LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] Fix race between attach_task and cpuset_exit
@ 2007-03-25 16:47 Srivatsa Vaddagiri
  2007-03-25 17:52 ` Balbir Singh
  2007-03-25 19:50 ` Paul Jackson
  0 siblings, 2 replies; 16+ messages in thread
From: Srivatsa Vaddagiri @ 2007-03-25 16:47 UTC (permalink / raw)
  To: pj, akpm; +Cc: linux-kernel

Currently cpuset_exit() changes the exiting task's ->cpuset pointer w/o
taking task_lock(). This can lead to ugly races between attach_task and
cpuset_exit. Details of the races are described at 
http://lkml.org/lkml/2007/3/24/132.

Patch below closes those races. It is against 2.6.21-rc4 and has undergone
a simple compile/boot test on a x86_64 box.

Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>


---


diff -puN kernel/cpuset.c~cpuset_race_fix kernel/cpuset.c
--- linux-2.6.21-rc4/kernel/cpuset.c~cpuset_race_fix	2007-03-25 21:08:27.000000000 +0530
+++ linux-2.6.21-rc4-vatsa/kernel/cpuset.c	2007-03-25 21:25:05.000000000 +0530
@@ -1182,6 +1182,7 @@ static int attach_task(struct cpuset *cs
 	pid_t pid;
 	struct task_struct *tsk;
 	struct cpuset *oldcs;
+	struct cpuset *oldcs_tobe_released = NULL;
 	cpumask_t cpus;
 	nodemask_t from, to;
 	struct mm_struct *mm;
@@ -1237,6 +1238,8 @@ static int attach_task(struct cpuset *cs
 	}
 	atomic_inc(&cs->count);
 	rcu_assign_pointer(tsk->cpuset, cs);
+	if (atomic_dec_and_test(&oldcs->count))
+		oldcs_tobe_released = oldcs;
 	task_unlock(tsk);
 
 	guarantee_online_cpus(cs, &cpus);
@@ -1257,8 +1260,8 @@ static int attach_task(struct cpuset *cs
 
 	put_task_struct(tsk);
 	synchronize_rcu();
-	if (atomic_dec_and_test(&oldcs->count))
-		check_for_release(oldcs, ppathbuf);
+	if (oldcs_tobe_released)
+		check_for_release(oldcs_tobe_released, ppathbuf);
 	return 0;
 }
 
@@ -2200,10 +2203,6 @@ void cpuset_fork(struct task_struct *chi
  * it is holding that mutex while calling check_for_release(),
  * which calls kmalloc(), so can't be called holding callback_mutex().
  *
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
  * the_top_cpuset_hack:
  *
  *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2242,19 +2241,20 @@ void cpuset_exit(struct task_struct *tsk
 {
 	struct cpuset *cs;
 
+	task_lock(tsk);
 	cs = tsk->cpuset;
 	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
+	atomic_dec(&cs->count);
+	task_unlock(tsk);
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
 		mutex_lock(&manage_mutex);
-		if (atomic_dec_and_test(&cs->count))
+		if (!atomic_read(&cs->count))
 			check_for_release(cs, &pathbuf);
 		mutex_unlock(&manage_mutex);
 		cpuset_release_agent(pathbuf);
-	} else {
-		atomic_dec(&cs->count);
 	}
 }
 
_




-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-25 16:47 [PATCH] Fix race between attach_task and cpuset_exit Srivatsa Vaddagiri
@ 2007-03-25 17:52 ` Balbir Singh
  2007-03-25 19:54   ` Paul Jackson
  2007-03-26 11:50   ` Srivatsa Vaddagiri
  2007-03-25 19:50 ` Paul Jackson
  1 sibling, 2 replies; 16+ messages in thread
From: Balbir Singh @ 2007-03-25 17:52 UTC (permalink / raw)
  To: vatsa; +Cc: pj, akpm, linux-kernel

Hi, Vatsa,

Srivatsa Vaddagiri wrote:
> 
> diff -puN kernel/cpuset.c~cpuset_race_fix kernel/cpuset.c
> --- linux-2.6.21-rc4/kernel/cpuset.c~cpuset_race_fix	2007-03-25 21:08:27.000000000 +0530
> +++ linux-2.6.21-rc4-vatsa/kernel/cpuset.c	2007-03-25 21:25:05.000000000 +0530
> @@ -1182,6 +1182,7 @@ static int attach_task(struct cpuset *cs
>  	pid_t pid;
>  	struct task_struct *tsk;
>  	struct cpuset *oldcs;
> +	struct cpuset *oldcs_tobe_released = NULL;

How about oldcs_to_be_released?

>  	cpumask_t cpus;
>  	nodemask_t from, to;
>  	struct mm_struct *mm;
> @@ -1237,6 +1238,8 @@ static int attach_task(struct cpuset *cs
>  	}
>  	atomic_inc(&cs->count);
>  	rcu_assign_pointer(tsk->cpuset, cs);
> +	if (atomic_dec_and_test(&oldcs->count))
> +		oldcs_tobe_released = oldcs;
>  	task_unlock(tsk);
> 
>  	guarantee_online_cpus(cs, &cpus);
> @@ -1257,8 +1260,8 @@ static int attach_task(struct cpuset *cs
> 
>  	put_task_struct(tsk);
>  	synchronize_rcu();
> -	if (atomic_dec_and_test(&oldcs->count))
> -		check_for_release(oldcs, ppathbuf);
> +	if (oldcs_tobe_released)
> +		check_for_release(oldcs_tobe_released, ppathbuf);
>  	return 0;
>  }
> 
> @@ -2200,10 +2203,6 @@ void cpuset_fork(struct task_struct *chi
>   * it is holding that mutex while calling check_for_release(),
>   * which calls kmalloc(), so can't be called holding callback_mutex().
>   *
> - * We don't need to task_lock() this reference to tsk->cpuset,
> - * because tsk is already marked PF_EXITING, so attach_task() won't
> - * mess with it, or task is a failed fork, never visible to attach_task.
> - *
>   * the_top_cpuset_hack:
>   *
>   *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
> @@ -2242,19 +2241,20 @@ void cpuset_exit(struct task_struct *tsk
>  {
>  	struct cpuset *cs;
> 
> +	task_lock(tsk);
>  	cs = tsk->cpuset;
>  	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
> +	atomic_dec(&cs->count);

How about using a local variable like ref_count and using

ref_count = atomic_dec_and_test(&cs->count); This will avoid the two
atomic operations, atomic_dec() and atomic_read() below.

> +	task_unlock(tsk);
> 
>  	if (notify_on_release(cs)) {
>  		char *pathbuf = NULL;
> 
>  		mutex_lock(&manage_mutex);
> -		if (atomic_dec_and_test(&cs->count))
> +		if (!atomic_read(&cs->count))

if (ref_count == 0)

>  			check_for_release(cs, &pathbuf);
>  		mutex_unlock(&manage_mutex);
>  		cpuset_release_agent(pathbuf);
> -	} else {
> -		atomic_dec(&cs->count);
>  	}
>  }
> 

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-25 16:47 [PATCH] Fix race between attach_task and cpuset_exit Srivatsa Vaddagiri
  2007-03-25 17:52 ` Balbir Singh
@ 2007-03-25 19:50 ` Paul Jackson
  2007-03-26 11:55   ` Srivatsa Vaddagiri
  1 sibling, 1 reply; 16+ messages in thread
From: Paul Jackson @ 2007-03-25 19:50 UTC (permalink / raw)
  To: vatsa; +Cc: akpm, linux-kernel

> +	task_lock(tsk);
>  	cs = tsk->cpuset;
>  	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
> +	atomic_dec(&cs->count);
> +	task_unlock(tsk);
>  
>  	if (notify_on_release(cs)) {
>  		char *pathbuf = NULL;
>  
>  		mutex_lock(&manage_mutex);
> -		if (atomic_dec_and_test(&cs->count))
> +		if (!atomic_read(&cs->count))
>  			check_for_release(cs, &pathbuf);

Is there perhaps another race here?  Could it happen that:
 1) the atomic_dec() lowers the count to say one (any value > zero)
 2) after we drop the task lock, some other task or tasks decrement
    the count to zero
 3) we catch that zero when we atomic_read the count, and issue a spurious
    check_for_release().

I'm thinking that we should use the same oldcs_tobe_released logic
here as we used in attach_task, so that we do an atomic_dec_and_test()
inside the task lock, and if that hit zero, then we know that our
pointer to this cpuset is the last remaining reference, so we can
release that pointer at our convenience, knowing no one else can
reference or mess with that cpuset any more.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-25 17:52 ` Balbir Singh
@ 2007-03-25 19:54   ` Paul Jackson
  2007-03-26 11:50   ` Srivatsa Vaddagiri
  1 sibling, 0 replies; 16+ messages in thread
From: Paul Jackson @ 2007-03-25 19:54 UTC (permalink / raw)
  To: balbir; +Cc: vatsa, akpm, linux-kernel

> How about using a local variable like ref_count and using
> 
> ref_count = atomic_dec_and_test(&cs->count); This will avoid the two
> atomic operations, atomic_dec() and atomic_read() below.

This would also seem to address the race I just noticed in my previous
reply.

Though I would suggest that we use the same code pattern in both
cpuset_exit and attach_task -- either both use cpuset_to_be_released,
or both use ref_count.

I tend to prefer the cpuset_to_be_released form - seems a bit more
explicitly clear to me.  But vatsa's doing the patch - he gets the
last vote.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-25 17:52 ` Balbir Singh
  2007-03-25 19:54   ` Paul Jackson
@ 2007-03-26 11:50   ` Srivatsa Vaddagiri
  2007-03-26 17:58     ` Paul Jackson
  2007-03-26 18:30     ` Paul Jackson
  1 sibling, 2 replies; 16+ messages in thread
From: Srivatsa Vaddagiri @ 2007-03-26 11:50 UTC (permalink / raw)
  To: Balbir Singh; +Cc: pj, akpm, linux-kernel

On Sun, Mar 25, 2007 at 11:22:15PM +0530, Balbir Singh wrote:
> >+	struct cpuset *oldcs_tobe_released = NULL;
> 
> How about oldcs_to_be_released?

Yes, I wanted to use that, but my typo I guess.

> >@@ -2242,19 +2241,20 @@ void cpuset_exit(struct task_struct *tsk
> > {
> > 	struct cpuset *cs;
> >
> >+	task_lock(tsk);
> > 	cs = tsk->cpuset;
> > 	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
> >+	atomic_dec(&cs->count);
> 
> How about using a local variable like ref_count and using
> 
> ref_count = atomic_dec_and_test(&cs->count); This will avoid the two
> atomic operations, atomic_dec() and atomic_read() below.

Well, someone may have attached to this cpuset while we were waiting on the 
mutex_lock(). So we need to do a atomic_read again to ensure it is still
unused. But I notice that check_for_release() has that
atomic_read-and-check-for-zero-refcount inbuilt into it, which means we can 
blindly call it. Modified patch in another mail.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-25 19:50 ` Paul Jackson
@ 2007-03-26 11:55   ` Srivatsa Vaddagiri
  2007-04-05  5:55     ` Paul Menage
  0 siblings, 1 reply; 16+ messages in thread
From: Srivatsa Vaddagiri @ 2007-03-26 11:55 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, linux-kernel, Balbir Singh, menage

On Sun, Mar 25, 2007 at 12:50:25PM -0700, Paul Jackson wrote:
> Is there perhaps another race here? 

Yes, we have!

Modified patch below. Compile/boot tested on a x86_64 box.


Currently cpuset_exit() changes the exiting task's ->cpuset pointer w/o
taking task_lock(). This can lead to ugly races between attach_task and
cpuset_exit. Details of the races are described at
http://lkml.org/lkml/2007/3/24/132.

Patch below closes those races. It is against 2.6.21-rc4 and has
undergone a simple compile/boot test on a x86_64 box.

Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>


---


diff -puN kernel/cpuset.c~cpuset_race_fix kernel/cpuset.c
--- linux-2.6.21-rc4/kernel/cpuset.c~cpuset_race_fix	2007-03-25 21:08:27.000000000 +0530
+++ linux-2.6.21-rc4-vatsa/kernel/cpuset.c	2007-03-26 16:48:24.000000000 +0530
@@ -1182,6 +1182,7 @@ static int attach_task(struct cpuset *cs
 	pid_t pid;
 	struct task_struct *tsk;
 	struct cpuset *oldcs;
+	struct cpuset *oldcs_to_be_released = NULL;
 	cpumask_t cpus;
 	nodemask_t from, to;
 	struct mm_struct *mm;
@@ -1237,6 +1238,8 @@ static int attach_task(struct cpuset *cs
 	}
 	atomic_inc(&cs->count);
 	rcu_assign_pointer(tsk->cpuset, cs);
+	if (atomic_dec_and_test(&oldcs->count))
+		oldcs_to_be_released = oldcs;
 	task_unlock(tsk);
 
 	guarantee_online_cpus(cs, &cpus);
@@ -1257,8 +1260,8 @@ static int attach_task(struct cpuset *cs
 
 	put_task_struct(tsk);
 	synchronize_rcu();
-	if (atomic_dec_and_test(&oldcs->count))
-		check_for_release(oldcs, ppathbuf);
+	if (oldcs_to_be_released)
+		check_for_release(oldcs_to_be_released, ppathbuf);
 	return 0;
 }
 
@@ -2200,10 +2203,6 @@ void cpuset_fork(struct task_struct *chi
  * it is holding that mutex while calling check_for_release(),
  * which calls kmalloc(), so can't be called holding callback_mutex().
  *
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
  * the_top_cpuset_hack:
  *
  *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2241,20 +2240,23 @@ void cpuset_fork(struct task_struct *chi
 void cpuset_exit(struct task_struct *tsk)
 {
 	struct cpuset *cs;
+	struct cpuset *oldcs_to_be_released = NULL;
 
+	task_lock(tsk);
 	cs = tsk->cpuset;
 	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
+	if (atomic_dec_and_test(&cs->count))
+		oldcs_to_be_released = cs;
+	task_unlock(tsk);
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
 		mutex_lock(&manage_mutex);
-		if (atomic_dec_and_test(&cs->count))
-			check_for_release(cs, &pathbuf);
+		if (oldcs_to_be_released)
+			check_for_release(oldcs_to_be_released, &pathbuf);
 		mutex_unlock(&manage_mutex);
 		cpuset_release_agent(pathbuf);
-	} else {
-		atomic_dec(&cs->count);
 	}
 }
 
_


-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-26 11:50   ` Srivatsa Vaddagiri
@ 2007-03-26 17:58     ` Paul Jackson
  2007-03-27  6:35       ` Srivatsa Vaddagiri
  2007-03-26 18:30     ` Paul Jackson
  1 sibling, 1 reply; 16+ messages in thread
From: Paul Jackson @ 2007-03-26 17:58 UTC (permalink / raw)
  To: vatsa; +Cc: balbir, akpm, linux-kernel

vatsa wrote:
> Well, someone may have attached to this cpuset while we were waiting on the 
> mutex_lock(). So we need to do a atomic_read again to ensure it is still
> unused.

I don't see how this could happen.  If we hold the task lock that now
(thanks to your good work) guards this pointer, and if we decrement to
zero the reference count on the cpuset to which it points and then
-overwrite- this last remaining visible pointer to that cpuset with a
pointer to a different cpuset, then aren't we guaranteed to be holding
the last remaining reference to the old cpuset in our local variable,
making it impossible for anyone else to attach to it in any way?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-26 11:50   ` Srivatsa Vaddagiri
  2007-03-26 17:58     ` Paul Jackson
@ 2007-03-26 18:30     ` Paul Jackson
  1 sibling, 0 replies; 16+ messages in thread
From: Paul Jackson @ 2007-03-26 18:30 UTC (permalink / raw)
  To: vatsa; +Cc: balbir, akpm, linux-kernel

vatsa wrote:
> Well, someone may have attached to this cpuset while we were waiting on the 
> mutex_lock(). So we need to do a atomic_read again to ensure it is still
> unused

pj replied:
> If we hold the task lock that now
> (thanks to your good work) guards this pointer, and if we decrement to
> zero the reference count on the cpuset to which it points 

I incorrectly described the locking, I think.

A cpusets reference count increases if either another task is attached
to it, or if a task already attached forks.

If we decrement to zero the count, we -know- that no more tasks are
attached to it.

If we hold the cpuset manage_mutex, then we -know- that attach_task can't
attach tasks to it.

But now that you mention it, that additional atomic_read of the count in
check_for_release() seems suspicious to me.  I'm afraid that the following
could happen:

    1) given cpusets A and A/B, with a single task attached to A (none to B)
    2) some other tasks issues a "rmdir A/B"
    3) near the end of the cpuset_rmdir() code, after we have removed A/B, we
	invoke check_for_release()
    4) just at that instant, the single task in A exits, decrementing the
	count on A to zero
    5) both the exiting task and the task doing the rmdir execute the
	cpuset_release_agent() and check_for_release() code.

Aha - yes, maybe that could happen, but it is OK !!

Multiple tasks all pounding on the same cpuset with this release logic
is not a problem. That just ends up being multiple tasks doing a 'rmdir'
on that cpuset from user space.  At most one of them succeeds in
removing the directory, and if it is removed, then the remaining get an
error that there is no such directory.

The race I worried about in last nights post is NOT a problem:
> Is there perhaps another race here?  Could it happen that:
>  1) the atomic_dec() lowers the count to say one (any value > zero)
>  2) after we drop the task lock, some other task or tasks decrement
>     the count to zero
>  3) we catch that zero when we atomic_read the count, and issue a spurious
>     check_for_release().

This is one of the advantages of not actually unlinking cpusets at this point,
when it seems they are no longer used.  We just fire off a user mode helper
thread to attempt a subsequent removal.  That separate thread will get the locking
correct, from the top down, and if the cpuset is still really and truly unused,
then and only then actually remove it.

Simultaneous spurious check_for_release() calls are not a problem!

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-26 17:58     ` Paul Jackson
@ 2007-03-27  6:35       ` Srivatsa Vaddagiri
  2007-03-27  8:45         ` Paul Jackson
  0 siblings, 1 reply; 16+ messages in thread
From: Srivatsa Vaddagiri @ 2007-03-27  6:35 UTC (permalink / raw)
  To: Paul Jackson; +Cc: balbir, akpm, linux-kernel

Quoting Paul Jackson <pj@sgi.com>:

> vatsa wrote:
>> Well, someone may have attached to this cpuset while we were waiting on the
>> mutex_lock(). So we need to do a atomic_read again to ensure it is still
>> unused.
>
> I don't see how this could happen. If we hold the task lock that now
> (thanks to your good work) guards this pointer, and if we decrement to
> zero the reference count on the cpuset to which it points and then
> -overwrite- this last remaining visible pointer to that cpuset with a
> pointer to a different cpuset, then aren't we guaranteed to be holding
> the last remaining reference to the old cpuset in our local variable,
> making it impossible for anyone else to attach to it in any way?

Yes, but the cpuset is not made invisible to userspace (in filesystem)  
yet. So as cpuset_exit() discovers that cpuset B has zero refcount now  
and blocks on mutex_lock(&manage_mutex) [ to do a check_for_release  
later ], someone could have done a attach_task to that cpuset.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-27  6:35       ` Srivatsa Vaddagiri
@ 2007-03-27  8:45         ` Paul Jackson
  0 siblings, 0 replies; 16+ messages in thread
From: Paul Jackson @ 2007-03-27  8:45 UTC (permalink / raw)
  To: Srivatsa Vaddagiri; +Cc: balbir, akpm, linux-kernel

> Yes, but the cpuset is not made invisible to userspace (in filesystem)  
> yet. So as cpuset_exit() discovers that cpuset B has zero refcount now  
> and blocks on mutex_lock(&manage_mutex) [ to do a check_for_release  
> later ], someone could have done a attach_task to that cpuset.

Agreed.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-03-26 11:55   ` Srivatsa Vaddagiri
@ 2007-04-05  5:55     ` Paul Menage
  2007-04-05  7:00       ` Srivatsa Vaddagiri
  2007-04-10 17:12       ` Srivatsa Vaddagiri
  0 siblings, 2 replies; 16+ messages in thread
From: Paul Menage @ 2007-04-05  5:55 UTC (permalink / raw)
  To: vatsa; +Cc: Paul Jackson, akpm, linux-kernel, Balbir Singh

On 3/26/07, Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:
> On Sun, Mar 25, 2007 at 12:50:25PM -0700, Paul Jackson wrote:
> > Is there perhaps another race here?
>
> Yes, we have!
>
> Modified patch below. Compile/boot tested on a x86_64 box.
>
>
> Currently cpuset_exit() changes the exiting task's ->cpuset pointer w/o
> taking task_lock(). This can lead to ugly races between attach_task and
> cpuset_exit. Details of the races are described at
> http://lkml.org/lkml/2007/3/24/132.
>
> Patch below closes those races. It is against 2.6.21-rc4 and has
> undergone a simple compile/boot test on a x86_64 box.
>
> Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>
>
>
> ---
>
>
> diff -puN kernel/cpuset.c~cpuset_race_fix kernel/cpuset.c
> --- linux-2.6.21-rc4/kernel/cpuset.c~cpuset_race_fix    2007-03-25 21:08:27.000000000 +0530
> +++ linux-2.6.21-rc4-vatsa/kernel/cpuset.c      2007-03-26 16:48:24.000000000 +0530
> @@ -1182,6 +1182,7 @@ static int attach_task(struct cpuset *cs
>         pid_t pid;
>         struct task_struct *tsk;
>         struct cpuset *oldcs;
> +       struct cpuset *oldcs_to_be_released = NULL;
>         cpumask_t cpus;
>         nodemask_t from, to;
>         struct mm_struct *mm;
> @@ -1237,6 +1238,8 @@ static int attach_task(struct cpuset *cs
>         }
>         atomic_inc(&cs->count);
>         rcu_assign_pointer(tsk->cpuset, cs);
> +       if (atomic_dec_and_test(&oldcs->count))
> +               oldcs_to_be_released = oldcs;
>         task_unlock(tsk);
>
>         guarantee_online_cpus(cs, &cpus);
> @@ -1257,8 +1260,8 @@ static int attach_task(struct cpuset *cs
>
>         put_task_struct(tsk);
>         synchronize_rcu();
> -       if (atomic_dec_and_test(&oldcs->count))
> -               check_for_release(oldcs, ppathbuf);
> +       if (oldcs_to_be_released)
> +               check_for_release(oldcs_to_be_released, ppathbuf);
>         return 0;
>  }

Is this part of the patch necessary? If we're adding a task_lock() in
cpuset_exit(), then the problem that Vatsa described (both
cpuset_attach_task() and cpuset_exit() decrementing the same cpuset
count, and cpuset_attach_task() incrementing the count on a cpuset
that the task doesn't eventually end up in) go away, since only one
thread will retrieve the old value of the task's cpuset in order to
decrement its count.


>
> @@ -2200,10 +2203,6 @@ void cpuset_fork(struct task_struct *chi
>   * it is holding that mutex while calling check_for_release(),
>   * which calls kmalloc(), so can't be called holding callback_mutex().
>   *
> - * We don't need to task_lock() this reference to tsk->cpuset,
> - * because tsk is already marked PF_EXITING, so attach_task() won't
> - * mess with it, or task is a failed fork, never visible to attach_task.
> - *
>   * the_top_cpuset_hack:
>   *
>   *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
> @@ -2241,20 +2240,23 @@ void cpuset_fork(struct task_struct *chi
>  void cpuset_exit(struct task_struct *tsk)
>  {
>         struct cpuset *cs;
> +       struct cpuset *oldcs_to_be_released = NULL;
>
> +       task_lock(tsk);
>         cs = tsk->cpuset;
>         tsk->cpuset = &top_cpuset;      /* the_top_cpuset_hack - see above */
> +       if (atomic_dec_and_test(&cs->count))
> +               oldcs_to_be_released = cs;
> +       task_unlock(tsk);
>

I think this is still racy - at this point we're holding a reference
on a cpuset that could have a zero count, and we don't hold
manage_mutex or callback_mutex. So a concurrent rmdir could zap the
directory and free the cpuset.

Shouldn't we just put a task_lock()/task_unlock() around these lines
and leave everything else as-is?

	task_lock(tsk);
	cs = tsk->cpuset;
	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
	task_unlock(tsk)

Paul

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-04-05  5:55     ` Paul Menage
@ 2007-04-05  7:00       ` Srivatsa Vaddagiri
  2007-04-05  7:01         ` Paul Menage
  2007-04-10 17:12       ` Srivatsa Vaddagiri
  1 sibling, 1 reply; 16+ messages in thread
From: Srivatsa Vaddagiri @ 2007-04-05  7:00 UTC (permalink / raw)
  To: Paul Menage; +Cc: Paul Jackson, akpm, linux-kernel, Balbir Singh

On Wed, Apr 04, 2007 at 10:55:01PM -0700, Paul Menage wrote:
> >@@ -1257,8 +1260,8 @@ static int attach_task(struct cpuset *cs
> >
> >        put_task_struct(tsk);
> >        synchronize_rcu();
> >-       if (atomic_dec_and_test(&oldcs->count))
> >-               check_for_release(oldcs, ppathbuf);
> >+       if (oldcs_to_be_released)
> >+               check_for_release(oldcs_to_be_released, ppathbuf);
> >        return 0;
> > }
> 
> Is this part of the patch necessary? If we're adding a task_lock() in
> cpuset_exit(), then the problem that Vatsa described (both
> cpuset_attach_task() and cpuset_exit() decrementing the same cpuset
> count, and cpuset_attach_task() incrementing the count on a cpuset
> that the task doesn't eventually end up in) go away, since only one
> thread will retrieve the old value of the task's cpuset in order to
> decrement its count.

You *have* to drop/inc the refcount inside the task_lock, otherwise it is
racy.

	task_lock(T1);
	old_cs = T1->cputset (C1)
	atomic_inc(&C2->count);
	T1->cputset = C2;
	task_unlock();

	...

	synchronize_rcu();

	if (atomic_dec_and_test(&C1->count))
		check_for_release(..)

is incorrect. For ex: T1's refcount on C1 may have already been dropped
by now in cpuset_exit() and dropping the refcount again can lead to
negative refcounts. 
		.
> > void cpuset_exit(struct task_struct *tsk)
> > {
> >        struct cpuset *cs;
> >+       struct cpuset *oldcs_to_be_released = NULL;
> >
> >+       task_lock(tsk);
> >        cs = tsk->cpuset;
> >        tsk->cpuset = &top_cpuset;      /* the_top_cpuset_hack - see above 
> >        */
> >+       if (atomic_dec_and_test(&cs->count))
> >+               oldcs_to_be_released = cs;
> >+       task_unlock(tsk);
> >
> 
> I think this is still racy - at this point we're holding a reference
> on a cpuset that could have a zero count,

How's that possible? That you have a zero-refcount cpuset with non empty
tasks in it?

> and we don't hold
> manage_mutex or callback_mutex. So a concurrent rmdir could zap the
> directory and free the cpuset.

I don't think that is possible. Can you explain?

> Shouldn't we just put a task_lock()/task_unlock() around these lines
> and leave everything else as-is?
> 
> 	task_lock(tsk);
> 	cs = tsk->cpuset;
> 	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
> 	task_unlock(tsk)

If we don't drop refcount inside task_lock() it makes it racy with
attach_task(). 'cs' derived above may not be the right cpuset to drop
refcount on later in cpuset_exit.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-04-05  7:00       ` Srivatsa Vaddagiri
@ 2007-04-05  7:01         ` Paul Menage
  2007-04-05  8:14           ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 16+ messages in thread
From: Paul Menage @ 2007-04-05  7:01 UTC (permalink / raw)
  To: vatsa; +Cc: Paul Jackson, akpm, linux-kernel, Balbir Singh

On 4/5/07, Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:
> On Wed, Apr 04, 2007 at 10:55:01PM -0700, Paul Menage wrote:
> > >@@ -1257,8 +1260,8 @@ static int attach_task(struct cpuset *cs
> > >
> > >        put_task_struct(tsk);
> > >        synchronize_rcu();
> > >-       if (atomic_dec_and_test(&oldcs->count))
> > >-               check_for_release(oldcs, ppathbuf);
> > >+       if (oldcs_to_be_released)
> > >+               check_for_release(oldcs_to_be_released, ppathbuf);
> > >        return 0;
> > > }
> >
> > Is this part of the patch necessary? If we're adding a task_lock() in
> > cpuset_exit(), then the problem that Vatsa described (both
> > cpuset_attach_task() and cpuset_exit() decrementing the same cpuset
> > count, and cpuset_attach_task() incrementing the count on a cpuset
> > that the task doesn't eventually end up in) go away, since only one
> > thread will retrieve the old value of the task's cpuset in order to
> > decrement its count.
>
> You *have* to drop/inc the refcount inside the task_lock, otherwise it is
> racy.
>
>         task_lock(T1);
>         old_cs = T1->cputset (C1)
>         atomic_inc(&C2->count);
>         T1->cputset = C2;
>         task_unlock();
>
>         ...
>
>         synchronize_rcu();
>
>         if (atomic_dec_and_test(&C1->count))
>                 check_for_release(..)
>
> is incorrect. For ex: T1's refcount on C1 may have already been dropped
> by now in cpuset_exit() and dropping the refcount again can lead to
> negative refcounts.

I don't see how that could happen. Assuming we add the
task_lock()/task_unlock() in cpuset_exit(), then only one of the two
threads (either cpuset_exit() or attach_task() ) can copy C1 from
T1->cpuset and replace it with something new, and hence only one of
them can drop the refcount.

>                 .
> > > void cpuset_exit(struct task_struct *tsk)
> > > {
> > >        struct cpuset *cs;
> > >+       struct cpuset *oldcs_to_be_released = NULL;
> > >
> > >+       task_lock(tsk);
> > >        cs = tsk->cpuset;
> > >        tsk->cpuset = &top_cpuset;      /* the_top_cpuset_hack - see above
> > >        */
> > >+       if (atomic_dec_and_test(&cs->count))
> > >+               oldcs_to_be_released = cs;
> > >+       task_unlock(tsk);
> > >
> >
> > I think this is still racy - at this point we're holding a reference
> > on a cpuset that could have a zero count,
>
> How's that possible? That you have a zero-refcount cpuset with non empty
> tasks in it?

If this is the last task in cs, then cs->count will be 1. We remove
this task from cs, and decrement its count to 0. Then another cpu does
cpuset_rmdir(), takes manage_mutex, sees that the count is 0, cleans
up the cpuset, drops the dentry, and the cpuset gets freed. Then we
get to run again, and we dereference an invalid cpuset.

>
> > Shouldn't we just put a task_lock()/task_unlock() around these lines
> > and leave everything else as-is?
> >
> >       task_lock(tsk);
> >       cs = tsk->cpuset;
> >       tsk->cpuset = &top_cpuset;      /* the_top_cpuset_hack - see above */
> >       task_unlock(tsk)
>
> If we don't drop refcount inside task_lock() it makes it racy with
> attach_task(). 'cs' derived above may not be the right cpuset to drop
> refcount on later in cpuset_exit.

How so? If we're holding task_lock(tsk) then we're atomic with respect
to the code in attach_task that reads the old cpuset and puts in a new
one. Only the thread that actually reads the value and replaces it
will get to drop the old value.

Paul

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-04-05  8:14           ` Srivatsa Vaddagiri
@ 2007-04-05  8:10             ` Paul Menage
  0 siblings, 0 replies; 16+ messages in thread
From: Paul Menage @ 2007-04-05  8:10 UTC (permalink / raw)
  To: vatsa; +Cc: Paul Jackson, akpm, linux-kernel, Balbir Singh

On 4/5/07, Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:
>
> Hmm yes ..I am surprised we arent doing a synhronize_rcu in
> cpuset_rmdir() before dropping the dentry. Did you want to send a patch
> for that?

Currently cpuset_exit() isn't in a rcu section so it wouldn't help.

But this ceases to be a problem if we don't leave the refcount
dropping in cpuset_exit() where it is currently - it doesn't drop the
refcount until it's holding manage_mutex, at which point it's safe to
hold a cpuset reference with a refcount of 0.

Paul

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Fix race between attach_task and cpuset_exit
  2007-04-05  7:01         ` Paul Menage
@ 2007-04-05  8:14           ` Srivatsa Vaddagiri
  2007-04-05  8:10             ` Paul Menage
  0 siblings, 1 reply; 16+ messages in thread
From: Srivatsa Vaddagiri @ 2007-04-05  8:14 UTC (permalink / raw)
  To: Paul Menage; +Cc: Paul Jackson, akpm, linux-kernel, Balbir Singh

On Thu, Apr 05, 2007 at 12:01:53AM -0700, Paul Menage wrote:
> I don't see how that could happen. Assuming we add the
> task_lock()/task_unlock() in cpuset_exit(), then only one of the two
> threads (either cpuset_exit() or attach_task() ) can copy C1 from
> T1->cpuset and replace it with something new, and hence only one of
> them can drop the refcount.

You are correct! We can just add the task_lock()/unlock() in cpuset_exit
and be done away with the races I described.

> >How's that possible? That you have a zero-refcount cpuset with non empty
> >tasks in it?
> 
> If this is the last task in cs, then cs->count will be 1. We remove
> this task from cs, and decrement its count to 0. Then another cpu does
> cpuset_rmdir(), takes manage_mutex, sees that the count is 0, cleans
> up the cpuset, drops the dentry, and the cpuset gets freed. Then we
> get to run again, and we dereference an invalid cpuset.

Hmm yes ..I am surprised we arent doing a synhronize_rcu in
cpuset_rmdir() before dropping the dentry. Did you want to send a patch
for that?

attach_task() is ugly ....

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] Fix race between attach_task and cpuset_exit
  2007-04-05  5:55     ` Paul Menage
  2007-04-05  7:00       ` Srivatsa Vaddagiri
@ 2007-04-10 17:12       ` Srivatsa Vaddagiri
  1 sibling, 0 replies; 16+ messages in thread
From: Srivatsa Vaddagiri @ 2007-04-10 17:12 UTC (permalink / raw)
  To: Paul Menage; +Cc: Paul Jackson, akpm, linux-kernel, Balbir Singh

On Wed, Apr 04, 2007 at 10:55:01PM -0700, Paul Menage wrote:
> Shouldn't we just put a task_lock()/task_unlock() around these lines
> and leave everything else as-is?
> 
> 	task_lock(tsk);
> 	cs = tsk->cpuset;
> 	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
> 	task_unlock(tsk)

Andrew,
	Can you drop fix-race-between-attach_task-and-cpuset_exit.patch
and take this fix instead, which addresses some points raised by Paul
Menage?





Currently cpuset_exit() changes the exiting task's ->cpuset pointer w/o
taking task_lock().  This can lead to ugly races between attach_task and
cpuset_exit.  Details of the races are described at
http://lkml.org/lkml/2007/3/24/132.

Patch below closes those races.  It is against 2.6.21-rc6-mm1 and has
undergone a simple compile/boot test on a x86_64 box.

Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>


---


diff -puN kernel/cpuset.c~cpuset_race_fix kernel/cpuset.c
--- linux-2.6.21-rc6/kernel/cpuset.c~cpuset_race_fix	2007-04-10 20:53:57.000000000 +0530
+++ linux-2.6.21-rc6-vatsa/kernel/cpuset.c	2007-04-10 22:08:46.000000000 +0530
@@ -2119,10 +2119,6 @@ void cpuset_fork(struct task_struct *chi
  * it is holding that mutex while calling check_for_release(),
  * which calls kmalloc(), so can't be called holding callback_mutex().
  *
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
  * the_top_cpuset_hack:
  *
  *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2161,8 +2157,10 @@ void cpuset_exit(struct task_struct *tsk
 {
 	struct cpuset *cs;
 
+	task_lock(current);
 	cs = tsk->cpuset;
 	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
+	task_unlock(current);
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
_

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2007-04-10 17:05 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-25 16:47 [PATCH] Fix race between attach_task and cpuset_exit Srivatsa Vaddagiri
2007-03-25 17:52 ` Balbir Singh
2007-03-25 19:54   ` Paul Jackson
2007-03-26 11:50   ` Srivatsa Vaddagiri
2007-03-26 17:58     ` Paul Jackson
2007-03-27  6:35       ` Srivatsa Vaddagiri
2007-03-27  8:45         ` Paul Jackson
2007-03-26 18:30     ` Paul Jackson
2007-03-25 19:50 ` Paul Jackson
2007-03-26 11:55   ` Srivatsa Vaddagiri
2007-04-05  5:55     ` Paul Menage
2007-04-05  7:00       ` Srivatsa Vaddagiri
2007-04-05  7:01         ` Paul Menage
2007-04-05  8:14           ` Srivatsa Vaddagiri
2007-04-05  8:10             ` Paul Menage
2007-04-10 17:12       ` Srivatsa Vaddagiri

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).