LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/2] do_wait reorganization
@ 2008-03-29  3:34 Roland McGrath
  2008-03-29  3:35 ` [PATCH 2/2] ptrace children revamp Roland McGrath
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Roland McGrath @ 2008-03-29  3:34 UTC (permalink / raw)
  To: Andrew Morton, Linus Torvalds; +Cc: Oleg Nesterov, linux-kernel

This breaks out the guts of do_wait into two subfunctions.
The control flow is less nonobvious without so much goto.
do_wait_thread contains the main work of the outer loop.
wait_consider_task contains the main work of the inner loop.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c |  191 +++++++++++++++++++++++++++++++++++----------------------
 1 files changed, 118 insertions(+), 73 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 53872bf..f2cf0a1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1442,89 +1442,136 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	return retval;
 }
 
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@retval before the first call.
+ * Returns nonzero if we have unlocked tasklist_lock and have
+ * the final return value ready in *@retval.  Returns zero if
+ * the search for a child should continue; then *@retval is 0
+ * if @p is an eligible child, or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent,
+			      struct task_struct *p, int *retval,
+			      enum pid_type type, struct pid *pid, int options,
+			      struct siginfo __user *infop,
+			      int __user *stat_addr, struct rusage __user *ru)
+{
+	int ret = eligible_child(type, pid, options, p);
+	if (!ret)
+		return 0;
+
+	if (unlikely(ret < 0)) {
+		read_unlock(&tasklist_lock);
+		*retval = ret;
+		return 1;
+	}
+
+	if (task_is_stopped_or_traced(p)) {
+		/*
+		 * It's stopped now, so it might later
+		 * continue, exit, or stop again.
+		 */
+		*retval = 0;
+		if ((p->ptrace & PT_PTRACED) ||
+		    (options & WUNTRACED)) {
+			*retval = wait_task_stopped(p, (options & WNOWAIT),
+						    infop, stat_addr, ru);
+			if (*retval)
+				return 1;
+		}
+	} else if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) {
+		/*
+		 * We don't reap group leaders with subthreads.
+		 */
+		if (likely(options & WEXITED)) {
+			*retval = wait_task_zombie(p, (options & WNOWAIT),
+						   infop, stat_addr, ru);
+			if (*retval)
+				return 1;
+		}
+	} else if (p->exit_state != EXIT_DEAD) {
+		/*
+		 * It's running now, so it might later
+		 * exit, stop, or stop and then continue.
+		 */
+		*retval = 0;
+		if (unlikely(options & WCONTINUED)) {
+			*retval = wait_task_continued(p, (options & WNOWAIT),
+						      infop, stat_addr, ru);
+			if (*retval)
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ * -ECHILD should be in *@retval before the first call.
+ * Returns nonzero if we have unlocked tasklist_lock and have
+ * the final return value ready in *@retval.
+ * Returns zero if the search for a child should continue; then
+ * *@retval is 0 if there are any eligible children, or still -ECHILD.
+ */
+static int do_wait_thread(struct task_struct *tsk, int *retval,
+			  enum pid_type type, struct pid *pid, int options,
+			  struct siginfo __user *infop, int __user *stat_addr,
+			  struct rusage __user *ru)
+{
+	struct task_struct *p;
+
+	list_for_each_entry(p, &tsk->children, sibling) {
+		if (wait_consider_task(tsk, p, retval, type, pid, options,
+				       infop, stat_addr, ru))
+			return 1;
+	}
+
+	/*
+	 * If we never saw an eligile child, check for children stolen by
+	 * ptrace.  We don't leave -ECHILD in *@retval if there are any,
+	 * because we will eventually be allowed to wait for them again.
+	 */
+	if (*retval)
+		list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
+			int ret = eligible_child(type, pid, options, p);
+			if (ret) {
+				*retval = unlikely(ret < 0) ? ret : 0;
+				break;
+			}
+		}
+
+	return 0;
+}
+
 static long do_wait(enum pid_type type, struct pid *pid, int options,
 		    struct siginfo __user *infop, int __user *stat_addr,
 		    struct rusage __user *ru)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
-	int flag, retval;
+	int retval;
 
 	add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
-	/* If there is nothing that can match our critier just get out */
+	/*
+	 * If there is nothing that can match our critiera just get out.
+	 * We will clear @retval to zero if we see any child that might later
+	 * match our criteria, even if we are not able to reap it yet.
+	 */
 	retval = -ECHILD;
 	if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
 		goto end;
 
-	/*
-	 * We will set this flag if we see any child that might later
-	 * match our criteria, even if we are not able to reap it yet.
-	 */
-	flag = retval = 0;
 	current->state = TASK_INTERRUPTIBLE;
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
-		struct task_struct *p;
+		if (do_wait_thread(tsk, &retval, type, pid, options,
+				   infop, stat_addr, ru))
+			goto end;
 
-		list_for_each_entry(p, &tsk->children, sibling) {
-			int ret = eligible_child(type, pid, options, p);
-			if (!ret)
-				continue;
-
-			if (unlikely(ret < 0)) {
-				retval = ret;
-			} else if (task_is_stopped_or_traced(p)) {
-				/*
-				 * It's stopped now, so it might later
-				 * continue, exit, or stop again.
-				 */
-				flag = 1;
-				if (!(p->ptrace & PT_PTRACED) &&
-				    !(options & WUNTRACED))
-					continue;
-
-				retval = wait_task_stopped(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			} else if (p->exit_state == EXIT_ZOMBIE &&
-					!delay_group_leader(p)) {
-				/*
-				 * We don't reap group leaders with subthreads.
-				 */
-				if (!likely(options & WEXITED))
-					continue;
-				retval = wait_task_zombie(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			} else if (p->exit_state != EXIT_DEAD) {
-				/*
-				 * It's running now, so it might later
-				 * exit, stop, or stop and then continue.
-				 */
-				flag = 1;
-				if (!unlikely(options & WCONTINUED))
-					continue;
-				retval = wait_task_continued(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			}
-			if (retval != 0) /* tasklist_lock released */
-				goto end;
-		}
-		if (!flag) {
-			list_for_each_entry(p, &tsk->ptrace_children,
-								ptrace_list) {
-				flag = eligible_child(type, pid, options, p);
-				if (!flag)
-					continue;
-				if (likely(flag > 0))
-					break;
-				retval = flag;
-				goto end;
-			}
-		}
 		if (options & __WNOTHREAD)
 			break;
 		tsk = next_thread(tsk);
@@ -1532,16 +1579,14 @@ repeat:
 	} while (tsk != current);
 	read_unlock(&tasklist_lock);
 
-	if (flag) {
-		if (options & WNOHANG)
-			goto end;
+	if (!retval && !(options & WNOHANG)) {
 		retval = -ERESTARTSYS;
-		if (signal_pending(current))
-			goto end;
-		schedule();
-		goto repeat;
+		if (!signal_pending(current)) {
+			schedule();
+			goto repeat;
+		}
 	}
-	retval = -ECHILD;
+
 end:
 	current->state = TASK_RUNNING;
 	remove_wait_queue(&current->signal->wait_chldexit,&wait);

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 2/2] ptrace children revamp
  2008-03-29  3:34 [PATCH 1/2] do_wait reorganization Roland McGrath
@ 2008-03-29  3:35 ` Roland McGrath
  2008-03-29 10:39   ` Oleg Nesterov
  2008-03-29 10:35 ` [PATCH 1/2] do_wait reorganization Oleg Nesterov
  2008-03-29 16:16 ` Linus Torvalds
  2 siblings, 1 reply; 23+ messages in thread
From: Roland McGrath @ 2008-03-29  3:35 UTC (permalink / raw)
  To: Andrew Morton, Linus Torvalds; +Cc: Oleg Nesterov, linux-kernel

ptrace no longer fiddles with the children/sibling links, and the
old ptrace_children list is gone.  Now using PTRACE_ATTACH on
someone else's child just uses the new ptrace_attach list instead.

There should be no user-visible difference that matters.  The only
change is the order in which do_wait() sees multiple stopped children
and stopped ptrace attachees.  Since wait_task_stopped() was changed
earlier so it no longer reorders the children list, we already know
this won't cause any new problems.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 include/linux/init_task.h |    2 +-
 include/linux/sched.h     |   27 +++----
 kernel/exit.c             |  180 ++++++++++++++++++++++-----------------------
 kernel/fork.c             |    4 +-
 kernel/ptrace.c           |   18 ++---
 5 files changed, 111 insertions(+), 120 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f74e1d..70dbb73 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,7 +157,7 @@ extern struct group_info init_groups;
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
-	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
+	.ptrace_attach	= LIST_HEAD_INIT(tsk.ptrace_attach),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
 	.real_parent	= &tsk,						\
 	.parent		= &tsk,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6c275e8..5a755a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1043,12 +1043,6 @@ struct task_struct {
 #endif
 
 	struct list_head tasks;
-	/*
-	 * ptrace_list/ptrace_children forms the list of my children
-	 * that were stolen by a ptracer.
-	 */
-	struct list_head ptrace_children;
-	struct list_head ptrace_list;
 
 	struct mm_struct *mm, *active_mm;
 
@@ -1070,18 +1064,26 @@ struct task_struct {
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with 
-	 * p->parent->pid)
+	 * p->real_parent->pid)
 	 */
-	struct task_struct *real_parent; /* real parent process (when being debugged) */
-	struct task_struct *parent;	/* parent process */
+	struct task_struct *real_parent; /* real parent process */
+	struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
 	/*
-	 * children/sibling forms the list of my children plus the
-	 * tasks I'm ptracing.
+	 * children/sibling forms the list of my natural children
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 
+	/*
+	 * ptrace_attach is the list of tasks I have PTRACE_ATTACH'd to,
+	 * excluding my own natural children.
+	 * ptrace_list is my own link on my PTRACE_ATTACHer's list,
+	 * which is p->parent->ptrace_attach.
+	 */
+	struct list_head ptrace_attach;
+	struct list_head ptrace_list;
+
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
@@ -1794,9 +1796,6 @@ extern void wait_task_inactive(struct task_struct * p);
 #define wait_task_inactive(p)	do { } while (0)
 #endif
 
-#define remove_parent(p)	list_del_init(&(p)->sibling)
-#define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
-
 #define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
 
 #define for_each_process(p) \
diff --git a/kernel/exit.c b/kernel/exit.c
index f2cf0a1..fdfda5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,7 +64,7 @@ static void __unhash_process(struct task_struct *p)
 		__get_cpu_var(process_counts)--;
 	}
 	list_del_rcu(&p->thread_group);
-	remove_parent(p);
+	list_del_init(&p->sibling);
 }
 
 /*
@@ -140,6 +140,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 
+/*
+ * Do final ptrace-related cleanup of a zombie being reaped.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_release_task(struct task_struct *p)
+{
+	ptrace_unlink(p);
+	BUG_ON(!list_empty(&p->ptrace_list));
+	BUG_ON(!list_empty(&p->ptrace_attach));
+}
+
 void release_task(struct task_struct * p)
 {
 	struct task_struct *leader;
@@ -148,8 +160,7 @@ repeat:
 	atomic_dec(&p->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
-	ptrace_unlink(p);
-	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+	ptrace_release_task(p);
 	__exit_signal(p);
 
 	/*
@@ -303,9 +314,8 @@ static void reparent_to_kthreadd(void)
 
 	ptrace_unlink(current);
 	/* Reparent to init */
-	remove_parent(current);
 	current->real_parent = current->parent = kthreadd_task;
-	add_parent(current);
+	list_move_tail(&current->sibling, &current->real_parent->children);
 
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
@@ -616,37 +626,60 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-static void
-reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+/*
+ * Detach any ptrace attachees (not our own natural children).
+ * Any that need to be release_task'd are put on the @dead list.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 {
-	if (p->pdeath_signal)
-		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+	struct task_struct *p, *n;
 
-	/* Move the child from its dying parent to the new one.  */
-	if (unlikely(traced)) {
-		/* Preserve ptrace links if someone else is tracing this child.  */
-		list_del_init(&p->ptrace_list);
-		if (p->parent != p->real_parent)
-			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
-	} else {
-		/* If this child is being traced, then we're the one tracing it
-		 * anyway, so let go of it.
-		 */
-		p->ptrace = 0;
-		remove_parent(p);
-		p->parent = p->real_parent;
-		add_parent(p);
+	list_for_each_entry_safe(p, n, &parent->ptrace_attach, ptrace_list) {
+		__ptrace_unlink(p);
 
-		if (task_is_traced(p)) {
-			/*
-			 * If it was at a trace stop, turn it into
-			 * a normal stop since it's no longer being
-			 * traced.
-			 */
-			ptrace_untrace(p);
+		/*
+		 * If it's a zombie, our attachedness prevented normal
+		 * parent notification or self-reaping.  Do notification
+		 * now if it would have happened earlier.  If it should
+		 * reap itself, add it to the @dead list.  We can't call
+		 * release_task() here because we already hold tasklist_lock.
+		 */
+		if (p->exit_state == EXIT_ZOMBIE) {
+			if (p->exit_signal != -1 && !thread_group_empty(p))
+				do_notify_parent(p, p->exit_signal);
+			if (p->exit_signal == -1)
+				list_add(&p->ptrace_list, dead);
 		}
 	}
+}
+
+/*
+ * Finish up exit-time ptrace cleanup.
+ *
+ * Called without locks.
+ */
+static void ptrace_exit_finish(struct task_struct *parent,
+			       struct list_head *dead)
+{
+	struct task_struct *p, *n;
+
+	BUG_ON(!list_empty(&parent->ptrace_attach));
+
+	list_for_each_entry_safe(p, n, dead, ptrace_list) {
+		list_del_init(&p->ptrace_list);
+		release_task(p);
+	}
+}
+
+static void reparent_thread(struct task_struct *p, struct task_struct *father)
+{
+	if (p->pdeath_signal)
+		/* We already hold the tasklist_lock here.  */
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+	list_move_tail(&p->sibling, &p->real_parent->children);
 
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
@@ -661,7 +694,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
-	if (!traced && p->exit_state == EXIT_ZOMBIE &&
+	if (p->exit_state == EXIT_ZOMBIE &&
 	    p->exit_signal != -1 && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
@@ -678,9 +711,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 static void forget_original_parent(struct task_struct *father)
 {
 	struct task_struct *p, *n, *reaper = father;
-	struct list_head ptrace_dead;
-
-	INIT_LIST_HEAD(&ptrace_dead);
+	LIST_HEAD(ptrace_dead);
 
 	write_lock_irq(&tasklist_lock);
 
@@ -692,58 +723,24 @@ static void forget_original_parent(struct task_struct *father)
 		}
 	} while (reaper->flags & PF_EXITING);
 
+	ptrace_exit(father, &ptrace_dead);
+
 	/*
-	 * There are only two places where our children can be:
-	 *
-	 * - in our child list
-	 * - in our ptraced child list
-	 *
-	 * Search them and reparent children.
+	 * Reparent our natural children.
 	 */
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
-		int ptrace;
-
-		ptrace = p->ptrace;
-
-		/* if father isn't the real parent, then ptrace must be enabled */
-		BUG_ON(father != p->real_parent && !ptrace);
-
-		if (father == p->real_parent) {
-			/* reparent with a reaper, real father it's us */
-			p->real_parent = reaper;
-			reparent_thread(p, father, 0);
-		} else {
-			/* reparent ptraced task to its real parent */
-			__ptrace_unlink (p);
-			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
-			    thread_group_empty(p))
-				do_notify_parent(p, p->exit_signal);
-		}
-
-		/*
-		 * if the ptraced child is a zombie with exit_signal == -1
-		 * we must collect it before we exit, or it will remain
-		 * zombie forever since we prevented it from self-reap itself
-		 * while it was being traced by us, to be able to see it in wait4.
-		 */
-		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
-			list_add(&p->ptrace_list, &ptrace_dead);
-	}
-
-	list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
 		p->real_parent = reaper;
-		reparent_thread(p, father, 1);
+		if (p->parent == father) {
+			ptrace_unlink(p);
+			p->parent = p->real_parent;
+		}
+		reparent_thread(p, father);
 	}
 
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
-	BUG_ON(!list_empty(&father->ptrace_children));
-
-	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
-		list_del_init(&p->ptrace_list);
-		release_task(p);
-	}
 
+	ptrace_exit_finish(father, &ptrace_dead);
 }
 
 /*
@@ -1467,6 +1464,15 @@ static int wait_consider_task(struct task_struct *parent,
 		return 1;
 	}
 
+	if (unlikely(p->parent != parent)) {
+		/*
+		 * This child is hidden by someone else who did PTRACE_ATTACH.
+		 * We aren't allowed to see it now, but eventually we will.
+		 */
+		*retval = 0;
+		return 0;
+	}
+
 	if (task_is_stopped_or_traced(p)) {
 		/*
 		 * It's stopped now, so it might later
@@ -1528,19 +1534,11 @@ static int do_wait_thread(struct task_struct *tsk, int *retval,
 			return 1;
 	}
 
-	/*
-	 * If we never saw an eligile child, check for children stolen by
-	 * ptrace.  We don't leave -ECHILD in *@retval if there are any,
-	 * because we will eventually be allowed to wait for them again.
-	 */
-	if (*retval)
-		list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
-			int ret = eligible_child(type, pid, options, p);
-			if (ret) {
-				*retval = unlikely(ret < 0) ? ret : 0;
-				break;
-			}
-		}
+	list_for_each_entry(p, &tsk->ptrace_attach, ptrace_list) {
+		if (wait_consider_task(tsk, p, retval, type, pid, options,
+				       infop, stat_addr, ru))
+			return 1;
+	}
 
 	return 0;
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 9c042f9..d1098a7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1256,7 +1256,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
-	INIT_LIST_HEAD(&p->ptrace_children);
+	INIT_LIST_HEAD(&p->ptrace_attach);
 	INIT_LIST_HEAD(&p->ptrace_list);
 
 	/* Now that the task is set up, run cgroup callbacks if
@@ -1329,7 +1329,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	}
 
 	if (likely(p->pid)) {
-		add_parent(p);
+		list_add_tail(&p->sibling, &p->real_parent->children);
 		if (unlikely(p->ptrace & PT_PTRACED))
 			__ptrace_link(p, current->parent);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a56a95b..2958ec3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -34,12 +34,10 @@
 void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 {
 	BUG_ON(!list_empty(&child->ptrace_list));
-	if (child->parent == new_parent)
-		return;
-	list_add(&child->ptrace_list, &child->parent->ptrace_children);
-	remove_parent(child);
-	child->parent = new_parent;
-	add_parent(child);
+	if (new_parent != child->real_parent) {
+		list_add(&child->ptrace_list, &new_parent->ptrace_attach);
+		child->parent = new_parent;
+	}
 }
  
 /*
@@ -73,12 +71,8 @@ void __ptrace_unlink(struct task_struct *child)
 	BUG_ON(!child->ptrace);
 
 	child->ptrace = 0;
-	if (!list_empty(&child->ptrace_list)) {
-		list_del_init(&child->ptrace_list);
-		remove_parent(child);
-		child->parent = child->real_parent;
-		add_parent(child);
-	}
+	child->parent = child->real_parent;
+	list_del_init(&child->ptrace_list);
 
 	if (task_is_traced(child))
 		ptrace_untrace(child);

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] do_wait reorganization
  2008-03-29  3:34 [PATCH 1/2] do_wait reorganization Roland McGrath
  2008-03-29  3:35 ` [PATCH 2/2] ptrace children revamp Roland McGrath
@ 2008-03-29 10:35 ` Oleg Nesterov
  2008-03-31  3:54   ` Roland McGrath
  2008-03-29 16:16 ` Linus Torvalds
  2 siblings, 1 reply; 23+ messages in thread
From: Oleg Nesterov @ 2008-03-29 10:35 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

On 03/28, Roland McGrath wrote:
>
> +static int wait_consider_task(struct task_struct *parent,
> +			      struct task_struct *p, int *retval,
> +			      enum pid_type type, struct pid *pid, int options,
> +			      struct siginfo __user *infop,
> +			      int __user *stat_addr, struct rusage __user *ru)
> +{
> +	int ret = eligible_child(type, pid, options, p);
> +	if (!ret)
> +		return 0;
> +
> +	if (unlikely(ret < 0)) {
> +		read_unlock(&tasklist_lock);
> +		*retval = ret;

Please note that eligible_child() drops tasklist_lock if it returns error
(ret < 0), so the "read_unlock" above is wrong.

> +static int do_wait_thread(struct task_struct *tsk, int *retval,
> +			  enum pid_type type, struct pid *pid, int options,
> +			  struct siginfo __user *infop, int __user *stat_addr,
> +			  struct rusage __user *ru)
> +{
> +	struct task_struct *p;
> +
> +	list_for_each_entry(p, &tsk->children, sibling) {
> +		if (wait_consider_task(tsk, p, retval, type, pid, options,
> +				       infop, stat_addr, ru))
> +			return 1;
> +	}
> +
> +	/*
> +	 * If we never saw an eligile child, check for children stolen by
> +	 * ptrace.  We don't leave -ECHILD in *@retval if there are any,
> +	 * because we will eventually be allowed to wait for them again.
> +	 */
> +	if (*retval)
> +		list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
> +			int ret = eligible_child(type, pid, options, p);
> +			if (ret) {
> +				*retval = unlikely(ret < 0) ? ret : 0;

This is not right. If ret < 0, we set *retval = ret, but then return 0.
We should return 1, so that the caller (do_wait) will report the error.

Note again that eligible_child() has already dropped tasklist, so not
only we don't return the error, we continue to run without tasklist
held, this is bug.

>  static long do_wait(enum pid_type type, struct pid *pid, int options,
>  		    struct siginfo __user *infop, int __user *stat_addr,
>  		    struct rusage __user *ru)
>  {
>
> [...snip...]
>
> -	if (flag) {
> -		if (options & WNOHANG)
> -			goto end;
> +	if (!retval && !(options & WNOHANG)) {
>  		retval = -ERESTARTSYS;
> -		if (signal_pending(current))
> -			goto end;
> -		schedule();
> -		goto repeat;
> +		if (!signal_pending(current)) {
> +			schedule();
> +			goto repeat;
> +		}
>  	}
> -	retval = -ECHILD;
> +
>  end:
>  	current->state = TASK_RUNNING;
>  	remove_wait_queue(&current->signal->wait_chldexit,&wait);

If !retval and WNOHANG, we should return -ECHILD, but with this patch
we return 0.

Perhaps I missed something, but personally I don't like the usage of
"int *retval", imho this really complicates the code. I think it is better
to use the returned values directly, but pass "&flag" to do_wait_thread().
We only need the pointer to avoid the unnecessary scanning of ->ptrace_children.
Better yet, we can split do_wait_thread() into 2 functions, and do not pass
the pointer at all. But I didn't read the next patch yet (will do tomorrow),
perhaps I missed the point of this approach.

Oleg.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] ptrace children revamp
  2008-03-29  3:35 ` [PATCH 2/2] ptrace children revamp Roland McGrath
@ 2008-03-29 10:39   ` Oleg Nesterov
  2008-03-29 13:10     ` Oleg Nesterov
  0 siblings, 1 reply; 23+ messages in thread
From: Oleg Nesterov @ 2008-03-29 10:39 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

I didn't read the patch yet (will do), just a minor nit right now,

On 03/28, Roland McGrath wrote:
>
> @@ -1528,19 +1534,11 @@ static int do_wait_thread(struct task_struct *tsk, int *retval,
>  			return 1;
>  	}
>  
> -	/*
> -	 * If we never saw an eligile child, check for children stolen by
> -	 * ptrace.  We don't leave -ECHILD in *@retval if there are any,
> -	 * because we will eventually be allowed to wait for them again.
> -	 */
> -	if (*retval)
> -		list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
> -			int ret = eligible_child(type, pid, options, p);
> -			if (ret) {
> -				*retval = unlikely(ret < 0) ? ret : 0;
> -				break;
> -			}
> -		}
> +	list_for_each_entry(p, &tsk->ptrace_attach, ptrace_list) {
> +		if (wait_consider_task(tsk, p, retval, type, pid, options,
> +				       infop, stat_addr, ru))
> +			return 1;
> +	}

Afaics, this adds a minor pessimization. We shouldn't scan ->ptrace_attach
list if it was already found that another thread has a "hidden" task.

Oleg.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] ptrace children revamp
  2008-03-29 10:39   ` Oleg Nesterov
@ 2008-03-29 13:10     ` Oleg Nesterov
  2008-03-29 14:37       ` Oleg Nesterov
  2008-04-04 21:00       ` Roland McGrath
  0 siblings, 2 replies; 23+ messages in thread
From: Oleg Nesterov @ 2008-03-29 13:10 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

On 03/29, Oleg Nesterov wrote:
> 
> On 03/28, Roland McGrath wrote:
> >
> > @@ -1528,19 +1534,11 @@ static int do_wait_thread(struct task_struct *tsk, int *retval,
> >  			return 1;
> >  	}
> >  
> > -	/*
> > -	 * If we never saw an eligile child, check for children stolen by
> > -	 * ptrace.  We don't leave -ECHILD in *@retval if there are any,
> > -	 * because we will eventually be allowed to wait for them again.
> > -	 */
> > -	if (*retval)
> > -		list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
> > -			int ret = eligible_child(type, pid, options, p);
> > -			if (ret) {
> > -				*retval = unlikely(ret < 0) ? ret : 0;
> > -				break;
> > -			}
> > -		}
> > +	list_for_each_entry(p, &tsk->ptrace_attach, ptrace_list) {
> > +		if (wait_consider_task(tsk, p, retval, type, pid, options,
> > +				       infop, stat_addr, ru))
> > +			return 1;
> > +	}
> 
> Afaics, this adds a minor pessimization. We shouldn't scan ->ptrace_attach
> list if it was already found that another thread has a "hidden" task.

Please ignore. I confused ->ptrace_attach with ->ptrace_children which
doesn't exists any longer. The added pessimization is very minor.


I personally think this is very nice change, it really makes the code better.
A couple of nits.

> @@ -1070,18 +1064,26 @@ struct task_struct {
>  	/*
>  	 * pointers to (original) parent process, youngest child, younger sibling,
>  	 * older sibling, respectively.  (p->father can be replaced with
> -	 * p->parent->pid)
> +	 * p->real_parent->pid)

There is no ->father in task_struct, the comment is obsolete

> -#define remove_parent(p)	list_del_init(&(p)->sibling)
> -#define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)

FYI, arch/mips/kernel/irixsig.c uses add_parent/remove_parent. I don't know
why irixsig.c reimplements do_wait() and friends...

> @@ -692,58 +723,24 @@ static void forget_original_parent(struct task_struct *father)
>  		}
>  	} while (reaper->flags & PF_EXITING);
>
> +	ptrace_exit(father, &ptrace_dead);
> +
>  	/*
> -	 * There are only two places where our children can be:
> -	 *
> -	 * - in our child list
> -	 * - in our ptraced child list
> -	 *
> -	 * Search them and reparent children.
> +	 * Reparent our natural children.
>  	 */
>  	list_for_each_entry_safe(p, n, &father->children, sibling) {
> -		int ptrace;
> -
> -		ptrace = p->ptrace;
> -
> -		/* if father isn't the real parent, then ptrace must be enabled */
> -		BUG_ON(father != p->real_parent && !ptrace);
> -
> -		if (father == p->real_parent) {
> -			/* reparent with a reaper, real father it's us */
> -			p->real_parent = reaper;
> -			reparent_thread(p, father, 0);
> -		} else {
> -			/* reparent ptraced task to its real parent */
> -			__ptrace_unlink (p);
> -			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
> -			    thread_group_empty(p))
> -				do_notify_parent(p, p->exit_signal);
> -		}
> -
> -		/*
> -		 * if the ptraced child is a zombie with exit_signal == -1
> -		 * we must collect it before we exit, or it will remain
> -		 * zombie forever since we prevented it from self-reap itself
> -		 * while it was being traced by us, to be able to see it in wait4.
> -		 */
> -		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
> -			list_add(&p->ptrace_list, &ptrace_dead);
> -	}
> -
> -	list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
>  		p->real_parent = reaper;
> -		reparent_thread(p, father, 1);
> +		if (p->parent == father) {
> +			ptrace_unlink(p);
> +			p->parent = p->real_parent;
> +		}
> +		reparent_thread(p, father);

Unless I missed something again, this is not 100% right.

Suppose that we are ->real_parent, the child was traced by us, we ignore
SIGCHLD, and we are doing the threaded reparenting. In that case we should
release the child if it is dead, like the current code does.

Even if we don't ignore SIGCHLD, we should notify our thread-group, but
reparent_thread() skips do_notify_parent() when the new parent is from
is from the same thread-group.

Note also that reparent_thread() has a very old bug. When it sees the
EXIT_ZOMBIE task, it notifies the new parent, but doesn't check if the task
becomes detached because the new parent ignores SIGCHLD. Currently this means
that init must have a handler for SIGCHLD or we leak zombies.

(btw, this patch has many conflicts with Linus's or Andrew's tree)

Oleg.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] ptrace children revamp
  2008-03-29 13:10     ` Oleg Nesterov
@ 2008-03-29 14:37       ` Oleg Nesterov
  2008-04-04 21:00       ` Roland McGrath
  1 sibling, 0 replies; 23+ messages in thread
From: Oleg Nesterov @ 2008-03-29 14:37 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

On 03/29, Oleg Nesterov wrote:
>
> > -	list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
> >  		p->real_parent = reaper;
> > -		reparent_thread(p, father, 1);
> > +		if (p->parent == father) {
> > +			ptrace_unlink(p);
> > +			p->parent = p->real_parent;
> > +		}
> > +		reparent_thread(p, father);
> 
> Unless I missed something again, this is not 100% right.
> 
> Suppose that we are ->real_parent, the child was traced by us, we ignore
> SIGCHLD, and we are doing the threaded reparenting. In that case we should
> release the child if it is dead, like the current code does.
> 
> Even if we don't ignore SIGCHLD, we should notify our thread-group, but
> reparent_thread() skips do_notify_parent() when the new parent is from
> is from the same thread-group.
> 
> Note also that reparent_thread() has a very old bug. When it sees the
> EXIT_ZOMBIE task, it notifies the new parent, but doesn't check if the task
> becomes detached because the new parent ignores SIGCHLD. Currently this means
> that init must have a handler for SIGCHLD or we leak zombies.

Also, I think ptrace_exit() is not right,

	if (p->exit_signal != -1 && !thread_group_empty(p))
		do_notify_parent(p, p->exit_signal);

note the "!thread_group_empty()" above, I guess this is typo, thread group
must be empty if we are going to release the task or notify its parent.


IOW, perhaps something like the patch below makes sense.

Oleg.

--- x/kernel/exit.c~x	2008-03-29 17:14:54.000000000 +0300
+++ x/kernel/exit.c	2008-03-29 17:28:17.000000000 +0300
@@ -596,6 +596,16 @@ static void exit_mm(struct task_struct *
 	mmput(mm);
 }
 
+static void xxx(struct task_struct *p, struct list_head *dead)
+{
+	if (p->exit_state == EXIT_ZOMBIE) {
+		if (p->exit_signal != -1 && thread_group_empty(p))
+			do_notify_parent(p, p->exit_signal);
+		if (p->exit_signal == -1)
+			list_add(&p->ptrace_list, dead);
+	}
+}
+
 /*
  * Detach any ptrace attachees (not our own natural children).
  * Any that need to be release_task'd are put on the @dead list.
@@ -616,12 +626,7 @@ static void ptrace_exit(struct task_stru
 		 * reap itself, add it to the @dead list.  We can't call
 		 * release_task() here because we already hold tasklist_lock.
 		 */
-		if (p->exit_state == EXIT_ZOMBIE) {
-			if (p->exit_signal != -1 && !thread_group_empty(p))
-				do_notify_parent(p, p->exit_signal);
-			if (p->exit_signal == -1)
-				list_add(&p->ptrace_list, dead);
-		}
+		 xxx(p, dead);
 	}
 }
 
@@ -661,13 +666,6 @@ static void reparent_thread(struct task_
 	if (p->exit_signal != -1)
 		p->exit_signal = SIGCHLD;
 
-	/* If we'd notified the old parent about this child's death,
-	 * also notify the new parent.
-	 */
-	if (p->exit_state == EXIT_ZOMBIE &&
-	    p->exit_signal != -1 && thread_group_empty(p))
-		do_notify_parent(p, p->exit_signal);
-
 	/*
 	 * process group orphan check
 	 * Case ii: Our child is in a different pgrp
@@ -720,6 +718,7 @@ static void forget_original_parent(struc
 			p->parent = p->real_parent;
 		}
 		reparent_thread(p, father);
+		xxx(p, &ptrace_dead);
 	}
 
 	write_unlock_irq(&tasklist_lock);


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] do_wait reorganization
  2008-03-29  3:34 [PATCH 1/2] do_wait reorganization Roland McGrath
  2008-03-29  3:35 ` [PATCH 2/2] ptrace children revamp Roland McGrath
  2008-03-29 10:35 ` [PATCH 1/2] do_wait reorganization Oleg Nesterov
@ 2008-03-29 16:16 ` Linus Torvalds
  2008-03-31  3:27   ` Roland McGrath
  2008-03-31  3:57   ` [PATCH 1/3] " Roland McGrath
  2 siblings, 2 replies; 23+ messages in thread
From: Linus Torvalds @ 2008-03-29 16:16 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, Oleg Nesterov, linux-kernel



On Fri, 28 Mar 2008, Roland McGrath wrote:
>
> The control flow is less nonobvious without so much goto.

How about a further non-obviousness?

> +static int wait_consider_task(struct task_struct *parent,
> +			      struct task_struct *p, int *retval,
> +			      enum pid_type type, struct pid *pid, int options,
> +			      struct siginfo __user *infop,
> +			      int __user *stat_addr, struct rusage __user *ru)
> +{
...
> +	if (task_is_stopped_or_traced(p)) {
> +		/*
> +		 * It's stopped now, so it might later
> +		 * continue, exit, or stop again.
> +		 */
> +		*retval = 0;
> +		if ((p->ptrace & PT_PTRACED) ||
> +		    (options & WUNTRACED)) {
> +			*retval = wait_task_stopped(p, (options & WNOWAIT),
> +						    infop, stat_addr, ru);
> +			if (*retval)
> +				return 1;
> +		}
> +	} else if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) {
...
> +	return 0;

I think it would be even more obvious (or, to use your phrase, "less 
nonobvious") if this was written like so:

	if (task_is_stopped_or_traced(p)) {
		...
		....
			if (*retval}
				return 1;
		}
		return 0;
	}

	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) {
		...
		return 0;
	}

	if (...)

because then you can clearly see that smething like the
"task_is_stopped_or_traced(p)" case is complete in itself, and only has 
its own local stuff going on.

(And at some point I'd also almost make each case a trivial small inline 
function of its on, but in this case there are so many arguments to pass 
around that it probably becomes _less_ readable that way).

I also wonder if you really need both "int *retval" and the return value. 
Isn't "retval" always going to be zero or a negative errno? And the return 
value is going to be either true of false? Why not just fold them into one 
single thing:

 - negative: all done, with error
 - zero: this didn't trigger, continue with the next one in caller
 - positive: this thread triggered, all done, return 0 in the caller.

which is (I think) close to what we already do in eligible_child() (so 
this would not be a new calling convention for this particular code).

		Linus

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] do_wait reorganization
  2008-03-29 16:16 ` Linus Torvalds
@ 2008-03-31  3:27   ` Roland McGrath
  2008-03-31  3:57   ` [PATCH 1/3] " Roland McGrath
  1 sibling, 0 replies; 23+ messages in thread
From: Roland McGrath @ 2008-03-31  3:27 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Oleg Nesterov, linux-kernel

> I also wonder if you really need both "int *retval" and the return value. 
> Isn't "retval" always going to be zero or a negative errno? 

No.

> And the return value is going to be either true of false?

Yes.

> Why not just fold them into one single thing:
> 
>  - negative: all done, with error
>  - zero: this didn't trigger, continue with the next one in caller
>  - positive: this thread triggered, all done, return 0 in the caller.
> 
> which is (I think) close to what we already do in eligible_child() (so 
> this would not be a new calling convention for this particular code).

You listed the three possibilities for eligible_child().
For wait_consider_task(), there are four possibilities:

 - all done, with error
 - this thread was not eligible, look for another; return -ECHILD if none ready
 - this thread was eligible but is not ready; return 0 or block if none ready
 - all done, this thread is ready; return its pid

I'll post another version that I think you'll like a little better.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] do_wait reorganization
  2008-03-29 10:35 ` [PATCH 1/2] do_wait reorganization Oleg Nesterov
@ 2008-03-31  3:54   ` Roland McGrath
  0 siblings, 0 replies; 23+ messages in thread
From: Roland McGrath @ 2008-03-31  3:54 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

> Please note that eligible_child() drops tasklist_lock if it returns error
> (ret < 0), so the "read_unlock" above is wrong.

Oops!  Thanks for catching that.  I first got the code into shape before I
noticed your change:

commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3 "do_wait: fix security checks"

I let that one slip off the end of my queue and never gave it proper review.
I tweaked my patch hastily to keep in line with that change and was careless.

I would have objected to that change at the time had I been up to speed.
AFAICT your objection was based solely on the inconsistent treatment of
children PTRACE_ATTACH'd by another.  That was an inadvertent omission in
my original change in commit 73243284463a761e04d69d22c7516b2be7de096c (and
seems obviously so to me).  That bug in no way invalidates the motivation
for the original change.  But you threw the baby out with the bathwater.

The original change came up in response to an actual problem in the real
world.  There was a bug in SELinux and/or a particular security policy that
made it impossible to debug some daemons with gdb.  The bug was an
internally inconsistent set of security checks, so root running gdb was
allowed to ptrace the daemon process, that process was allowed to send gdb
a SIGCHLD and wake it up, but gdb was not allowed to see it with wait4().

The symptom of this bug was that gdb said "wait: No children".  
The resaonable developer seeing this double-checked by using strace on
gdb, and saw that indeed wait4(-1, ...) had failed with ECHILD even
though ps and /proc/PID/status and everything the developer knew said
wait4() should consider the daemon process a waitable child of gdb
(which had used PTRACE_ATTACH).  The reasonable developer says, "This
is strange bug in ptrace or wait, and I'm just stuck; send it to Roland."

So I debugged the whole thing and fixed an SELinux bug for the SELinux
people.  Then I imagined how it could have been:

The symptom of this bug was that gdb said "wait: Permission denied".
The resaonable developer seeing this double-checked by using strace on
gdb, and saw that indeed wait4(-1, ...) had failed with EACCES.  The
reasonable developer has seen an unreasonable "Permission denied"
error before, and says, "Hmm, maybe I should look in /var/log/secure
or someplace."  The reasonable developer finds a log message about
"avc denied wait for gdb".  The reasonable developer says, "This is
another gosh-darn SELinux issue, so I better report an SELinux bug."
(The reasonably hurried developer then says, "Oh heck, let's just
disable SELinux until I've finished debugging the dag-burn daemon.")

Now try to guess which of these stories I like better.

> If !retval and WNOHANG, we should return -ECHILD, but with this patch
> we return 0.

The meaning of WNOHANG is to return 0 when we would otherwise block.
Returning -ECHILD is wrong.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/3] do_wait reorganization
  2008-03-29 16:16 ` Linus Torvalds
  2008-03-31  3:27   ` Roland McGrath
@ 2008-03-31  3:57   ` Roland McGrath
  2008-03-31  3:59     ` [PATCH 2/3] ptrace children revamp Roland McGrath
                       ` (2 more replies)
  1 sibling, 3 replies; 23+ messages in thread
From: Roland McGrath @ 2008-03-31  3:57 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton; +Cc: Oleg Nesterov, linux-kernel

This breaks out the guts of do_wait into two subfunctions.
The control flow is less nonobvious without so much goto.
do_wait_thread contains the main work of the outer loop.
wait_consider_task contains the main work of the inner loop.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c |  194 +++++++++++++++++++++++++++++++++-----------------------
 1 files changed, 114 insertions(+), 80 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 53872bf..237e3d0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1160,7 +1160,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_zombie(struct task_struct *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int options,
 			    struct siginfo __user *infop,
 			    int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1168,7 +1168,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
 
-	if (unlikely(noreap)) {
+	if (!likely(options & WEXITED))
+		return 0;
+
+	if (unlikely(options & WNOWAIT)) {
 		uid_t uid = p->uid;
 		int exit_code = p->exit_code;
 		int why, status;
@@ -1320,13 +1323,16 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
  * released the lock and the system call should return.
  */
 static int wait_task_stopped(struct task_struct *p,
-			     int noreap, struct siginfo __user *infop,
+			     int options, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
 	int retval, exit_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
+	if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED))
+		return 0;
+
 	exit_code = 0;
 	spin_lock_irq(&p->sighand->siglock);
 
@@ -1344,7 +1350,7 @@ static int wait_task_stopped(struct task_struct *p,
 	if (!exit_code)
 		goto unlock_sig;
 
-	if (!noreap)
+	if (!unlikely(options & WNOWAIT))
 		p->exit_code = 0;
 
 	uid = p->uid;
@@ -1365,7 +1371,7 @@ unlock_sig:
 	why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 
-	if (unlikely(noreap))
+	if (unlikely(options & WNOWAIT))
 		return wait_noreap_copyout(p, pid, uid,
 					   why, exit_code,
 					   infop, ru);
@@ -1399,7 +1405,7 @@ unlock_sig:
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_continued(struct task_struct *p, int noreap,
+static int wait_task_continued(struct task_struct *p, int options,
 			       struct siginfo __user *infop,
 			       int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1407,6 +1413,9 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	pid_t pid;
 	uid_t uid;
 
+	if (!unlikely(options & WCONTINUED))
+		return 0;
+
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
 		return 0;
 
@@ -1416,7 +1425,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 		spin_unlock_irq(&p->sighand->siglock);
 		return 0;
 	}
-	if (!noreap)
+	if (!unlikely(options & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 	spin_unlock_irq(&p->sighand->siglock);
 
@@ -1442,89 +1451,116 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	return retval;
 }
 
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@retval before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then *@retval is
+ * 0 if @p is an eligible child, or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent,
+			      struct task_struct *p, int *retval,
+			      enum pid_type type, struct pid *pid, int options,
+			      struct siginfo __user *infop,
+			      int __user *stat_addr, struct rusage __user *ru)
+{
+	int ret = eligible_child(type, pid, options, p);
+	if (ret <= 0)
+		return ret;
+
+	/*
+	 * We don't reap group leaders with subthreads.
+	 */
+	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+		return wait_task_zombie(p, options, infop, stat_addr, ru);
+
+	/*
+	 * It's stopped or running now, so it might
+	 * later continue, exit, or stop again.
+	 */
+	*retval = 0;
+
+	if (task_is_stopped_or_traced(p))
+		return wait_task_stopped(p, options, infop, stat_addr, ru);
+
+	if (p->exit_state != EXIT_DEAD)
+		return wait_task_continued(p, options, infop, stat_addr, ru);
+
+	return 0;
+}
+
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ *
+ * -ECHILD should be in *@retval before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then *@retval is
+ * 0 if there were any eligible children, or still -ECHILD.
+ */
+static int do_wait_thread(struct task_struct *tsk, int *retval,
+			  enum pid_type type, struct pid *pid, int options,
+			  struct siginfo __user *infop, int __user *stat_addr,
+			  struct rusage __user *ru)
+{
+	struct task_struct *p;
+
+	list_for_each_entry(p, &tsk->children, sibling) {
+		int ret = wait_consider_task(tsk, p, retval, type, pid, options,
+					     infop, stat_addr, ru);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * If we never saw an eligile child, check for children stolen by
+	 * ptrace.  We don't leave -ECHILD in *@retval if there are any,
+	 * because we will eventually be allowed to wait for them again.
+	 */
+	if (*retval)
+		list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
+			int ret = eligible_child(type, pid, options, p);
+			if (unlikely(ret < 0))
+				return ret;
+			if (ret) {
+				*retval = 0;
+				return 0;
+			}
+		}
+
+	return 0;
+}
+
 static long do_wait(enum pid_type type, struct pid *pid, int options,
 		    struct siginfo __user *infop, int __user *stat_addr,
 		    struct rusage __user *ru)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
-	int flag, retval;
+	int retval;
 
 	add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
-	/* If there is nothing that can match our critier just get out */
+	/*
+	 * If there is nothing that can match our critiera just get out.
+	 * We will clear @retval to zero if we see any child that might later
+	 * match our criteria, even if we are not able to reap it yet.
+	 */
 	retval = -ECHILD;
 	if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
 		goto end;
 
-	/*
-	 * We will set this flag if we see any child that might later
-	 * match our criteria, even if we are not able to reap it yet.
-	 */
-	flag = retval = 0;
 	current->state = TASK_INTERRUPTIBLE;
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
-		struct task_struct *p;
-
-		list_for_each_entry(p, &tsk->children, sibling) {
-			int ret = eligible_child(type, pid, options, p);
-			if (!ret)
-				continue;
-
-			if (unlikely(ret < 0)) {
-				retval = ret;
-			} else if (task_is_stopped_or_traced(p)) {
-				/*
-				 * It's stopped now, so it might later
-				 * continue, exit, or stop again.
-				 */
-				flag = 1;
-				if (!(p->ptrace & PT_PTRACED) &&
-				    !(options & WUNTRACED))
-					continue;
-
-				retval = wait_task_stopped(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			} else if (p->exit_state == EXIT_ZOMBIE &&
-					!delay_group_leader(p)) {
-				/*
-				 * We don't reap group leaders with subthreads.
-				 */
-				if (!likely(options & WEXITED))
-					continue;
-				retval = wait_task_zombie(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			} else if (p->exit_state != EXIT_DEAD) {
-				/*
-				 * It's running now, so it might later
-				 * exit, stop, or stop and then continue.
-				 */
-				flag = 1;
-				if (!unlikely(options & WCONTINUED))
-					continue;
-				retval = wait_task_continued(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			}
-			if (retval != 0) /* tasklist_lock released */
-				goto end;
-		}
-		if (!flag) {
-			list_for_each_entry(p, &tsk->ptrace_children,
-								ptrace_list) {
-				flag = eligible_child(type, pid, options, p);
-				if (!flag)
-					continue;
-				if (likely(flag > 0))
-					break;
-				retval = flag;
-				goto end;
-			}
+		int ret = do_wait_thread(tsk, &retval, type, pid, options,
+					 infop, stat_addr, ru);
+		if (ret) {
+			retval = ret;
+			goto end;
 		}
+
 		if (options & __WNOTHREAD)
 			break;
 		tsk = next_thread(tsk);
@@ -1532,16 +1568,14 @@ repeat:
 	} while (tsk != current);
 	read_unlock(&tasklist_lock);
 
-	if (flag) {
-		if (options & WNOHANG)
-			goto end;
+	if (!retval && !(options & WNOHANG)) {
 		retval = -ERESTARTSYS;
-		if (signal_pending(current))
-			goto end;
-		schedule();
-		goto repeat;
+		if (!signal_pending(current)) {
+			schedule();
+			goto repeat;
+		}
 	}
-	retval = -ECHILD;
+
 end:
 	current->state = TASK_RUNNING;
 	remove_wait_queue(&current->signal->wait_chldexit,&wait);

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 2/3] ptrace children revamp
  2008-03-31  3:57   ` [PATCH 1/3] " Roland McGrath
@ 2008-03-31  3:59     ` Roland McGrath
  2008-03-31  9:12       ` Oleg Nesterov
  2008-03-31  3:59     ` [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD Roland McGrath
  2008-03-31  8:51     ` [PATCH 1/3] do_wait reorganization Oleg Nesterov
  2 siblings, 1 reply; 23+ messages in thread
From: Roland McGrath @ 2008-03-31  3:59 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton; +Cc: Oleg Nesterov, linux-kernel

ptrace no longer fiddles with the children/sibling links, and the
old ptrace_children list is gone.  Now using PTRACE_ATTACH on
someone else's child just uses the new ptrace_attach list instead.

There should be no user-visible difference that matters.  The only
change is the order in which do_wait() sees multiple stopped children
and stopped ptrace attachees.  Since wait_task_stopped() was changed
earlier so it no longer reorders the children list, we already know
this won't cause any new problems.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 include/linux/init_task.h |    2 +-
 include/linux/sched.h     |   27 +++----
 kernel/exit.c             |  183 ++++++++++++++++++++++-----------------------
 kernel/fork.c             |    4 +-
 kernel/ptrace.c           |   18 ++---
 5 files changed, 112 insertions(+), 122 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f74e1d..70dbb73 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,7 +157,7 @@ extern struct group_info init_groups;
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
-	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
+	.ptrace_attach	= LIST_HEAD_INIT(tsk.ptrace_attach),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
 	.real_parent	= &tsk,						\
 	.parent		= &tsk,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6c275e8..5a755a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1043,12 +1043,6 @@ struct task_struct {
 #endif
 
 	struct list_head tasks;
-	/*
-	 * ptrace_list/ptrace_children forms the list of my children
-	 * that were stolen by a ptracer.
-	 */
-	struct list_head ptrace_children;
-	struct list_head ptrace_list;
 
 	struct mm_struct *mm, *active_mm;
 
@@ -1070,18 +1064,26 @@ struct task_struct {
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with 
-	 * p->parent->pid)
+	 * p->real_parent->pid)
 	 */
-	struct task_struct *real_parent; /* real parent process (when being debugged) */
-	struct task_struct *parent;	/* parent process */
+	struct task_struct *real_parent; /* real parent process */
+	struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
 	/*
-	 * children/sibling forms the list of my children plus the
-	 * tasks I'm ptracing.
+	 * children/sibling forms the list of my natural children
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 
+	/*
+	 * ptrace_attach is the list of tasks I have PTRACE_ATTACH'd to,
+	 * excluding my own natural children.
+	 * ptrace_list is my own link on my PTRACE_ATTACHer's list,
+	 * which is p->parent->ptrace_attach.
+	 */
+	struct list_head ptrace_attach;
+	struct list_head ptrace_list;
+
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
@@ -1794,9 +1796,6 @@ extern void wait_task_inactive(struct task_struct * p);
 #define wait_task_inactive(p)	do { } while (0)
 #endif
 
-#define remove_parent(p)	list_del_init(&(p)->sibling)
-#define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
-
 #define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
 
 #define for_each_process(p) \
diff --git a/kernel/exit.c b/kernel/exit.c
index 237e3d0..b5057ce 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,7 +64,7 @@ static void __unhash_process(struct task_struct *p)
 		__get_cpu_var(process_counts)--;
 	}
 	list_del_rcu(&p->thread_group);
-	remove_parent(p);
+	list_del_init(&p->sibling);
 }
 
 /*
@@ -140,6 +140,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 
+/*
+ * Do final ptrace-related cleanup of a zombie being reaped.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_release_task(struct task_struct *p)
+{
+	ptrace_unlink(p);
+	BUG_ON(!list_empty(&p->ptrace_list));
+	BUG_ON(!list_empty(&p->ptrace_attach));
+}
+
 void release_task(struct task_struct * p)
 {
 	struct task_struct *leader;
@@ -148,8 +160,7 @@ repeat:
 	atomic_dec(&p->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
-	ptrace_unlink(p);
-	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+	ptrace_release_task(p);
 	__exit_signal(p);
 
 	/*
@@ -303,9 +314,8 @@ static void reparent_to_kthreadd(void)
 
 	ptrace_unlink(current);
 	/* Reparent to init */
-	remove_parent(current);
 	current->real_parent = current->parent = kthreadd_task;
-	add_parent(current);
+	list_move_tail(&current->sibling, &current->real_parent->children);
 
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
@@ -616,37 +626,60 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-static void
-reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+/*
+ * Detach any ptrace attachees (not our own natural children).
+ * Any that need to be release_task'd are put on the @dead list.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 {
-	if (p->pdeath_signal)
-		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+	struct task_struct *p, *n;
 
-	/* Move the child from its dying parent to the new one.  */
-	if (unlikely(traced)) {
-		/* Preserve ptrace links if someone else is tracing this child.  */
-		list_del_init(&p->ptrace_list);
-		if (p->parent != p->real_parent)
-			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
-	} else {
-		/* If this child is being traced, then we're the one tracing it
-		 * anyway, so let go of it.
-		 */
-		p->ptrace = 0;
-		remove_parent(p);
-		p->parent = p->real_parent;
-		add_parent(p);
+	list_for_each_entry_safe(p, n, &parent->ptrace_attach, ptrace_list) {
+		__ptrace_unlink(p);
 
-		if (task_is_traced(p)) {
-			/*
-			 * If it was at a trace stop, turn it into
-			 * a normal stop since it's no longer being
-			 * traced.
-			 */
-			ptrace_untrace(p);
+		/*
+		 * If it's a zombie, our attachedness prevented normal
+		 * parent notification or self-reaping.  Do notification
+		 * now if it would have happened earlier.  If it should
+		 * reap itself, add it to the @dead list.  We can't call
+		 * release_task() here because we already hold tasklist_lock.
+		 */
+		if (p->exit_state == EXIT_ZOMBIE) {
+			if (p->exit_signal != -1 && !thread_group_empty(p))
+				do_notify_parent(p, p->exit_signal);
+			if (p->exit_signal == -1)
+				list_add(&p->ptrace_list, dead);
 		}
 	}
+}
+
+/*
+ * Finish up exit-time ptrace cleanup.
+ *
+ * Called without locks.
+ */
+static void ptrace_exit_finish(struct task_struct *parent,
+			       struct list_head *dead)
+{
+	struct task_struct *p, *n;
+
+	BUG_ON(!list_empty(&parent->ptrace_attach));
+
+	list_for_each_entry_safe(p, n, dead, ptrace_list) {
+		list_del_init(&p->ptrace_list);
+		release_task(p);
+	}
+}
+
+static void reparent_thread(struct task_struct *p, struct task_struct *father)
+{
+	if (p->pdeath_signal)
+		/* We already hold the tasklist_lock here.  */
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+	list_move_tail(&p->sibling, &p->real_parent->children);
 
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
@@ -661,7 +694,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
-	if (!traced && p->exit_state == EXIT_ZOMBIE &&
+	if (p->exit_state == EXIT_ZOMBIE &&
 	    p->exit_signal != -1 && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
@@ -678,9 +711,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 static void forget_original_parent(struct task_struct *father)
 {
 	struct task_struct *p, *n, *reaper = father;
-	struct list_head ptrace_dead;
-
-	INIT_LIST_HEAD(&ptrace_dead);
+	LIST_HEAD(ptrace_dead);
 
 	write_lock_irq(&tasklist_lock);
 
@@ -692,58 +723,24 @@ static void forget_original_parent(struct task_struct *father)
 		}
 	} while (reaper->flags & PF_EXITING);
 
+	ptrace_exit(father, &ptrace_dead);
+
 	/*
-	 * There are only two places where our children can be:
-	 *
-	 * - in our child list
-	 * - in our ptraced child list
-	 *
-	 * Search them and reparent children.
+	 * Reparent our natural children.
 	 */
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
-		int ptrace;
-
-		ptrace = p->ptrace;
-
-		/* if father isn't the real parent, then ptrace must be enabled */
-		BUG_ON(father != p->real_parent && !ptrace);
-
-		if (father == p->real_parent) {
-			/* reparent with a reaper, real father it's us */
-			p->real_parent = reaper;
-			reparent_thread(p, father, 0);
-		} else {
-			/* reparent ptraced task to its real parent */
-			__ptrace_unlink (p);
-			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
-			    thread_group_empty(p))
-				do_notify_parent(p, p->exit_signal);
-		}
-
-		/*
-		 * if the ptraced child is a zombie with exit_signal == -1
-		 * we must collect it before we exit, or it will remain
-		 * zombie forever since we prevented it from self-reap itself
-		 * while it was being traced by us, to be able to see it in wait4.
-		 */
-		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
-			list_add(&p->ptrace_list, &ptrace_dead);
-	}
-
-	list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
 		p->real_parent = reaper;
-		reparent_thread(p, father, 1);
+		if (p->parent == father) {
+			ptrace_unlink(p);
+			p->parent = p->real_parent;
+		}
+		reparent_thread(p, father);
 	}
 
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
-	BUG_ON(!list_empty(&father->ptrace_children));
-
-	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
-		list_del_init(&p->ptrace_list);
-		release_task(p);
-	}
 
+	ptrace_exit_finish(father, &ptrace_dead);
 }
 
 /*
@@ -1481,6 +1478,15 @@ static int wait_consider_task(struct task_struct *parent,
 	 */
 	*retval = 0;
 
+	if (unlikely(p->parent != parent)) {
+		/*
+		 * This child is hidden by someone else who did PTRACE_ATTACH.
+		 * We aren't allowed to see it now, but eventually we will.
+		 */
+		*retval = 0;
+		return 0;
+	}
+
 	if (task_is_stopped_or_traced(p))
 		return wait_task_stopped(p, options, infop, stat_addr, ru);
 
@@ -1512,21 +1518,12 @@ static int do_wait_thread(struct task_struct *tsk, int *retval,
 			return ret;
 	}
 
-	/*
-	 * If we never saw an eligile child, check for children stolen by
-	 * ptrace.  We don't leave -ECHILD in *@retval if there are any,
-	 * because we will eventually be allowed to wait for them again.
-	 */
-	if (*retval)
-		list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
-			int ret = eligible_child(type, pid, options, p);
-			if (unlikely(ret < 0))
-				return ret;
-			if (ret) {
-				*retval = 0;
-				return 0;
-			}
-		}
+	list_for_each_entry(p, &tsk->ptrace_attach, ptrace_list) {
+		int ret = wait_consider_task(tsk, p, retval, type, pid, options,
+					     infop, stat_addr, ru);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 9c042f9..d1098a7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1256,7 +1256,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
-	INIT_LIST_HEAD(&p->ptrace_children);
+	INIT_LIST_HEAD(&p->ptrace_attach);
 	INIT_LIST_HEAD(&p->ptrace_list);
 
 	/* Now that the task is set up, run cgroup callbacks if
@@ -1329,7 +1329,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	}
 
 	if (likely(p->pid)) {
-		add_parent(p);
+		list_add_tail(&p->sibling, &p->real_parent->children);
 		if (unlikely(p->ptrace & PT_PTRACED))
 			__ptrace_link(p, current->parent);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a56a95b..2958ec3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -34,12 +34,10 @@
 void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 {
 	BUG_ON(!list_empty(&child->ptrace_list));
-	if (child->parent == new_parent)
-		return;
-	list_add(&child->ptrace_list, &child->parent->ptrace_children);
-	remove_parent(child);
-	child->parent = new_parent;
-	add_parent(child);
+	if (new_parent != child->real_parent) {
+		list_add(&child->ptrace_list, &new_parent->ptrace_attach);
+		child->parent = new_parent;
+	}
 }
  
 /*
@@ -73,12 +71,8 @@ void __ptrace_unlink(struct task_struct *child)
 	BUG_ON(!child->ptrace);
 
 	child->ptrace = 0;
-	if (!list_empty(&child->ptrace_list)) {
-		list_del_init(&child->ptrace_list);
-		remove_parent(child);
-		child->parent = child->real_parent;
-		add_parent(child);
-	}
+	child->parent = child->real_parent;
+	list_del_init(&child->ptrace_list);
 
 	if (task_is_traced(child))
 		ptrace_untrace(child);

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD
  2008-03-31  3:57   ` [PATCH 1/3] " Roland McGrath
  2008-03-31  3:59     ` [PATCH 2/3] ptrace children revamp Roland McGrath
@ 2008-03-31  3:59     ` Roland McGrath
  2008-03-31 11:03       ` Oleg Nesterov
  2008-03-31  8:51     ` [PATCH 1/3] do_wait reorganization Oleg Nesterov
  2 siblings, 1 reply; 23+ messages in thread
From: Roland McGrath @ 2008-03-31  3:59 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton; +Cc: Oleg Nesterov, linux-kernel

This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3
"do_wait: fix security checks".  That change reverted the effect of commit
73243284463a761e04d69d22c7516b2be7de096c.  The rationale for the original
commit still stands.  The inconsistent treatment of children hidden by
ptrace was an unintended omission in the original change and in no way
invalidates its purpose.

This makes do_wait return the error returned by security_task_wait()
(usually -EACCES) in place of -ECHILD when there are some children the
caller would be able to wait for if not for the permission failure.  A
permission error will give the user a clue to look for security policy
problems, rather than for mysterious wait bugs.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c |   31 +++++++++++++++++++++----------
 1 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index b5057ce..2f0392d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1116,14 +1116,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 		return 0;
 
 	err = security_task_wait(p);
-	if (likely(!err))
-		return 1;
+	if (err)
+		return err;
 
-	if (type != PIDTYPE_PID)
-		return 0;
-	/* This child was explicitly requested, abort */
-	read_unlock(&tasklist_lock);
-	return err;
+	return 1;
 }
 
 static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1454,7 +1450,8 @@ static int wait_task_continued(struct task_struct *p, int options,
  * -ECHILD should be in *@retval before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then *@retval is
- * 0 if @p is an eligible child, or still -ECHILD.
+ * 0 if @p is an eligible child, or another error from security_task_wait(),
+ * or still -ECHILD.
  */
 static int wait_consider_task(struct task_struct *parent,
 			      struct task_struct *p, int *retval,
@@ -1463,9 +1460,22 @@ static int wait_consider_task(struct task_struct *parent,
 			      int __user *stat_addr, struct rusage __user *ru)
 {
 	int ret = eligible_child(type, pid, options, p);
-	if (ret <= 0)
+	if (!ret)
 		return ret;
 
+	if (unlikely(ret < 0)) {
+		/*
+		 * If we have not yet seen any eligible child,
+		 * then let this error code replace -ECHILD.
+		 * A permission error will give the user a clue
+		 * to look for security policy problems, rather
+		 * than for mysterious wait bugs.
+		 */
+		if (*retval)
+			*retval = ret;
+		return 0;
+	}
+
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
@@ -1502,7 +1512,8 @@ static int wait_consider_task(struct task_struct *parent,
  * -ECHILD should be in *@retval before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then *@retval is
- * 0 if there were any eligible children, or still -ECHILD.
+ * 0 if there were any eligible children, or another error from
+ * security_task_wait(), or still -ECHILD.
  */
 static int do_wait_thread(struct task_struct *tsk, int *retval,
 			  enum pid_type type, struct pid *pid, int options,

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/3] do_wait reorganization
  2008-03-31  3:57   ` [PATCH 1/3] " Roland McGrath
  2008-03-31  3:59     ` [PATCH 2/3] ptrace children revamp Roland McGrath
  2008-03-31  3:59     ` [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD Roland McGrath
@ 2008-03-31  8:51     ` Oleg Nesterov
  2008-03-31 20:29       ` Roland McGrath
  2 siblings, 1 reply; 23+ messages in thread
From: Oleg Nesterov @ 2008-03-31  8:51 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Linus Torvalds, Andrew Morton, linux-kernel

(I apologize in advance if I missed something, I'm not able to study this series
 carefully now. But I hope that a false alarm is better than a probable bug.)

On 03/30, Roland McGrath wrote:
> +static int wait_consider_task(struct task_struct *parent,
> +			      struct task_struct *p, int *retval,
> +			      enum pid_type type, struct pid *pid, int options,
> +			      struct siginfo __user *infop,
> +			      int __user *stat_addr, struct rusage __user *ru)
> +{
> +	int ret = eligible_child(type, pid, options, p);
> +	if (ret <= 0)
> +		return ret;
> +
> +	/*
> +	 * We don't reap group leaders with subthreads.
> +	 */
> +	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
> +		return wait_task_zombie(p, options, infop, stat_addr, ru);
> +
> +	/*
> +	 * It's stopped or running now, so it might
> +	 * later continue, exit, or stop again.
> +	 */
> +	*retval = 0;
> +
> +	if (task_is_stopped_or_traced(p))
> +		return wait_task_stopped(p, options, infop, stat_addr, ru);
> +
> +	if (p->exit_state != EXIT_DEAD)
> +		return wait_task_continued(p, options, infop, stat_addr, ru);
> +
> +	return 0;
> +}

This looks wrong, we shouldn't clear *retval if the child is EXIT_DEAD.

We can race with another thread which changed EXIT_ZOMBIE->EXIT_DEAD but
hasn't released the child yet. Suppose we don't have other childs, in that
case do_wait() will falsely sleep on ->wait_chldexit (unless WHOHANG).

Oleg.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/3] ptrace children revamp
  2008-03-31  3:59     ` [PATCH 2/3] ptrace children revamp Roland McGrath
@ 2008-03-31  9:12       ` Oleg Nesterov
  0 siblings, 0 replies; 23+ messages in thread
From: Oleg Nesterov @ 2008-03-31  9:12 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Linus Torvalds, Andrew Morton, linux-kernel

On 03/30, Roland McGrath wrote:
>
> +static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
>  {
> -	if (p->pdeath_signal)
> -		/* We already hold the tasklist_lock here.  */
> -		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
> +	struct task_struct *p, *n;
>  
> -	/* Move the child from its dying parent to the new one.  */
> -	if (unlikely(traced)) {
> -		/* Preserve ptrace links if someone else is tracing this child.  */
> -		list_del_init(&p->ptrace_list);
> -		if (p->parent != p->real_parent)
> -			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
> -	} else {
> -		/* If this child is being traced, then we're the one tracing it
> -		 * anyway, so let go of it.
> -		 */
> -		p->ptrace = 0;
> -		remove_parent(p);
> -		p->parent = p->real_parent;
> -		add_parent(p);
> +	list_for_each_entry_safe(p, n, &parent->ptrace_attach, ptrace_list) {
> +		__ptrace_unlink(p);
>  
> -		if (task_is_traced(p)) {
> -			/*
> -			 * If it was at a trace stop, turn it into
> -			 * a normal stop since it's no longer being
> -			 * traced.
> -			 */
> -			ptrace_untrace(p);
> +		/*
> +		 * If it's a zombie, our attachedness prevented normal
> +		 * parent notification or self-reaping.  Do notification
> +		 * now if it would have happened earlier.  If it should
> +		 * reap itself, add it to the @dead list.  We can't call
> +		 * release_task() here because we already hold tasklist_lock.
> +		 */
> +		if (p->exit_state == EXIT_ZOMBIE) {
> +			if (p->exit_signal != -1 && !thread_group_empty(p))
                                                    ^^^^^^^^^^^^^^^^^^^^^^^
I still think this is wrong. Please also look at my previous reply,
http://marc.info/?l=linux-kernel&m=120680153931177

> @@ -1481,6 +1478,15 @@ static int wait_consider_task(struct task_struct *parent,
>  	 */
>  	*retval = 0;
>
> +	if (unlikely(p->parent != parent)) {
> +		/*
> +		 * This child is hidden by someone else who did PTRACE_ATTACH.
> +		 * We aren't allowed to see it now, but eventually we will.
> +		 */
> +		*retval = 0;
> +		return 0;
> +	}

Looks wrong... Above this "p->parent != parent" check we have

	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
		return wait_task_zombie(p, options, infop, stat_addr, ru);

Suppose that p is not a group leader, it is traced by some unrelated task,
and now it is EXIT_ZOMBIE. We shouldn't release it (and return the "bogus"
pid to user-space).

Oleg.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD
  2008-03-31  3:59     ` [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD Roland McGrath
@ 2008-03-31 11:03       ` Oleg Nesterov
  2008-03-31 20:20         ` Roland McGrath
  0 siblings, 1 reply; 23+ messages in thread
From: Oleg Nesterov @ 2008-03-31 11:03 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Linus Torvalds, Andrew Morton, linux-kernel

On 03/30, Roland McGrath wrote:
>
> This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3
> "do_wait: fix security checks".  That change reverted the effect of commit
> 73243284463a761e04d69d22c7516b2be7de096c.  The rationale for the original
> commit still stands.  The inconsistent treatment of children hidden by
> ptrace was an unintended omission in the original change and in no way
> invalidates its purpose.

OK, it turns out I misunderstood the purpose of 73243284463a761e04d69d22c7516b2be7de096c,
its changelog says:

	wait* syscalls return -ECHILD even when an individual PID of a live child
	was requested explicitly, when security_task_wait denies the operation.
	This means that something like a broken SELinux policy can produce an
	unexpected failure that looks just like a bug with wait or ptrace or
	something.

I wrongly thought that "-ECHILD even when an individual PID ... was requested"
was the problem.

> This makes do_wait return the error returned by security_task_wait()
> (usually -EACCES) in place of -ECHILD when there are some children the
> caller would be able to wait for if not for the permission failure.  A
> permission error will give the user a clue to look for security policy
> problems, rather than for mysterious wait bugs.

OK, thanks. Again, I was confused and thought we should "hide" -EACCES
unless the child was explicitly requested.

> @@ -1463,9 +1460,22 @@ static int wait_consider_task(struct task_struct *parent,
>  			      int __user *stat_addr, struct rusage __user *ru)
>  {
>  	int ret = eligible_child(type, pid, options, p);
> -	if (ret <= 0)
> +	if (!ret)
>  		return ret;
>  
> +	if (unlikely(ret < 0)) {
> +		/*
> +		 * If we have not yet seen any eligible child,
> +		 * then let this error code replace -ECHILD.
> +		 * A permission error will give the user a clue
> +		 * to look for security policy problems, rather
> +		 * than for mysterious wait bugs.
> +		 */
> +		if (*retval)
> +			*retval = ret;
> +		return 0;
> +	}

Not that I blame this patch...

Suppose that we have 2 childs. The first one is running, the second is zombie
but nacked by security_task_wait(). Now waitpid(-1, WHOHANG|WEXITED) returns 0,
a bit strange/confusing.

Yes, we have the same behaviour before this patch, but after reading your
explanation I am starting to think this is not "optimal".

Don't get me wrong, I don't claim this should be changed, just I'd like to be
sure this didn't escape your attention.

Oleg.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/3] do_wait reorganization
  2008-03-31 20:29       ` Roland McGrath
@ 2008-03-31 20:07         ` Oleg Nesterov
  0 siblings, 0 replies; 23+ messages in thread
From: Oleg Nesterov @ 2008-03-31 20:07 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Linus Torvalds, Andrew Morton, linux-kernel

On 03/31, Roland McGrath wrote:
>
> I think it should look like this:
> 
>     {
> 	    int ret = eligible_child(type, pid, options, p);
> 	    if (ret <= 0)
> 		    return ret;
> 
> 	    if (p->exit_state == EXIT_DEAD)
> 		    return 0;
> 
> 	    /*
> 	     * We don't reap group leaders with subthreads.
> 	     */
> 	    if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
> 		    return wait_task_zombie(p, options, infop, stat_addr, ru);
> 
> 	    /*
> 	     * It's stopped or running now, so it might
> 	     * later continue, exit, or stop again.
> 	     */
> 	    *retval = 0;
> 
> 	    if (task_is_stopped_or_traced(p))
> 		    return wait_task_stopped(p, options, infop, stat_addr, ru);
> 
> 	    return wait_task_continued(p, options, infop, stat_addr, ru);
>     }

I think you are right.

(BTW, you moved the "options & WHATEVER" checks into wait_task_xxx(),
 I think this really makes the code more readable and maintainable).

Oleg.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD
  2008-03-31 11:03       ` Oleg Nesterov
@ 2008-03-31 20:20         ` Roland McGrath
  0 siblings, 0 replies; 23+ messages in thread
From: Roland McGrath @ 2008-03-31 20:20 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linus Torvalds, Andrew Morton, linux-kernel

> Suppose that we have 2 childs. The first one is running, the second is zombie
> but nacked by security_task_wait(). Now waitpid(-1, WHOHANG|WEXITED) returns 0,
> a bit strange/confusing.

What else would it do?  There is a child that will be reapable eventually
(the running one).  It might be even more strange and more confusing if you
hid the running child just because there is a permission-denied one.  If
you ask specifically for the permission-denied zombie you'll get the
permission error.  I'm not sure we can do better.


Thanks,
Roland



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/3] do_wait reorganization
  2008-03-31  8:51     ` [PATCH 1/3] do_wait reorganization Oleg Nesterov
@ 2008-03-31 20:29       ` Roland McGrath
  2008-03-31 20:07         ` Oleg Nesterov
  0 siblings, 1 reply; 23+ messages in thread
From: Roland McGrath @ 2008-03-31 20:29 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linus Torvalds, Andrew Morton, linux-kernel

> This looks wrong, we shouldn't clear *retval if the child is EXIT_DEAD.

You're right.  Thanks for catching that.
I think it should look like this:

    {
	    int ret = eligible_child(type, pid, options, p);
	    if (ret <= 0)
		    return ret;

	    if (p->exit_state == EXIT_DEAD)
		    return 0;

	    /*
	     * We don't reap group leaders with subthreads.
	     */
	    if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
		    return wait_task_zombie(p, options, infop, stat_addr, ru);

	    /*
	     * It's stopped or running now, so it might
	     * later continue, exit, or stop again.
	     */
	    *retval = 0;

	    if (task_is_stopped_or_traced(p))
		    return wait_task_stopped(p, options, infop, stat_addr, ru);

	    return wait_task_continued(p, options, infop, stat_addr, ru);
    }

What do you think?


Thanks,
Roland

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] ptrace children revamp
  2008-03-29 13:10     ` Oleg Nesterov
  2008-03-29 14:37       ` Oleg Nesterov
@ 2008-04-04 21:00       ` Roland McGrath
  2008-04-05 14:06         ` Oleg Nesterov
  1 sibling, 1 reply; 23+ messages in thread
From: Roland McGrath @ 2008-04-04 21:00 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

> >  	/*
> >  	 * pointers to (original) parent process, youngest child, younger sibling,
> >  	 * older sibling, respectively.  (p->father can be replaced with
> > -	 * p->parent->pid)
> > +	 * p->real_parent->pid)
> 
> There is no ->father in task_struct, the comment is obsolete

I took it to be referring to what replaced the field that no longer exists.
Anyway, I just tweaked the bad reference to the ptrace-specific parent field.

> FYI, arch/mips/kernel/irixsig.c uses add_parent/remove_parent. I don't know
> why irixsig.c reimplements do_wait() and friends...

That badly needs to be cleaned up.  As per recent comments from Linus in
another thread, I think we are best off letting this old cruft get broken
and having its arch people fix it to be sane.

> Unless I missed something again, this is not 100% right.
> 
> Suppose that we are ->real_parent, the child was traced by us, we ignore
> SIGCHLD, and we are doing the threaded reparenting. In that case we should
> release the child if it is dead, like the current code does.

I tried to construct a test case to demonstrate this behavior on the
existing kernel.  I couldn't manage to get it to do this.  Are you this
is "like the current code does"?  I'll send you my test case attempts
off-list and we can try together to get the right test to show this.  If
it turns out that it does not really behave this way now and so my code
is not introducing any regression, then (as usual) I'd rather fix this
separately after the cleanup.

> Even if we don't ignore SIGCHLD, we should notify our thread-group, but
> reparent_thread() skips do_notify_parent() when the new parent is from
> is from the same thread-group.

Why do you think this is wrong?  The notification is always group-wide
(a SIGCHLD to the group, and wakeup of the group-shared wait_chldexit).
So there already was a notification, and a second one seems wrong to me.

> Note also that reparent_thread() has a very old bug. When it sees the
> EXIT_ZOMBIE task, it notifies the new parent, but doesn't check if the task
> becomes detached because the new parent ignores SIGCHLD. Currently this means
> that init must have a handler for SIGCHLD or we leak zombies.

I had noticed that.  After the cleanup, we can look into fixing that.
As usual, I am first going for a set of cleanup patches that changes
exactly nothing semantically.

> Also, I think ptrace_exit() is not right,
> 
> 	if (p->exit_signal != -1 && !thread_group_empty(p))
> 		do_notify_parent(p, p->exit_signal);

Yes, that was a typo.  

> Looks wrong... Above this "p->parent != parent" check we have

This was an inadvertent change from some sloppy last-minute rebasing.
Sorry about that.  Both of these two errors were not there in the
version of the code I read over a lot of times, and I flubbed them in
when rejiggering the patches after some of the earlier feedback.  
In the correct code, the ptrace-stolen check is the very first thing
after calling eligible_child and checking its return value.

When we have resolved the issue about the test case above, I will post
a new series that fixes these flubs.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] ptrace children revamp
  2008-04-04 21:00       ` Roland McGrath
@ 2008-04-05 14:06         ` Oleg Nesterov
  2008-04-09 20:15           ` Roland McGrath
  0 siblings, 1 reply; 23+ messages in thread
From: Oleg Nesterov @ 2008-04-05 14:06 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

On 04/04, Roland McGrath wrote:
>
> > Unless I missed something again, this is not 100% right.
> > 
> > Suppose that we are ->real_parent, the child was traced by us, we ignore
> > SIGCHLD, and we are doing the threaded reparenting. In that case we should
> > release the child if it is dead, like the current code does.
> 
> I tried to construct a test case to demonstrate this behavior on the
> existing kernel.  I couldn't manage to get it to do this.  Are you this
> is "like the current code does"?  I'll send you my test case attempts
> off-list and we can try together to get the right test to show this.  If
> it turns out that it does not really behave this way now and so my code
> is not introducing any regression, then (as usual) I'd rather fix this
> separately after the cleanup.

Heh. You are right, the current kernel has the same (minor) bug.
I beleive it was introduced by b2b2cbc4b2a2f389442549399a993a8306420baf.
Before this patch we notified the new parent even if it is from the same
thread group. If SIGCHLD is ignored, do_notify_parent() sets ->exit_signal = -1,
and then forget_original_parent() notices this and adds the child to ptrace_dead.
Now we don't notify the parent, and so the child "hangs".


However, now I think your patch adds a more serious problem, we can really
leak a zombie. Suppose that that we are ->real_parent, the child was traced
by us, it is zombie now, and the child is not a group leader. No matter who
is the new parent, no matter what is the state of SIGCHLD, we must not reparent
the child: nobody can release it except us. It is not traced any longer,
its ->exit_signal == -1, eligible_child() doesn't like this.

No?

> > Even if we don't ignore SIGCHLD, we should notify our thread-group, but
> > reparent_thread() skips do_notify_parent() when the new parent is from
> > is from the same thread-group.
> 
> Why do you think this is wrong?  The notification is always group-wide
> (a SIGCHLD to the group, and wakeup of the group-shared wait_chldexit).

Well, I was thinking about another thread (the new parent) sleeping in
do_wait(__WNOTHREAD)... not sure this really matters, though.

But,

> So there already was a notification, and a second one seems wrong to me.

Yes, I missed this. The second signal doesn't look right.

> > Note also that reparent_thread() has a very old bug. When it sees the
> > EXIT_ZOMBIE task, it notifies the new parent, but doesn't check if the task
> > becomes detached because the new parent ignores SIGCHLD. Currently this means
> > that init must have a handler for SIGCHLD or we leak zombies.
> 
> I had noticed that.  After the cleanup, we can look into fixing that.
> As usual, I am first going for a set of cleanup patches that changes
> exactly nothing semantically.

Yes, yes, I agree. I just wanted to be sure you didn't miss this bug.

Oleg.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] ptrace children revamp
  2008-04-05 14:06         ` Oleg Nesterov
@ 2008-04-09 20:15           ` Roland McGrath
  2008-04-13 14:24             ` Oleg Nesterov
  0 siblings, 1 reply; 23+ messages in thread
From: Roland McGrath @ 2008-04-09 20:15 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

> Heh. You are right, the current kernel has the same (minor) bug.
> I beleive it was introduced by b2b2cbc4b2a2f389442549399a993a8306420baf.

I have a regression test program for that (the one I sent you).
I've done a fix as a follow-on patch.

> However, now I think your patch adds a more serious problem, we can really
> leak a zombie. Suppose that that we are ->real_parent, the child was traced
> by us, it is zombie now, and the child is not a group leader. No matter who
> is the new parent, no matter what is the state of SIGCHLD, we must not reparent
> the child: nobody can release it except us. It is not traced any longer,
> its ->exit_signal == -1, eligible_child() doesn't like this.
> 
> No?

(Yes!)  Ah, thanks for that.  I wrote a regression test program for this
case that worked before and fails with my last version of the patch.
I've fixed my new code so it passes again.

> Well, I was thinking about another thread (the new parent) sleeping in
> do_wait(__WNOTHREAD)... not sure this really matters, though.

I'm pretty sure it doesn't matter in real life.  I doubt whoever uses
__WNOTHREAD was relying on seeing some other thread's reparented
children.  TBH, I don't care much about __WNOTHREAD and I don't know off
hand of anything that actually uses it.  OTOH, an extra wakeup on
wait_chldexit is always harmless.  And for pure anality, it seems proper
to maintain the invariant that do_wait() wakes up promptly whenever
interrupting and restarting it would complete without blocking.  But, we
don't get that wakeup now and noone has complained, so all in all, I'm
inclined to leave it as it is.

I've put the latest tweaked version of this same patch series at:
	http://people.redhat.com/roland/kernel-patches/ptrace-cleanup.mbox
That adds a fourth patch that fixes the aforementioned bug case that the
current canonical kernel gets wrong.  I think that fix also incidentally
covered the init-ignores-SIGCHLD case, but I didn't test that and I'm
not really positive.

I'm working on a variant of the ptrace revamp where all ptrace'd tasks
go on a list (whether natural children or not).  (This was my original
intent, but then I thought it might be more complication and change that
way.  Now it's seeming attractive again.)  I tend towards this approach
aesthetically because it makes ptrace bookkeeping more consistent across
all kinds of tracees.  For the cases we've been talking about, it means
ptrace_exit() would take care of zombies kept alive by ptrace uniformly
before we get to the reparent_thread loop.  This means the init case is
separate and needs a separate fix, but that kind of seems cleaner.  When
I get the variant patch working, we can see which one looks best.  I'd
like your opinion on that.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] ptrace children revamp
  2008-04-09 20:15           ` Roland McGrath
@ 2008-04-13 14:24             ` Oleg Nesterov
  2008-04-15  1:41               ` Roland McGrath
  0 siblings, 1 reply; 23+ messages in thread
From: Oleg Nesterov @ 2008-04-13 14:24 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

Sorry for delay!

On 04/09, Roland McGrath wrote:
>
> > Well, I was thinking about another thread (the new parent) sleeping in
> > do_wait(__WNOTHREAD)... not sure this really matters, though.
> 
> I'm pretty sure it doesn't matter in real life.  I doubt whoever uses
> __WNOTHREAD was relying on seeing some other thread's reparented
> children.  TBH, I don't care much about __WNOTHREAD and I don't know off
> hand of anything that actually uses it.

Agreed. I never understood why we need __WNOTHREAD. The same for
->pdeath_signal in its current "per-thread" form. I think it would be
very nice to kill them both (or send the ->pdeath_signal when the whole
process exits). Then we can place all childrens on one signal->children
list. But this is a bit off-topic for now.

> I've put the latest tweaked version of this same patch series at:
> 	http://people.redhat.com/roland/kernel-patches/ptrace-cleanup.mbox
> That adds a fourth patch that fixes the aforementioned bug case that the
> current canonical kernel gets wrong.  I think that fix also incidentally
> covered the init-ignores-SIGCHLD case, but I didn't test that and I'm
> not really positive.

I think the 4th patch has a small problem,

	reparent_zombie:

		if (p->exit_signal == -1 ||
		    (thread_group_empty(p) && ignoring_children(p->real_parent)))
			list_add(&p->ptrace_list, dead);

The 2nd case, "thread_group_empty(p) && ignoring_children", looks racy.
We didn't set ->exit_signal == -1, the new parent can call do_wait() and
release "p" as soon as we drop tasklist_lock, before ptrace_exit_finish().

> I'm working on a variant of the ptrace revamp where all ptrace'd tasks
> go on a list (whether natural children or not).  (This was my original
> intent, but then I thought it might be more complication and change that
> way.  Now it's seeming attractive again.)

Yes! I thought about this too. Actually, I was very sure that this is your
plan from the the very beginning ;)

Oleg.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] ptrace children revamp
  2008-04-13 14:24             ` Oleg Nesterov
@ 2008-04-15  1:41               ` Roland McGrath
  0 siblings, 0 replies; 23+ messages in thread
From: Roland McGrath @ 2008-04-15  1:41 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

> Sorry for delay!

There's no need to apologize!  I heartily appreciate all your help on this.

> Agreed. I never understood why we need __WNOTHREAD. The same for
> ->pdeath_signal in its current "per-thread" form. I think it would be
> very nice to kill them both (or send the ->pdeath_signal when the whole
> process exits). Then we can place all childrens on one signal->children
> list. But this is a bit off-topic for now.

I'm in complete agreement here, but indeed it's to be considered later.
I never understood pdeath_signal, but I recall someone piping up before
and saying it really was used with its current semantics, so go figure.
(Btw, we can move ->children and still keep __WNOTHREAD compatibility.
It just has to check p->parent == current for __WNOTHREAD.  We can do
that if we determine that __WNOTHREAD is purely for anal backward
compatibility, and noone cares about the performance difference of
__WNOTHREAD calls only looping over a shorter list.)

> I think the 4th patch has a small problem,

Thanks.  I think we're moving on to the other variant of the patch now,
so I won't worry about fixing up the version we're abandoning.  

> Yes! I thought about this too. Actually, I was very sure that this is your
> plan from the the very beginning ;)

You were quite right!  I somehow tricked myself into trying something
inferior first.  Silly me.  :-)


Thanks,
Roland

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2008-04-15  1:42 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-29  3:34 [PATCH 1/2] do_wait reorganization Roland McGrath
2008-03-29  3:35 ` [PATCH 2/2] ptrace children revamp Roland McGrath
2008-03-29 10:39   ` Oleg Nesterov
2008-03-29 13:10     ` Oleg Nesterov
2008-03-29 14:37       ` Oleg Nesterov
2008-04-04 21:00       ` Roland McGrath
2008-04-05 14:06         ` Oleg Nesterov
2008-04-09 20:15           ` Roland McGrath
2008-04-13 14:24             ` Oleg Nesterov
2008-04-15  1:41               ` Roland McGrath
2008-03-29 10:35 ` [PATCH 1/2] do_wait reorganization Oleg Nesterov
2008-03-31  3:54   ` Roland McGrath
2008-03-29 16:16 ` Linus Torvalds
2008-03-31  3:27   ` Roland McGrath
2008-03-31  3:57   ` [PATCH 1/3] " Roland McGrath
2008-03-31  3:59     ` [PATCH 2/3] ptrace children revamp Roland McGrath
2008-03-31  9:12       ` Oleg Nesterov
2008-03-31  3:59     ` [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD Roland McGrath
2008-03-31 11:03       ` Oleg Nesterov
2008-03-31 20:20         ` Roland McGrath
2008-03-31  8:51     ` [PATCH 1/3] do_wait reorganization Oleg Nesterov
2008-03-31 20:29       ` Roland McGrath
2008-03-31 20:07         ` Oleg Nesterov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).