* [PATCH 1/4] pid: Remove the child_reaper special case in init/main.c
2011-01-31 10:25 Prepare for the unshare support of the pid namespace Daniel Lezcano
@ 2011-01-31 10:25 ` Daniel Lezcano
2011-01-31 10:25 ` [PATCH 2/4] pidns: Call pid_ns_prepare_proc from create_pid_namespace Daniel Lezcano
` (2 subsequent siblings)
3 siblings, 0 replies; 13+ messages in thread
From: Daniel Lezcano @ 2011-01-31 10:25 UTC (permalink / raw)
To: akpm; +Cc: ebiederm, oleg, containers, linux-kernel, clg
From: Eric W. Biederman <ebiederm@xmission.com>
It turns out that the existing assignment in copy_process of
the child_reaper can handle the initial assignment of child_reaper
we just need to generalize the test in kernel/fork.c
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
---
include/linux/pid.h | 11 +++++++++++
init/main.c | 9 ---------
kernel/fork.c | 2 +-
3 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 49f1c2f..efceda0 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -141,6 +141,17 @@ static inline struct pid_namespace *ns_of_pid(struct pid *pid)
}
/*
+ * is_child_reaper returns true if the pid is the init process
+ * of the current namespace. As this one could be checked before
+ * pid_ns->child_reaper is assigned in copy_process, we check
+ * with the pid number.
+ */
+static inline bool is_child_reaper(struct pid *pid)
+{
+ return pid->numbers[pid->level].nr == 1;
+}
+
+/*
* the helpers to get the pid's id seen from different namespaces
*
* pid_nr() : global id, i.e. the id seen from the init namespace;
diff --git a/init/main.c b/init/main.c
index 33c37c3..793ebfd 100644
--- a/init/main.c
+++ b/init/main.c
@@ -875,15 +875,6 @@ static int __init kernel_init(void * unused)
* init can run on any cpu.
*/
set_cpus_allowed_ptr(current, cpu_all_mask);
- /*
- * Tell the world that we're going to be the grim
- * reaper of innocent orphaned children.
- *
- * We don't want people to have to make incorrect
- * assumptions about where in the task array this
- * can be found.
- */
- init_pid_ns.child_reaper = current;
cad_pid = task_pid(current);
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..c9f0784 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1289,7 +1289,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
tracehook_finish_clone(p, clone_flags, trace);
if (thread_group_leader(p)) {
- if (clone_flags & CLONE_NEWPID)
+ if (is_child_reaper(pid))
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
--
1.7.1
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 2/4] pidns: Call pid_ns_prepare_proc from create_pid_namespace
2011-01-31 10:25 Prepare for the unshare support of the pid namespace Daniel Lezcano
2011-01-31 10:25 ` [PATCH 1/4] pid: Remove the child_reaper special case in init/main.c Daniel Lezcano
@ 2011-01-31 10:25 ` Daniel Lezcano
2011-01-31 13:22 ` Oleg Nesterov
2011-01-31 10:25 ` [PATCH 3/4] procfs: kill the global proc_mnt variable Daniel Lezcano
2011-01-31 10:25 ` [PATCH 4/4] pidns: Use task_active_pid_ns where appropriate Daniel Lezcano
3 siblings, 1 reply; 13+ messages in thread
From: Daniel Lezcano @ 2011-01-31 10:25 UTC (permalink / raw)
To: akpm; +Cc: ebiederm, oleg, containers, linux-kernel, clg
From: Eric W. Biederman <ebiederm@xmission.com>
Reorganize proc_get_sb so it can be called before the struct pid
of the first process is allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
---
fs/proc/root.c | 25 +++++++------------------
kernel/fork.c | 6 ------
kernel/pid_namespace.c | 4 ++++
3 files changed, 11 insertions(+), 24 deletions(-)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ef9fa8e..e5e2bfa 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
struct pid_namespace *ns;
struct proc_inode *ei;
- if (proc_mnt) {
- /* Seed the root directory with a pid so it doesn't need
- * to be special in base.c. I would do this earlier but
- * the only task alive when /proc is mounted the first time
- * is the init_task and it doesn't have any pids.
- */
- ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
- if (!ei->pid)
- ei->pid = find_get_pid(1);
- }
-
if (flags & MS_KERNMOUNT)
ns = (struct pid_namespace *)data;
else
@@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
return ERR_PTR(err);
}
- ei = PROC_I(sb->s_root->d_inode);
- if (!ei->pid) {
- rcu_read_lock();
- ei->pid = get_pid(find_pid_ns(1, ns));
- rcu_read_unlock();
- }
-
sb->s_flags |= MS_ACTIVE;
}
+ ei = PROC_I(sb->s_root->d_inode);
+ if (!ei->pid) {
+ rcu_read_lock();
+ ei->pid = get_pid(find_pid_ns(1, ns));
+ rcu_read_unlock();
+ }
+
return dget(sb->s_root);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index c9f0784..e7a5907 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1180,12 +1180,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}
p->pid = pid_nr(pid);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94..b90e4ab 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
+#include <linux/proc_fs.h>
#define BITS_PER_PAGE (PAGE_SIZE*8)
@@ -96,6 +97,9 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+ if (pid_ns_prepare_proc(ns))
+ goto out_free_map;
+
return ns;
out_free_map:
--
1.7.1
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 3/4] procfs: kill the global proc_mnt variable
2011-01-31 10:25 Prepare for the unshare support of the pid namespace Daniel Lezcano
2011-01-31 10:25 ` [PATCH 1/4] pid: Remove the child_reaper special case in init/main.c Daniel Lezcano
2011-01-31 10:25 ` [PATCH 2/4] pidns: Call pid_ns_prepare_proc from create_pid_namespace Daniel Lezcano
@ 2011-01-31 10:25 ` Daniel Lezcano
2011-01-31 10:25 ` [PATCH 4/4] pidns: Use task_active_pid_ns where appropriate Daniel Lezcano
3 siblings, 0 replies; 13+ messages in thread
From: Daniel Lezcano @ 2011-01-31 10:25 UTC (permalink / raw)
To: akpm; +Cc: ebiederm, oleg, containers, linux-kernel, clg
From: Oleg Nesterov <oleg@redhat.com>
After the previous cleanup in proc_get_sb() the global proc_mnt has
no reasons to exists, kill it.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
---
fs/proc/inode.c | 2 --
fs/proc/internal.h | 1 -
fs/proc/root.c | 7 ++++---
3 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4c..ee0f802 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -42,8 +42,6 @@ static void proc_evict_inode(struct inode *inode)
sysctl_head_put(PROC_I(inode)->sysctl);
}
-struct vfsmount *proc_mnt;
-
static struct kmem_cache * proc_inode_cachep;
static struct inode *proc_alloc_inode(struct super_block *sb)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9ad561d..c03e8d3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -107,7 +107,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
}
void pde_put(struct proc_dir_entry *pde);
-extern struct vfsmount *proc_mnt;
int proc_fill_super(struct super_block *);
struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index e5e2bfa..a9000e9 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -90,19 +90,20 @@ static struct file_system_type proc_fs_type = {
void __init proc_root_init(void)
{
+ struct vfsmount *mnt;
int err;
proc_init_inodecache();
err = register_filesystem(&proc_fs_type);
if (err)
return;
- proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
- if (IS_ERR(proc_mnt)) {
+ mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+ if (IS_ERR(mnt)) {
unregister_filesystem(&proc_fs_type);
return;
}
- init_pid_ns.proc_mnt = proc_mnt;
+ init_pid_ns.proc_mnt = mnt;
proc_symlink("mounts", NULL, "self/mounts");
proc_net_init();
--
1.7.1
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 4/4] pidns: Use task_active_pid_ns where appropriate
2011-01-31 10:25 Prepare for the unshare support of the pid namespace Daniel Lezcano
` (2 preceding siblings ...)
2011-01-31 10:25 ` [PATCH 3/4] procfs: kill the global proc_mnt variable Daniel Lezcano
@ 2011-01-31 10:25 ` Daniel Lezcano
2011-01-31 11:26 ` Alexey Dobriyan
3 siblings, 1 reply; 13+ messages in thread
From: Daniel Lezcano @ 2011-01-31 10:25 UTC (permalink / raw)
To: akpm; +Cc: ebiederm, oleg, containers, linux-kernel, clg
From: Eric W. Biederman <ebiederm@xmission.com>
The expressions tsk->nsproxy->pid_ns and task_active_pid_ns
aka ns_of_pid(task_pid(tsk)) should have the same number of
cache line misses with the practical difference that
ns_of_pid(task_pid(tsk)) is released later in a processes life.
Furthermore by using task_active_pid_ns it becomes trivial
to write an unshare implementation for the the pid namespace.
So I have used task_active_pid_ns everywhere I can.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
---
arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
arch/um/drivers/mconsole_kern.c | 2 +-
fs/proc/root.c | 2 +-
kernel/cgroup.c | 3 +--
kernel/perf_event.c | 2 +-
kernel/pid.c | 8 ++++----
kernel/signal.c | 9 ++++-----
kernel/sysctl_binary.c | 2 +-
8 files changed, 14 insertions(+), 16 deletions(-)
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 0b04662..82e26a0 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1095,7 +1095,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
LOAD_INT(c), LOAD_FRAC(c),
count_active_contexts(),
atomic_read(&nr_spu_contexts),
- current->nsproxy->pid_ns->last_pid);
+ task_active_pid_ns(current)->last_pid);
return 0;
}
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 975613b..edac0da 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -125,7 +125,7 @@ void mconsole_log(struct mc_request *req)
void mconsole_proc(struct mc_request *req)
{
struct nameidata nd;
- struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
+ struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
struct file *file;
int n, err;
char *ptr = req->request.data, *buf;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index a9000e9..9ea237e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -46,7 +46,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
if (flags & MS_KERNMOUNT)
ns = (struct pid_namespace *)data;
else
- ns = current->nsproxy->pid_ns;
+ ns = task_active_pid_ns(current);
sb = sget(fs_type, proc_test_super, proc_set_super, ns);
if (IS_ERR(sb))
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d702..5cb4ae7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2741,8 +2741,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = current->nsproxy->pid_ns;
-
+ struct pid_namespace *ns = task_active_pid_ns(current);
/*
* We can't drop the pidlist_mutex before taking the l->mutex in case
* the last ref-holder is trying to remove l from the list at the same
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 852ae8c..42bdb40 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5581,7 +5581,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->parent = parent_event;
- event->ns = get_pid_ns(current->nsproxy->pid_ns);
+ event->ns = get_pid_ns(task_active_pid_ns(current));
event->id = atomic64_inc_return(&perf_event_id);
event->state = PERF_EVENT_STATE_INACTIVE;
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b6..b45189d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -339,7 +339,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
struct pid *find_vpid(int nr)
{
- return find_pid_ns(nr, current->nsproxy->pid_ns);
+ return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);
@@ -422,7 +422,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
struct task_struct *find_task_by_vpid(pid_t vnr)
{
- return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+ return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -474,7 +474,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
pid_t pid_vnr(struct pid *pid)
{
- return pid_nr_ns(pid, current->nsproxy->pid_ns);
+ return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);
@@ -485,7 +485,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
rcu_read_lock();
if (!ns)
- ns = current->nsproxy->pid_ns;
+ ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..6f10e78 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1462,16 +1462,15 @@ int do_notify_parent(struct task_struct *tsk, int sig)
* we are under tasklist_lock here so our parent is tied to
* us and cannot exit and release its namespace.
*
- * the only it can is to switch its nsproxy with sys_unshare,
- * bu uncharing pid namespaces is not allowed, so we'll always
- * see relevant namespace
+ * The only it can is to switch its nsproxy with sys_unshare,
+ * but we use the pid_namespace for task_pid which never changes.
*
* write_lock() currently calls preempt_disable() which is the
* same as rcu_read_lock(), but according to Oleg, this is not
* correct to rely on this
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
info.si_uid = __task_cred(tsk)->uid;
rcu_read_unlock();
@@ -1542,7 +1541,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
* see comment in do_notify_parent() abot the following 3 lines
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
info.si_uid = __task_cred(tsk)->uid;
rcu_read_unlock();
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bed..88c69d5 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1349,7 +1349,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
goto out_putname;
}
- mnt = current->nsproxy->pid_ns->proc_mnt;
+ mnt = task_active_pid_ns(current)->proc_mnt;
result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
if (result)
goto out_putname;
--
1.7.1
^ permalink raw reply [flat|nested] 13+ messages in thread