* [RFC PATCH 1/4] namespacefs: Introduce 'namespacefs'
2021-11-18 18:12 [RFC PATCH 0/4] namespacefs: Proof-of-Concept Yordan Karadzhov (VMware)
@ 2021-11-18 18:12 ` Yordan Karadzhov (VMware)
2021-11-18 18:12 ` [RFC PATCH 2/4] namespacefs: Add methods to create/remove PID namespace directories Yordan Karadzhov (VMware)
` (4 subsequent siblings)
5 siblings, 0 replies; 28+ messages in thread
From: Yordan Karadzhov (VMware) @ 2021-11-18 18:12 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel
Cc: viro, ebiederm, rostedt, mingo, hagen, rppt, James.Bottomley,
akpm, vvs, shakeelb, christian.brauner, mkoutny,
Yordan Karadzhov (VMware)
Introducing a simple read-only pseudo file system that aims to provide
direct mechanism for examining the existing hierarchy of namespaces on
the system. When fully functional, 'namespacefs' will allow the user to
see all namespaces that are active on the system and to easily retrieve
the specific data, managed by each namespace. For example the PIDs of
all tasks enclosed in each individual PID namespace.
Here we introduce only the basic definitions of the virtual filesystem
that are based off of 'fs/debugfs/inide.c' and 'fs/tracefs/inod.c'.
The actual coupling between the new filesystem and the namespaces and
all methods for adding/removing namespace directories and files will be
added later.
Signed-off-by: Yordan Karadzhov (VMware) <y.karadz@gmail.com>
---
fs/Kconfig | 1 +
fs/Makefile | 1 +
fs/namespacefs/Kconfig | 6 +
fs/namespacefs/Makefile | 4 +
fs/namespacefs/inode.c | 213 ++++++++++++++++++++++++++++++++++++
include/linux/idr-seq.h | 0
include/linux/namespacefs.h | 47 ++++++++
include/uapi/linux/magic.h | 2 +
8 files changed, 274 insertions(+)
create mode 100644 fs/namespacefs/Kconfig
create mode 100644 fs/namespacefs/Makefile
create mode 100644 fs/namespacefs/inode.c
create mode 100644 include/linux/idr-seq.h
create mode 100644 include/linux/namespacefs.h
diff --git a/fs/Kconfig b/fs/Kconfig
index a6313a969bc5..84c220160615 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -268,6 +268,7 @@ config ARCH_HAS_GIGANTIC_PAGE
source "fs/configfs/Kconfig"
source "fs/efivarfs/Kconfig"
+source "fs/namespacefs/Kconfig"
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 84c5e4cdfee5..5c850f6a7cb0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -138,3 +138,4 @@ obj-$(CONFIG_EFIVAR_FS) += efivarfs/
obj-$(CONFIG_EROFS_FS) += erofs/
obj-$(CONFIG_VBOXSF_FS) += vboxsf/
obj-$(CONFIG_ZONEFS_FS) += zonefs/
+obj-$(CONFIG_NAMESPACE_FS) += namespacefs/
diff --git a/fs/namespacefs/Kconfig b/fs/namespacefs/Kconfig
new file mode 100644
index 000000000000..f26bc62376d4
--- /dev/null
+++ b/fs/namespacefs/Kconfig
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NAMESPACE_FS
+ bool "NameSpace Filesystem support"
+ help
+ This option enables support for namespacefs - a pseudo filesystem
+ that allows to examine the hierarchy of namespaces.
diff --git a/fs/namespacefs/Makefile b/fs/namespacefs/Makefile
new file mode 100644
index 000000000000..23628d3207e3
--- /dev/null
+++ b/fs/namespacefs/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+namespacefs-objs := inode.o
+obj-$(CONFIG_NAMESPACE_FS) += namespacefs.o
diff --git a/fs/namespacefs/inode.c b/fs/namespacefs/inode.c
new file mode 100644
index 000000000000..0f6293b0877d
--- /dev/null
+++ b/fs/namespacefs/inode.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * inode.c - part of namespacefs, pseudo filesystem for examining namespaces.
+ *
+ * Copyright 2021 VMware Inc, Yordan Karadzhov (VMware) <y.karadz@gmail.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
+#include <linux/magic.h>
+
+static struct vfsmount *namespacefs_mount;
+static int namespacefs_mount_count;
+
+static const struct super_operations namespacefs_super_operations = {
+ .statfs = simple_statfs,
+};
+
+#define S_IRALL (S_IRUSR | S_IRGRP | S_IROTH)
+#define S_IXALL (S_IXUSR | S_IXGRP | S_IXOTH)
+
+static int fill_super(struct super_block *sb, void *data, int silent)
+{
+ static const struct tree_descr files[] = {{""}};
+ int err;
+
+ err = simple_fill_super(sb, NAMESPACEFS_MAGIC, files);
+ if (err)
+ return err;
+
+ sb->s_op = &namespacefs_super_operations;
+ sb->s_root->d_inode->i_mode |= S_IRALL;
+
+ return 0;
+}
+
+static struct dentry *ns_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
+{
+ return mount_single(fs_type, flags, data, fill_super);
+}
+
+static struct file_system_type namespacefs_fs_type = {
+ .name = "namespacefs",
+ .mount = ns_mount,
+ .kill_sb = kill_litter_super,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+static inline void release_namespacefs(void)
+{
+ simple_release_fs(&namespacefs_mount, &namespacefs_mount_count);
+}
+
+static inline struct inode *parent_inode(struct dentry *dentry)
+{
+ return dentry->d_parent->d_inode;
+}
+
+static struct inode *get_inode(struct super_block *sb)
+{
+ struct inode *inode = new_inode(sb);
+ if (inode) {
+ inode->i_ino = get_next_ino();
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ }
+ return inode;
+}
+
+static inline void set_file_inode(struct inode *inode,
+ const struct file_operations *fops,
+ void *data)
+{
+ inode->i_fop = fops;
+ inode->i_private = data;
+ inode->i_mode = S_IFREG | S_IRUSR | S_IRGRP;
+}
+
+static inline void set_dir_inode(struct inode *inode)
+{
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_mode = S_IFDIR | S_IXALL | S_IRALL;
+}
+
+static inline int pin_fs(void)
+{
+ return simple_pin_fs(&namespacefs_fs_type,
+ &namespacefs_mount,
+ &namespacefs_mount_count);
+}
+
+static struct dentry *create(const char *name, struct dentry *parent,
+ const struct user_namespace *user_ns,
+ const struct file_operations *fops,
+ void *data)
+{
+ struct dentry *dentry = NULL;
+ struct inode *inode;
+
+ if (pin_fs())
+ return ERR_PTR(-ESTALE);
+
+ /*
+ * If the parent is not specified, we create it in the root.
+ * We need the root dentry to do this, which is in the super
+ * block. A pointer to that is in the struct vfsmount that we
+ * have around.
+ */
+ if (!parent)
+ parent = namespacefs_mount->mnt_root;
+
+ inode_lock(parent->d_inode);
+ if (unlikely(IS_DEADDIR(parent->d_inode)))
+ return ERR_PTR(-ESTALE);
+
+ dentry = lookup_one_len(name, parent, strlen(name));
+ if (IS_ERR(dentry) || (!IS_ERR(dentry) && dentry->d_inode))
+ goto fail;
+
+ inode = get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ goto fail;
+
+ inode->i_uid = user_ns->owner;
+ inode->i_gid = user_ns->group;
+
+ if (fops) {
+ /* Create a file. */
+ set_file_inode(inode, fops, data);
+ d_instantiate(dentry, inode);
+ fsnotify_create(parent_inode(dentry), dentry);
+ } else {
+ /* Create a directory. */
+ set_dir_inode(inode);
+ d_instantiate(dentry, inode);
+ set_nlink(inode, 2);
+ inc_nlink(parent_inode(dentry));
+ fsnotify_mkdir(parent_inode(dentry), dentry);
+ }
+
+ inode_unlock(parent_inode(dentry));
+ return dentry;
+
+ fail:
+ if(!IS_ERR_OR_NULL(dentry))
+ dput(dentry);
+
+ inode_unlock(parent->d_inode);
+ release_namespacefs();
+
+ return ERR_PTR(-ESTALE);
+}
+
+struct dentry *
+namespacefs_create_file(const char *name, struct dentry *parent,
+ const struct user_namespace *user_ns,
+ const struct file_operations *fops,
+ void *data)
+{
+ return create(name, parent, user_ns, fops, data);
+}
+
+struct dentry *
+namespacefs_create_dir(const char *name, struct dentry *parent,
+ const struct user_namespace *user_ns)
+{
+ return create(name, parent, user_ns, NULL, NULL);
+}
+
+static void remove_one(struct dentry *d)
+{
+ release_namespacefs();
+}
+
+void namespacefs_remove_dir(struct dentry *dentry)
+{
+ if (IS_ERR_OR_NULL(dentry))
+ return;
+
+ if (pin_fs())
+ return;
+
+ simple_recursive_removal(dentry, remove_one);
+ release_namespacefs();
+}
+
+#define _NS_MOUNT_DIR "namespaces"
+
+static int __init namespacefs_init(void)
+{
+ int err;
+
+ err = sysfs_create_mount_point(fs_kobj, _NS_MOUNT_DIR);
+ if (err)
+ goto fail;
+
+ err = register_filesystem(&namespacefs_fs_type);
+ if (err)
+ goto rm_mount;
+
+ return 0;
+
+ rm_mount:
+ sysfs_remove_mount_point(fs_kobj, _NS_MOUNT_DIR);
+ fail:
+ return err;
+}
+
+fs_initcall(namespacefs_init);
diff --git a/include/linux/idr-seq.h b/include/linux/idr-seq.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/include/linux/namespacefs.h b/include/linux/namespacefs.h
new file mode 100644
index 000000000000..44a760080df7
--- /dev/null
+++ b/include/linux/namespacefs.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * namespacefs.h - a pseudo file system for examining namespaces.
+ */
+
+#ifndef _NAMESPACEFS_H_
+#define _NAMESPACEFS_H_
+
+#ifdef CONFIG_NAMESPACE_FS
+
+#include <linux/fs.h>
+
+struct dentry *
+namespacefs_create_file(const char *name, struct dentry *parent,
+ const struct user_namespace *user_ns,
+ const struct file_operations *fops,
+ void *data);
+struct dentry *
+namespacefs_create_dir(const char *name, struct dentry *parent,
+ const struct user_namespace *user_ns);
+void namespacefs_remove_dir(struct dentry *dentry);
+
+#else
+
+static inline struct dentry *
+namespacefs_create_file(const char *name, struct dentry *parent,
+ const struct user_namespace *user_ns,
+ const struct file_operations *fops,
+ void *data)
+{
+ return NULL;
+}
+
+static inline struct dentry *
+namespacefs_create_dir(const char *name, struct dentry *parent,
+ const struct user_namespace *user_ns)
+{
+ return NULL;
+}
+
+static inline void namespacefs_remove_dir(struct dentry *dentry)
+{
+}
+
+#endif /* CONFIG_NAMESPACE_FS */
+
+#endif
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 35687dcb1a42..36b432be0d22 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -62,6 +62,8 @@
#define CGROUP_SUPER_MAGIC 0x27e0eb
#define CGROUP2_SUPER_MAGIC 0x63677270
+#define NAMESPACEFS_MAGIC 0x458728fa
+
#define RDTGROUP_SUPER_MAGIC 0x7655821
#define STACK_END_MAGIC 0x57AC6E9D
--
2.33.1
^ permalink raw reply [flat|nested] 28+ messages in thread
* [RFC PATCH 2/4] namespacefs: Add methods to create/remove PID namespace directories
2021-11-18 18:12 [RFC PATCH 0/4] namespacefs: Proof-of-Concept Yordan Karadzhov (VMware)
2021-11-18 18:12 ` [RFC PATCH 1/4] namespacefs: Introduce 'namespacefs' Yordan Karadzhov (VMware)
@ 2021-11-18 18:12 ` Yordan Karadzhov (VMware)
2021-11-18 18:12 ` [RFC PATCH 3/4] namespacefs: Couple namespacefs to the PID namespace Yordan Karadzhov (VMware)
` (3 subsequent siblings)
5 siblings, 0 replies; 28+ messages in thread
From: Yordan Karadzhov (VMware) @ 2021-11-18 18:12 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel
Cc: viro, ebiederm, rostedt, mingo, hagen, rppt, James.Bottomley,
akpm, vvs, shakeelb, christian.brauner, mkoutny,
Yordan Karadzhov (VMware)
Each existing namespace on the system will be represented by a
corresponding directory in namespacesfs. When a namespace is created
a new directory will be added. When a namespace is destroyed, its
corresponding directory will be removed. When fully functional,
'namespacesfs' will provide a direct (1 to 1) mapping between the
hierarchy of all namespaces that are currently active on the system
and the hierarchy of directories in 'namespacesfs'.
As a first step towards this, here we add methods for creating and
removing PID namespace directories. For the moment the PID namespace
directory contains only one file called 'tasks'. This is a read only
pseudo file that provides a list of PIDs of all tasks enclosed inside
the namespace.
We modify 'struct ns_common' so that each namespaces will be able to
own a pointer to the 'dentry' of its corresponding directory in
'namespacesfs'. This pointer will be used to couple the creation and
destruction of a namespace with the creation and removal of its
corresponding directory.
In the patch we also add generic helper methods for printing the content
of an 'idr' ('id to pointer' translation service) into synthetic files
from sequences of records (seq_file). These new definitions are used by
'namespacefs' when printing the PIDs of the tasks in each PID namespace.
Signed-off-by: Yordan Karadzhov (VMware) <y.karadz@gmail.com>
---
fs/namespacefs/inode.c | 119 ++++++++++++++++++++++++++++++++++++
include/linux/idr-seq.h | 0
include/linux/namespacefs.h | 13 ++++
include/linux/ns_common.h | 4 ++
4 files changed, 136 insertions(+)
delete mode 100644 include/linux/idr-seq.h
diff --git a/fs/namespacefs/inode.c b/fs/namespacefs/inode.c
index 0f6293b0877d..012c1c43b44d 100644
--- a/fs/namespacefs/inode.c
+++ b/fs/namespacefs/inode.c
@@ -10,6 +10,8 @@
#include <linux/namei.h>
#include <linux/fsnotify.h>
#include <linux/magic.h>
+#include <linux/idr.h>
+#include <linux/seq_file.h>
static struct vfsmount *namespacefs_mount;
static int namespacefs_mount_count;
@@ -188,6 +190,123 @@ void namespacefs_remove_dir(struct dentry *dentry)
release_namespacefs();
}
+struct idr_seq_context {
+ struct idr *idr;
+ int index;
+};
+
+static void *idr_seq_get_next(struct idr_seq_context *idr_ctx, loff_t *pos)
+{
+ void *next = idr_get_next(idr_ctx->idr, &idr_ctx->index);
+
+ *pos = ++idr_ctx->index;
+ return next;
+}
+
+static void *idr_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct idr_seq_context *idr_ctx = m->private;
+
+ idr_lock(idr_ctx->idr);
+ idr_ctx->index = *pos;
+ return idr_seq_get_next(idr_ctx, pos);
+}
+
+static void *idr_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return idr_seq_get_next(m->private, pos);
+}
+
+static void idr_seq_stop(struct seq_file *m, void *p)
+{
+ struct idr_seq_context *idr_ctx = m->private;
+
+ idr_unlock(idr_ctx->idr);
+}
+
+static int idr_seq_open(struct file *file, struct idr *idr,
+ const struct seq_operations *ops)
+{
+ struct idr_seq_context *idr_ctx;
+
+ idr_ctx = __seq_open_private(file, ops, sizeof(*idr_ctx));
+ if (!idr_ctx)
+ return -ENOMEM;
+
+ idr_ctx->idr = idr;
+
+ return 0;
+}
+
+static inline int pid_seq_show(struct seq_file *m, void *v)
+{
+ struct pid *pid = v;
+
+ seq_printf(m, "%d\n", pid_nr(pid));
+ return 0;
+}
+
+static const struct seq_operations pid_seq_ops = {
+ .start = idr_seq_start,
+ .next = idr_seq_next,
+ .stop = idr_seq_stop,
+ .show = pid_seq_show,
+};
+
+static int pid_seq_open(struct inode *inode, struct file *file)
+{
+ struct idr *idr = inode->i_private;
+
+ return idr_seq_open(file, idr, &pid_seq_ops);
+}
+
+static const struct file_operations tasks_fops = {
+ .open = pid_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int create_inode_dir(struct ns_common *ns, struct dentry *parent_dentry,
+ const struct user_namespace *user_ns)
+{
+ char *dir = kasprintf(GFP_KERNEL, "%u", ns->inum);
+
+ if (!dir)
+ return -ENOMEM;
+
+ ns->dentry = namespacefs_create_dir(dir, parent_dentry, user_ns);
+ kfree(dir);
+ if (IS_ERR(ns->dentry))
+ return PTR_ERR(ns->dentry);
+
+ return 0;
+}
+
+int namespacefs_create_pid_ns_dir(struct pid_namespace *ns)
+{
+ struct dentry *dentry;
+ int err;
+
+ err = create_inode_dir(&ns->ns, ns->parent->ns.dentry, ns->user_ns);
+ if (err)
+ return err;
+
+ dentry = namespacefs_create_file("tasks", ns->ns.dentry, ns->user_ns,
+ &tasks_fops, &ns->idr);
+ if (IS_ERR(dentry)) {
+ dput(ns->ns.dentry);
+ return PTR_ERR(dentry);
+ }
+
+ return 0;
+}
+
+void namespacefs_remove_pid_ns_dir(struct pid_namespace *ns)
+{
+ namespacefs_remove_dir(ns->ns.dentry);
+}
+
#define _NS_MOUNT_DIR "namespaces"
static int __init namespacefs_init(void)
diff --git a/include/linux/idr-seq.h b/include/linux/idr-seq.h
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/include/linux/namespacefs.h b/include/linux/namespacefs.h
index 44a760080df7..f41499a7635a 100644
--- a/include/linux/namespacefs.h
+++ b/include/linux/namespacefs.h
@@ -19,6 +19,8 @@ struct dentry *
namespacefs_create_dir(const char *name, struct dentry *parent,
const struct user_namespace *user_ns);
void namespacefs_remove_dir(struct dentry *dentry);
+int namespacefs_create_pid_ns_dir(struct pid_namespace *ns);
+void namespacefs_remove_pid_ns_dir(struct pid_namespace *ns);
#else
@@ -42,6 +44,17 @@ static inline void namespacefs_remove_dir(struct dentry *dentry)
{
}
+static inline int
+namespacefs_create_pid_ns_dir(struct pid_namespace *ns)
+{
+ return 0;
+}
+
+static inline void
+namespacefs_remove_pid_ns_dir(struct pid_namespace *ns)
+{
+}
+
#endif /* CONFIG_NAMESPACE_FS */
#endif
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 0f1d024bd958..1dec75c51b2c 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -11,6 +11,10 @@ struct ns_common {
const struct proc_ns_operations *ops;
unsigned int inum;
refcount_t count;
+
+#ifdef CONFIG_NAMESPACE_FS
+ struct dentry *dentry;
+#endif /* CONFIG_NAMESPACE_FS */
};
#endif
--
2.33.1
^ permalink raw reply [flat|nested] 28+ messages in thread
* [RFC PATCH 3/4] namespacefs: Couple namespacefs to the PID namespace
2021-11-18 18:12 [RFC PATCH 0/4] namespacefs: Proof-of-Concept Yordan Karadzhov (VMware)
2021-11-18 18:12 ` [RFC PATCH 1/4] namespacefs: Introduce 'namespacefs' Yordan Karadzhov (VMware)
2021-11-18 18:12 ` [RFC PATCH 2/4] namespacefs: Add methods to create/remove PID namespace directories Yordan Karadzhov (VMware)
@ 2021-11-18 18:12 ` Yordan Karadzhov (VMware)
2021-11-18 18:12 ` [RFC PATCH 4/4] namespacefs: Couple namespacefs to the UTS namespace Yordan Karadzhov (VMware)
` (2 subsequent siblings)
5 siblings, 0 replies; 28+ messages in thread
From: Yordan Karadzhov (VMware) @ 2021-11-18 18:12 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel
Cc: viro, ebiederm, rostedt, mingo, hagen, rppt, James.Bottomley,
akpm, vvs, shakeelb, christian.brauner, mkoutny,
Yordan Karadzhov (VMware)
When the PID namespace gets initialized, a directory called 'pid' is
added to 'namespacesfs'. This directory represents the main PID namespace
and also serves as a trunk (parent) of all child PID namespaces. Every
time when a new PID namespace is created a corresponding directory is
added to 'namespacefs/pid/parent/hierarchy/'. The 'inum' of the new
namespace gives the name of its directory. When the PID namespace is
destroyed the corresponding directory is removed.
Signed-off-by: Yordan Karadzhov (VMware) <y.karadz@gmail.com>
---
fs/namespacefs/inode.c | 21 +++++++++++++++++++++
kernel/pid_namespace.c | 9 +++++++++
2 files changed, 30 insertions(+)
diff --git a/fs/namespacefs/inode.c b/fs/namespacefs/inode.c
index 012c1c43b44d..55d71733164c 100644
--- a/fs/namespacefs/inode.c
+++ b/fs/namespacefs/inode.c
@@ -11,7 +11,9 @@
#include <linux/fsnotify.h>
#include <linux/magic.h>
#include <linux/idr.h>
+#include <linux/proc_ns.h>
#include <linux/seq_file.h>
+#include <linux/pid_namespace.h>
static struct vfsmount *namespacefs_mount;
static int namespacefs_mount_count;
@@ -307,6 +309,19 @@ void namespacefs_remove_pid_ns_dir(struct pid_namespace *ns)
namespacefs_remove_dir(ns->ns.dentry);
}
+static int add_ns_dentry(struct ns_common *ns)
+{
+ struct dentry *dentry =
+ namespacefs_create_dir(ns->ops->name, NULL, &init_user_ns);
+
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ ns->dentry = dentry;
+
+ return 0;
+}
+
#define _NS_MOUNT_DIR "namespaces"
static int __init namespacefs_init(void)
@@ -321,8 +336,14 @@ static int __init namespacefs_init(void)
if (err)
goto rm_mount;
+ err = add_ns_dentry(&init_pid_ns.ns);
+ if (err)
+ goto unreg;
+
return 0;
+ unreg:
+ unregister_filesystem(&namespacefs_fs_type);
rm_mount:
sysfs_remove_mount_point(fs_kobj, _NS_MOUNT_DIR);
fail:
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a46a3723bc66..1690b2c87661 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
#include <linux/pid.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/namespacefs.h>
#include <linux/syscalls.h>
#include <linux/cred.h>
#include <linux/err.h>
@@ -101,6 +102,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
err = ns_alloc_inum(&ns->ns);
if (err)
goto out_free_idr;
+
ns->ns.ops = &pidns_operations;
refcount_set(&ns->ns.count, 1);
@@ -110,8 +112,14 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ err = namespacefs_create_pid_ns_dir(ns);
+ if (err)
+ goto out_free_inum;
+
return ns;
+out_free_inum:
+ ns_free_inum(&ns->ns);
out_free_idr:
idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
@@ -133,6 +141,7 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
+ namespacefs_remove_pid_ns_dir(ns);
ns_free_inum(&ns->ns);
idr_destroy(&ns->idr);
--
2.33.1
^ permalink raw reply [flat|nested] 28+ messages in thread
* [RFC PATCH 4/4] namespacefs: Couple namespacefs to the UTS namespace
2021-11-18 18:12 [RFC PATCH 0/4] namespacefs: Proof-of-Concept Yordan Karadzhov (VMware)
` (2 preceding siblings ...)
2021-11-18 18:12 ` [RFC PATCH 3/4] namespacefs: Couple namespacefs to the PID namespace Yordan Karadzhov (VMware)
@ 2021-11-18 18:12 ` Yordan Karadzhov (VMware)
2021-11-18 18:55 ` [RFC PATCH 0/4] namespacefs: Proof-of-Concept Eric W. Biederman
2021-11-18 21:24 ` Mike Rapoport
5 siblings, 0 replies; 28+ messages in thread
From: Yordan Karadzhov (VMware) @ 2021-11-18 18:12 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel
Cc: viro, ebiederm, rostedt, mingo, hagen, rppt, James.Bottomley,
akpm, vvs, shakeelb, christian.brauner, mkoutny,
Yordan Karadzhov (VMware)
When the UTS namespace gets initialized, a directory called 'uts'
is added to namespacesfs. This directory represents the main UTS
namespace and also serves as a trunk (parent) of all other UTS
namespaces. Every time when a new UTS namespace is created a
corresponding directory is added to 'namespacefs/uts/'. The 'inum'
of the new namespace gives the name of its directory. When a UTS
namespace is destroyed the corresponding directory is removed. Each
directory contains a file called 'uname' that can be used to get the
unique data fields of the uts namespaces(sysname, nodename, ...).
Signed-off-by: Yordan Karadzhov (VMware) <y.karadz@gmail.com>
---
fs/namespacefs/inode.c | 57 +++++++++++++++++++++++++++++++++++++
include/linux/namespacefs.h | 13 +++++++++
kernel/utsname.c | 9 ++++++
3 files changed, 79 insertions(+)
diff --git a/fs/namespacefs/inode.c b/fs/namespacefs/inode.c
index 55d71733164c..4b661bdd4d9c 100644
--- a/fs/namespacefs/inode.c
+++ b/fs/namespacefs/inode.c
@@ -14,6 +14,7 @@
#include <linux/proc_ns.h>
#include <linux/seq_file.h>
#include <linux/pid_namespace.h>
+#include <linux/utsname.h>
static struct vfsmount *namespacefs_mount;
static int namespacefs_mount_count;
@@ -309,6 +310,58 @@ void namespacefs_remove_pid_ns_dir(struct pid_namespace *ns)
namespacefs_remove_dir(ns->ns.dentry);
}
+#define _UNAME_N_FIELDS 6
+#define _UNAME_MAX_LEN ((__NEW_UTS_LEN + 1) * _UNAME_N_FIELDS)
+
+static ssize_t uts_ns_read(struct file *file, char __user *ubuf,
+ size_t count, loff_t *pos)
+{
+ struct new_utsname *name = file->private_data;
+ char buff[_UNAME_MAX_LEN + 1];
+ int n;
+
+ n = snprintf(buff, _UNAME_MAX_LEN + 1,
+ "%s %s %s %s %s %s\n",
+ name->sysname,
+ name->nodename,
+ name->release,
+ name->version,
+ name->machine,
+ name->domainname);
+
+ return simple_read_from_buffer(ubuf, count, pos, buff, n);
+}
+
+static const struct file_operations uts_fops = {
+ .open = simple_open,
+ .read = uts_ns_read,
+ .llseek = default_llseek,
+};
+
+int namespacefs_create_uts_ns_dir(struct uts_namespace *ns)
+{
+ struct dentry *dentry;
+ int err;
+
+ err = create_inode_dir(&ns->ns, init_uts_ns.ns.dentry, ns->user_ns);
+ if (err)
+ return err;
+
+ dentry = namespacefs_create_file("uname", ns->ns.dentry, ns->user_ns,
+ &uts_fops, &ns->name);
+ if (IS_ERR(dentry)) {
+ dput(ns->ns.dentry);
+ return PTR_ERR(dentry);
+ }
+
+ return 0;
+}
+
+void namespacefs_remove_uts_ns_dir(struct uts_namespace *ns)
+{
+ namespacefs_remove_dir(ns->ns.dentry);
+}
+
static int add_ns_dentry(struct ns_common *ns)
{
struct dentry *dentry =
@@ -340,6 +393,10 @@ static int __init namespacefs_init(void)
if (err)
goto unreg;
+ err = add_ns_dentry(&init_uts_ns.ns);
+ if (err)
+ goto unreg;
+
return 0;
unreg:
diff --git a/include/linux/namespacefs.h b/include/linux/namespacefs.h
index f41499a7635a..3815a7bbeb1c 100644
--- a/include/linux/namespacefs.h
+++ b/include/linux/namespacefs.h
@@ -21,6 +21,8 @@ namespacefs_create_dir(const char *name, struct dentry *parent,
void namespacefs_remove_dir(struct dentry *dentry);
int namespacefs_create_pid_ns_dir(struct pid_namespace *ns);
void namespacefs_remove_pid_ns_dir(struct pid_namespace *ns);
+int namespacefs_create_uts_ns_dir(struct uts_namespace *ns);
+void namespacefs_remove_uts_ns_dir(struct uts_namespace *ns);
#else
@@ -55,6 +57,17 @@ namespacefs_remove_pid_ns_dir(struct pid_namespace *ns)
{
}
+static inline int
+namespacefs_create_uts_ns_dir(struct uts_namespace *ns)
+{
+ return 0;
+}
+
+static inline void
+namespacefs_remove_uts_ns_dir(struct uts_namespace *ns)
+{
+}
+
#endif /* CONFIG_NAMESPACE_FS */
#endif
diff --git a/kernel/utsname.c b/kernel/utsname.c
index b1ac3ca870f2..d44b307cffdc 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/user_namespace.h>
+#include <linux/namespacefs.h>
#include <linux/proc_ns.h>
#include <linux/sched/task.h>
@@ -70,8 +71,15 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
+
+ err = namespacefs_create_uts_ns_dir(ns);
+ if (err)
+ goto fail_free_inum;
+
return ns;
+fail_free_inum:
+ ns_free_inum(&ns->ns);
fail_free:
kmem_cache_free(uts_ns_cache, ns);
fail_dec:
@@ -105,6 +113,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
void free_uts_ns(struct uts_namespace *ns)
{
+ namespacefs_remove_uts_ns_dir(ns);
dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
--
2.33.1
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC PATCH 0/4] namespacefs: Proof-of-Concept
2021-11-18 18:12 [RFC PATCH 0/4] namespacefs: Proof-of-Concept Yordan Karadzhov (VMware)
` (3 preceding siblings ...)
2021-11-18 18:12 ` [RFC PATCH 4/4] namespacefs: Couple namespacefs to the UTS namespace Yordan Karadzhov (VMware)
@ 2021-11-18 18:55 ` Eric W. Biederman
2021-11-18 19:02 ` Steven Rostedt
` (2 more replies)
2021-11-18 21:24 ` Mike Rapoport
5 siblings, 3 replies; 28+ messages in thread
From: Eric W. Biederman @ 2021-11-18 18:55 UTC (permalink / raw)
To: Yordan Karadzhov (VMware)
Cc: linux-kernel, linux-fsdevel, viro, rostedt, mingo, hagen, rppt,
James.Bottomley, akpm, vvs, shakeelb, christian.brauner, mkoutny,
Linux Containers
Adding the containers mailing list which is for discussions like this.
"Yordan Karadzhov (VMware)" <y.karadz@gmail.com> writes:
> We introduce a simple read-only virtual filesystem that provides
> direct mechanism for examining the existing hierarchy of namespaces
> on the system. For the purposes of this PoC, we tried to keep the
> implementation of the pseudo filesystem as simple as possible. Only
> two namespace types (PID and UTS) are coupled to it for the moment.
> Nevertheless, we do not expect having significant problems when
> adding all other namespace types.
>
> When fully functional, 'namespacefs' will allow the user to see all
> namespaces that are active on the system and to easily retrieve the
> specific data, managed by each namespace. For example the PIDs of
> all tasks enclosed in the individual PID namespaces. Any existing
> namespace on the system will be represented by its corresponding
> directory in namespacesfs. When a namespace is created a directory
> will be added. When a namespace is destroyed, its corresponding
> directory will be removed. The hierarchy of the directories will
> follow the hierarchy of the namespaces.
It is not correct to use inode numbers as the actual names for
namespaces.
I can not see anything else you can possibly uses as names for
namespaces.
To allow container migration between machines and similar things
the you wind up needing a namespace for your names of namespaces.
Further you talk about hierarchy and you have not added support for the
user namespace. Without the user namespace there is not hierarchy with
any namespace but the pid namespace. There is definitely no meaningful
hierarchy without the user namespace.
As far as I can tell merging this will break CRIU and container
migration in general (as the namespace of namespaces problem is not
solved).
Since you are not solving the problem of a namespace for namespaces,
yet implementing something that requires it.
Since you are implementing hierarchy and ignoring the user namespace
which gives structure and hierarchy to the namespaces.
Since this breaks existing use cases without giving a solution.
Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Eric
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC PATCH 0/4] namespacefs: Proof-of-Concept
2021-11-18 18:55 ` [RFC PATCH 0/4] namespacefs: Proof-of-Concept Eric W. Biederman
@ 2021-11-18 19:02 ` Steven Rostedt
2021-11-18 19:22 ` Eric W. Biederman
2021-11-18 19:24 ` Steven Rostedt
2021-11-19 14:26 ` Yordan Karadzhov
2 siblings, 1 reply; 28+ messages in thread
From: Steven Rostedt @ 2021-11-18 19:02 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Yordan Karadzhov (VMware),
linux-kernel, linux-fsdevel, viro, mingo, hagen, rppt,
James.Bottomley, akpm, vvs, shakeelb, christian.brauner, mkoutny,
Linux Containers
On Thu, 18 Nov 2021 12:55:07 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:
> Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>
> Eric
Eric,
As you can see, the subject says "Proof-of-Concept" and every patch in the
the series says "RFC". All you did was point out problems with no help in
fixing those problems, and then gave a nasty Nacked-by before it even got
into a conversation.
From this response, I have to say:
It is not correct to nack a proof of concept that is asking for
discussion.
So, I nack your nack, because it's way to early to nack this.
-- Steve
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC PATCH 0/4] namespacefs: Proof-of-Concept
2021-11-18 19:02 ` Steven Rostedt
@ 2021-11-18 19:22 ` Eric W. Biederman
2021-11-18 19:36 ` Steven Rostedt
0 siblings, 1 reply; 28+ messages in thread
From: Eric W. Biederman @ 2021-11-18 19:22 UTC (permalink / raw)
To: Steven Rostedt
Cc: Yordan Karadzhov (VMware),
linux-kernel, linux-fsdevel, viro, mingo, hagen, rppt,
James.Bottomley, akpm, vvs, shakeelb, christian.brauner, mkoutny,
Linux Containers
Steven Rostedt <rostedt@goodmis.org> writes:
> On Thu, 18 Nov 2021 12:55:07 -0600
> ebiederm@xmission.com (Eric W. Biederman) wrote:
>
>> Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>
>> Eric
>
> Eric,
>
> As you can see, the subject says "Proof-of-Concept" and every patch in the
> the series says "RFC". All you did was point out problems with no help in
> fixing those problems, and then gave a nasty Nacked-by before it even got
> into a conversation.
>
> From this response, I have to say:
>
> It is not correct to nack a proof of concept that is asking for
> discussion.
>
> So, I nack your nack, because it's way to early to nack this.
I am refreshing my nack on the concept. My nack has been in place for
good technical reasons since about 2006.
I see no way forward. I do not see a compelling use case.
There have been many conversations in the past attempt to implement
something that requires a namespace of namespaces and they have never
gotten anywhere.
I see no attempt a due diligence or of actually understanding what
hierarchy already exists in namespaces.
I don't mean to be nasty but I do mean to be clear. Without a
compelling new idea in this space I see no hope of an implementation.
What they are attempting to do makes it impossible to migrate a set of
process that uses this feature from one machine to another. AKA this
would be a breaking change and a regression if merged.
The breaking and regression are caused by assigning names to namespaces
without putting those names into a namespace of their own. That
appears fundamental to the concept not to the implementation.
Since the concept if merged would cause a regression it qualifies for
a nack.
We can explore what problems they are trying to solve with this and
explore other ways to solve those problems. All I saw was a comment
about monitoring tools and wanting a global view. I did not see
any comments about dealing with all of the reasons why a global view
tends to be a bad idea.
I should have added that we have to some extent a way to walk through
namespaces using ioctls on nsfs inodes.
Eric
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC PATCH 0/4] namespacefs: Proof-of-Concept
2021-11-18 19:22 ` Eric W. Biederman
@ 2021-11-18 19:36 ` Steven Rostedt
0 siblings, 0 replies; 28+ messages in thread
From: Steven Rostedt @ 2021-11-18 19:36 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Yordan Karadzhov (VMware),
linux-kernel, linux-fsdevel, viro, mingo, hagen, rppt,
James.Bottomley, akpm, vvs, shakeelb, christian.brauner, mkoutny,
Linux Containers
On Thu, 18 Nov 2021 13:22:16 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:
> Steven Rostedt <rostedt@goodmis.org> writes:
> >
> I am refreshing my nack on the concept. My nack has been in place for
> good technical reasons since about 2006.
I'll admit, we are new to this, as we are now trying to add more visibility
into the workings of things like kubernetes. And having a way of knowing
what containers are running and how to monitor them is needed, and we need
to do this for all container infrastructures.
>
> I see no way forward. I do not see a compelling use case.
What do you use to debug issues in a kubernetes cluster of hundreds of
machines running thousands of containers? Currently, if something is amiss,
a node is restarted in the hopes that the issue does not appear again. But
we would like to add infrastructure that takes advantage of tracing and
profiling to be able to narrow that down. But to do so, we need to
understand what tasks belong to what containers.
>
> There have been many conversations in the past attempt to implement
> something that requires a namespace of namespaces and they have never
> gotten anywhere.
We are not asking about a "namespace" of namespaces, but a filesystem (one,
not a namespace of one), that holds the information at the system scale,
not a container view.
I would be happy to implement something that makes a container having this
file system available "special" as most containers do not need this.
>
> I see no attempt a due diligence or of actually understanding what
> hierarchy already exists in namespaces.
This is not trivial. What did we miss?
>
> I don't mean to be nasty but I do mean to be clear. Without a
> compelling new idea in this space I see no hope of an implementation.
>
> What they are attempting to do makes it impossible to migrate a set of
> process that uses this feature from one machine to another. AKA this
> would be a breaking change and a regression if merged.
The point of this is not to allow that migration. I'd be happy to add that
if a container has access to this file system, it is pinned to the system
and can not be migrated. The whole point of this file system is to monitor
all containers no the system, and it makes no sense in migrating it.
We would duplicate it over several systems, but there's no reason to move
it once it is running.
>
> The breaking and regression are caused by assigning names to namespaces
> without putting those names into a namespace of their own. That
> appears fundamental to the concept not to the implementation.
If you think this should be migrated then yes, it is broken. But we don't
want this to work across migrations. That defeats the purpose of this work.
>
> Since the concept if merged would cause a regression it qualifies for
> a nack.
>
> We can explore what problems they are trying to solve with this and
> explore other ways to solve those problems. All I saw was a comment
> about monitoring tools and wanting a global view. I did not see
> any comments about dealing with all of the reasons why a global view
> tends to be a bad idea.
If you only care about a working environment of the system that runs a set
of containers, how is that a bad idea. Again, I'm happy with implementing
something that makes having this file system prevent it from being
migrated. A pinned privileged container.
>
> I should have added that we have to some extent a way to walk through
> namespaces using ioctls on nsfs inodes.
How robust is this? And is there a library or tooling around it?
-- Steve
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC PATCH 0/4] namespacefs: Proof-of-Concept
2021-11-18 18:55 ` [RFC PATCH 0/4] namespacefs: Proof-of-Concept Eric W. Biederman
2021-11-18 19:02 ` Steven Rostedt
@ 2021-11-18 19:24 ` Steven Rostedt
2021-11-19 9:50 ` Kirill Tkhai
2021-11-19 12:45 ` James Bottomley
2021-11-19 14:26 ` Yordan Karadzhov
2 siblings, 2 replies; 28+ messages in thread
From: Steven Rostedt @ 2021-11-18 19:24 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Yordan Karadzhov (VMware),
linux-kernel, linux-fsdevel, viro, mingo, hagen, rppt,
James.Bottomley, akpm, vvs, shakeelb, christian.brauner, mkoutny,
Linux Containers
On Thu, 18 Nov 2021 12:55:07 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:
> It is not correct to use inode numbers as the actual names for
> namespaces.
>
> I can not see anything else you can possibly uses as names for
> namespaces.
This is why we used inode numbers.
>
> To allow container migration between machines and similar things
> the you wind up needing a namespace for your names of namespaces.
Is this why you say inode numbers are incorrect?
There's no reason to make this into its own namespace. Ideally, this file
system should only be for privilege containers. As the entire point of this
file system is to monitor the other containers on the system. In other
words, this file system is not to be used like procfs, but instead a global
information of the containers running on the host.
At first, we were not going to let this file system be part of any
namespace but the host itself, but because we want to wrap up tooling into
a container that we can install on other machines as a way to monitor the
containers on each machine, we had to open that up.
>
> Further you talk about hierarchy and you have not added support for the
> user namespace. Without the user namespace there is not hierarchy with
> any namespace but the pid namespace. There is definitely no meaningful
> hierarchy without the user namespace.
Great, help us implement this.
>
> As far as I can tell merging this will break CRIU and container
> migration in general (as the namespace of namespaces problem is not
> solved).
This is not to be a file system that is to be migrated. As the point of
this file system is to monitor the other containers, so it does not make
sense to migrate it.
>
> Since you are not solving the problem of a namespace for namespaces,
> yet implementing something that requires it.
Why is it needed?
>
> Since you are implementing hierarchy and ignoring the user namespace
> which gives structure and hierarchy to the namespaces.
We are not ignoring it, we are RFC'ing for advice on how to implement it.
>
> Since this breaks existing use cases without giving a solution.
You don't understand proof-of-concepts and RFCs do you?
-- Steve
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC PATCH 0/4] namespacefs: Proof-of-Concept
2021-11-18 19:24 ` Steven Rostedt
@ 2021-11-19 9:50 ` Kirill Tkhai
2021-11-19 12:45 ` James Bottomley
1 sibling, 0 replies; 28+ messages in thread
From: Kirill Tkhai @ 2021-11-19 9:50 UTC (permalink / raw)
To: Steven Rostedt, Eric W. Biederman
Cc: Yordan Karadzhov (VMware),
linux-kernel, linux-fsdevel, viro, mingo, hagen, rppt,
James.Bottomley, akpm, vvs, shakeelb, christian.brauner, mkoutny,
Linux Containers
On 18.11.2021 22:24, Steven Rostedt wrote:
> On Thu, 18 Nov 2021 12:55:07 -0600
> ebiederm@xmission.com (Eric W. Biederman) wrote:
>
>> It is not correct to use inode numbers as the actual names for
>> namespaces.
>>
>> I can not see anything else you can possibly uses as names for
>> namespaces.
>
> This is why we used inode numbers.
The migration problem may be solved in case of the new filesystem
allows rename.
Kernel may use random UUID as initial namespace file. After the migration,
we recreate this namespace, and it will have another UUID generated by kernel.
Then, we just rename it in correct one.
I sent something like this for /proc fs (except rename):
http://archive.lwn.net:8080/linux-fsdevel/97fdcff1-1cce-7eab-6449-7fe10451162d@virtuozzo.com/T/#m7579f79a6ba8422b57463049f52d2043986b5cac
>>
>> To allow container migration between machines and similar things
>> the you wind up needing a namespace for your names of namespaces.
>
> Is this why you say inode numbers are incorrect?
>
> There's no reason to make this into its own namespace. Ideally, this file
> system should only be for privilege containers. As the entire point of this
> file system is to monitor the other containers on the system. In other
> words, this file system is not to be used like procfs, but instead a global
> information of the containers running on the host.
>
> At first, we were not going to let this file system be part of any
> namespace but the host itself, but because we want to wrap up tooling into
> a container that we can install on other machines as a way to monitor the
> containers on each machine, we had to open that up.
>
>>
>> Further you talk about hierarchy and you have not added support for the
>> user namespace. Without the user namespace there is not hierarchy with
>> any namespace but the pid namespace. There is definitely no meaningful
>> hierarchy without the user namespace.
>
> Great, help us implement this.
>
>>
>> As far as I can tell merging this will break CRIU and container
>> migration in general (as the namespace of namespaces problem is not
>> solved).
>
> This is not to be a file system that is to be migrated. As the point of
> this file system is to monitor the other containers, so it does not make
> sense to migrate it.
>
>>
>> Since you are not solving the problem of a namespace for namespaces,
>> yet implementing something that requires it.
>
> Why is it needed?
>
>>
>> Since you are implementing hierarchy and ignoring the user namespace
>> which gives structure and hierarchy to the namespaces.
>
> We are not ignoring it, we are RFC'ing for advice on how to implement it.
>
>>
>> Since this breaks existing use cases without giving a solution.
>
> You don't understand proof-of-concepts and RFCs do you?
>
> -- Steve
>
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC PATCH 0/4] namespacefs: Proof-of-Concept
2021-11-18 19:24 ` Steven Rostedt
2021-11-19 9:50 ` Kirill Tkhai
@ 2021-11-19 12:45 ` James Bottomley
[not found] ` <20211119092758.1012073e@gandalf.local.home>
1 sibling, 1 reply; 28+ messages in thread
From: James Bottomley @ 2021-11-19 12:45 UTC (permalink / raw)
To: Steven Rostedt, Eric W. Biederman
Cc: Yordan Karadzhov (VMware),
linux-kernel, linux-fsdevel, viro, mingo, hagen, rppt, akpm, vvs,
shakeelb, christian.brauner, mkoutny, Linux Containers
On Thu, 2021-11-18 at 14:24 -0500, Steven Rostedt wrote:
> On Thu, 18 Nov 2021 12:55:07 -0600
> ebiederm@xmission.com (Eric W. Biederman) wrote:
>
> > It is not correct to use inode numbers as the actual names for
> > namespaces.
> >
> > I can not see anything else you can possibly uses as names for
> > namespaces.
>
> This is why we used inode numbers.
>
> > To allow container migration between machines and similar things
> > the you wind up needing a namespace for your names of namespaces.
>
> Is this why you say inode numbers are incorrect?
The problem is you seem to have picked on one orchestration system
without considering all the uses of namespaces and how this would
impact them. So let me explain why inode numbers are incorrect and it
will possibly illuminate some of the cans of worms you're opening.
We have a container checkpoint/restore system called CRIU that can be
used to snapshot the state of a pid subtree and restore it. It can be
used for the entire system or piece of it. It is also used by some
orchestration systems to live migrate containers. Any property of a
container system that has meaning must be saved and restored by CRIU.
The inode number is simply a semi random number assigned to the
namespace. it shows up in /proc/<pid>/ns but nowhere else and isn't
used by anything. When CRIU migrates or restores containers, all the
namespaces that compose them get different inode values on the restore.
If you want to make the inode number equivalent to the container name,
they'd have to restore to the previous number because you've made it a
property of the namespace. The way everything is set up now, that's
just not possible and never will be. Inode numbers are a 32 bit space
and can't be globally unique. If you want a container name, it will
have to be something like a new UUID and that's the first problem you
should tackle.
James
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC PATCH 0/4] namespacefs: Proof-of-Concept
2021-11-18 18:55 ` [RFC PATCH 0/4] namespacefs: Proof-of-Concept Eric W. Biederman
2021-11-18 19:02 ` Steven Rostedt
2021-11-18 19:24 ` Steven Rostedt
@ 2021-11-19 14:26 ` Yordan Karadzhov
2 siblings, 0 replies; 28+ messages in thread
From: Yordan Karadzhov @ 2021-11-19 14:26 UTC (permalink / raw)
To: Eric W. Biederman
Cc: linux-kernel, linux-fsdevel, viro, rostedt, mingo, hagen, rppt,
James.Bottomley, akpm, vvs, shakeelb, christian.brauner, mkoutny,
Linux Containers
Dear Eric,
Thank you very much for pointing out all the weaknesses of this Proof-of-Concept!
I tried to make it clear in the Cover letter that this is nothing more than a PoC. It is OK that you are giving it a
'Nacked-by'. We never had an expectation that this particular version of the code can be merged. Nevertheless, we hope
to receive constructive guidance on how to improve. I will try to comment on your arguments below.
On 18.11.21 г. 20:55 ч., Eric W. Biederman wrote:
>
> Adding the containers mailing list which is for discussions like this.
>
> "Yordan Karadzhov (VMware)" <y.karadz@gmail.com> writes:
>
>> We introduce a simple read-only virtual filesystem that provides
>> direct mechanism for examining the existing hierarchy of namespaces
>> on the system. For the purposes of this PoC, we tried to keep the
>> implementation of the pseudo filesystem as simple as possible. Only
>> two namespace types (PID and UTS) are coupled to it for the moment.
>> Nevertheless, we do not expect having significant problems when
>> adding all other namespace types.
>>
>> When fully functional, 'namespacefs' will allow the user to see all
>> namespaces that are active on the system and to easily retrieve the
>> specific data, managed by each namespace. For example the PIDs of
>> all tasks enclosed in the individual PID namespaces. Any existing
>> namespace on the system will be represented by its corresponding
>> directory in namespacesfs. When a namespace is created a directory
>> will be added. When a namespace is destroyed, its corresponding
>> directory will be removed. The hierarchy of the directories will
>> follow the hierarchy of the namespaces.
>
> It is not correct to use inode numbers as the actual names for
> namespaces.
It is unclear for me why exposing the inode number of a namespace is such a fundamental problem. This information is
already available in /proc/PID/ns. If you are worried by the fact that the inode number gives the name of the
corresponding directory in the filesystem and that someone can interpret this as a name of the namespace itself, then we
can make the inum available inside the directory (and make it identical with /proc/PID/ns/) and to think for some other
naming convention for the directories.
>
> I can not see anything else you can possibly uses as names for
> namespaces.
>
> To allow container migration between machines and similar things
> the you wind up needing a namespace for your names of namespaces.
>
This filesystem aims to provide a snapshot of the current structure of the namespaces on the entire host, so migrating
it to another machine where this structure will be anyway different seems to be meaningless by definition, unless you
really migrate the entire machine.
This may be a stupid question, but are you currently migrating 'debugfs' or 'tracefs' together with a container?
> Further you talk about hierarchy and you have not added support for the
> user namespace. Without the user namespace there is not hierarchy with
> any namespace but the pid namespace. There is definitely no meaningful
> hierarchy without the user namespace.
>
I do agree that the user namespace plays a central role in the global hierarchy of namespaces.
> As far as I can tell merging this will break CRIU and container
> migration in general (as the namespace of namespaces problem is not
> solved).
>
> Since you are not solving the problem of a namespace for namespaces,
> yet implementing something that requires it.
>
> Since you are implementing hierarchy and ignoring the user namespace
> which gives structure and hierarchy to the namespaces.
>
If we provide a second version of the PoC that includes the use namespace, is this going make you do a second
consideration of the idea?
It is OK if you give us a second "Nacked-by" after this ;-)
Once again, thank you very much for your comments!
Best,
Yordan
> Since this breaks existing use cases without giving a solution.
>
> Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>
> Eric
>
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC PATCH 0/4] namespacefs: Proof-of-Concept
2021-11-18 18:12 [RFC PATCH 0/4] namespacefs: Proof-of-Concept Yordan Karadzhov (VMware)
` (4 preceding siblings ...)
2021-11-18 18:55 ` [RFC PATCH 0/4] namespacefs: Proof-of-Concept Eric W. Biederman
@ 2021-11-18 21:24 ` Mike Rapoport
5 siblings, 0 replies; 28+ messages in thread
From: Mike Rapoport @ 2021-11-18 21:24 UTC (permalink / raw)
To: Yordan Karadzhov (VMware)
Cc: linux-kernel, linux-fsdevel, viro, ebiederm, rostedt, mingo,
hagen, James.Bottomley, akpm, vvs, shakeelb, christian.brauner,
mkoutny, Pavel Emelyanov, Andrei Vagin, criu
(added more CRIU folks)
On Thu, Nov 18, 2021 at 08:12:06PM +0200, Yordan Karadzhov (VMware) wrote:
> We introduce a simple read-only virtual filesystem that provides
> direct mechanism for examining the existing hierarchy of namespaces
> on the system. For the purposes of this PoC, we tried to keep the
> implementation of the pseudo filesystem as simple as possible. Only
> two namespace types (PID and UTS) are coupled to it for the moment.
> Nevertheless, we do not expect having significant problems when
> adding all other namespace types.
>
> When fully functional, 'namespacefs' will allow the user to see all
> namespaces that are active on the system and to easily retrieve the
> specific data, managed by each namespace. For example the PIDs of
> all tasks enclosed in the individual PID namespaces. Any existing
> namespace on the system will be represented by its corresponding
> directory in namespacesfs. When a namespace is created a directory
> will be added. When a namespace is destroyed, its corresponding
> directory will be removed. The hierarchy of the directories will
> follow the hierarchy of the namespaces.
>
> One may argue that most of the information, being exposed by this
> new filesystem is already provided by 'procfs' in /proc/*/ns/. In
> fact, 'namespacefs' aims to be complementary to 'procfs', showing not
> only the individual connections between a process and its namespaces,
> but also the global hierarchy of these connections. As a usage example,
> before playing with 'namespacefs', I had no idea that the Chrome web
> browser creates a number of nested PID namespaces. I can only guess
> that each tab or each site is isolated in a nested namespace.
>
> Being able to see the structure of the namespaces can be very useful
> in the context of the containerized workloads. This will provide
> universal methods for detecting, examining and monitoring all sorts
> of containers running on the system, without relaying on any specific
> user-space software. Fore example, with the help of 'namespacefs',
> the simple Python script below can discover all containers, created
> by 'Docker' and Podman' (by all user) that are currently running on
> the system.
>
>
> import sys
> import os
> import pwd
>
> path = '/sys/fs/namespaces'
>
> def pid_ns_tasks(inum):
> tasks_file = '{0}/pid/{1}/tasks'.format(path ,inum)
> with open(tasks_file) as f:
> return [int(pid) for pid in f]
>
> def uts_ns_inum(pid):
> uts_ns_file = '/proc/{0}/ns/uts'.format(pid)
> uts_ns = os.readlink(uts_ns_file)
> return uts_ns.split('[')[1].split(']')[0]
>
> def container_info(pid_inum):
> pids = pid_ns_tasks(inum)
> name = ''
> uid = -1
>
> if len(pids):
> uts_inum = uts_ns_inum(pids[0])
> uname_file = '{0}/uts/{1}/uname'.format(path, uts_inum)
> if os.path.exists(uname_file):
> stat_info = os.stat(uname_file)
> uid = stat_info.st_uid
> with open(uname_file) as f:
> name = f.read().split()[1]
>
> return name, pids, uid
>
> if __name__ == "__main__":
> pid_ns_list = os.listdir('{0}/pid'.format(path))
> for inum in pid_ns_list:
> name, pids, uid = container_info(inum)
> if (name):
> user = pwd.getpwuid(uid).pw_name
> print("{0} -> pids: {1} user: {2}".format(name, pids, user))
>
>
>
> The idea for 'namespacefs' is inspired by the discussion of the
> 'Container tracing' topic [1] during the 'Tracing micro-conference' [2]
> at LPC 2021.
>
> 1. https://www.youtube.com/watch?v=09bVK3f0MPg&t=5455s
> 2. https://www.linuxplumbersconf.org/event/11/page/104-accepted-microconferences
>
>
> Yordan Karadzhov (VMware) (4):
> namespacefs: Introduce 'namespacefs'
> namespacefs: Add methods to create/remove PID namespace directories
> namespacefs: Couple namespacefs to the PID namespace
> namespacefs: Couple namespacefs to the UTS namespace
>
> fs/Kconfig | 1 +
> fs/Makefile | 1 +
> fs/namespacefs/Kconfig | 6 +
> fs/namespacefs/Makefile | 4 +
> fs/namespacefs/inode.c | 410 ++++++++++++++++++++++++++++++++++++
> include/linux/namespacefs.h | 73 +++++++
> include/linux/ns_common.h | 4 +
> include/uapi/linux/magic.h | 2 +
> kernel/pid_namespace.c | 9 +
> kernel/utsname.c | 9 +
> 10 files changed, 519 insertions(+)
> create mode 100644 fs/namespacefs/Kconfig
> create mode 100644 fs/namespacefs/Makefile
> create mode 100644 fs/namespacefs/inode.c
> create mode 100644 include/linux/namespacefs.h
>
> --
> 2.33.1
>
--
Sincerely yours,
Mike.
^ permalink raw reply [flat|nested] 28+ messages in thread