LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [patch 0/6] vfs: mountinfo update
@ 2008-03-13 21:26 Miklos Szeredi
  2008-03-13 21:26 ` [patch 1/6] vfs: mountinfo -mm fix Miklos Szeredi
                   ` (6 more replies)
  0 siblings, 7 replies; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-13 21:26 UTC (permalink / raw)
  To: akpm; +Cc: viro, linuxram, linux-fsdevel, linux-kernel

Hi Andrew,

Here are several updates to the /proc/<pid>/mountinfo patch by Ram and
me.  These hopefully address all comments by you, Al and others.

Thanks,
Miklos

--

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [patch 1/6] vfs: mountinfo -mm fix
  2008-03-13 21:26 [patch 0/6] vfs: mountinfo update Miklos Szeredi
@ 2008-03-13 21:26 ` Miklos Szeredi
  2008-03-13 21:26 ` [patch 2/6] vfs: pnode cleanup Miklos Szeredi
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-13 21:26 UTC (permalink / raw)
  To: akpm; +Cc: viro, linuxram, linux-fsdevel, linux-kernel

[-- Attachment #1: mountinfo_fix.patch --]
[-- Type: text/plain, Size: 959 bytes --]

From: Miklos Szeredi <mszeredi@suse.cz>

The read-only flag showing got broken by the ro-bind patches going in
and out of -mm.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/namespace.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c	2008-03-13 20:45:15.000000000 +0100
+++ linux/fs/namespace.c	2008-03-13 20:45:49.000000000 +0100
@@ -789,9 +789,9 @@ static int show_mountinfo(struct seq_fil
 	seq_dentry(m, mnt->mnt_root, " \t\n\\");
 	seq_putc(m, ' ');
 	seq_path(m, &mnt_path, " \t\n\\");
+	seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
 	show_mnt_opts(m, mnt);
-	seq_putc(m, ' ');
-	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
+	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
 	show_sb_opts(m, sb);
 	if (sb->s_op->show_options)
 		err = sb->s_op->show_options(m, mnt);

--

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [patch 2/6] vfs: pnode cleanup
  2008-03-13 21:26 [patch 0/6] vfs: mountinfo update Miklos Szeredi
  2008-03-13 21:26 ` [patch 1/6] vfs: mountinfo -mm fix Miklos Szeredi
@ 2008-03-13 21:26 ` Miklos Szeredi
  2008-03-19 11:16   ` Al Viro
  2008-03-13 21:26 ` [patch 3/6] vfs: mountinfo stable peer group id Miklos Szeredi
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-13 21:26 UTC (permalink / raw)
  To: akpm; +Cc: viro, linuxram, linux-fsdevel, linux-kernel

[-- Attachment #1: pnode_cleanup.patch --]
[-- Type: text/plain, Size: 3547 bytes --]

From: Miklos Szeredi <mszeredi@suse.cz>

Clean up mnt->mnt_slave_list being initialized too many times.

Move set_mnt_shared from pnode.h to pnode.c.  Change
CLEAR_MNT_SHARED() to clear_mnt_shared() function, and move to
pnode.c.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/namespace.c |    2 +-
 fs/pnode.c     |   23 ++++++++++++++++-------
 fs/pnode.h     |    9 ++-------
 3 files changed, 19 insertions(+), 15 deletions(-)

Index: linux/fs/pnode.c
===================================================================
--- linux.orig/fs/pnode.c	2008-03-13 20:45:15.000000000 +0100
+++ linux/fs/pnode.c	2008-03-13 20:45:49.000000000 +0100
@@ -27,6 +27,17 @@ static inline struct vfsmount *next_slav
 	return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
 }
 
+void set_mnt_shared(struct vfsmount *mnt)
+{
+	mnt->mnt_flags &= ~MNT_PNODE_MASK;
+	mnt->mnt_flags |= MNT_SHARED;
+}
+
+void clear_mnt_shared(struct vfsmount *mnt)
+{
+	mnt->mnt_flags &= ~MNT_SHARED;
+}
+
 static int __peer_group_id(struct vfsmount *mnt)
 {
 	struct vfsmount *m;
@@ -89,20 +100,18 @@ static int do_make_slave(struct vfsmount
 		list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
 			slave_mnt->mnt_master = master;
 		list_move(&mnt->mnt_slave, &master->mnt_slave_list);
-		list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
-		INIT_LIST_HEAD(&mnt->mnt_slave_list);
+		list_splice_init(&mnt->mnt_slave_list,
+				 master->mnt_slave_list.prev);
 	} else {
-		struct list_head *p = &mnt->mnt_slave_list;
-		while (!list_empty(p)) {
-                        slave_mnt = list_first_entry(p,
+		while (!list_empty(&mnt->mnt_slave_list)) {
+			slave_mnt = list_first_entry(&mnt->mnt_slave_list,
 					struct vfsmount, mnt_slave);
 			list_del_init(&slave_mnt->mnt_slave);
 			slave_mnt->mnt_master = NULL;
 		}
 	}
 	mnt->mnt_master = master;
-	CLEAR_MNT_SHARED(mnt);
-	INIT_LIST_HEAD(&mnt->mnt_slave_list);
+	clear_mnt_shared(mnt);
 	return 0;
 }
 
Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c	2008-03-13 20:45:49.000000000 +0100
+++ linux/fs/namespace.c	2008-03-13 20:45:49.000000000 +0100
@@ -537,7 +537,7 @@ static struct vfsmount *clone_mnt(struct
 		if (flag & CL_SLAVE) {
 			list_add(&mnt->mnt_slave, &old->mnt_slave_list);
 			mnt->mnt_master = old;
-			CLEAR_MNT_SHARED(mnt);
+			clear_mnt_shared(mnt);
 		} else if (!(flag & CL_PRIVATE)) {
 			if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
 				list_add(&mnt->mnt_share, &old->mnt_share);
Index: linux/fs/pnode.h
===================================================================
--- linux.orig/fs/pnode.h	2008-03-13 20:45:15.000000000 +0100
+++ linux/fs/pnode.h	2008-03-13 20:45:49.000000000 +0100
@@ -14,7 +14,6 @@
 #define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
 #define IS_MNT_SLAVE(mnt) (mnt->mnt_master)
 #define IS_MNT_NEW(mnt)  (!mnt->mnt_ns)
-#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED)
 #define IS_MNT_UNBINDABLE(mnt) (mnt->mnt_flags & MNT_UNBINDABLE)
 
 #define CL_EXPIRE    		0x01
@@ -24,12 +23,8 @@
 #define CL_PROPAGATION 		0x10
 #define CL_PRIVATE 		0x20
 
-static inline void set_mnt_shared(struct vfsmount *mnt)
-{
-	mnt->mnt_flags &= ~MNT_PNODE_MASK;
-	mnt->mnt_flags |= MNT_SHARED;
-}
-
+void set_mnt_shared(struct vfsmount *);
+void clear_mnt_shared(struct vfsmount *);
 void change_mnt_propagation(struct vfsmount *, int);
 int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
 		struct list_head *);

--

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-13 21:26 [patch 0/6] vfs: mountinfo update Miklos Szeredi
  2008-03-13 21:26 ` [patch 1/6] vfs: mountinfo -mm fix Miklos Szeredi
  2008-03-13 21:26 ` [patch 2/6] vfs: pnode cleanup Miklos Szeredi
@ 2008-03-13 21:26 ` Miklos Szeredi
  2008-03-19 11:48   ` Al Viro
  2008-03-13 21:26 ` [patch 4/6] vfs: mountinfo show dominating " Miklos Szeredi
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-13 21:26 UTC (permalink / raw)
  To: akpm; +Cc: viro, linuxram, linux-fsdevel, linux-kernel

[-- Attachment #1: mountinfo_stable_peer_group_id.patch --]
[-- Type: text/plain, Size: 6560 bytes --]

From: Miklos Szeredi <mszeredi@suse.cz>

Add a stable identifier for shared mounts.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 Documentation/filesystems/proc.txt |   12 +------
 fs/namespace.c                     |    7 ++--
 fs/pnode.c                         |   63 ++++++++++++++++++++++++-------------
 fs/pnode.h                         |    1 
 include/linux/mount.h              |    1 
 5 files changed, 52 insertions(+), 32 deletions(-)

Index: linux/fs/pnode.c
===================================================================
--- linux.orig/fs/pnode.c	2008-03-13 20:45:49.000000000 +0100
+++ linux/fs/pnode.c	2008-03-13 20:45:50.000000000 +0100
@@ -9,8 +9,12 @@
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/fs.h>
+#include <linux/idr.h>
 #include "pnode.h"
 
+static DEFINE_SPINLOCK(mnt_pgid_lock);
+static DEFINE_IDA(mnt_pgid_ida);
+
 /* return the next shared peer mount of @p */
 static inline struct vfsmount *next_peer(struct vfsmount *p)
 {
@@ -27,47 +31,58 @@ static inline struct vfsmount *next_slav
 	return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
 }
 
-void set_mnt_shared(struct vfsmount *mnt)
+static void __set_mnt_shared(struct vfsmount *mnt)
 {
 	mnt->mnt_flags &= ~MNT_PNODE_MASK;
 	mnt->mnt_flags |= MNT_SHARED;
 }
 
-void clear_mnt_shared(struct vfsmount *mnt)
+void set_mnt_shared(struct vfsmount *mnt)
 {
-	mnt->mnt_flags &= ~MNT_SHARED;
+	int res;
+
+ retry:
+	spin_lock(&mnt_pgid_lock);
+	if (IS_MNT_SHARED(mnt)) {
+		spin_unlock(&mnt_pgid_lock);
+		return;
+	}
+
+	res = ida_get_new(&mnt_pgid_ida, &mnt->mnt_pgid);
+	spin_unlock(&mnt_pgid_lock);
+	if (res == -EAGAIN) {
+		if (ida_pre_get(&mnt_pgid_ida, GFP_KERNEL))
+			goto retry;
+	}
+	__set_mnt_shared(mnt);
 }
 
-static int __peer_group_id(struct vfsmount *mnt)
+void clear_mnt_shared(struct vfsmount *mnt)
 {
-	struct vfsmount *m;
-	int id = mnt->mnt_id;
-
-	for (m = next_peer(mnt); m != mnt; m = next_peer(m))
-		id = min(id, m->mnt_id);
+	if (IS_MNT_SHARED(mnt)) {
+		mnt->mnt_flags &= ~MNT_SHARED;
+		mnt->mnt_pgid = -1;
+	}
+}
 
-	return id;
+void make_mnt_peer(struct vfsmount *old, struct vfsmount *mnt)
+{
+	mnt->mnt_pgid = old->mnt_pgid;
+	list_add(&mnt->mnt_share, &old->mnt_share);
+	__set_mnt_shared(mnt);
 }
 
-/* return the smallest ID within the peer group */
 int get_peer_group_id(struct vfsmount *mnt)
 {
-	int id;
-
-	spin_lock(&vfsmount_lock);
-	id = __peer_group_id(mnt);
-	spin_unlock(&vfsmount_lock);
-
-	return id;
+	return mnt->mnt_pgid;
 }
 
-/* return the smallest ID within the master's peer group */
 int get_master_id(struct vfsmount *mnt)
 {
 	int id;
 
 	spin_lock(&vfsmount_lock);
-	id = __peer_group_id(mnt->mnt_master);
+	id = get_peer_group_id(mnt->mnt_master);
 	spin_unlock(&vfsmount_lock);
 
 	return id;
@@ -91,7 +106,13 @@ static int do_make_slave(struct vfsmount
 		if (peer_mnt == mnt)
 			peer_mnt = NULL;
 	}
-	list_del_init(&mnt->mnt_share);
+	if (!list_empty(&mnt->mnt_share))
+		list_del_init(&mnt->mnt_share);
+	else if (IS_MNT_SHARED(mnt)) {
+		spin_lock(&mnt_pgid_lock);
+		ida_remove(&mnt_pgid_ida, mnt->mnt_pgid);
+		spin_unlock(&mnt_pgid_lock);
+	}
 
 	if (peer_mnt)
 		master = peer_mnt;
Index: linux/include/linux/mount.h
===================================================================
--- linux.orig/include/linux/mount.h	2008-03-13 20:45:15.000000000 +0100
+++ linux/include/linux/mount.h	2008-03-13 20:45:50.000000000 +0100
@@ -57,6 +57,7 @@ struct vfsmount {
 	struct vfsmount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	int mnt_id;			/* mount identifier */
+	int mnt_pgid;			/* peer group identifier */
 	/*
 	 * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
 	 * to let these frequently modified fields in a separate cache line
Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c	2008-03-13 20:45:49.000000000 +0100
+++ linux/fs/namespace.c	2008-03-13 20:45:50.000000000 +0100
@@ -95,6 +95,7 @@ struct vfsmount *alloc_vfsmnt(const char
 			return NULL;
 		}
 
+		mnt->mnt_pgid = -1;
 		atomic_set(&mnt->mnt_count, 1);
 		INIT_LIST_HEAD(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
@@ -539,8 +540,10 @@ static struct vfsmount *clone_mnt(struct
 			mnt->mnt_master = old;
 			clear_mnt_shared(mnt);
 		} else if (!(flag & CL_PRIVATE)) {
-			if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
-				list_add(&mnt->mnt_share, &old->mnt_share);
+			if (flag & CL_PROPAGATION)
+				set_mnt_shared(old);
+			if (IS_MNT_SHARED(old))
+				make_mnt_peer(old, mnt);
 			if (IS_MNT_SLAVE(old))
 				list_add(&mnt->mnt_slave, &old->mnt_slave);
 			mnt->mnt_master = old->mnt_master;
Index: linux/Documentation/filesystems/proc.txt
===================================================================
--- linux.orig/Documentation/filesystems/proc.txt	2008-03-13 20:45:15.000000000 +0100
+++ linux/Documentation/filesystems/proc.txt	2008-03-13 20:45:50.000000000 +0100
@@ -2367,18 +2367,12 @@ MNTOPTS: per mount options
 SBOPTS: per super block options
 PROPAGATION: propagation type
 
-propagation type: <propagation_flag>[:<mntid>][,...]
-	note: 'shared' flag is followed by the mntid of its peer mount
-	      'slave' flag is followed by the mntid of its master mount
+propagation type: <propagation_flag>[:<peergrpid>][,...]
+	note: 'shared' flag is followed by the id of this mount's peer group
+	      'slave' flag is followed by the peer group id of its master mount
 	      'private' flag stands by itself
 	      'unbindable' flag stands by itself
 
-The 'mntid' used in the propagation type is a canonical ID of the peer
-group (currently the smallest ID within the group is used for this
-purpose, but this should not be relied on).  Since mounts can be added
-or removed from the peer group, this ID only guaranteed to stay the
-same on a static propagation tree.
-
 For more information see:
 
   Documentation/filesystems/sharedsubtree.txt
Index: linux/fs/pnode.h
===================================================================
--- linux.orig/fs/pnode.h	2008-03-13 20:45:49.000000000 +0100
+++ linux/fs/pnode.h	2008-03-13 20:45:50.000000000 +0100
@@ -25,6 +25,7 @@
 
 void set_mnt_shared(struct vfsmount *);
 void clear_mnt_shared(struct vfsmount *);
+void make_mnt_peer(struct vfsmount *, struct vfsmount *);
 void change_mnt_propagation(struct vfsmount *, int);
 int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
 		struct list_head *);

--

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [patch 4/6] vfs: mountinfo show dominating group id
  2008-03-13 21:26 [patch 0/6] vfs: mountinfo update Miklos Szeredi
                   ` (2 preceding siblings ...)
  2008-03-13 21:26 ` [patch 3/6] vfs: mountinfo stable peer group id Miklos Szeredi
@ 2008-03-13 21:26 ` Miklos Szeredi
  2008-03-19 11:37   ` Al Viro
  2008-03-13 21:26 ` [patch 5/6] vfs: optimization to /proc/<pid>/mountinfo patch Miklos Szeredi
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-13 21:26 UTC (permalink / raw)
  To: akpm; +Cc: viro, linuxram, linux-fsdevel, linux-kernel

[-- Attachment #1: mountinfo_dominator_id.patch --]
[-- Type: text/plain, Size: 4580 bytes --]

From: Miklos Szeredi <mszeredi@suse.cz>

Show peer group ID of nearest dominating group that has intersection
with the mount's namespace.

See the the documentation update for details.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 Documentation/filesystems/proc.txt |   11 ++++++++---
 fs/namespace.c                     |   24 ++++++++++++++++--------
 fs/pnode.c                         |   32 ++++++++++++++++++++++++++++++++
 fs/pnode.h                         |    1 +
 4 files changed, 57 insertions(+), 11 deletions(-)

Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c	2008-03-13 20:45:50.000000000 +0100
+++ linux/fs/namespace.c	2008-03-13 20:45:51.000000000 +0100
@@ -798,16 +798,24 @@ static int show_mountinfo(struct seq_fil
 	show_sb_opts(m, sb);
 	if (sb->s_op->show_options)
 		err = sb->s_op->show_options(m, mnt);
-	if (IS_MNT_SHARED(mnt)) {
-		seq_printf(m, " shared:%i", get_peer_group_id(mnt));
-		if (IS_MNT_SLAVE(mnt))
-			seq_printf(m, ",slave:%i", get_master_id(mnt));
-	} else if (IS_MNT_SLAVE(mnt)) {
-		seq_printf(m, " slave:%i", get_master_id(mnt));
+	seq_putc(m, ' ');
+	if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
+		if (IS_MNT_SHARED(mnt))
+			seq_printf(m, "shared:%i", get_peer_group_id(mnt));
+		if (IS_MNT_SLAVE(mnt)) {
+			int dominator_id = get_dominator_id_same_ns(mnt);
+
+			if (IS_MNT_SHARED(mnt))
+				seq_putc(m, ',');
+
+			seq_printf(m, "slave:%i", get_master_id(mnt));
+			if (dominator_id != -1)
+				seq_printf(m, ":%i", dominator_id);
+		}
 	} else if (IS_MNT_UNBINDABLE(mnt)) {
-		seq_printf(m, " unbindable");
+		seq_printf(m, "unbindable");
 	} else {
-		seq_printf(m, " private");
+		seq_printf(m, "private");
 	}
 	seq_putc(m, '\n');
 	return err;
Index: linux/fs/pnode.c
===================================================================
--- linux.orig/fs/pnode.c	2008-03-13 20:45:50.000000000 +0100
+++ linux/fs/pnode.c	2008-03-13 20:45:51.000000000 +0100
@@ -88,6 +88,38 @@ int get_master_id(struct vfsmount *mnt)
 	return id;
 }
 
+static struct vfsmount *get_peer_in_ns(struct vfsmount *mnt,
+				       struct mnt_namespace *ns)
+{
+	struct vfsmount *m = mnt;
+
+	do {
+		if (m->mnt_ns == ns)
+			return m;
+		m = next_peer(m);
+	} while (m != mnt);
+
+	return NULL;
+}
+
+int get_dominator_id_same_ns(struct vfsmount *mnt)
+{
+	int id = -1;
+	struct vfsmount *m;
+
+	spin_lock(&vfsmount_lock);
+	for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
+		struct vfsmount *d = get_peer_in_ns(m, mnt->mnt_ns);
+		if (d) {
+			id = d->mnt_pgid;
+			break;
+		}
+	}
+	spin_unlock(&vfsmount_lock);
+
+	return id;
+}
+
 static int do_make_slave(struct vfsmount *mnt)
 {
 	struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master;
Index: linux/fs/pnode.h
===================================================================
--- linux.orig/fs/pnode.h	2008-03-13 20:45:50.000000000 +0100
+++ linux/fs/pnode.h	2008-03-13 20:45:51.000000000 +0100
@@ -33,4 +33,5 @@ int propagate_umount(struct list_head *)
 int propagate_mount_busy(struct vfsmount *, int);
 int get_peer_group_id(struct vfsmount *);
 int get_master_id(struct vfsmount *);
+int get_dominator_id_same_ns(struct vfsmount *);
 #endif /* _LINUX_PNODE_H */
Index: linux/Documentation/filesystems/proc.txt
===================================================================
--- linux.orig/Documentation/filesystems/proc.txt	2008-03-13 20:45:50.000000000 +0100
+++ linux/Documentation/filesystems/proc.txt	2008-03-13 20:45:51.000000000 +0100
@@ -2367,15 +2367,20 @@ MNTOPTS: per mount options
 SBOPTS: per super block options
 PROPAGATION: propagation type
 
-propagation type: <propagation_flag>[:<peergrpid>][,...]
+propagation type: <propagation_flag>[:<peergrpid>[:<domgrpid>]][,...]
 	note: 'shared' flag is followed by the id of this mount's peer group
-	      'slave' flag is followed by the peer group id of its master mount
+	      'slave' flag is followed by the peer group id of its master mount,
+	      	      optionally followed by the id of the closest dominant(*)
+		      peer group in the same namespace, if one exists.
 	      'private' flag stands by itself
 	      'unbindable' flag stands by itself
 
+(*) A dominant peer group is an ancestor of this mount in the
+propagation tree, in other words, this mount receives propagation from
+the dominant peer group, but not the other way round.
+
 For more information see:
 
   Documentation/filesystems/sharedsubtree.txt
 
-
 ------------------------------------------------------------------------------

--

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [patch 5/6] vfs: optimization to /proc/<pid>/mountinfo patch
  2008-03-13 21:26 [patch 0/6] vfs: mountinfo update Miklos Szeredi
                   ` (3 preceding siblings ...)
  2008-03-13 21:26 ` [patch 4/6] vfs: mountinfo show dominating " Miklos Szeredi
@ 2008-03-13 21:26 ` Miklos Szeredi
  2008-03-19 11:56   ` Al Viro
  2008-03-13 21:26 ` [patch 6/6] vfs: mountinfo: only show mounts under tasks root Miklos Szeredi
  2008-03-13 22:53 ` [patch 0/6] vfs: mountinfo update Andrew Morton
  6 siblings, 1 reply; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-13 21:26 UTC (permalink / raw)
  To: akpm; +Cc: viro, linuxram, linux-fsdevel, linux-kernel

[-- Attachment #1: mountinfo_ifdefs.patch --]
[-- Type: text/plain, Size: 8690 bytes --]

From: Ram Pai <linuxram@us.ibm.com>

1) reports deleted inode in dentry_path() consistent with that in __d_path()
2) modified __d_path() to use prepend(), reducing the size of __d_path()
3) moved all the functionality that reports mount information in /proc under
	CONFIG_PROC_FS.

Code compile tested only with and without CONFIG_PROC_FS.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/dcache.c              |   63 ++++++++++++++++++++---------------------------
 fs/namespace.c           |    4 ++
 fs/pnode.c               |   10 ++++---
 fs/pnode.h               |    4 ++
 fs/seq_file.c            |    3 +-
 include/linux/dcache.h   |    3 ++
 include/linux/seq_file.h |    3 ++
 7 files changed, 47 insertions(+), 43 deletions(-)

Index: linux/fs/dcache.c
===================================================================
--- linux.orig/fs/dcache.c	2008-03-13 20:45:15.000000000 +0100
+++ linux/fs/dcache.c	2008-03-13 20:45:52.000000000 +0100
@@ -1747,6 +1747,17 @@ shouldnt_be_hashed:
 	goto shouldnt_be_hashed;
 }
 
+static int prepend(char **buffer, int *buflen, const char *str,
+			  int namelen)
+{
+	*buflen -= namelen;
+	if (*buflen < 0)
+		return -ENAMETOOLONG;
+	*buffer -= namelen;
+	memcpy(*buffer, str, namelen);
+	return 0;
+}
+
 /**
  * d_path - return the path of a dentry
  * @dentry: dentry to report
@@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den
 {
 	char * end = buffer+buflen;
 	char * retval;
-	int namelen;
 
-	*--end = '\0';
-	buflen--;
-	if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
-		buflen -= 10;
-		end -= 10;
-		if (buflen < 0)
+	prepend(&end, &buflen, "\0", 1);
+	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+		(prepend(&end, &buflen, " (deleted)", 10) != 0))
 			goto Elong;
-		memcpy(end, " (deleted)", 10);
-	}
 
 	if (buflen < 1)
 		goto Elong;
@@ -1805,13 +1810,10 @@ static char *__d_path(struct dentry *den
 		}
 		parent = dentry->d_parent;
 		prefetch(parent);
-		namelen = dentry->d_name.len;
-		buflen -= namelen + 1;
-		if (buflen < 0)
+		if ((prepend(&end, &buflen, dentry->d_name.name,
+				dentry->d_name.len) != 0) ||
+		    (prepend(&end, &buflen, "/", 1) != 0))
 			goto Elong;
-		end -= namelen;
-		memcpy(end, dentry->d_name.name, namelen);
-		*--end = '/';
 		retval = end;
 		dentry = parent;
 	}
@@ -1819,12 +1821,10 @@ static char *__d_path(struct dentry *den
 	return retval;
 
 global_root:
-	namelen = dentry->d_name.len;
-	buflen -= namelen;
-	if (buflen < 0)
+	retval += 1;	/* hit the slash */
+	if (prepend(&retval, &buflen, dentry->d_name.name,
+		    dentry->d_name.len) != 0)
 		goto Elong;
-	retval -= namelen-1;	/* hit the slash */
-	memcpy(retval, dentry->d_name.name, namelen);
 	return retval;
 Elong:
 	return ERR_PTR(-ENAMETOOLONG);
@@ -1890,17 +1890,8 @@ char *dynamic_dname(struct dentry *dentr
 	return memcpy(buffer, temp, sz);
 }
 
-static int prepend(char **buffer, int *buflen, const char *str,
-			  int namelen)
-{
-	*buflen -= namelen;
-	if (*buflen < 0)
-		return 1;
-	*buffer -= namelen;
-	memcpy(*buffer, str, namelen);
-	return 0;
-}
 
+#ifdef CONFIG_PROC_FS
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
@@ -1911,10 +1902,9 @@ char *dentry_path(struct dentry *dentry,
 
 	spin_lock(&dcache_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
-		if (prepend(&end, &buflen, "//deleted", 9))
+	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+		(prepend(&end, &buflen, " (deleted)", 10) != 0))
 			goto Elong;
-	}
 	if (buflen < 1)
 		goto Elong;
 	/* Get '/' right */
@@ -1929,9 +1919,9 @@ char *dentry_path(struct dentry *dentry,
 		parent = dentry->d_parent;
 		prefetch(parent);
 
-		if (prepend(&end, &buflen, dentry->d_name.name,
-				dentry->d_name.len) ||
-		    prepend(&end, &buflen, "/", 1))
+		if ((prepend(&end, &buflen, dentry->d_name.name,
+				dentry->d_name.len) != 0) ||
+		    (prepend(&end, &buflen, "/", 1) != 0))
 			goto Elong;
 
 		retval = end;
@@ -1943,6 +1933,7 @@ Elong:
 	spin_unlock(&dcache_lock);
 	return ERR_PTR(-ENAMETOOLONG);
 }
+#endif /* CONFIG_PROC_FS */
 
 /*
  * NOTE! The user-level library version returns a
Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c	2008-03-13 20:45:51.000000000 +0100
+++ linux/fs/namespace.c	2008-03-13 20:45:52.000000000 +0100
@@ -679,6 +679,7 @@ void save_mount_options(struct super_blo
 }
 EXPORT_SYMBOL(save_mount_options);
 
+#ifdef CONFIG_PROC_FS
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
@@ -808,7 +809,7 @@ static int show_mountinfo(struct seq_fil
 			if (IS_MNT_SHARED(mnt))
 				seq_putc(m, ',');
 
-			seq_printf(m, "slave:%i", get_master_id(mnt));
+			seq_printf(m, "slave:%i", get_master_group_id(mnt));
 			if (dominator_id != -1)
 				seq_printf(m, ":%i", dominator_id);
 		}
@@ -866,6 +867,7 @@ const struct seq_operations mountstats_o
 	.stop	= m_stop,
 	.show	= show_vfsstat,
 };
+#endif  /* CONFIG_PROC_FS */
 
 /**
  * may_umount_tree - check if a mount tree is busy
Index: linux/fs/seq_file.c
===================================================================
--- linux.orig/fs/seq_file.c	2008-03-13 20:45:15.000000000 +0100
+++ linux/fs/seq_file.c	2008-03-13 20:45:52.000000000 +0100
@@ -391,6 +391,7 @@ int seq_path(struct seq_file *m, struct 
 }
 EXPORT_SYMBOL(seq_path);
 
+#ifdef CONFIG_PROC_FS
 /*
  * returns the path of the 'dentry' from the root of its filesystem.
  */
@@ -411,7 +412,7 @@ int seq_dentry(struct seq_file *m, struc
 	m->count = m->size;
 	return -1;
 }
-EXPORT_SYMBOL(seq_dentry);
+#endif /* CONFIG_PROC_FS */
 
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
Index: linux/include/linux/dcache.h
===================================================================
--- linux.orig/include/linux/dcache.h	2008-03-13 20:45:15.000000000 +0100
+++ linux/include/linux/dcache.h	2008-03-13 20:45:52.000000000 +0100
@@ -303,7 +303,10 @@ extern int d_validate(struct dentry *, s
 extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 
 extern char *d_path(struct path *, char *, int);
+
+#ifdef CONFIG_PROC_FS
 extern char *dentry_path(struct dentry *, char *, int);
+#endif /* CONFIG_PROC_FS */
 
 /* Allocation counts.. */
 
Index: linux/include/linux/seq_file.h
===================================================================
--- linux.orig/include/linux/seq_file.h	2008-03-13 20:45:15.000000000 +0100
+++ linux/include/linux/seq_file.h	2008-03-13 20:45:52.000000000 +0100
@@ -44,7 +44,10 @@ int seq_printf(struct seq_file *, const 
 	__attribute__ ((format (printf,2,3)));
 
 int seq_path(struct seq_file *, struct path *, char *);
+
+#ifdef CONFIG_PROC_FS
 int seq_dentry(struct seq_file *, struct dentry *, char *);
+#endif /* CONFIG_PROC_FS */
 
 int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
 int single_release(struct inode *, struct file *);
Index: linux/fs/pnode.c
===================================================================
--- linux.orig/fs/pnode.c	2008-03-13 20:45:51.000000000 +0100
+++ linux/fs/pnode.c	2008-03-13 20:45:52.000000000 +0100
@@ -72,12 +72,13 @@ void make_mnt_peer(struct vfsmount *old,
 	__set_mnt_shared(mnt);
 }
 
+#ifdef CONFIG_PROC_FS
 int get_peer_group_id(struct vfsmount *mnt)
 {
 	return mnt->mnt_pgid;
 }
 
-int get_master_id(struct vfsmount *mnt)
+int get_master_group_id(struct vfsmount *mnt)
 {
 	int id;
 
@@ -119,6 +120,7 @@ int get_dominator_id_same_ns(struct vfsm
 
 	return id;
 }
+#endif
 
 static int do_make_slave(struct vfsmount *mnt)
 {
@@ -138,13 +140,13 @@ static int do_make_slave(struct vfsmount
 		if (peer_mnt == mnt)
 			peer_mnt = NULL;
 	}
-	if (!list_empty(&mnt->mnt_share))
-		list_del_init(&mnt->mnt_share);
-	else if (IS_MNT_SHARED(mnt)) {
+
+	if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) {
 		spin_lock(&mnt_pgid_lock);
 		ida_remove(&mnt_pgid_ida, mnt->mnt_pgid);
 		spin_unlock(&mnt_pgid_lock);
 	}
+	list_del_init(&mnt->mnt_share);
 
 	if (peer_mnt)
 		master = peer_mnt;
Index: linux/fs/pnode.h
===================================================================
--- linux.orig/fs/pnode.h	2008-03-13 20:45:51.000000000 +0100
+++ linux/fs/pnode.h	2008-03-13 20:45:52.000000000 +0100
@@ -31,7 +31,9 @@ int propagate_mnt(struct vfsmount *, str
 		struct list_head *);
 int propagate_umount(struct list_head *);
 int propagate_mount_busy(struct vfsmount *, int);
+
 int get_peer_group_id(struct vfsmount *);
-int get_master_id(struct vfsmount *);
+int get_master_group_id(struct vfsmount *);
 int get_dominator_id_same_ns(struct vfsmount *);
+
 #endif /* _LINUX_PNODE_H */

--

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [patch 6/6] vfs: mountinfo: only show mounts under tasks root
  2008-03-13 21:26 [patch 0/6] vfs: mountinfo update Miklos Szeredi
                   ` (4 preceding siblings ...)
  2008-03-13 21:26 ` [patch 5/6] vfs: optimization to /proc/<pid>/mountinfo patch Miklos Szeredi
@ 2008-03-13 21:26 ` Miklos Szeredi
  2008-03-19 12:12   ` Al Viro
  2008-03-13 22:53 ` [patch 0/6] vfs: mountinfo update Andrew Morton
  6 siblings, 1 reply; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-13 21:26 UTC (permalink / raw)
  To: akpm; +Cc: viro, linuxram, linux-fsdevel, linux-kernel

[-- Attachment #1: mountinfo_root.patch --]
[-- Type: text/plain, Size: 12891 bytes --]

From: Miklos Szeredi <mszeredi@suse.cz>

1. Only show reachable mounts in /proc/<pid>/mountinfo, this makes
mountpoints unambiguous for chrooted processes.

2. Instead of showing mountpoints relative to the current root, always
show them relative to the queried task's root.

This means, that a particular mountinfo file will always have the same
contents, regardless of which process is reading the file.  Which is a
lot more consistent, than the current behavior of /proc/<pid>/mounts.

Addressed comments from Jan Blunck and Ram Pai.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/dcache.c                   |   16 ++++++---
 fs/namespace.c                |   23 +++++++++-----
 fs/pnode.c                    |   28 ++++++++++++++---
 fs/pnode.h                    |    2 -
 fs/proc/base.c                |   68 ++++++++++++++++++++++++++----------------
 fs/seq_file.c                 |   25 +++++++++++++++
 include/linux/dcache.h        |    1 
 include/linux/mnt_namespace.h |    8 ++++
 include/linux/seq_file.h      |    2 +
 9 files changed, 128 insertions(+), 45 deletions(-)

Index: linux/fs/dcache.c
===================================================================
--- linux.orig/fs/dcache.c	2008-03-13 20:45:52.000000000 +0100
+++ linux/fs/dcache.c	2008-03-13 20:45:53.000000000 +0100
@@ -1760,12 +1760,12 @@ static int prepend(char **buffer, int *b
 
 /**
  * d_path - return the path of a dentry
- * @dentry: dentry to report
- * @vfsmnt: vfsmnt to which the dentry belongs
+ * @path: the dentry/vfsmount to report
  * @root: root dentry
  * @rootmnt: vfsmnt to which the root dentry belongs
  * @buffer: buffer to return value in
  * @buflen: buffer length
+ * @only_reachable: if true, then path must be reachable from root
  *
  * Convert a dentry into an ASCII path name. If the entry has been deleted
  * the string " (deleted)" is appended. Note that this is ambiguous.
@@ -1774,9 +1774,11 @@ static int prepend(char **buffer, int *b
  *
  * "buflen" should be positive. Caller holds the dcache_lock.
  */
-static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
-		       struct path *root, char *buffer, int buflen)
+char *__d_path(struct path *path, struct path *root, char *buffer, int buflen,
+	       bool only_reachable)
 {
+	struct dentry *dentry = path->dentry;
+	struct vfsmount *vfsmnt = path->mnt;
 	char * end = buffer+buflen;
 	char * retval;
 
@@ -1821,6 +1823,8 @@ static char *__d_path(struct dentry *den
 	return retval;
 
 global_root:
+	if (only_reachable)
+		return ERR_PTR(-EINVAL);
 	retval += 1;	/* hit the slash */
 	if (prepend(&retval, &buflen, dentry->d_name.name,
 		    dentry->d_name.len) != 0)
@@ -1863,7 +1867,7 @@ char *d_path(struct path *path, char *bu
 	path_get(&current->fs->root);
 	read_unlock(&current->fs->lock);
 	spin_lock(&dcache_lock);
-	res = __d_path(path->dentry, path->mnt, &root, buf, buflen);
+	res = __d_path(path, &root, buf, buflen, false);
 	spin_unlock(&dcache_lock);
 	path_put(&root);
 	return res;
@@ -1976,7 +1980,7 @@ asmlinkage long sys_getcwd(char __user *
 		unsigned long len;
 		char * cwd;
 
-		cwd = __d_path(pwd.dentry, pwd.mnt, &root, page, PAGE_SIZE);
+		cwd = __d_path(&pwd, &root, page, PAGE_SIZE, false);
 		spin_unlock(&dcache_lock);
 
 		error = PTR_ERR(cwd);
Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c	2008-03-13 20:45:52.000000000 +0100
+++ linux/fs/namespace.c	2008-03-13 20:45:53.000000000 +0100
@@ -683,17 +683,17 @@ EXPORT_SYMBOL(save_mount_options);
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
-	struct mnt_namespace *n = m->private;
+	struct proc_mounts *p = m->private;
 
 	down_read(&namespace_sem);
-	return seq_list_start(&n->list, *pos);
+	return seq_list_start(&p->ns->list, *pos);
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct mnt_namespace *n = m->private;
+	struct proc_mounts *p = m->private;
 
-	return seq_list_next(v, &n->list, pos);
+	return seq_list_next(v, &p->ns->list, pos);
 }
 
 static void m_stop(struct seq_file *m, void *v)
@@ -779,6 +779,8 @@ const struct seq_operations mounts_op = 
 
 static int show_mountinfo(struct seq_file *m, void *v)
 {
+	struct proc_mounts *p = m->private;
+	size_t count_save = m->count;
 	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
 	struct super_block *sb = mnt->mnt_sb;
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
@@ -792,7 +794,11 @@ static int show_mountinfo(struct seq_fil
 	seq_putc(m, ' ');
 	seq_dentry(m, mnt->mnt_root, " \t\n\\");
 	seq_putc(m, ' ');
-	seq_path(m, &mnt_path, " \t\n\\");
+	if (seq_path_root(m, &mnt_path, &p->root, " \t\n\\") == -EINVAL) {
+		/* path is outside root */
+		m->count = count_save;
+		return 0;
+	}
 	seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
 	show_mnt_opts(m, mnt);
 	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
@@ -804,14 +810,15 @@ static int show_mountinfo(struct seq_fil
 		if (IS_MNT_SHARED(mnt))
 			seq_printf(m, "shared:%i", get_peer_group_id(mnt));
 		if (IS_MNT_SLAVE(mnt)) {
-			int dominator_id = get_dominator_id_same_ns(mnt);
+			int dominator;
 
 			if (IS_MNT_SHARED(mnt))
 				seq_putc(m, ',');
 
 			seq_printf(m, "slave:%i", get_master_group_id(mnt));
-			if (dominator_id != -1)
-				seq_printf(m, ":%i", dominator_id);
+			dominator = get_dominator_id_same_root(mnt, &p->root);
+			if (dominator != -1)
+				seq_printf(m, ":%i", dominator);
 		}
 	} else if (IS_MNT_UNBINDABLE(mnt)) {
 		seq_printf(m, "unbindable");
Index: linux/fs/proc/base.c
===================================================================
--- linux.orig/fs/proc/base.c	2008-03-13 20:45:15.000000000 +0100
+++ linux/fs/proc/base.c	2008-03-13 20:45:53.000000000 +0100
@@ -502,17 +502,14 @@ static const struct inode_operations pro
 	.setattr	= proc_setattr,
 };
 
-struct proc_mounts {
-	struct seq_file m;
-	int event;
-};
-
 static int mounts_open_common(struct inode *inode, struct file *file,
 			      const struct seq_operations *op)
 {
 	struct task_struct *task = get_proc_task(inode);
 	struct nsproxy *nsp;
 	struct mnt_namespace *ns = NULL;
+	struct fs_struct *fs = NULL;
+	struct path root;
 	struct proc_mounts *p;
 	int ret = -EINVAL;
 
@@ -525,40 +522,61 @@ static int mounts_open_common(struct ino
 				get_mnt_ns(ns);
 		}
 		rcu_read_unlock();
-
+		if (ns)
+			fs = get_fs_struct(task);
 		put_task_struct(task);
 	}
 
-	if (ns) {
-		ret = -ENOMEM;
-		p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
-		if (p) {
-			file->private_data = &p->m;
-			ret = seq_open(file, op);
-			if (!ret) {
-				p->m.private = ns;
-				p->event = ns->event;
-				return 0;
-			}
-			kfree(p);
-		}
-		put_mnt_ns(ns);
-	}
+	if (!ns)
+		goto err;
+	if (!fs)
+		goto err_put_ns;
+
+	read_lock(&fs->lock);
+	root = fs->root;
+	path_get(&root);
+	read_unlock(&fs->lock);
+	put_fs_struct(fs);
+
+	ret = -ENOMEM;
+	p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
+	if (!p)
+		goto err_put_path;
+
+	file->private_data = &p->m;
+	ret = seq_open(file, op);
+	if (ret)
+		goto err_free;
+
+	p->m.private = p;
+	p->ns = ns;
+	p->root = root;
+	p->event = ns->event;
+
+	return 0;
+
+ err_free:
+	kfree(p);
+ err_put_path:
+	path_put(&root);
+ err_put_ns:
+	put_mnt_ns(ns);
+ err:
 	return ret;
 }
 
 static int mounts_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *m = file->private_data;
-	struct mnt_namespace *ns = m->private;
-	put_mnt_ns(ns);
+	struct proc_mounts *p = file->private_data;
+	path_put(&p->root);
+	put_mnt_ns(p->ns);
 	return seq_release(inode, file);
 }
 
 static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
 	struct proc_mounts *p = file->private_data;
-	struct mnt_namespace *ns = p->m.private;
+	struct mnt_namespace *ns = p->ns;
 	unsigned res = 0;
 
 	poll_wait(file, &ns->poll, wait);
Index: linux/fs/seq_file.c
===================================================================
--- linux.orig/fs/seq_file.c	2008-03-13 20:45:52.000000000 +0100
+++ linux/fs/seq_file.c	2008-03-13 20:45:53.000000000 +0100
@@ -392,6 +392,31 @@ int seq_path(struct seq_file *m, struct 
 EXPORT_SYMBOL(seq_path);
 
 #ifdef CONFIG_PROC_FS
+int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
+		  char *esc)
+{
+	int err = -ENAMETOOLONG;
+	if (m->count < m->size) {
+		char *s = m->buf + m->count;
+		char *p;
+
+		spin_lock(&dcache_lock);
+		p = __d_path(path, root, s, m->size - m->count, true);
+		spin_unlock(&dcache_lock);
+		err = PTR_ERR(p);
+		if (!IS_ERR(p)) {
+			s = mangle_path(s, p, esc);
+			if (s) {
+				p = m->buf + m->count;
+				m->count = s - m->buf;
+				return 0;
+			}
+		}
+	}
+	m->count = m->size;
+	return err;
+}
+
 /*
  * returns the path of the 'dentry' from the root of its filesystem.
  */
Index: linux/include/linux/dcache.h
===================================================================
--- linux.orig/include/linux/dcache.h	2008-03-13 20:45:52.000000000 +0100
+++ linux/include/linux/dcache.h	2008-03-13 20:45:53.000000000 +0100
@@ -302,6 +302,7 @@ extern int d_validate(struct dentry *, s
  */
 extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 
+extern char *__d_path(struct path *path, struct path *root, char *, int, bool);
 extern char *d_path(struct path *, char *, int);
 
 #ifdef CONFIG_PROC_FS
Index: linux/include/linux/mnt_namespace.h
===================================================================
--- linux.orig/include/linux/mnt_namespace.h	2008-03-13 20:45:15.000000000 +0100
+++ linux/include/linux/mnt_namespace.h	2008-03-13 20:45:53.000000000 +0100
@@ -5,6 +5,7 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/nsproxy.h>
+#include <linux/seq_file.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -14,6 +15,13 @@ struct mnt_namespace {
 	int event;
 };
 
+struct proc_mounts {
+	struct seq_file m; /* must be the first element */
+	struct mnt_namespace *ns;
+	struct path root;
+	int event;
+};
+
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
 extern void __put_mnt_ns(struct mnt_namespace *ns);
Index: linux/include/linux/seq_file.h
===================================================================
--- linux.orig/include/linux/seq_file.h	2008-03-13 20:45:52.000000000 +0100
+++ linux/include/linux/seq_file.h	2008-03-13 20:45:53.000000000 +0100
@@ -46,6 +46,8 @@ int seq_printf(struct seq_file *, const 
 int seq_path(struct seq_file *, struct path *, char *);
 
 #ifdef CONFIG_PROC_FS
+int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
+		  char *esc);
 int seq_dentry(struct seq_file *, struct dentry *, char *);
 #endif /* CONFIG_PROC_FS */
 
Index: linux/fs/pnode.c
===================================================================
--- linux.orig/fs/pnode.c	2008-03-13 20:45:52.000000000 +0100
+++ linux/fs/pnode.c	2008-03-13 20:45:53.000000000 +0100
@@ -89,28 +89,46 @@ int get_master_group_id(struct vfsmount 
 	return id;
 }
 
-static struct vfsmount *get_peer_in_ns(struct vfsmount *mnt,
-				       struct mnt_namespace *ns)
+/*
+ * Return true if path is reachable from root
+ *
+ * Caller must hold vfsmount_lock
+ */
+static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry,
+			 const struct path *root)
+{
+	while (mnt != root->mnt && mnt->mnt_parent != mnt) {
+		dentry = mnt->mnt_mountpoint;
+		mnt = mnt->mnt_parent;
+	}
+	return mnt == root->mnt && is_subdir(dentry, root->dentry);
+}
+
+static struct vfsmount *get_peer_under_root(struct vfsmount *mnt,
+					    struct mnt_namespace *ns,
+					    const struct path *root)
 {
 	struct vfsmount *m = mnt;
 
 	do {
-		if (m->mnt_ns == ns)
+		/* Check the namespace first for optimization */
+		if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root))
 			return m;
+
 		m = next_peer(m);
 	} while (m != mnt);
 
 	return NULL;
 }
 
-int get_dominator_id_same_ns(struct vfsmount *mnt)
+int get_dominator_id_same_root(struct vfsmount *mnt, const struct path *root)
 {
 	int id = -1;
 	struct vfsmount *m;
 
 	spin_lock(&vfsmount_lock);
 	for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
-		struct vfsmount *d = get_peer_in_ns(m, mnt->mnt_ns);
+		struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root);
 		if (d) {
 			id = d->mnt_pgid;
 			break;
Index: linux/fs/pnode.h
===================================================================
--- linux.orig/fs/pnode.h	2008-03-13 20:45:52.000000000 +0100
+++ linux/fs/pnode.h	2008-03-13 20:45:53.000000000 +0100
@@ -34,6 +34,6 @@ int propagate_mount_busy(struct vfsmount
 
 int get_peer_group_id(struct vfsmount *);
 int get_master_group_id(struct vfsmount *);
-int get_dominator_id_same_ns(struct vfsmount *);
+int get_dominator_id_same_root(struct vfsmount *, const struct path *);
 
 #endif /* _LINUX_PNODE_H */

--

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 0/6] vfs: mountinfo update
  2008-03-13 21:26 [patch 0/6] vfs: mountinfo update Miklos Szeredi
                   ` (5 preceding siblings ...)
  2008-03-13 21:26 ` [patch 6/6] vfs: mountinfo: only show mounts under tasks root Miklos Szeredi
@ 2008-03-13 22:53 ` Andrew Morton
  2008-03-14  8:17   ` Miklos Szeredi
  6 siblings, 1 reply; 38+ messages in thread
From: Andrew Morton @ 2008-03-13 22:53 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: viro, linuxram, linux-fsdevel, linux-kernel

On Thu, 13 Mar 2008 22:26:41 +0100
Miklos Szeredi <miklos@szeredi.hu> wrote:

> Hi Andrew,
> 
> Here are several updates to the /proc/<pid>/mountinfo patch by Ram and
> me.  These hopefully address all comments by you, Al and others.
> 

diffstat for all seven patches is:

 Documentation/filesystems/proc.txt |   35 +++++
 fs/dcache.c                        |  101 +++++++++++---
 fs/namespace.c                     |  182 ++++++++++++++++++++++-----
 fs/pnode.c                         |  131 ++++++++++++++++++-
 fs/pnode.h                         |   15 +-
 fs/proc/base.c                     |  121 +++++++++--------
 fs/seq_file.c                      |   92 +++++++++++--
 include/linux/dcache.h             |    5 
 include/linux/mnt_namespace.h      |   12 +
 include/linux/mount.h              |    2 
 include/linux/seq_file.h           |    7 +
 11 files changed, 559 insertions(+), 144 deletions(-)

that's a mountain of tricksy new core-kernel code just for some /proc file.

Is this all really justifiable?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 0/6] vfs: mountinfo update
  2008-03-13 22:53 ` [patch 0/6] vfs: mountinfo update Andrew Morton
@ 2008-03-14  8:17   ` Miklos Szeredi
  2008-03-14 19:29     ` Ram Pai
  0 siblings, 1 reply; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-14  8:17 UTC (permalink / raw)
  To: akpm; +Cc: miklos, viro, linuxram, linux-fsdevel, linux-kernel

> diffstat for all seven patches is:
> 
>  Documentation/filesystems/proc.txt |   35 +++++
>  fs/dcache.c                        |  101 +++++++++++---
>  fs/namespace.c                     |  182 ++++++++++++++++++++++-----
>  fs/pnode.c                         |  131 ++++++++++++++++++-
>  fs/pnode.h                         |   15 +-
>  fs/proc/base.c                     |  121 +++++++++--------
>  fs/seq_file.c                      |   92 +++++++++++--
>  include/linux/dcache.h             |    5 
>  include/linux/mnt_namespace.h      |   12 +
>  include/linux/mount.h              |    2 
>  include/linux/seq_file.h           |    7 +
>  11 files changed, 559 insertions(+), 144 deletions(-)
> 
> that's a mountain of tricksy new core-kernel code just for some /proc file.
> 
> Is this all really justifiable?

This is an approximate order of added features from most complex to
least complex:

 1) basic info about mount propagation
 2) root of the mount (source directory of bind)
 3) disambiguating reachable mounts from unreachable (/proc/mounts
    under chroot is a mess, and people are afraid to fix it:
    http://lkml.org/lkml/2007/4/17/271)
 4) showing "dominating group" which intersects with current namespace/root
 5) allocating ID's from IDR pools instead of ever increasing 64bit counters
 6) showing ID of mount and parent mount
 7) st_dev of mount

1, 2, 3 and 6 we definitely want.  Al wants 4.  I think 5 is useful
for readability, but not absolutely necessary.

All in all, I think there's very little that we can do, without
throwing out the baby with the bathwater.

Also compare this with the number of lines and mind boggling
complexity added for the mount propagation stuff, which is still very
little used years after it's introduction.  Possibly because some of
the infrastructure is still missing from kernel as well as userspace
to make it all really useful.  And I think this patch set goes towards
getting that infrastructure in place.

Possibly mount propagation will remain on the fringe for ever, in
which case a big chunk of this patch set (1 and 4) also end up being
useless.  The worst that can happen is that we end up adding some
#ifdef CONFIG_MOUNT_PROPAGATION clutter to namespace.c.

Miklos

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 0/6] vfs: mountinfo update
  2008-03-14  8:17   ` Miklos Szeredi
@ 2008-03-14 19:29     ` Ram Pai
  0 siblings, 0 replies; 38+ messages in thread
From: Ram Pai @ 2008-03-14 19:29 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, viro, linux-fsdevel, linux-kernel

On Fri, 2008-03-14 at 09:17 +0100, Miklos Szeredi wrote:
> > diffstat for all seven patches is:
> > 
> >  Documentation/filesystems/proc.txt |   35 +++++
> >  fs/dcache.c                        |  101 +++++++++++---
> >  fs/namespace.c                     |  182 ++++++++++++++++++++++-----
> >  fs/pnode.c                         |  131 ++++++++++++++++++-
> >  fs/pnode.h                         |   15 +-
> >  fs/proc/base.c                     |  121 +++++++++--------
> >  fs/seq_file.c                      |   92 +++++++++++--
> >  include/linux/dcache.h             |    5 
> >  include/linux/mnt_namespace.h      |   12 +
> >  include/linux/mount.h              |    2 
> >  include/linux/seq_file.h           |    7 +
> >  11 files changed, 559 insertions(+), 144 deletions(-)
> > 
> > that's a mountain of tricksy new core-kernel code just for some /proc file.
> > 
> > Is this all really justifiable?
> 
> This is an approximate order of added features from most complex to
> least complex:
> 
>  1) basic info about mount propagation
>  2) root of the mount (source directory of bind)
>  3) disambiguating reachable mounts from unreachable (/proc/mounts
>     under chroot is a mess, and people are afraid to fix it:
>     http://lkml.org/lkml/2007/4/17/271)
>  4) showing "dominating group" which intersects with current namespace/root
>  5) allocating ID's from IDR pools instead of ever increasing 64bit counters
>  6) showing ID of mount and parent mount
>  7) st_dev of mount
> 
> 1, 2, 3 and 6 we definitely want.  Al wants 4.  I think 5 is useful
> for readability, but not absolutely necessary.
> 
> All in all, I think there's very little that we can do, without
> throwing out the baby with the bathwater.
> 
> Also compare this with the number of lines and mind boggling
> complexity added for the mount propagation stuff, which is still very
> little used years after it's introduction.  Possibly because some of
> the infrastructure is still missing from kernel as well as userspace
> to make it all really useful.  And I think this patch set goes towards
> getting that infrastructure in place.

The inability to see the propagation tree was one major block towards
the use of sharedsubtree. Patch (1) fills up that gap and hence
absolutely needed. I view the remaining patches as patches directly
or indirectly supporting (1). 

As far as the 'mind boggling complex code' I believe the container folks
use it or at-least plan to use it. I know one our software groups
actively leveraging this functionality. 

RP


> 
> Possibly mount propagation will remain on the fringe for ever, in
> which case a big chunk of this patch set (1 and 4) also end up being
> useless.  The worst that can happen is that we end up adding some
> #ifdef CONFIG_MOUNT_PROPAGATION clutter to namespace.c.
> 
> Miklos


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 2/6] vfs: pnode cleanup
  2008-03-13 21:26 ` [patch 2/6] vfs: pnode cleanup Miklos Szeredi
@ 2008-03-19 11:16   ` Al Viro
  2008-03-19 11:48     ` Miklos Szeredi
  0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2008-03-19 11:16 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, linuxram, linux-fsdevel, linux-kernel

On Thu, Mar 13, 2008 at 10:26:43PM +0100, Miklos Szeredi wrote:
> Move set_mnt_shared from pnode.h to pnode.c.  Change
> CLEAR_MNT_SHARED() to clear_mnt_shared() function, and move to
> pnode.c.

I don't see why these two are a cleanup, actually.

> @@ -89,20 +100,18 @@ static int do_make_slave(struct vfsmount
>  		list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
>  			slave_mnt->mnt_master = master;
>  		list_move(&mnt->mnt_slave, &master->mnt_slave_list);
> -		list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
> -		INIT_LIST_HEAD(&mnt->mnt_slave_list);
> +		list_splice_init(&mnt->mnt_slave_list,
> +				 master->mnt_slave_list.prev);

Umm...  OK.

>  	} else {
> -		struct list_head *p = &mnt->mnt_slave_list;
> -		while (!list_empty(p)) {
> -                        slave_mnt = list_first_entry(p,
> +		while (!list_empty(&mnt->mnt_slave_list)) {
> +			slave_mnt = list_first_entry(&mnt->mnt_slave_list,

How is that better?


> -	INIT_LIST_HEAD(&mnt->mnt_slave_list);

Fine by me.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 4/6] vfs: mountinfo show dominating group id
  2008-03-13 21:26 ` [patch 4/6] vfs: mountinfo show dominating " Miklos Szeredi
@ 2008-03-19 11:37   ` Al Viro
  2008-03-19 12:03     ` Miklos Szeredi
  0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2008-03-19 11:37 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, linuxram, linux-fsdevel, linux-kernel

On Thu, Mar 13, 2008 at 10:26:45PM +0100, Miklos Szeredi wrote:
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> Show peer group ID of nearest dominating group that has intersection
> with the mount's namespace.

There's an obvious problem here: ->show_options() can spew _anything_,
including a string that ends on " shared:42".  Makes reliable parsing of
the damn thing in userland impossible.  IOW, fs options should go _last_
and they should follow an unconditionally present field.

>  	if (sb->s_op->show_options)
>  		err = sb->s_op->show_options(m, mnt);
> -	if (IS_MNT_SHARED(mnt)) {
> -		seq_printf(m, " shared:%i", get_peer_group_id(mnt));
> -		if (IS_MNT_SLAVE(mnt))
> -			seq_printf(m, ",slave:%i", get_master_id(mnt));
> -	} else if (IS_MNT_SLAVE(mnt)) {
> -		seq_printf(m, " slave:%i", get_master_id(mnt));
> +	seq_putc(m, ' ');
> +	if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
> +		if (IS_MNT_SHARED(mnt))
> +			seq_printf(m, "shared:%i", get_peer_group_id(mnt));
> +		if (IS_MNT_SLAVE(mnt)) {
> +			int dominator_id = get_dominator_id_same_ns(mnt);

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-13 21:26 ` [patch 3/6] vfs: mountinfo stable peer group id Miklos Szeredi
@ 2008-03-19 11:48   ` Al Viro
  2008-03-19 16:41     ` Miklos Szeredi
  0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2008-03-19 11:48 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, linuxram, linux-fsdevel, linux-kernel

On Thu, Mar 13, 2008 at 10:26:44PM +0100, Miklos Szeredi wrote:
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> Add a stable identifier for shared mounts.

> +static DEFINE_SPINLOCK(mnt_pgid_lock);

Um?  Do you ever need to take it outside of vfsmount_lock?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 2/6] vfs: pnode cleanup
  2008-03-19 11:16   ` Al Viro
@ 2008-03-19 11:48     ` Miklos Szeredi
  0 siblings, 0 replies; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-19 11:48 UTC (permalink / raw)
  To: viro; +Cc: miklos, akpm, linuxram, linux-fsdevel, linux-kernel

> On Thu, Mar 13, 2008 at 10:26:43PM +0100, Miklos Szeredi wrote:
> > Move set_mnt_shared from pnode.h to pnode.c.  Change
> > CLEAR_MNT_SHARED() to clear_mnt_shared() function, and move to
> > pnode.c.
> 
> I don't see why these two are a cleanup, actually.

People don't like my cleanups :( sniff...

> > @@ -89,20 +100,18 @@ static int do_make_slave(struct vfsmount
> >  		list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
> >  			slave_mnt->mnt_master = master;
> >  		list_move(&mnt->mnt_slave, &master->mnt_slave_list);
> > -		list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
> > -		INIT_LIST_HEAD(&mnt->mnt_slave_list);
> > +		list_splice_init(&mnt->mnt_slave_list,
> > +				 master->mnt_slave_list.prev);
> 
> Umm...  OK.

Yeah, not much of an improvement.

> >  	} else {
> > -		struct list_head *p = &mnt->mnt_slave_list;
> > -		while (!list_empty(p)) {
> > -                        slave_mnt = list_first_entry(p,
> > +		while (!list_empty(&mnt->mnt_slave_list)) {
> > +			slave_mnt = list_first_entry(&mnt->mnt_slave_list,
> 
> How is that better?

I find it easier to read.  Renaming "p" to "slaves" would also have
been better.  Single letter variables are reserved for iterators in my
mind...

Miklos

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 5/6] vfs: optimization to /proc/<pid>/mountinfo patch
  2008-03-13 21:26 ` [patch 5/6] vfs: optimization to /proc/<pid>/mountinfo patch Miklos Szeredi
@ 2008-03-19 11:56   ` Al Viro
  2008-03-19 16:56     ` Miklos Szeredi
  0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2008-03-19 11:56 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, linuxram, linux-fsdevel, linux-kernel

On Thu, Mar 13, 2008 at 10:26:46PM +0100, Miklos Szeredi wrote:
> +static int prepend(char **buffer, int *buflen, const char *str,
> +			  int namelen)

inline, please.

> @@ -1911,10 +1902,9 @@ char *dentry_path(struct dentry *dentry,
>  
>  	spin_lock(&dcache_lock);
>  	prepend(&end, &buflen, "\0", 1);
> -	if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
> -		if (prepend(&end, &buflen, "//deleted", 9))
> +	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
> +		(prepend(&end, &buflen, " (deleted)", 10) != 0))
>  			goto Elong;

That's a bad idea:
	* we bloody well might want to use it outside of procfs
	* //deleted is _better_; you can't have an empty path component,
but you can have a pathname ending on " (deleted)".

> +#ifdef CONFIG_PROC_FS
>  /*
>   * returns the path of the 'dentry' from the root of its filesystem.
>   */
> @@ -411,7 +412,7 @@ int seq_dentry(struct seq_file *m, struc
>  	m->count = m->size;
>  	return -1;
>  }
> -EXPORT_SYMBOL(seq_dentry);
> +#endif /* CONFIG_PROC_FS */

Same comment; you don't need procfs to use seq_file.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 4/6] vfs: mountinfo show dominating group id
  2008-03-19 11:37   ` Al Viro
@ 2008-03-19 12:03     ` Miklos Szeredi
  2008-03-19 12:19       ` Miklos Szeredi
  0 siblings, 1 reply; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-19 12:03 UTC (permalink / raw)
  To: viro; +Cc: miklos, akpm, linuxram, linux-fsdevel, linux-kernel

> > From: Miklos Szeredi <mszeredi@suse.cz>
> > 
> > Show peer group ID of nearest dominating group that has intersection
> > with the mount's namespace.
> 
> There's an obvious problem here: ->show_options() can spew _anything_,
> including a string that ends on " shared:42".

Yeah, even though I'd call that very broken, I wouldn't like to go
auditing filesystems for such breakage.

>  Makes reliable parsing of
> the damn thing in userland impossible.  IOW, fs options should go _last_
> and they should follow an unconditionally present field.

But then we lose the ability to later extend the format by adding
fields at the end.  Which is one of the things that would be nice to
have, in contrast to /proc/mounts, which we are so afraid to touch now.

So maybe some alternative, multi line format would be better?

MountID: 99
ParentID: 88
DevID: 0:34
Type: foofs
Source: /dev/foo
Root: /
MountPoint: /mnt/foo
MountOpts: rw,noatime
Opts: rw,errors=continue
Propagation: shared:42

Miklos

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 6/6] vfs: mountinfo: only show mounts under tasks root
  2008-03-13 21:26 ` [patch 6/6] vfs: mountinfo: only show mounts under tasks root Miklos Szeredi
@ 2008-03-19 12:12   ` Al Viro
  2008-03-19 12:25     ` Miklos Szeredi
  0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2008-03-19 12:12 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, linuxram, linux-fsdevel, linux-kernel

On Thu, Mar 13, 2008 at 10:26:47PM +0100, Miklos Szeredi wrote:
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> 1. Only show reachable mounts in /proc/<pid>/mountinfo, this makes
> mountpoints unambiguous for chrooted processes.
> 
> 2. Instead of showing mountpoints relative to the current root, always
> show them relative to the queried task's root.
> 
> This means, that a particular mountinfo file will always have the same
> contents, regardless of which process is reading the file.  Which is a
> lot more consistent, than the current behavior of /proc/<pid>/mounts.
> 
> Addressed comments from Jan Blunck and Ram Pai.

>  	return retval;
>  
>  global_root:
> +	if (only_reachable)
> +		return ERR_PTR(-EINVAL);

Humm...  Might make sense to update *root in case we hit that place and
do the rest in callers, instead of playing with extra arguments...
Hell knows, I'd try to massage in that direction and see if anything
clean shows up.

>  static int show_mountinfo(struct seq_file *m, void *v)
>  {
> +	struct proc_mounts *p = m->private;
> +	size_t count_save = m->count;

*UGH*.  Do you really need that?  Frankly, in that case I'd rather
separate the check from __d_path(); unwinds like that are Not Nice(tm).

> +	if (seq_path_root(m, &mnt_path, &p->root, " \t\n\\") == -EINVAL) {
> +		/* path is outside root */
> +		m->count = count_save;

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 4/6] vfs: mountinfo show dominating group id
  2008-03-19 12:03     ` Miklos Szeredi
@ 2008-03-19 12:19       ` Miklos Szeredi
  2008-03-19 12:41         ` Al Viro
  0 siblings, 1 reply; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-19 12:19 UTC (permalink / raw)
  To: miklos; +Cc: viro, miklos, akpm, linuxram, linux-fsdevel, linux-kernel

> >  Makes reliable parsing of
> > the damn thing in userland impossible.  IOW, fs options should go _last_
> > and they should follow an unconditionally present field.
> 
> But then we lose the ability to later extend the format by adding
> fields at the end.  Which is one of the things that would be nice to
> have, in contrast to /proc/mounts, which we are so afraid to touch now.
> 
> So maybe some alternative, multi line format would be better?
> 
> MountID: 99
> ParentID: 88
> DevID: 0:34
> Type: foofs
> Source: /dev/foo
> Root: /
> MountPoint: /mnt/foo
> MountOpts: rw,noatime
> Opts: rw,errors=continue
> Propagation: shared:42

Which still doesn't fully solve the problem, since ->show_options()
can also spew newlines + MountID:.  Oh well.

Miklos

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 6/6] vfs: mountinfo: only show mounts under tasks root
  2008-03-19 12:12   ` Al Viro
@ 2008-03-19 12:25     ` Miklos Szeredi
  0 siblings, 0 replies; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-19 12:25 UTC (permalink / raw)
  To: viro; +Cc: miklos, akpm, linuxram, linux-fsdevel, linux-kernel

> >  static int show_mountinfo(struct seq_file *m, void *v)
> >  {
> > +	struct proc_mounts *p = m->private;
> > +	size_t count_save = m->count;
> 
> *UGH*.  Do you really need that?  Frankly, in that case I'd rather
> separate the check from __d_path(); unwinds like that are Not Nice(tm).

Agreed.  I did the separate check first, but then there's a window
between the check and __d_path() where the mountpoint might move
outside the root, and trying to prevent that would also add quite a
bit of ugliness.

Miklos

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 4/6] vfs: mountinfo show dominating group id
  2008-03-19 12:19       ` Miklos Szeredi
@ 2008-03-19 12:41         ` Al Viro
  2008-03-19 13:07           ` Miklos Szeredi
  0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2008-03-19 12:41 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, linuxram, linux-fsdevel, linux-kernel

On Wed, Mar 19, 2008 at 01:19:42PM +0100, Miklos Szeredi wrote:
> > So maybe some alternative, multi line format would be better?
> > 
> > MountID: 99
> > ParentID: 88
> > DevID: 0:34
> > Type: foofs
> > Source: /dev/foo
> > Root: /
> > MountPoint: /mnt/foo
> > MountOpts: rw,noatime
> > Opts: rw,errors=continue
> > Propagation: shared:42
> 
> Which still doesn't fully solve the problem, since ->show_options()
> can also spew newlines + MountID:.  Oh well.

a) ban newlines in ->show_options(); that's a requirement that is easy
to formulate and understand, so it has a chance to survive the contact
with reality.

b) the order is all wrong - *everything* that depends on fs type should
be after fs type and everything else should be prior to it.  That way
you don't need to know what the hell does this fs type spew in order to
parse type-independent information.  In particular, "source" (BTW, why
do you capitalize those?) certainly has no business being in front of
fs type; as the matter of fact, I'm not at all sure that we _want_ it
separated from the rest of type-dependent options.  The fact that mount(2)
gets it in a separate argument is a historical accident...

c) since you are tagging the fields anyway, why do you need newlines?
Moreover, you don't really need to tag everything - there's a well-defined
beginning and optional fields between it and (type+rest) are the only
things that needs to be tagged...  BTW, why bother with Propagation: part
and gluing shared:... with slave:... into a single field?  Separate them
with whitespace - you have recognizable prefixes right there.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 4/6] vfs: mountinfo show dominating group id
  2008-03-19 12:41         ` Al Viro
@ 2008-03-19 13:07           ` Miklos Szeredi
  0 siblings, 0 replies; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-19 13:07 UTC (permalink / raw)
  To: viro; +Cc: miklos, akpm, linuxram, linux-fsdevel, linux-kernel

> > > So maybe some alternative, multi line format would be better?
> > > 
> > > MountID: 99
> > > ParentID: 88
> > > DevID: 0:34
> > > Type: foofs
> > > Source: /dev/foo
> > > Root: /
> > > MountPoint: /mnt/foo
> > > MountOpts: rw,noatime
> > > Opts: rw,errors=continue
> > > Propagation: shared:42
> > 
> > Which still doesn't fully solve the problem, since ->show_options()
> > can also spew newlines + MountID:.  Oh well.
> 
> a) ban newlines in ->show_options(); that's a requirement that is easy
> to formulate and understand, so it has a chance to survive the contact
> with reality.
> 
> b) the order is all wrong - *everything* that depends on fs type should
> be after fs type and everything else should be prior to it.  That way
> you don't need to know what the hell does this fs type spew in order to
> parse type-independent information.  In particular, "source" (BTW, why
> do you capitalize those?) certainly has no business being in front of
> fs type; as the matter of fact, I'm not at all sure that we _want_ it
> separated from the rest of type-dependent options.  The fact that mount(2)
> gets it in a separate argument is a historical accident...
> 
> c) since you are tagging the fields anyway, why do you need newlines?
> Moreover, you don't really need to tag everything - there's a well-defined
> beginning and optional fields between it and (type+rest) are the only
> things that needs to be tagged...  BTW, why bother with Propagation: part
> and gluing shared:... with slave:... into a single field?  Separate them
> with whitespace - you have recognizable prefixes right there.
> 

99 88 0:34 / /mnt/foo rw,noatime shared:42 slave:13 foofs /dev/foo,errors=continue

Something like that?  It assumes, that fs types never have ':' in
them, but that's acceptable.

Miklos

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-19 11:48   ` Al Viro
@ 2008-03-19 16:41     ` Miklos Szeredi
  2008-03-19 18:20       ` Al Viro
  0 siblings, 1 reply; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-19 16:41 UTC (permalink / raw)
  To: viro; +Cc: miklos, akpm, linuxram, linux-fsdevel, linux-kernel

> > From: Miklos Szeredi <mszeredi@suse.cz>
> > 
> > Add a stable identifier for shared mounts.
> 
> > +static DEFINE_SPINLOCK(mnt_pgid_lock);
> 
> Um?  Do you ever need to take it outside of vfsmount_lock?
> 

Tried to think this through:

It's always called with namespace_sem, which is enough, no need for a
new lock.  The bigger problem, is that it _is_ called with
vfsmount_lock in one case, which is bad, since the allocation may
sleep.

That is in do_change_type().  But do we really need to hold
vfsmount_lock in that case?  I think not, the propagation tree has no
relevance outside namespace_sem, so that one should be sufficient.

This patch should fix it (untested, just for review).

I have a half mind to throw out the IDR allocation altogether, and
just go with a 64bit counter, some poeple would much prefer that...

Thanks,
Miklos

---
 fs/namespace.c |    2 --
 fs/pnode.c     |   20 ++++++++------------
 2 files changed, 8 insertions(+), 14 deletions(-)

Index: linux/fs/pnode.c
===================================================================
--- linux.orig/fs/pnode.c	2008-03-19 16:39:28.000000000 +0100
+++ linux/fs/pnode.c	2008-03-19 17:26:44.000000000 +0100
@@ -12,7 +12,6 @@
 #include <linux/idr.h>
 #include "pnode.h"
 
-static DEFINE_SPINLOCK(mnt_pgid_lock);
 static DEFINE_IDA(mnt_pgid_ida);
 
 /* return the next shared peer mount of @p */
@@ -37,19 +36,19 @@ static void __set_mnt_shared(struct vfsm
 	mnt->mnt_flags |= MNT_SHARED;
 }
 
+/*
+ * - must hold namespace_sem to protect the mnt_pgid_ida
+ * - must not hold vfsmount_lock, because this function might sleep
+ */
 void set_mnt_shared(struct vfsmount *mnt)
 {
 	int res;
 
- retry:
-	spin_lock(&mnt_pgid_lock);
-	if (IS_MNT_SHARED(mnt)) {
-		spin_unlock(&mnt_pgid_lock);
+	might_sleep();
+	if (IS_MNT_SHARED(mnt))
 		return;
-	}
-
+ retry:
 	res = ida_get_new(&mnt_pgid_ida, &mnt->mnt_pgid);
-	spin_unlock(&mnt_pgid_lock);
 	if (res == -EAGAIN) {
 		if (ida_pre_get(&mnt_pgid_ida, GFP_KERNEL))
 			goto retry;
@@ -159,11 +158,8 @@ static int do_make_slave(struct vfsmount
 			peer_mnt = NULL;
 	}
 
-	if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) {
-		spin_lock(&mnt_pgid_lock);
+	if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share))
 		ida_remove(&mnt_pgid_ida, mnt->mnt_pgid);
-		spin_unlock(&mnt_pgid_lock);
-	}
 	list_del_init(&mnt->mnt_share);
 
 	if (peer_mnt)
Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c	2008-03-19 16:39:28.000000000 +0100
+++ linux/fs/namespace.c	2008-03-19 17:23:06.000000000 +0100
@@ -1351,10 +1351,8 @@ static noinline int do_change_type(struc
 		return -EINVAL;
 
 	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	spin_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	return 0;
 }


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 5/6] vfs: optimization to /proc/<pid>/mountinfo patch
  2008-03-19 11:56   ` Al Viro
@ 2008-03-19 16:56     ` Miklos Szeredi
  0 siblings, 0 replies; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-19 16:56 UTC (permalink / raw)
  To: viro; +Cc: miklos, akpm, linuxram, linux-fsdevel, linux-kernel

> On Thu, Mar 13, 2008 at 10:26:46PM +0100, Miklos Szeredi wrote:
> > +static int prepend(char **buffer, int *buflen, const char *str,
> > +			  int namelen)
> 
> inline, please.

Akpm wrote earlier about this function:
| Both the newly-added inlines in this patch are wrong.  They will result in
| a larger and slower kernel.  This should be very well known by now.

Please fight that out with him :)

> > @@ -1911,10 +1902,9 @@ char *dentry_path(struct dentry *dentry,
> >  
> >  	spin_lock(&dcache_lock);
> >  	prepend(&end, &buflen, "\0", 1);
> > -	if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
> > -		if (prepend(&end, &buflen, "//deleted", 9))
> > +	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
> > +		(prepend(&end, &buflen, " (deleted)", 10) != 0))
> >  			goto Elong;
> 
> That's a bad idea:
> 	* we bloody well might want to use it outside of procfs
> 	* //deleted is _better_; you can't have an empty path component,
> but you can have a pathname ending on " (deleted)".

This was again Andrew's comment (making it look the same as the links
in /proc/PID/fd), but here I have to agree that //deleted is probably
better in this case.

Miklos

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-19 16:41     ` Miklos Szeredi
@ 2008-03-19 18:20       ` Al Viro
  2008-03-19 18:37         ` Miklos Szeredi
  0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2008-03-19 18:20 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, linuxram, linux-fsdevel, linux-kernel

On Wed, Mar 19, 2008 at 05:41:15PM +0100, Miklos Szeredi wrote:
> > > From: Miklos Szeredi <mszeredi@suse.cz>
> > Um?  Do you ever need to take it outside of vfsmount_lock?
> > 
> 
> Tried to think this through:
> 
> It's always called with namespace_sem, which is enough, no need for a
> new lock.  The bigger problem, is that it _is_ called with
> vfsmount_lock in one case, which is bad, since the allocation may
> sleep.

It is called with vfsmount_lock in *all* cases.  You've missed one
in umount_tree(), BTW; you won't block in that case, though.

> That is in do_change_type().  But do we really need to hold
> vfsmount_lock in that case?

Not the issue.

>  I think not, the propagation tree has no
> relevance outside namespace_sem, so that one should be sufficient.

Callers manipulate more than propagation tree.  Note that e.g.
umount_tree() changes all sorts of data structures, including ones
that are traversed without namespace_sem.

I _really_ don't like the idea of different locking rules for caller
of a function depending on the value of argument of that function.
They are complicated enough as it is.

Argh...  OK, I'll try to put something together tonight, after I get some
sleep - 31 hours of uptime _suck_ ;-/  BTW, on top of everything else,
the current variant plays interesting games with CL_PROPAGATION behaviour
and I really don't like the look of what it's doing there.  Later...

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-19 18:20       ` Al Viro
@ 2008-03-19 18:37         ` Miklos Szeredi
  2008-03-20 21:43           ` Al Viro
  2008-03-22 16:27           ` Al Viro
  0 siblings, 2 replies; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-19 18:37 UTC (permalink / raw)
  To: viro; +Cc: miklos, akpm, linuxram, linux-fsdevel, linux-kernel

> > > > From: Miklos Szeredi <mszeredi@suse.cz>
> > > Um?  Do you ever need to take it outside of vfsmount_lock?
> > > 
> > 
> > Tried to think this through:
> > 
> > It's always called with namespace_sem, which is enough, no need for a
> > new lock.  The bigger problem, is that it _is_ called with
> > vfsmount_lock in one case, which is bad, since the allocation may
> > sleep.
> 
> It is called with vfsmount_lock in *all* cases.  You've missed one
> in umount_tree(), BTW; you won't block in that case, though.

set_mnt_shared() is called from namespace.c as well, without
vfsmount_lock.  But agreed, that's not the real issue.

> 
> > That is in do_change_type().  But do we really need to hold
> > vfsmount_lock in that case?
> 
> Not the issue.
> 
> >  I think not, the propagation tree has no
> > relevance outside namespace_sem, so that one should be sufficient.
> 
> Callers manipulate more than propagation tree.  Note that e.g.
> umount_tree() changes all sorts of data structures, including ones
> that are traversed without namespace_sem.
> 
> I _really_ don't like the idea of different locking rules for caller
> of a function depending on the value of argument of that function.
> They are complicated enough as it is.
> 
> Argh...  OK, I'll try to put something together tonight, after I get some
> sleep - 31 hours of uptime _suck_ ;-/

Gosh, yes.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-19 18:37         ` Miklos Szeredi
@ 2008-03-20 21:43           ` Al Viro
  2008-03-21  8:57             ` Miklos Szeredi
                               ` (2 more replies)
  2008-03-22 16:27           ` Al Viro
  1 sibling, 3 replies; 38+ messages in thread
From: Al Viro @ 2008-03-20 21:43 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, linuxram, linux-fsdevel, linux-kernel, Trond.Myklebust, dhowells

On Wed, Mar 19, 2008 at 07:37:51PM +0100, Miklos Szeredi wrote:

> > Argh...  OK, I'll try to put something together tonight, after I get some
> > sleep - 31 hours of uptime _suck_ ;-/
> 
> Gosh, yes.

Folks, we have some problems.  I've sat down to write documentation on
locking rules, etc. for struct vfsmount and there are some serious
turds in the area.  Among the other things we have interesting race
between shrink_submounts() and copy_tree() - the former calls
propagate_mount_busy(), which walks ->mnt_share and friends.  The
latter adds elements to those.  Wouldn't be a problem, except that
shrink_submounts() only holds vfsmount_lock while everything else
(including copy_tree()) uses namespace_sem to protect those.

There's another interesting issue with shrink_submounts() - it can
put vfsmounts back on the wrong list.  If e.g. cifs is mounted under
nfs and does an automount, umount of nfs might push cifs expirables
*to* *nfs* *list*.

I'm not sure that I understand Trond's intentions with that stuff;
what exactly are we trying to achieve and why the hell is it fs-specific
to start with?

Another interesting thing is locking in mark_mounts_for_expiry(); we
go to all sorts of convolutions that are, AFAICS, not needed anymore.
We used to need rather interesting logics back when we had per-namespace
semaphores; thus grabbing the namespace, dropping vfsmount_lock, dealing
with races in case if something manages to walk into the potential
victim, etc.

I think that having the damn thing global *and* having umount_tree()
callable under vfsmount_lock (we unhash everything in the subtree and
put it on the kill list, so that nothing can walk into those suckers
via hash lookup; release_mounts() called after we are done (and after
releasing vfsmount_lock *and* namespace_sem) will take care of actual
talking to filesystems) removes the need of all that crap.

IOW, mark_mounts_for_expiry() should do the following:
	grabbing namespace_sem exclusive
	grab vfsmount_lock
	walk the list as it does now, except that it should do the right
check from the very beginning (propagate_mount_busy())
	without dropping the vfsmount_lock, go through the collected list,
calling umount_tree()
	drop the locks
	do release_mounts()
The second pass is needed since umount_tree() might do interesting things
to expiry list, so we make life easier for ourselves by leaving that to
second pass when we just want to drain the resulting list until it's empty.

Does anybody see holes in the above?

shrink_submounts() is _probably_ similar (lock/collect/umount_tree on all/
unlock/release_mounts), but I'm not sure if I understand WTF is really
attempted in there.

Is there any reason why we do that in ->umount_begin() and not *after*
it, unconditionally, straight from do_umount()?  AFAICS, the only reason
why it's done from fs-specific code is figuring out which mount-list
should the stuff go back to, and that's both broken *and* not needed
with sanitized locking as above.  While we are at it, I'd rather return
->umount_begin() to its previous prototype, TYVM - the less filesystem
sees vfsmounts, the better off we all are...

Comments?  If nobody objects, I'm going to do that in vfs-fixes branch
and then push to Linus...

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-20 21:43           ` Al Viro
@ 2008-03-21  8:57             ` Miklos Szeredi
  2008-03-22  3:49             ` Al Viro
  2008-03-24  8:50             ` Ram Pai
  2 siblings, 0 replies; 38+ messages in thread
From: Miklos Szeredi @ 2008-03-21  8:57 UTC (permalink / raw)
  To: viro
  Cc: miklos, akpm, linuxram, linux-fsdevel, linux-kernel,
	Trond.Myklebust, dhowells

> IOW, mark_mounts_for_expiry() should do the following:
> 	grabbing namespace_sem exclusive
> 	grab vfsmount_lock
> 	walk the list as it does now, except that it should do the right
> check from the very beginning (propagate_mount_busy())
> 	without dropping the vfsmount_lock, go through the collected list,
> calling umount_tree()
> 	drop the locks
> 	do release_mounts()
> The second pass is needed since umount_tree() might do interesting things
> to expiry list, so we make life easier for ourselves by leaving that to
> second pass when we just want to drain the resulting list until it's empty.
> 
> Does anybody see holes in the above?
> 
> shrink_submounts() is _probably_ similar (lock/collect/umount_tree on all/
> unlock/release_mounts), but I'm not sure if I understand WTF is really
> attempted in there.
> 
> Is there any reason why we do that in ->umount_begin() and not *after*
> it, unconditionally, straight from do_umount()?  AFAICS, the only reason
> why it's done from fs-specific code is figuring out which mount-list
> should the stuff go back to, and that's both broken *and* not needed
> with sanitized locking as above.  While we are at it, I'd rather return
> ->umount_begin() to its previous prototype, TYVM - the less filesystem
> sees vfsmounts, the better off we all are...

All of that seems sane to me.

Miklos

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-20 21:43           ` Al Viro
  2008-03-21  8:57             ` Miklos Szeredi
@ 2008-03-22  3:49             ` Al Viro
  2008-03-22  3:54               ` Al Viro
  2008-03-22  4:11               ` Al Viro
  2008-03-24  8:50             ` Ram Pai
  2 siblings, 2 replies; 38+ messages in thread
From: Al Viro @ 2008-03-22  3:49 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, linuxram, linux-fsdevel, linux-kernel, Trond.Myklebust, dhowells

On Thu, Mar 20, 2008 at 09:43:19PM +0000, Al Viro wrote:
> shrink_submounts() is _probably_ similar (lock/collect/umount_tree on all/
> unlock/release_mounts), but I'm not sure if I understand WTF is really
> attempted in there.

Argh...  Doing release_mounts() after collection phase won't work ;-/
It would leave references to parents until the very end, leaving us
with false-busy shrinkable vfsmounts if we had shrinkable automounted
on top of shrinkable...

It does work for mark_mounts_for_expiry(), but not here.  We could do
the same kind of loops as now, releasing namespace_sem after each
portion of candidates, doing release_mounts() and regaining namespace_sem,
but that leaves us with indefinitely long stalls if somebody keeps
doing lookups triggering automounts.  OTOH, we probably could get away
with separate counter covering only that kind of references...  That
would be bumped in umount_tree() (at the same point where we decrement
d_mounted) and dropped in release_mounts() when we reset ->mnt_parent
and do mntput() on it.

Then we would simply make do_refcount_check() in pnode.c do
        int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
        return (mycount > count);
instead of what it does now, and everything would work fine...

So, let's define mnt->mnt_ghosts by requiring that outside of vfsmount_lock
it would be equal to number of vfsmounts with ->mnt_parent == mnt that are
_not_ on child list of mnt.

	We'd need to decrement it in release_mounts(), increment in
mnt_set_mountpoint(), decrement again in attach_mnt() (which strongly
suggests that increment should happen in _callers_ of mnt_set_mountpoint(),
so that attach_mnt() wouldn't modify it at all), decrement in commit_tree(),
and increment in umount_tree() at the same point where we play with d_mounted.
AFAICS, that's all.

	Shifting increment from mnt_set_mountpoint() and commit_tree()
to theirs callers and collapsing where possible, we get the following:
	* decrement in release_mounts() when resetting ->mnt_parent
	* increment in propagate_mnt() after call of mnt_set_mountpoint()
	* decrement in attach_recursive_mnt() in the loop calling
commit_tree() for clones (on mountpoint of each clone).
	* increment in umount_tree() at the point where we update d_mounted.

All these places are under vfsmount_lock, so we are fine with plain int; no
atomics needed.

So...  Attack plan: introduce mnt_ghosts+use it in propagate_mnt_busy()
(that gets rid of false-busy stuff), then switch shrink_submounts() and
mark_mounts_for_expiry() to the scheme from the previous posting, then
call shrink_submounts() from do_umount() unconditionally, removing it from
->umount_begin() instances, then restore sane prototype for shrink_submounts().
Four patches...

Comments?  Ram, Miklos, Trond?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-22  3:49             ` Al Viro
@ 2008-03-22  3:54               ` Al Viro
  2008-03-22  4:11               ` Al Viro
  1 sibling, 0 replies; 38+ messages in thread
From: Al Viro @ 2008-03-22  3:54 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, linuxram, linux-fsdevel, linux-kernel, Trond.Myklebust, dhowells

On Sat, Mar 22, 2008 at 03:49:50AM +0000, Al Viro wrote:
> ->umount_begin() instances, then restore sane prototype for shrink_submounts().

s/shrink_submounts/->umount_begin/ in the line above

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-22  3:49             ` Al Viro
  2008-03-22  3:54               ` Al Viro
@ 2008-03-22  4:11               ` Al Viro
  2008-03-22  4:56                 ` Al Viro
  2008-03-30 19:33                 ` Ram Pai
  1 sibling, 2 replies; 38+ messages in thread
From: Al Viro @ 2008-03-22  4:11 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, linuxram, linux-fsdevel, linux-kernel, Trond.Myklebust, dhowells

On Sat, Mar 22, 2008 at 03:49:50AM +0000, Al Viro wrote:

> 	Shifting increment from mnt_set_mountpoint() and commit_tree()
> to theirs callers and collapsing where possible, we get the following:
> 	* decrement in release_mounts() when resetting ->mnt_parent
> 	* increment in propagate_mnt() after call of mnt_set_mountpoint()
> 	* decrement in attach_recursive_mnt() in the loop calling
> commit_tree() for clones (on mountpoint of each clone).
> 	* increment in umount_tree() at the point where we update d_mounted.

... except that it'd give a leak in case of mount to shared mountpoint
failing halfway through - we'll get double increments since umount_tree()
would hit the mountpoints of cloned trees with extra increment, even though 
reference from root of cloned to its mountpoint is _already_ a ghost.

OTOH, we probably don't want to bother with counting those anyway - i.e.
it's simply a bad definition and the right one would be along the lines of
"number of vfsmounts that are doomed to be eaten by release_mounts() and
that have ->mnt_parent pointing to us".  IOW, dropping the 2nd and 3rd
in the above would do the right thing - anything chewed by umount_tree()
*will* go to release_mounts() and ones in flight are what we are interested
in...

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-22  4:11               ` Al Viro
@ 2008-03-22  4:56                 ` Al Viro
  2008-03-30 19:33                 ` Ram Pai
  1 sibling, 0 replies; 38+ messages in thread
From: Al Viro @ 2008-03-22  4:56 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, linuxram, linux-fsdevel, linux-kernel, Trond.Myklebust, dhowells

	OK, preliminary patches are in
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6.git/ expiry-fixes

BTW, cifs use of MNT_SHRINKABLE is broken; it updates mnt_flags on
*mountpoint* instead of just having them set on new vfsmount.
Their expiry policy is also very odd - "expire everything cifs
whenever some cifs filesystem is unmounted".  Hell knows what had
been intended there (dfs_shrink_umount_helper() and friends)...

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-19 18:37         ` Miklos Szeredi
  2008-03-20 21:43           ` Al Viro
@ 2008-03-22 16:27           ` Al Viro
  2008-03-24  8:19             ` Ram Pai
  1 sibling, 1 reply; 38+ messages in thread
From: Al Viro @ 2008-03-22 16:27 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, linuxram, linux-fsdevel, linux-kernel

On Wed, Mar 19, 2008 at 07:37:51PM +0100, Miklos Szeredi wrote:
> set_mnt_shared() is called from namespace.c as well, without
> vfsmount_lock.  But agreed, that's not the real issue.

How about the following: let's separate set_mnt_shared() and inventing
group ids.  All we need is this:
invent_group_ids(mnt)	/* call under namespace_sem */
	for all vfsmounts p in subtree rooted at mnt
		if p->mnt_share is non-empty
			continue
		get ID for p
		if allocation fails
			goto cleanup
	return 0
cleanup:
	for all vfsmounts q in subtree rooted at mnt
		if q == p
			break
		if q->mnt_share is non-empty
			continue
		release ID of q
	return -ENOMEM

Now here's what we do:
	* in do_change_type(), outside of vfsmount_lock, do invent_group_ids()
If it fails - bugger off, if not - proceed as now.
	* in attach_recursive_mnt() if IS_MNT_SHARED(dest_mnt) do
invent_group_ids() on the dest_mnt immediately and if it fails do
umount_tree(dest_mnt, 0, ) under vfsmount_lock, then release_mounts()
and bugger off (FWIW, we might want to lift the last part to caller
and do the same to release_mounts() in propagate_mnt()).  If it hadn't
failed, we proceed as now.
	* in clone_mnt() do
	int new_group = group ID of old;
	int free_group = 0;
	if (flag & (CL_SLAVE | CL_PRIVATE))
		new_group = 0; /* not a peer of original */
	if ((flag & CL_MAKE_SHARED) && !new_group)
		new_group = allocate new ID
		if failed
			return 0;
		free_group = 1;
	}
	mnt = alloc_vfsmount();
	if (mnt) {
		set group ID of mnt to new_group;
		free_group = 0;
		/* as in mainline */
	}
	if (free_group)
		release group ID found in new_group;
	return mnt;

then (after allocating new vfsmount) set its group ID to new_group if
alloc_vfsmount() succeeds.  Otherwise release group ID if needed and
bugger off as usual.

No need to mess with any additional exclusion for idr protection or with
any kind of retries; allocation failure is allocation failure.

Releasing group ID should be done from do_make_slave(), along with clearing
group ID in vfsmount.

Care to do that using mountinfo-base in vfs-2.6.git as base tree?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-22 16:27           ` Al Viro
@ 2008-03-24  8:19             ` Ram Pai
  2008-03-24  9:34               ` Al Viro
  0 siblings, 1 reply; 38+ messages in thread
From: Ram Pai @ 2008-03-24  8:19 UTC (permalink / raw)
  To: Al Viro; +Cc: Miklos Szeredi, akpm, linux-fsdevel, linux-kernel

On Sat, 2008-03-22 at 16:27 +0000, Al Viro wrote:
> On Wed, Mar 19, 2008 at 07:37:51PM +0100, Miklos Szeredi wrote:
> > set_mnt_shared() is called from namespace.c as well, without
> > vfsmount_lock.  But agreed, that's not the real issue.
> 
> How about the following: let's separate set_mnt_shared() and inventing
> group ids.  All we need is this:
> invent_group_ids(mnt)	/* call under namespace_sem */
> 	for all vfsmounts p in subtree rooted at mnt
> 		if p->mnt_share is non-empty
> 			continue
> 		get ID for p
> 		if allocation fails
> 			goto cleanup
> 	return 0
> cleanup:
> 	for all vfsmounts q in subtree rooted at mnt
> 		if q == p
> 			break
> 		if q->mnt_share is non-empty
> 			continue
> 		release ID of q
> 	return -ENOMEM
> 
> Now here's what we do:
> 	* in do_change_type(), outside of vfsmount_lock, do invent_group_ids()
> If it fails - bugger off, if not - proceed as now.

Has it to be done outside vfsmount_lock?  AFAICT, invent_group_ids()
does not sleep, nor does change_mnt_propagation().

> 	* in attach_recursive_mnt() if IS_MNT_SHARED(dest_mnt) do
> invent_group_ids() on the dest_mnt immediately and if it fails do

I think you meant, invent_group_ids() on the source_mnt.  But again
applying invent_group_ids() on the source_mnt has to be done carefully,
because, source_mnt may have been shared to begin with.

right?
RP

> umount_tree(dest_mnt, 0, ) under vfsmount_lock, then release_mounts()
> and bugger off (FWIW, we might want to lift the last part to caller
> and do the same to release_mounts() in propagate_mnt()).  If it hadn't
> failed, we proceed as now.
> 	* in clone_mnt() do
> 	int new_group = group ID of old;
> 	int free_group = 0;
> 	if (flag & (CL_SLAVE | CL_PRIVATE))
> 		new_group = 0; /* not a peer of original */
> 	if ((flag & CL_MAKE_SHARED) && !new_group)
> 		new_group = allocate new ID
> 		if failed
> 			return 0;
> 		free_group = 1;
> 	}
> 	mnt = alloc_vfsmount();
> 	if (mnt) {
> 		set group ID of mnt to new_group;
> 		free_group = 0;
> 		/* as in mainline */
> 	}
> 	if (free_group)
> 		release group ID found in new_group;
> 	return mnt;
> 
> then (after allocating new vfsmount) set its group ID to new_group if
> alloc_vfsmount() succeeds.  Otherwise release group ID if needed and
> bugger off as usual.
> 
> No need to mess with any additional exclusion for idr protection or with
> any kind of retries; allocation failure is allocation failure.
> 
> Releasing group ID should be done from do_make_slave(), along with clearing
> group ID in vfsmount.
> 
> Care to do that using mountinfo-base in vfs-2.6.git as base tree?


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-20 21:43           ` Al Viro
  2008-03-21  8:57             ` Miklos Szeredi
  2008-03-22  3:49             ` Al Viro
@ 2008-03-24  8:50             ` Ram Pai
  2008-03-24  8:54               ` Christoph Hellwig
  2008-03-24  9:53               ` Al Viro
  2 siblings, 2 replies; 38+ messages in thread
From: Ram Pai @ 2008-03-24  8:50 UTC (permalink / raw)
  To: Al Viro
  Cc: Miklos Szeredi, akpm, linux-fsdevel, linux-kernel,
	Trond.Myklebust, dhowells

On Thu, 2008-03-20 at 21:43 +0000, Al Viro wrote:
> On Wed, Mar 19, 2008 at 07:37:51PM +0100, Miklos Szeredi wrote:
> 
> > > Argh...  OK, I'll try to put something together tonight, after I get some
> > > sleep - 31 hours of uptime _suck_ ;-/
> > 
> > Gosh, yes.
> 
.....snip...
> Is there any reason why we do that in ->umount_begin() and not *after*
> it, unconditionally, straight from do_umount()?  AFAICS, the only reason
> why it's done from fs-specific code is figuring out which mount-list
> should the stuff go back to, and that's both broken *and* not needed
> with sanitized locking as above.  While we are at it, I'd rather return
> ->umount_begin() to its previous prototype, TYVM - the less filesystem
> sees vfsmounts, the better off we all are...

I think that ->umount_begin also acts a hook for providing pre-umount
event notification to userspace from filesystems; something that is
required by DMAPI interface.

RP

> 
> Comments?  If nobody objects, I'm going to do that in vfs-fixes branch
> and then push to Linus...


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-24  8:50             ` Ram Pai
@ 2008-03-24  8:54               ` Christoph Hellwig
  2008-03-24  9:53               ` Al Viro
  1 sibling, 0 replies; 38+ messages in thread
From: Christoph Hellwig @ 2008-03-24  8:54 UTC (permalink / raw)
  To: Ram Pai
  Cc: Al Viro, Miklos Szeredi, akpm, linux-fsdevel, linux-kernel,
	Trond.Myklebust, dhowells

On Mon, Mar 24, 2008 at 01:50:14AM -0700, Ram Pai wrote:
> I think that ->umount_begin also acts a hook for providing pre-umount
> event notification to userspace from filesystems; something that is
> required by DMAPI interface.

No such thing like dmapi near mainline, and we ever want user-space
assiseted HDM in mainline we'd do it in VFS.

FYI: SGI out of tree dmapi doesn't make use of ->unmount_begin either.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-24  8:19             ` Ram Pai
@ 2008-03-24  9:34               ` Al Viro
  0 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2008-03-24  9:34 UTC (permalink / raw)
  To: Ram Pai; +Cc: Miklos Szeredi, akpm, linux-fsdevel, linux-kernel

On Mon, Mar 24, 2008 at 01:19:41AM -0700, Ram Pai wrote:
> > 	* in do_change_type(), outside of vfsmount_lock, do invent_group_ids()
> > If it fails - bugger off, if not - proceed as now.
> 
> Has it to be done outside vfsmount_lock?  AFAICT, invent_group_ids()
> does not sleep, nor does change_mnt_propagation().

It does allocation.  And no, GFP_ATOMIC is not appropriate for that.
The same goes for mount IDs, BTW, and there we _also_ don't need
vfsmount_lock - see my current tree, there we have clone_mnt()
serialized by namespace_sem in all cases.

> > 	* in attach_recursive_mnt() if IS_MNT_SHARED(dest_mnt) do
> > invent_group_ids() on the dest_mnt immediately and if it fails do
> 
> I think you meant, invent_group_ids() on the source_mnt.
Yes

> But again
> applying invent_group_ids() on the source_mnt has to be done carefully,
> because, source_mnt may have been shared to begin with.

And?  See the original posting - invent_group_ids() skips the vfsmounts
with already set group ID.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-24  8:50             ` Ram Pai
  2008-03-24  8:54               ` Christoph Hellwig
@ 2008-03-24  9:53               ` Al Viro
  1 sibling, 0 replies; 38+ messages in thread
From: Al Viro @ 2008-03-24  9:53 UTC (permalink / raw)
  To: Ram Pai
  Cc: Miklos Szeredi, akpm, linux-fsdevel, linux-kernel,
	Trond.Myklebust, dhowells

On Mon, Mar 24, 2008 at 01:50:14AM -0700, Ram Pai wrote:
> > Is there any reason why we do that in ->umount_begin() and not *after*
> > it, unconditionally, straight from do_umount()?  AFAICS, the only reason
> > why it's done from fs-specific code is figuring out which mount-list
> > should the stuff go back to, and that's both broken *and* not needed
> > with sanitized locking as above.  While we are at it, I'd rather return
> > ->umount_begin() to its previous prototype, TYVM - the less filesystem
> > sees vfsmounts, the better off we all are...
> 
> I think that ->umount_begin also acts a hook for providing pre-umount
> event notification to userspace from filesystems; something that is
> required by DMAPI interface.

"Why, so can I, and so can any man..."

IOW, DMAPI might require whatever its authors want it to require, but
what does that have to do with us?

BTW, I've dropped several more patches into the tree (sanitizing
namespace.c/pnode.c) and merged that into mountinfo-base.  Documentation
is mostly done, so it will be the next thing to go there, then it's
time for 'dissolve unreachable subtrees on d_invalidate()' stuff.
Which probably will mean *another* cyclic list in vfsmount, not anchored
but pointed to by replacement of d_mounted; same protection as for
mnt_child/mnt_mounts, contains vfsmounts with given mnt_mountpoint.
I'm not too fond of that, but I don't see cleaner way to do it at the
moment.  Any better ideas are welcome...

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/6] vfs: mountinfo stable peer group id
  2008-03-22  4:11               ` Al Viro
  2008-03-22  4:56                 ` Al Viro
@ 2008-03-30 19:33                 ` Ram Pai
  1 sibling, 0 replies; 38+ messages in thread
From: Ram Pai @ 2008-03-30 19:33 UTC (permalink / raw)
  To: Al Viro
  Cc: Miklos Szeredi, akpm, linux-fsdevel, linux-kernel,
	Trond.Myklebust, dhowells

On Sat, 2008-03-22 at 04:11 +0000, Al Viro wrote:
> On Sat, Mar 22, 2008 at 03:49:50AM +0000, Al Viro wrote:
> 
> >       Shifting increment from mnt_set_mountpoint() and commit_tree()
> > to theirs callers and collapsing where possible, we get the
> following:
> >       * decrement in release_mounts() when resetting ->mnt_parent
> >       * increment in propagate_mnt() after call of
> mnt_set_mountpoint()
> >       * decrement in attach_recursive_mnt() in the loop calling
> > commit_tree() for clones (on mountpoint of each clone).
> >       * increment in umount_tree() at the point where we update
> d_mounted.
> 
> ... except that it'd give a leak in case of mount to shared mountpoint
> failing halfway through - we'll get double increments since
> umount_tree()
> would hit the mountpoints of cloned trees with extra increment, even
> though 
> reference from root of cloned to its mountpoint is _already_ a ghost.

> OTOH, we probably don't want to bother with counting those anyway -
> i.e.
> it's simply a bad definition and the right one would be along the
> lines of
> "number of vfsmounts that are doomed to be eaten by release_mounts()
> and
> that have ->mnt_parent pointing to us".  IOW, dropping the 2nd and 3rd
> in the above would do the right thing - anything chewed by
> umount_tree()
> *will* go to release_mounts() and ones in flight are what we are
> interested
> in... 

By not accounting for the ghost reference created in propagate_mnt(),
i.e case 2 and 3; the race is still on with shrink_mounts. But I think,
you are right. We don't want the shrink_mounts and friends to think that
the mounts are available to be purged, by accounting them into
mnt_ghosts.

RP





^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2008-03-30 19:33 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-13 21:26 [patch 0/6] vfs: mountinfo update Miklos Szeredi
2008-03-13 21:26 ` [patch 1/6] vfs: mountinfo -mm fix Miklos Szeredi
2008-03-13 21:26 ` [patch 2/6] vfs: pnode cleanup Miklos Szeredi
2008-03-19 11:16   ` Al Viro
2008-03-19 11:48     ` Miklos Szeredi
2008-03-13 21:26 ` [patch 3/6] vfs: mountinfo stable peer group id Miklos Szeredi
2008-03-19 11:48   ` Al Viro
2008-03-19 16:41     ` Miklos Szeredi
2008-03-19 18:20       ` Al Viro
2008-03-19 18:37         ` Miklos Szeredi
2008-03-20 21:43           ` Al Viro
2008-03-21  8:57             ` Miklos Szeredi
2008-03-22  3:49             ` Al Viro
2008-03-22  3:54               ` Al Viro
2008-03-22  4:11               ` Al Viro
2008-03-22  4:56                 ` Al Viro
2008-03-30 19:33                 ` Ram Pai
2008-03-24  8:50             ` Ram Pai
2008-03-24  8:54               ` Christoph Hellwig
2008-03-24  9:53               ` Al Viro
2008-03-22 16:27           ` Al Viro
2008-03-24  8:19             ` Ram Pai
2008-03-24  9:34               ` Al Viro
2008-03-13 21:26 ` [patch 4/6] vfs: mountinfo show dominating " Miklos Szeredi
2008-03-19 11:37   ` Al Viro
2008-03-19 12:03     ` Miklos Szeredi
2008-03-19 12:19       ` Miklos Szeredi
2008-03-19 12:41         ` Al Viro
2008-03-19 13:07           ` Miklos Szeredi
2008-03-13 21:26 ` [patch 5/6] vfs: optimization to /proc/<pid>/mountinfo patch Miklos Szeredi
2008-03-19 11:56   ` Al Viro
2008-03-19 16:56     ` Miklos Szeredi
2008-03-13 21:26 ` [patch 6/6] vfs: mountinfo: only show mounts under tasks root Miklos Szeredi
2008-03-19 12:12   ` Al Viro
2008-03-19 12:25     ` Miklos Szeredi
2008-03-13 22:53 ` [patch 0/6] vfs: mountinfo update Andrew Morton
2008-03-14  8:17   ` Miklos Szeredi
2008-03-14 19:29     ` Ram Pai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).