LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Dave Hansen <haveblue@us.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: hch@lst.de, viro@ZenIV.linux.org.uk, viro@ftp.linux.org.uk,
	miklos@szeredi.hu, Dave Hansen <haveblue@us.ibm.com>
Subject: [RFC][PATCH 20/30] r/o bind mounts: elevate write count for open()s
Date: Fri, 08 Feb 2008 14:27:14 -0800	[thread overview]
Message-ID: <20080208222714.989BEE44@kernel> (raw)
In-Reply-To: <20080208222641.6024A7CC@kernel>


This is the first really tricky patch in the series.  It elevates the writer
count on a mount each time a non-special file is opened for write.

We used to do this in may_open(), but Miklos pointed out that __dentry_open()
is used as well to create filps.  This will cover even those cases, while a
call in may_open() would not have.

There is also an elevated count around the vfs_create() call in open_namei(). 
See the comments for more details, but we need this to fix a 'create, remount,
fail r/w open()' race.

Some filesystems forego the use of normal vfs calls to create
struct files.   Make sure that these users elevate the mnt
writer count because they will get __fput(), and we need
to make sure they're balanced.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 linux-2.6.git-dave/fs/file_table.c |   14 ++++++
 linux-2.6.git-dave/fs/namei.c      |   75 ++++++++++++++++++++++++++++++++-----
 linux-2.6.git-dave/fs/open.c       |   36 ++++++++++++++++-
 linux-2.6.git-dave/ipc/mqueue.c    |   16 ++++++-
 4 files changed, 127 insertions(+), 14 deletions(-)

diff -puN fs/file_table.c~r-o-bind-mounts-elevate-write-count-opened-files fs/file_table.c
--- linux-2.6.git/fs/file_table.c~r-o-bind-mounts-elevate-write-count-opened-files	2008-02-08 13:04:55.000000000 -0800
+++ linux-2.6.git-dave/fs/file_table.c	2008-02-08 13:04:55.000000000 -0800
@@ -199,6 +199,17 @@ int init_file(struct file *file, struct 
 	file->f_mapping = dentry->d_inode->i_mapping;
 	file->f_mode = mode;
 	file->f_op = fop;
+
+	/*
+	 * These mounts don't really matter in practice
+	 * for r/o bind mounts.  They aren't userspace-
+	 * visible.  We do this for consistency, and so
+	 * that we can do debugging checks at __fput()
+	 */
+	if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+		error = mnt_want_write(mnt);
+		WARN_ON(error);
+	}
 	return error;
 }
 EXPORT_SYMBOL(init_file);
@@ -221,10 +232,13 @@ EXPORT_SYMBOL(fput);
  */
 void drop_file_write_access(struct file *file)
 {
+	struct vfsmount *mnt = file->f_path.mnt;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 
 	put_write_access(inode);
+	if (!special_file(inode->i_mode))
+		mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
 
diff -puN fs/namei.c~r-o-bind-mounts-elevate-write-count-opened-files fs/namei.c
--- linux-2.6.git/fs/namei.c~r-o-bind-mounts-elevate-write-count-opened-files	2008-02-08 13:04:55.000000000 -0800
+++ linux-2.6.git-dave/fs/namei.c	2008-02-08 13:04:55.000000000 -0800
@@ -1620,8 +1620,7 @@ int may_open(struct nameidata *nd, int a
 			return -EACCES;
 
 		flag &= ~O_TRUNC;
-	} else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
-		return -EROFS;
+	}
 
 	error = vfs_permission(nd, acc_mode);
 	if (error)
@@ -1721,18 +1720,32 @@ static inline int open_to_namei_flags(in
 	return flag;
 }
 
+static int open_will_write_to_fs(int flag, struct inode *inode)
+{
+	/*
+	 * We'll never write to the fs underlying
+	 * a device file.
+	 */
+	if (special_file(inode->i_mode))
+		return 0;
+	return (flag & O_TRUNC);
+}
+
 /*
- * Note that the low bits of "flag" aren't the same as in the open
- * system call.  See open_to_namei_flags().
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
  */
 struct file *do_filp_open(int dfd, const char *pathname,
 		int open_flag, int mode)
 {
+	struct file *filp;
 	struct nameidata nd;
 	int acc_mode, error;
 	struct path path;
 	struct dentry *dir;
 	int count = 0;
+	int will_write;
 	int flag = open_to_namei_flags(open_flag);
 
 	acc_mode = ACC_MODE(flag);
@@ -1788,17 +1801,30 @@ do_last:
 	}
 
 	if (IS_ERR(nd.intent.open.file)) {
-		mutex_unlock(&dir->d_inode->i_mutex);
 		error = PTR_ERR(nd.intent.open.file);
-		goto exit_dput;
+		goto exit_mutex_unlock;
 	}
 
 	/* Negative dentry, just create the file */
 	if (!path.dentry->d_inode) {
-		error = __open_namei_create(&nd, &path, flag, mode);
+		/*
+		 * This write is needed to ensure that a
+		 * ro->rw transition does not occur between
+		 * the time when the file is created and when
+		 * a permanent write count is taken through
+		 * the 'struct file' in nameidata_to_filp().
+		 */
+		error = mnt_want_write(nd.mnt);
 		if (error)
+			goto exit_mutex_unlock;
+		error = __open_namei_create(&nd, &path, flag, mode);
+		if (error) {
+			mnt_drop_write(nd.mnt);
 			goto exit;
-		return nameidata_to_filp(&nd, open_flag);
+		}
+		filp = nameidata_to_filp(&nd, open_flag);
+		mnt_drop_write(nd.mnt);
+		return filp;
 	}
 
 	/*
@@ -1828,11 +1854,40 @@ do_last:
 	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
 		goto exit;
 ok:
+	/*
+	 * Consider:
+	 * 1. may_open() truncates a file
+	 * 2. a rw->ro mount transition occurs
+	 * 3. nameidata_to_filp() fails due to
+	 *    the ro mount.
+	 * That would be inconsistent, and should
+	 * be avoided. Taking this mnt write here
+	 * ensures that (2) can not occur.
+	 */
+	will_write = open_will_write_to_fs(flag, nd.dentry->d_inode);
+	if (will_write) {
+		error = mnt_want_write(nd.mnt);
+		if (error)
+			goto exit;
+	}
 	error = may_open(&nd, acc_mode, flag);
-	if (error)
+	if (error) {
+		if (will_write)
+			mnt_drop_write(nd.mnt);
 		goto exit;
-	return nameidata_to_filp(&nd, open_flag);
+	}
+	filp = nameidata_to_filp(&nd, open_flag);
+	/*
+	 * It is now safe to drop the mnt write
+	 * because the filp has had a write taken
+	 * on its behalf.
+	 */
+	if (will_write)
+		mnt_drop_write(nd.mnt);
+	return filp;
 
+exit_mutex_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
 exit_dput:
 	dput_path(&path, &nd);
 exit:
diff -puN fs/open.c~r-o-bind-mounts-elevate-write-count-opened-files fs/open.c
--- linux-2.6.git/fs/open.c~r-o-bind-mounts-elevate-write-count-opened-files	2008-02-08 13:04:55.000000000 -0800
+++ linux-2.6.git-dave/fs/open.c	2008-02-08 13:04:55.000000000 -0800
@@ -734,6 +734,35 @@ out:
 	return error;
 }
 
+/*
+ * You have to be very careful that these write
+ * counts get cleaned up in error cases and
+ * upon __fput().  This should probably never
+ * be called outside of __dentry_open().
+ */
+static inline int __get_file_write_access(struct inode *inode,
+					  struct vfsmount *mnt)
+{
+	int error;
+	error = get_write_access(inode);
+	if (error)
+		return error;
+	/*
+	 * Do not take mount writer counts on
+	 * special files since no writes to
+	 * the mount itself will occur.
+	 */
+	if (!special_file(inode->i_mode)) {
+		/*
+		 * Balanced in __fput()
+		 */
+		error = mnt_want_write(mnt);
+		if (error)
+			put_write_access(inode);
+	}
+	return error;
+}
+
 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 					int flags, struct file *f,
 					int (*open)(struct inode *, struct file *))
@@ -746,7 +775,7 @@ static struct file *__dentry_open(struct
 				FMODE_PREAD | FMODE_PWRITE;
 	inode = dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
-		error = get_write_access(inode);
+		error = __get_file_write_access(inode, mnt);
 		if (error)
 			goto cleanup_file;
 	}
@@ -788,8 +817,11 @@ static struct file *__dentry_open(struct
 
 cleanup_all:
 	fops_put(f->f_op);
-	if (f->f_mode & FMODE_WRITE)
+	if (f->f_mode & FMODE_WRITE) {
 		put_write_access(inode);
+		if (!special_file(inode->i_mode))
+			mnt_drop_write(mnt);
+	}
 	file_kill(f);
 	f->f_path.dentry = NULL;
 	f->f_path.mnt = NULL;
diff -puN ipc/mqueue.c~r-o-bind-mounts-elevate-write-count-opened-files ipc/mqueue.c
--- linux-2.6.git/ipc/mqueue.c~r-o-bind-mounts-elevate-write-count-opened-files	2008-02-08 13:04:55.000000000 -0800
+++ linux-2.6.git-dave/ipc/mqueue.c	2008-02-08 13:04:55.000000000 -0800
@@ -599,6 +599,7 @@ static struct file *do_create(struct den
 			int oflag, mode_t mode, struct mq_attr __user *u_attr)
 {
 	struct mq_attr attr;
+	struct file *result;
 	int ret;
 
 	if (u_attr) {
@@ -613,13 +614,24 @@ static struct file *do_create(struct den
 	}
 
 	mode &= ~current->fs->umask;
+	ret = mnt_want_write(mqueue_mnt);
+	if (ret)
+		goto out;
 	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
 	dentry->d_fsdata = NULL;
 	if (ret)
-		goto out;
+		goto out_drop_write;
 
-	return dentry_open(dentry, mqueue_mnt, oflag);
+	result = dentry_open(dentry, mqueue_mnt, oflag);
+	/*
+	 * dentry_open() took a persistent mnt_want_write(),
+	 * so we can now drop this one.
+	 */
+	mnt_drop_write(mqueue_mnt);
+	return result;
 
+out_drop_write:
+	mnt_drop_write(mqueue_mnt);
 out:
 	dput(dentry);
 	mntput(mqueue_mnt);
_

  parent reply	other threads:[~2008-02-08 22:36 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-02-08 22:26 [RFC][PATCH 00/30] Read-only bind mounts (-mm resend) Dave Hansen
2008-02-08 22:26 ` [RFC][PATCH 01/30] reiserfs: eliminate private use of struct file in xattr Dave Hansen
2008-02-08 22:26 ` [RFC][PATCH 02/30] hppfs pass vfsmount to dentry_open() Dave Hansen
2008-02-08 22:26 ` [RFC][PATCH 03/30] check for null vfsmount in dentry_open() Dave Hansen
2008-02-08 22:26 ` [RFC][PATCH 04/30] fix up new filp allocators Dave Hansen
2008-02-08 22:26 ` [RFC][PATCH 05/30] do namei_flags calculation inside open_namei() Dave Hansen
2008-02-08 22:26 ` [RFC][PATCH 06/30] make open_namei() return a filp Dave Hansen
2008-02-09  5:09   ` Christoph Hellwig
2008-02-08 22:26 ` [RFC][PATCH 07/30] r/o bind mounts: stub functions Dave Hansen
2008-02-08 22:26 ` [RFC][PATCH 08/30] r/o bind mounts: create helper to drop file write access Dave Hansen
2008-02-08 22:26 ` [RFC][PATCH 09/30] r/o bind mounts: drop write during emergency remount Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 10/30] r/o bind mounts: elevate write count for vfs_rmdir() Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 11/30] r/o bind mounts: elevate write count for callers of vfs_mkdir() Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 12/30] r/o bind mounts: elevate mnt_writers for unlink callers Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 13/30] r/o bind mounts: elevate write count for xattr_permission() callers Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 14/30] r/o bind mounts: elevate write count for ncp_ioctl() Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 15/30] r/o bind mounts: write counts for time functions Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 16/30] r/o bind mounts: elevate write count for do_utimes() Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 17/30] r/o bind mounts: write count for file_update_time() Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 18/30] r/o bind mounts: write counts for link/symlink Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 19/30] r/o bind mounts: elevate write count for ioctls() Dave Hansen
2008-02-08 22:27 ` Dave Hansen [this message]
2008-02-08 22:27 ` [RFC][PATCH 21/30] r/o bind mounts: get write access for vfs_rename() callers Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 22/30] r/o bind mounts: elevate write count for chmod/chown callers Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 23/30] r/o bind mounts: write counts for truncate() Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 24/30] r/o bind mounts: elevate count for xfs timestamp updates Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 25/30] r/o bind mounts: make access() use new r/o helper Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 26/30] r/o bind mounts: check mnt instead of superblock directly Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 27/30] r/o bind mounts: get callers of vfs_mknod/create() Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 28/30] r/o bind mounts: track numbers of writers to mounts Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 29/30] r/o bind mounts: honor mount writer counts at remount Dave Hansen
2008-02-08 22:27 ` [RFC][PATCH 30/30] r/o bind mounts: debugging for missed calls Dave Hansen
2008-02-09  6:39 ` [RFC][PATCH 00/30] Read-only bind mounts (-mm resend) Christoph Hellwig
2008-02-09  7:57 ` Al Viro
2008-02-12  5:06 ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080208222714.989BEE44@kernel \
    --to=haveblue@us.ibm.com \
    --cc=hch@lst.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=miklos@szeredi.hu \
    --cc=viro@ZenIV.linux.org.uk \
    --cc=viro@ftp.linux.org.uk \
    --subject='Re: [RFC][PATCH 20/30] r/o bind mounts: elevate write count for open()s' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).