LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Andreas Gruenbacher <agruenba@redhat.com>
To: Linus Torvalds <torvalds@linux-foundation.org>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Christoph Hellwig <hch@infradead.org>,
	"Darrick J. Wong" <djwong@kernel.org>
Cc: Jan Kara <jack@suse.cz>, Matthew Wilcox <willy@infradead.org>,
	cluster-devel@redhat.com, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, ocfs2-devel@oss.oracle.com,
	Andreas Gruenbacher <agruenba@redhat.com>
Subject: [PATCH v7 13/19] gfs2: Fix mmap + page fault deadlocks for buffered I/O
Date: Fri, 27 Aug 2021 18:49:20 +0200	[thread overview]
Message-ID: <20210827164926.1726765-14-agruenba@redhat.com> (raw)
In-Reply-To: <20210827164926.1726765-1-agruenba@redhat.com>

In the .read_iter and .write_iter file operations, we're accessing
user-space memory while holding the inode glock.  There is a possibility
that the memory is mapped to the same file, in which case we'd recurse
on the same glock.

More complex scenarios can involve multiple glocks, processes, and even
cluster nodes.

Avoid these kinds of problems by disabling page faults while holding the
inode glock.  If a page fault would occur, we either end up with a
partial read or write, or with -EFAULT if nothing could be read or
written.  In either case, we know that we're not done with the
operation, so we indicate that we're willing to give up the inode glock
(HIF_MAY_DEMOTE) and then we fault in the missing pages.  If that made
us lose the inode glock, we return a partial read or write.  Otherwise,
we resume the operation.

This locking problem was originally reported by Jan Kara.  Linus came up
with the proposal to disable page faults.  Many thanks to Al Viro and
Matthew Wilcox for their feedback.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/file.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 5f328bc21d0b..fce3a5249e19 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -776,6 +776,36 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 	return ret ? ret : ret1;
 }
 
+static bool should_fault_in_pages(struct iov_iter *i, size_t *prev_count,
+				  size_t *window_size)
+{
+	char __user *p = i->iov[0].iov_base + i->iov_offset;
+	size_t count = iov_iter_count(i);
+	size_t size;
+
+	if (!iter_is_iovec(i))
+		return false;
+
+	if (*prev_count != count || !*window_size) {
+		int pages, nr_dirtied;
+
+		pages = min_t(int, BIO_MAX_VECS,
+			      DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
+		nr_dirtied = max(current->nr_dirtied_pause -
+				 current->nr_dirtied, 1);
+		pages = min(pages, nr_dirtied);
+		size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
+	} else {
+		size = (size_t)PAGE_SIZE - offset_in_page(p);
+		if (*window_size <= size)
+			return false;
+	}
+
+	*prev_count = count;
+	*window_size = size;
+	return true;
+}
+
 static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
 				     struct gfs2_holder *gh)
 {
@@ -840,9 +870,16 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct gfs2_inode *ip;
 	struct gfs2_holder gh;
+	size_t prev_count = 0, window_size = 0;
 	size_t written = 0;
 	ssize_t ret;
 
+	/*
+	 * In this function, we disable page faults when we're holding the
+	 * inode glock while doing I/O.  If a page fault occurs, we drop the
+	 * inode glock, fault in the pages manually, and retry.
+	 */
+
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		ret = gfs2_file_direct_read(iocb, to, &gh);
 		if (likely(ret != -ENOTBLK))
@@ -864,13 +901,35 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	}
 	ip = GFS2_I(iocb->ki_filp->f_mapping->host);
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+retry:
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
 		goto out_uninit;
+retry_under_glock:
+	pagefault_disable();
 	ret = generic_file_read_iter(iocb, to);
+	pagefault_enable();
 	if (ret > 0)
 		written += ret;
-	gfs2_glock_dq(&gh);
+
+	if (unlikely(iov_iter_count(to) && (ret > 0 || ret == -EFAULT)) &&
+	    should_fault_in_pages(to, &prev_count, &window_size)) {
+		size_t leftover;
+
+		gfs2_holder_allow_demote(&gh);
+		leftover = fault_in_iov_iter_writeable(to, window_size);
+		gfs2_holder_disallow_demote(&gh);
+		if (leftover != window_size) {
+			if (!gfs2_holder_queued(&gh)) {
+				if (written)
+					goto out_uninit;
+				goto retry;
+			}
+			goto retry_under_glock;
+		}
+	}
+	if (gfs2_holder_queued(&gh))
+		gfs2_glock_dq(&gh);
 out_uninit:
 	gfs2_holder_uninit(&gh);
 	return written ? written : ret;
@@ -885,6 +944,8 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_holder *statfs_gh = NULL;
+	size_t prev_count = 0, window_size = 0;
+	size_t read = 0;
 	ssize_t ret;
 
 	/*
@@ -900,10 +961,11 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
 	}
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
+retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
-
+retry_under_glock:
 	if (inode == sdp->sd_rindex) {
 		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 
@@ -914,19 +976,40 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
 	}
 
 	current->backing_dev_info = inode_to_bdi(inode);
+	pagefault_disable();
 	ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+	pagefault_enable();
 	current->backing_dev_info = NULL;
+	if (ret > 0)
+		read += ret;
 
 	if (inode == sdp->sd_rindex)
 		gfs2_glock_dq_uninit(statfs_gh);
 
+	if (unlikely(iov_iter_count(from) && (ret > 0 || ret == -EFAULT)) &&
+	    should_fault_in_pages(from, &prev_count, &window_size)) {
+		size_t leftover;
+
+		gfs2_holder_allow_demote(gh);
+		leftover = fault_in_iov_iter_readable(from, window_size);
+		gfs2_holder_disallow_demote(gh);
+		if (leftover != window_size) {
+			if (!gfs2_holder_queued(gh)) {
+				if (read)
+					goto out_uninit;
+				goto retry;
+			}
+			goto retry_under_glock;
+		}
+	}
 out_unlock:
-	gfs2_glock_dq(gh);
+	if (gfs2_holder_queued(gh))
+		gfs2_glock_dq(gh);
 out_uninit:
 	gfs2_holder_uninit(gh);
 	if (statfs_gh)
 		kfree(statfs_gh);
-	return ret;
+	return read ? read : ret;
 }
 
 /**
-- 
2.26.3


  parent reply	other threads:[~2021-08-27 16:51 UTC|newest]

Thread overview: 99+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-27 16:49 [PATCH v7 00/19] gfs2: Fix mmap + page fault deadlocks Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 01/19] iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value Andreas Gruenbacher
2021-09-09 11:09   ` Christoph Hellwig
2021-08-27 16:49 ` [PATCH v7 02/19] powerpc/kvm: Fix kvm_use_magic_page Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 03/19] gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable} Andreas Gruenbacher
2021-08-27 19:08   ` Al Viro
2021-09-03 14:56   ` Filipe Manana
2021-09-28 15:02     ` Andreas Gruenbacher
2021-09-28 16:37       ` Matthew Wilcox
2021-09-28 20:41         ` Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 04/19] iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable Andreas Gruenbacher
2021-08-27 18:53   ` Al Viro
2021-08-27 18:57     ` Linus Torvalds
2021-08-27 19:16       ` Al Viro
2021-08-27 20:56   ` Kari Argillander
2021-08-28 17:13     ` Linus Torvalds
2021-08-27 16:49 ` [PATCH v7 05/19] iov_iter: Introduce fault_in_iov_iter_writeable Andreas Gruenbacher
2021-08-27 18:49   ` Al Viro
2021-08-27 19:05     ` Linus Torvalds
2021-08-27 19:23       ` Al Viro
2021-08-27 19:33         ` Linus Torvalds
2021-08-27 19:37           ` Al Viro
2021-08-27 21:48             ` Al Viro
2021-08-27 21:57               ` Al Viro
2021-08-27 23:22                 ` Luck, Tony
2021-08-28  2:20                   ` Luck, Tony
2021-08-28 21:47                   ` Thomas Gleixner
2021-08-28 22:04                     ` Al Viro
2021-08-28 22:11                       ` Al Viro
2021-08-28 22:19                         ` Al Viro
2021-08-28 22:51                           ` Al Viro
2021-08-29 18:44                             ` Thomas Gleixner
2021-08-29 19:46                               ` Al Viro
2021-08-29 19:51                                 ` Thomas Gleixner
2021-08-28 22:20                         ` Tony Luck
2021-08-29  1:40                           ` Matthew Wilcox
2021-08-30 15:41                             ` Luck, Tony
2021-08-28 22:23                       ` Thomas Gleixner
2021-08-28 19:28               ` [RFC][arm64] possible infinite loop in btrfs search_ioctl() Al Viro
2021-08-31 13:54                 ` Catalin Marinas
2021-08-31 15:28                   ` Al Viro
2021-08-31 16:01                     ` Catalin Marinas
2021-10-11 17:37                     ` Catalin Marinas
2021-10-11 19:15                       ` Linus Torvalds
2021-10-11 21:08                         ` Catalin Marinas
2021-10-11 23:59                           ` Linus Torvalds
2021-10-12 17:27                             ` Catalin Marinas
2021-10-12 17:58                               ` Linus Torvalds
2021-10-18 17:13                                 ` Catalin Marinas
2021-10-21  0:46                             ` Andreas Gruenbacher
2021-10-21 10:05                               ` Catalin Marinas
2021-10-21 14:42                                 ` Andreas Gruenbacher
2021-10-21 17:09                                   ` Catalin Marinas
2021-10-21 18:00                                     ` Andreas Gruenbacher
2021-10-22 18:41                                       ` Catalin Marinas
2021-10-25 19:37                                         ` Andreas Gruenbacher
2021-10-22  2:30                                   ` Linus Torvalds
2021-10-22  9:34                                     ` Catalin Marinas
2021-08-29  0:58               ` [PATCH v7 05/19] iov_iter: Introduce fault_in_iov_iter_writeable Al Viro
2021-08-27 16:49 ` [PATCH v7 06/19] gfs2: Add wrapper for iomap_file_buffered_write Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 07/19] gfs2: Clean up function may_grant Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 08/19] gfs2: Eliminate vestigial HIF_FIRST Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 09/19] gfs2: Remove redundant check from gfs2_glock_dq Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 10/19] gfs2: Introduce flag for glock holder auto-demotion Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 11/19] gfs2: Move the inode glock locking to gfs2_file_buffered_write Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 12/19] gfs2: Eliminate ip->i_gh Andreas Gruenbacher
2021-08-27 16:49 ` Andreas Gruenbacher [this message]
2021-08-27 16:49 ` [PATCH v7 14/19] iomap: Fix iomap_dio_rw return value for user copies Andreas Gruenbacher
2021-09-03 18:54   ` Darrick J. Wong
2021-09-09 11:17   ` Christoph Hellwig
2021-08-27 16:49 ` [PATCH v7 15/19] iomap: Support partial direct I/O on user copy failures Andreas Gruenbacher
2021-09-03 18:54   ` Darrick J. Wong
2021-09-09 11:20   ` Christoph Hellwig
2021-09-28 15:05     ` Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 16/19] iomap: Add done_before argument to iomap_dio_rw Andreas Gruenbacher
2021-08-27 18:30   ` Darrick J. Wong
2021-08-27 20:15     ` Andreas Gruenbacher
2021-08-27 21:32       ` Darrick J. Wong
2021-08-27 21:49         ` Andreas Grünbacher
2021-08-27 22:35         ` Linus Torvalds
2021-09-03 18:47           ` Darrick J. Wong
2021-09-03 18:53   ` Darrick J. Wong
2021-09-09 11:30   ` Christoph Hellwig
2021-09-09 17:22     ` Linus Torvalds
2021-09-10  7:36       ` Christoph Hellwig
2021-08-27 16:49 ` [PATCH v7 17/19] gup: Introduce FOLL_NOFAULT flag to disable page faults Andreas Gruenbacher
2021-09-09 11:36   ` Christoph Hellwig
2021-09-09 17:17     ` Linus Torvalds
2021-09-10  7:24       ` Christoph Hellwig
2021-08-27 16:49 ` [PATCH v7 18/19] iov_iter: Introduce nofault " Andreas Gruenbacher
2021-08-27 18:47   ` Al Viro
2021-08-27 19:56     ` Andreas Gruenbacher
2021-08-27 16:49 ` [PATCH v7 19/19] gfs2: Fix mmap + page fault deadlocks for direct I/O Andreas Gruenbacher
2021-08-27 17:16 ` [PATCH v7 00/19] gfs2: Fix mmap + page fault deadlocks Linus Torvalds
2021-09-01 19:52   ` Andreas Gruenbacher
2021-09-03 15:52     ` Linus Torvalds
2021-09-03 18:25       ` Al Viro
2021-09-03 18:47         ` Linus Torvalds
2021-09-03 15:07 ` Filipe Manana

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210827164926.1726765-14-agruenba@redhat.com \
    --to=agruenba@redhat.com \
    --cc=cluster-devel@redhat.com \
    --cc=djwong@kernel.org \
    --cc=hch@infradead.org \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ocfs2-devel@oss.oracle.com \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --subject='Re: [PATCH v7 13/19] gfs2: Fix mmap + page fault deadlocks for buffered I/O' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).