Linux-Fsdevel Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC PATCH 0/4] fsdax: introduce fs query to support reflink
@ 2020-09-15 10:13 Shiyang Ruan
  2020-09-15 10:13 ` [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure Shiyang Ruan
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Shiyang Ruan @ 2020-09-15 10:13 UTC (permalink / raw)
  To: linux-kernel, linux-xfs, linux-nvdimm, linux-mm
  Cc: linux-fsdevel, darrick.wong, dan.j.williams, david, hch,
	rgoldwyn, qi.fuli, y-goto

This patchset is a try to resolve the problem of tracking shared page
for fsdax.

This patchset moves owner tracking from dax_assocaite_entry() to pmem
device, by introducing an interface ->memory_failure() of struct
pagemap.  The interface is called by memory_failure() in mm, and
implemented by pmem device.  Then pmem device find the filesystem
which the damaged page located in, and call ->storage_lost() to track
files or metadata assocaited with this page.  Finally we are able to
try to fix the damaged data in filesystem and do other necessary
processing, such as killing processes who are using the files
affected.

The call trace is like this:
 memory_failure()
   pgmap->ops->memory_failure()      => pmem_pgmap_memory_failure()
     bdev->bd_super->storage_lost()  => xfs_fs_storage_lost()
       xfs_rmap_query_range()
         xfs_storage_lost_helper()
           mf_recover_controller->recover_fn => \ 
                            memory_failure_dev_pagemap_kill_procs()

The collect_procs() and kill_procs() are moved into a callback which
is passed from memory_failure() to xfs_storage_lost_helper().  So we
can call it when a file assocaited is found, instead of creating a
file list and iterate it.

The fsdax & reflink support for XFS is not contained in this patchset.

(Rebased on v5.9-rc2)
==

Shiyang Ruan (4):
  fs: introduce ->storage_lost() for memory-failure
  pagemap: introduce ->memory_failure()
  mm, fsdax: refactor dax handler in memory-failure
  fsdax: remove useless (dis)associate functions

 block/genhd.c            |  12 ++++
 drivers/nvdimm/pmem.c    |  31 ++++++++++
 fs/dax.c                 |  64 ++------------------
 fs/xfs/xfs_super.c       |  80 ++++++++++++++++++++++++
 include/linux/dax.h      |   5 +-
 include/linux/fs.h       |   1 +
 include/linux/genhd.h    |   2 +
 include/linux/memremap.h |   3 +
 include/linux/mm.h       |  14 +++++
 mm/memory-failure.c      | 127 ++++++++++++++++++++++++---------------
 10 files changed, 229 insertions(+), 110 deletions(-)

-- 
2.28.0




^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure
  2020-09-15 10:13 [RFC PATCH 0/4] fsdax: introduce fs query to support reflink Shiyang Ruan
@ 2020-09-15 10:13 ` Shiyang Ruan
  2020-09-15 16:16   ` Darrick J. Wong
  2020-09-15 10:13 ` [RFC PATCH 2/4] pagemap: introduce ->memory_failure() Shiyang Ruan
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Shiyang Ruan @ 2020-09-15 10:13 UTC (permalink / raw)
  To: linux-kernel, linux-xfs, linux-nvdimm, linux-mm
  Cc: linux-fsdevel, darrick.wong, dan.j.williams, david, hch,
	rgoldwyn, qi.fuli, y-goto

This function is used to handle errors which may cause data lost in
filesystem.  Such as memory-failure in fsdax mode.

In XFS, it requires "rmapbt" feature in order to query for files or
metadata which associated to the error block.  Then we could call fs
recover functions to try to repair the damaged data.(did not implemented
in this patch)

After that, the memory-failure also needs to kill processes who are
using those files.  The struct mf_recover_controller is created to store
necessary parameters.

Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
---
 fs/xfs/xfs_super.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  1 +
 include/linux/mm.h |  6 ++++
 3 files changed, 87 insertions(+)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 71ac6c1cdc36..118d9c5d9e1e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -35,6 +35,10 @@
 #include "xfs_refcount_item.h"
 #include "xfs_bmap_item.h"
 #include "xfs_reflink.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_bit.h"
 
 #include <linux/magic.h>
 #include <linux/fs_context.h>
@@ -1104,6 +1108,81 @@ xfs_fs_free_cached_objects(
 	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
 }
 
+static int
+xfs_storage_lost_helper(
+	struct xfs_btree_cur		*cur,
+	struct xfs_rmap_irec		*rec,
+	void				*priv)
+{
+	struct xfs_inode		*ip;
+	struct mf_recover_controller	*mfrc = priv;
+	int				rc = 0;
+
+	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) {
+		// TODO check and try to fix metadata
+	} else {
+		/*
+		 * Get files that incore, filter out others that are not in use.
+		 */
+		xfs_iget(cur->bc_mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
+			 0, &ip);
+		if (!ip)
+			return 0;
+		if (!VFS_I(ip)->i_mapping)
+			goto out;
+
+		rc = mfrc->recover_fn(mfrc->pfn, mfrc->flags,
+				      VFS_I(ip)->i_mapping, rec->rm_offset);
+
+		// TODO try to fix data
+out:
+		xfs_irele(ip);
+	}
+
+	return rc;
+}
+
+static int
+xfs_fs_storage_lost(
+	struct super_block	*sb,
+	loff_t			offset,
+	void			*priv)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	struct xfs_trans	*tp = NULL;
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_rmap_irec	rmap_low, rmap_high;
+	struct xfs_buf		*agf_bp = NULL;
+	xfs_fsblock_t		fsbno = XFS_B_TO_FSB(mp, offset);
+	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
+	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
+	int			error = 0;
+
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
+	if (error)
+		return error;
+
+	cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agno);
+
+	/* Construct a range for rmap query */
+	memset(&rmap_low, 0, sizeof(rmap_low));
+	memset(&rmap_high, 0xFF, sizeof(rmap_high));
+	rmap_low.rm_startblock = rmap_high.rm_startblock = agbno;
+
+	error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
+				     xfs_storage_lost_helper, priv);
+	if (error == -ECANCELED)
+		error = 0;
+
+	xfs_btree_del_cursor(cur, error);
+	xfs_trans_brelse(tp, agf_bp);
+	return error;
+}
+
 static const struct super_operations xfs_super_operations = {
 	.alloc_inode		= xfs_fs_alloc_inode,
 	.destroy_inode		= xfs_fs_destroy_inode,
@@ -1117,6 +1196,7 @@ static const struct super_operations xfs_super_operations = {
 	.show_options		= xfs_fs_show_options,
 	.nr_cached_objects	= xfs_fs_nr_cached_objects,
 	.free_cached_objects	= xfs_fs_free_cached_objects,
+	.storage_lost		= xfs_fs_storage_lost,
 };
 
 static int
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e019ea2f1347..bd90485cfdc9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1951,6 +1951,7 @@ struct super_operations {
 				  struct shrink_control *);
 	long (*free_cached_objects)(struct super_block *,
 				    struct shrink_control *);
+	int (*storage_lost)(struct super_block *sb, loff_t offset, void *priv);
 };
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1983e08f5906..3f0c36e1bf3d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3002,6 +3002,12 @@ extern void shake_page(struct page *p, int access);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
 
+struct mf_recover_controller {
+	int (*recover_fn)(unsigned long pfn, int flags,
+		struct address_space *mapping, pgoff_t index);
+	unsigned long pfn;
+	int flags;
+};
 
 /*
  * Error handlers for various types of pages.
-- 
2.28.0




^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC PATCH 2/4] pagemap: introduce ->memory_failure()
  2020-09-15 10:13 [RFC PATCH 0/4] fsdax: introduce fs query to support reflink Shiyang Ruan
  2020-09-15 10:13 ` [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure Shiyang Ruan
@ 2020-09-15 10:13 ` Shiyang Ruan
  2020-09-15 16:31   ` Darrick J. Wong
  2020-09-15 10:13 ` [RFC PATCH 3/4] mm, fsdax: refactor dax handler in memory-failure Shiyang Ruan
  2020-09-15 10:13 ` [RFC PATCH 4/4] fsdax: remove useless (dis)associate functions Shiyang Ruan
  3 siblings, 1 reply; 9+ messages in thread
From: Shiyang Ruan @ 2020-09-15 10:13 UTC (permalink / raw)
  To: linux-kernel, linux-xfs, linux-nvdimm, linux-mm
  Cc: linux-fsdevel, darrick.wong, dan.j.williams, david, hch,
	rgoldwyn, qi.fuli, y-goto

When memory-failure occurs, we call this function which is implemented
by each devices.  For fsdax, pmem device implements it.  Pmem device
will find out the block device where the error page located in, gets the
filesystem on this block device, and finally call ->storage_lost() to
handle the error in filesystem layer.

Normally, a pmem device may contain one or more partitions, each
partition contains a block device, each block device contains a
filesystem.  So we are able to find out the filesystem by one offset on
this pmem device.  However, in other cases, such as mapped device, I
didn't find a way to obtain the filesystem laying on it.  It is a
problem need to be fixed.

Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
---
 block/genhd.c            | 12 ++++++++++++
 drivers/nvdimm/pmem.c    | 31 +++++++++++++++++++++++++++++++
 include/linux/genhd.h    |  2 ++
 include/linux/memremap.h |  3 +++
 4 files changed, 48 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index 99c64641c314..e7442b60683e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1063,6 +1063,18 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno)
 }
 EXPORT_SYMBOL(bdget_disk);
 
+struct block_device *bdget_disk_sector(struct gendisk *disk, sector_t sector)
+{
+	struct block_device *bdev = NULL;
+	struct hd_struct *part = disk_map_sector_rcu(disk, sector);
+
+	if (part)
+		bdev = bdget(part_devt(part));
+
+	return bdev;
+}
+EXPORT_SYMBOL(bdget_disk_sector);
+
 /*
  * print a full list of all partitions - intended for places where the root
  * filesystem can't be mounted and thus to give the victim some idea of what
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index fab29b514372..3ed96486c883 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -364,9 +364,40 @@ static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
+static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
+		struct mf_recover_controller *mfrc)
+{
+	struct pmem_device *pdev;
+	struct block_device *bdev;
+	sector_t disk_sector;
+	loff_t bdev_offset;
+
+	pdev = container_of(pgmap, struct pmem_device, pgmap);
+	if (!pdev->disk)
+		return -ENXIO;
+
+	disk_sector = (PFN_PHYS(mfrc->pfn) - pdev->phys_addr) >> SECTOR_SHIFT;
+	bdev = bdget_disk_sector(pdev->disk, disk_sector);
+	if (!bdev)
+		return -ENXIO;
+
+	// TODO what if block device contains a mapped device
+	if (!bdev->bd_super)
+		goto out;
+
+	bdev_offset = ((disk_sector - get_start_sect(bdev)) << SECTOR_SHIFT) -
+			pdev->data_offset;
+	bdev->bd_super->s_op->storage_lost(bdev->bd_super, bdev_offset, mfrc);
+
+out:
+	bdput(bdev);
+	return 0;
+}
+
 static const struct dev_pagemap_ops fsdax_pagemap_ops = {
 	.kill			= pmem_pagemap_kill,
 	.cleanup		= pmem_pagemap_cleanup,
+	.memory_failure		= pmem_pagemap_memory_failure,
 };
 
 static int pmem_attach_disk(struct device *dev,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 4ab853461dff..16e9e13e0841 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -303,6 +303,8 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk)
 extern void del_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
+extern struct block_device *bdget_disk_sector(struct gendisk *disk,
+			sector_t sector);
 
 extern void set_device_ro(struct block_device *bdev, int flag);
 extern void set_disk_ro(struct gendisk *disk, int flag);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 5f5b2df06e61..efebefa70d00 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -6,6 +6,7 @@
 
 struct resource;
 struct device;
+struct mf_recover_controller;
 
 /**
  * struct vmem_altmap - pre-allocated storage for vmemmap_populate
@@ -87,6 +88,8 @@ struct dev_pagemap_ops {
 	 * the page back to a CPU accessible page.
 	 */
 	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
+	int (*memory_failure)(struct dev_pagemap *pgmap,
+			      struct mf_recover_controller *mfrc);
 };
 
 #define PGMAP_ALTMAP_VALID	(1 << 0)
-- 
2.28.0




^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC PATCH 3/4] mm, fsdax: refactor dax handler in memory-failure
  2020-09-15 10:13 [RFC PATCH 0/4] fsdax: introduce fs query to support reflink Shiyang Ruan
  2020-09-15 10:13 ` [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure Shiyang Ruan
  2020-09-15 10:13 ` [RFC PATCH 2/4] pagemap: introduce ->memory_failure() Shiyang Ruan
@ 2020-09-15 10:13 ` Shiyang Ruan
  2020-09-15 10:13 ` [RFC PATCH 4/4] fsdax: remove useless (dis)associate functions Shiyang Ruan
  3 siblings, 0 replies; 9+ messages in thread
From: Shiyang Ruan @ 2020-09-15 10:13 UTC (permalink / raw)
  To: linux-kernel, linux-xfs, linux-nvdimm, linux-mm
  Cc: linux-fsdevel, darrick.wong, dan.j.williams, david, hch,
	rgoldwyn, qi.fuli, y-goto

With the ->memory_failure() implemented in pmem device and
->storage_lost() in XFS, we are able to track files or metadata
and process them further.

We don't track files by page->mapping, page->index any more, so
some of functions who obtain ->mapping, ->index from struct page
parameter need to be changed by directly passing mapping and index.

Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
---
 fs/dax.c            |  18 +++----
 include/linux/dax.h |   5 +-
 include/linux/mm.h  |   8 +++
 mm/memory-failure.c | 127 +++++++++++++++++++++++++++-----------------
 4 files changed, 94 insertions(+), 64 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 95341af1a966..1ec592f0aadd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -379,14 +379,14 @@ static struct page *dax_busy_page(void *entry)
 }
 
 /*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * dax_lock - Lock the DAX entry corresponding to a page
  * @page: The page whose entry we want to lock
  *
  * Context: Process context.
  * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
  * not be locked.
  */
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index)
 {
 	XA_STATE(xas, NULL, 0);
 	void *entry;
@@ -394,8 +394,6 @@ dax_entry_t dax_lock_page(struct page *page)
 	/* Ensure page->mapping isn't freed while we look at it */
 	rcu_read_lock();
 	for (;;) {
-		struct address_space *mapping = READ_ONCE(page->mapping);
-
 		entry = NULL;
 		if (!mapping || !dax_mapping(mapping))
 			break;
@@ -413,11 +411,7 @@ dax_entry_t dax_lock_page(struct page *page)
 
 		xas.xa = &mapping->i_pages;
 		xas_lock_irq(&xas);
-		if (mapping != page->mapping) {
-			xas_unlock_irq(&xas);
-			continue;
-		}
-		xas_set(&xas, page->index);
+		xas_set(&xas, index);
 		entry = xas_load(&xas);
 		if (dax_is_locked(entry)) {
 			rcu_read_unlock();
@@ -433,10 +427,10 @@ dax_entry_t dax_lock_page(struct page *page)
 	return (dax_entry_t)entry;
 }
 
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+		dax_entry_t cookie)
 {
-	struct address_space *mapping = page->mapping;
-	XA_STATE(xas, &mapping->i_pages, page->index);
+	XA_STATE(xas, &mapping->i_pages, index);
 
 	if (S_ISCHR(mapping->host->i_mode))
 		return;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 6904d4e0b2e0..669ba768b89e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -141,8 +141,9 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 		struct dax_device *dax_dev, struct writeback_control *wbc);
 
 struct page *dax_layout_busy_page(struct address_space *mapping);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index);
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+		dax_entry_t cookie);
 #else
 static inline bool bdev_dax_supported(struct block_device *bdev,
 		int blocksize)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3f0c36e1bf3d..d170b3f74d83 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1126,6 +1126,14 @@ static inline bool is_device_private_page(const struct page *page)
 		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
+static inline bool is_device_fsdax_page(const struct page *page)
+{
+	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+		IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+		is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
 	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f1aa6433f404..0c4a25bf276f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -93,6 +93,9 @@ static int hwpoison_filter_dev(struct page *p)
 	if (PageSlab(p))
 		return -EINVAL;
 
+	if (is_device_fsdax_page(p))
+		return 0;
+
 	mapping = page_mapping(p);
 	if (mapping == NULL || mapping->host == NULL)
 		return -EINVAL;
@@ -263,9 +266,8 @@ void shake_page(struct page *p, int access)
 EXPORT_SYMBOL_GPL(shake_page);
 
 static unsigned long dev_pagemap_mapping_shift(struct page *page,
-		struct vm_area_struct *vma)
+		struct vm_area_struct *vma, unsigned long address)
 {
-	unsigned long address = vma_address(page, vma);
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
@@ -306,8 +308,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
-		       struct vm_area_struct *vma,
-		       struct list_head *to_kill)
+		       struct address_space *mapping, pgoff_t pgoff,
+		       struct vm_area_struct *vma, struct list_head *to_kill)
 {
 	struct to_kill *tk;
 
@@ -317,12 +319,18 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 		return;
 	}
 
-	tk->addr = page_address_in_vma(p, vma);
-	if (is_zone_device_page(p))
-		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-	else
-		tk->size_shift = page_shift(compound_head(p));
-
+	if (is_device_fsdax_page(p)) {
+		tk->addr = vma->vm_start +
+				((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+		tk->size_shift = dev_pagemap_mapping_shift(p, vma, tk->addr);
+	} else {
+		tk->addr = page_address_in_vma(p, vma);
+		if (is_zone_device_page(p)) {
+			tk->size_shift = dev_pagemap_mapping_shift(p, vma,
+							vma_address(p, vma));
+		} else
+			tk->size_shift = page_shift(compound_head(p));
+	}
 	/*
 	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
 	 * "tk->size_shift" is always non-zero for !is_zone_device_page(),
@@ -468,7 +476,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 			if (!page_mapped_in_vma(page, vma))
 				continue;
 			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+				add_to_kill(t, page, NULL, 0, vma, to_kill);
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -478,23 +486,19 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 /*
  * Collect processes when the error hit a file mapped page.
  */
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
-				int force_early)
+static void collect_procs_file(struct page *page, struct address_space *mapping,
+		pgoff_t pgoff, struct list_head *to_kill, int force_early)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
-	struct address_space *mapping = page->mapping;
 
 	i_mmap_lock_read(mapping);
 	read_lock(&tasklist_lock);
 	for_each_process(tsk) {
-		pgoff_t pgoff = page_to_pgoff(page);
 		struct task_struct *t = task_early_kill(tsk, force_early);
-
 		if (!t)
 			continue;
-		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
-				      pgoff) {
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 			/*
 			 * Send early kill signal to tasks where a vma covers
 			 * the page but the corrupted page is not necessarily
@@ -502,8 +506,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 			 * Assume applications who requested early kill want
 			 * to be informed of all such data corruptions.
 			 */
-			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+			if (vma->vm_mm == t->mm) {
+				add_to_kill(t, page, mapping, pgoff, vma,
+					    to_kill);
+			}
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -522,7 +528,8 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 	if (PageAnon(page))
 		collect_procs_anon(page, tokill, force_early);
 	else
-		collect_procs_file(page, tokill, force_early);
+		collect_procs_file(page, page->mapping, page_to_pgoff(page),
+				   tokill, force_early);
 }
 
 static const char *action_name[] = {
@@ -1176,14 +1183,14 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	return res;
 }
 
-static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
-		struct dev_pagemap *pgmap)
+static int memory_failure_dev_pagemap_kill_procs(unsigned long pfn, int flags,
+		struct address_space *mapping, pgoff_t index)
 {
 	struct page *page = pfn_to_page(pfn);
 	const bool unmap_success = true;
 	unsigned long size = 0;
 	struct to_kill *tk;
-	LIST_HEAD(tokill);
+	LIST_HEAD(to_kill);
 	int rc = -EBUSY;
 	loff_t start;
 	dax_entry_t cookie;
@@ -1195,28 +1202,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	 * also prevents changes to the mapping of this pfn until
 	 * poison signaling is complete.
 	 */
-	cookie = dax_lock_page(page);
+	cookie = dax_lock(mapping, index);
 	if (!cookie)
-		goto out;
-
-	if (hwpoison_filter(page)) {
-		rc = 0;
 		goto unlock;
-	}
-
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
-		/*
-		 * TODO: Handle HMM pages which may need coordination
-		 * with device-side memory.
-		 */
-		goto unlock;
-	}
-
-	/*
-	 * Use this flag as an indication that the dax page has been
-	 * remapped UC to prevent speculative consumption of poison.
-	 */
-	SetPageHWPoison(page);
 
 	/*
 	 * Unlike System-RAM there is no possibility to swap in a
@@ -1225,9 +1213,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	 * SIGBUS (i.e. MF_MUST_KILL)
 	 */
 	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-	collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+	collect_procs_file(page, mapping, index, &to_kill,
+			   flags & MF_ACTION_REQUIRED);
 
-	list_for_each_entry(tk, &tokill, nd)
+	list_for_each_entry(tk, &to_kill, nd)
 		if (tk->size_shift)
 			size = max(size, 1UL << tk->size_shift);
 	if (size) {
@@ -1237,13 +1226,51 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		 * actual size of the mapping being torn down is
 		 * communicated in siginfo, see kill_proc()
 		 */
-		start = (page->index << PAGE_SHIFT) & ~(size - 1);
-		unmap_mapping_range(page->mapping, start, start + size, 0);
+		start = (index << PAGE_SHIFT) & ~(size - 1);
+		unmap_mapping_range(mapping, start, start + size, 0);
 	}
-	kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+
+	kill_procs(&to_kill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
 	rc = 0;
 unlock:
-	dax_unlock_page(page, cookie);
+	dax_unlock(mapping, index, cookie);
+	return rc;
+}
+
+static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+		struct dev_pagemap *pgmap)
+{
+	struct page *page = pfn_to_page(pfn);
+	struct mf_recover_controller mfrc = {
+		.recover_fn = memory_failure_dev_pagemap_kill_procs,
+		.pfn = pfn,
+		.flags = flags,
+	};
+	int rc;
+
+	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+		/*
+		 * TODO: Handle HMM pages which may need coordination
+		 * with device-side memory.
+		 */
+		goto out;
+	}
+
+	if (hwpoison_filter(page)) {
+		rc = 0;
+		goto out;
+	}
+
+	/*
+	 * Use this flag as an indication that the dax page has been
+	 * remapped UC to prevent speculative consumption of poison.
+	 */
+	SetPageHWPoison(page);
+
+	/* call driver to handle the memory failure */
+	if (pgmap->ops->memory_failure)
+		rc = pgmap->ops->memory_failure(pgmap, &mfrc);
+
 out:
 	/* drop pgmap ref acquired in caller */
 	put_dev_pagemap(pgmap);
-- 
2.28.0




^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC PATCH 4/4] fsdax: remove useless (dis)associate functions
  2020-09-15 10:13 [RFC PATCH 0/4] fsdax: introduce fs query to support reflink Shiyang Ruan
                   ` (2 preceding siblings ...)
  2020-09-15 10:13 ` [RFC PATCH 3/4] mm, fsdax: refactor dax handler in memory-failure Shiyang Ruan
@ 2020-09-15 10:13 ` Shiyang Ruan
  3 siblings, 0 replies; 9+ messages in thread
From: Shiyang Ruan @ 2020-09-15 10:13 UTC (permalink / raw)
  To: linux-kernel, linux-xfs, linux-nvdimm, linux-mm
  Cc: linux-fsdevel, darrick.wong, dan.j.williams, david, hch,
	rgoldwyn, qi.fuli, y-goto

Since owner tarcking is triggerred by pmem device, these functions are
useless.  So remove it.

Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
---
 fs/dax.c | 46 ----------------------------------------------
 1 file changed, 46 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 1ec592f0aadd..4c85b07abcc0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -323,48 +323,6 @@ static unsigned long dax_end_pfn(void *entry)
 	for (pfn = dax_to_pfn(entry); \
 			pfn < dax_end_pfn(entry); pfn++)
 
-/*
- * TODO: for reflink+dax we need a way to associate a single page with
- * multiple address_space instances at different linear_page_index()
- * offsets.
- */
-static void dax_associate_entry(void *entry, struct address_space *mapping,
-		struct vm_area_struct *vma, unsigned long address)
-{
-	unsigned long size = dax_entry_size(entry), pfn, index;
-	int i = 0;
-
-	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return;
-
-	index = linear_page_index(vma, address & ~(size - 1));
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
-
-		WARN_ON_ONCE(page->mapping);
-		page->mapping = mapping;
-		page->index = index + i++;
-	}
-}
-
-static void dax_disassociate_entry(void *entry, struct address_space *mapping,
-		bool trunc)
-{
-	unsigned long pfn;
-
-	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return;
-
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
-
-		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
-		WARN_ON_ONCE(page->mapping && page->mapping != mapping);
-		page->mapping = NULL;
-		page->index = 0;
-	}
-}
-
 static struct page *dax_busy_page(void *entry)
 {
 	unsigned long pfn;
@@ -516,7 +474,6 @@ static void *grab_mapping_entry(struct xa_state *xas,
 			xas_lock_irq(xas);
 		}
 
-		dax_disassociate_entry(entry, mapping, false);
 		xas_store(xas, NULL);	/* undo the PMD join */
 		dax_wake_entry(xas, entry, true);
 		mapping->nrexceptional--;
@@ -636,7 +593,6 @@ static int __dax_invalidate_entry(struct address_space *mapping,
 	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
 	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
 		goto out;
-	dax_disassociate_entry(entry, mapping, trunc);
 	xas_store(&xas, NULL);
 	mapping->nrexceptional--;
 	ret = 1;
@@ -730,8 +686,6 @@ static void *dax_insert_entry(struct xa_state *xas,
 	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 		void *old;
 
-		dax_disassociate_entry(entry, mapping, false);
-		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
 		/*
 		 * Only swap our new entry into the page cache if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
-- 
2.28.0




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure
  2020-09-15 10:13 ` [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure Shiyang Ruan
@ 2020-09-15 16:16   ` Darrick J. Wong
  2020-09-16  2:15     ` Ruan Shiyang
  0 siblings, 1 reply; 9+ messages in thread
From: Darrick J. Wong @ 2020-09-15 16:16 UTC (permalink / raw)
  To: Shiyang Ruan
  Cc: linux-kernel, linux-xfs, linux-nvdimm, linux-mm, linux-fsdevel,
	dan.j.williams, david, hch, rgoldwyn, qi.fuli, y-goto

On Tue, Sep 15, 2020 at 06:13:08PM +0800, Shiyang Ruan wrote:
> This function is used to handle errors which may cause data lost in
> filesystem.  Such as memory-failure in fsdax mode.
> 
> In XFS, it requires "rmapbt" feature in order to query for files or
> metadata which associated to the error block.  Then we could call fs
> recover functions to try to repair the damaged data.(did not implemented
> in this patch)
> 
> After that, the memory-failure also needs to kill processes who are
> using those files.  The struct mf_recover_controller is created to store
> necessary parameters.
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
> ---
>  fs/xfs/xfs_super.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/fs.h |  1 +
>  include/linux/mm.h |  6 ++++
>  3 files changed, 87 insertions(+)
> 
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 71ac6c1cdc36..118d9c5d9e1e 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -35,6 +35,10 @@
>  #include "xfs_refcount_item.h"
>  #include "xfs_bmap_item.h"
>  #include "xfs_reflink.h"
> +#include "xfs_alloc.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rmap_btree.h"
> +#include "xfs_bit.h"
>  
>  #include <linux/magic.h>
>  #include <linux/fs_context.h>
> @@ -1104,6 +1108,81 @@ xfs_fs_free_cached_objects(
>  	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
>  }
>  
> +static int
> +xfs_storage_lost_helper(
> +	struct xfs_btree_cur		*cur,
> +	struct xfs_rmap_irec		*rec,
> +	void				*priv)
> +{
> +	struct xfs_inode		*ip;
> +	struct mf_recover_controller	*mfrc = priv;
> +	int				rc = 0;
> +
> +	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) {
> +		// TODO check and try to fix metadata
> +	} else {
> +		/*
> +		 * Get files that incore, filter out others that are not in use.
> +		 */
> +		xfs_iget(cur->bc_mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
> +			 0, &ip);

Missing return value check here.

> +		if (!ip)
> +			return 0;
> +		if (!VFS_I(ip)->i_mapping)
> +			goto out;
> +
> +		rc = mfrc->recover_fn(mfrc->pfn, mfrc->flags,
> +				      VFS_I(ip)->i_mapping, rec->rm_offset);
> +
> +		// TODO try to fix data
> +out:
> +		xfs_irele(ip);
> +	}
> +
> +	return rc;
> +}
> +
> +static int
> +xfs_fs_storage_lost(
> +	struct super_block	*sb,
> +	loff_t			offset,

offset into which device?  XFS supports three...

I'm also a little surprised you don't pass in a length.

I guess that means this function will get called repeatedly for every
byte in the poisoned range?

> +	void			*priv)
> +{
> +	struct xfs_mount	*mp = XFS_M(sb);
> +	struct xfs_trans	*tp = NULL;
> +	struct xfs_btree_cur	*cur = NULL;
> +	struct xfs_rmap_irec	rmap_low, rmap_high;
> +	struct xfs_buf		*agf_bp = NULL;
> +	xfs_fsblock_t		fsbno = XFS_B_TO_FSB(mp, offset);
> +	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
> +	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
> +	int			error = 0;
> +
> +	error = xfs_trans_alloc_empty(mp, &tp);
> +	if (error)
> +		return error;
> +
> +	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> +	if (error)
> +		return error;
> +
> +	cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agno);

...and this is definitely the wrong call sequence if the malfunctioning
device is the realtime device.  If a dax rt device dies, you'll be
shooting down files on the data device, which will cause all sorts of
problems.

Question: Should all this poison recovery stuff go into a new file?
xfs_poison.c?  There's already a lot of code in xfs_super.c.

--D

> +
> +	/* Construct a range for rmap query */
> +	memset(&rmap_low, 0, sizeof(rmap_low));
> +	memset(&rmap_high, 0xFF, sizeof(rmap_high));
> +	rmap_low.rm_startblock = rmap_high.rm_startblock = agbno;
> +
> +	error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
> +				     xfs_storage_lost_helper, priv);
> +	if (error == -ECANCELED)
> +		error = 0;
> +
> +	xfs_btree_del_cursor(cur, error);
> +	xfs_trans_brelse(tp, agf_bp);
> +	return error;
> +}
> +
>  static const struct super_operations xfs_super_operations = {
>  	.alloc_inode		= xfs_fs_alloc_inode,
>  	.destroy_inode		= xfs_fs_destroy_inode,
> @@ -1117,6 +1196,7 @@ static const struct super_operations xfs_super_operations = {
>  	.show_options		= xfs_fs_show_options,
>  	.nr_cached_objects	= xfs_fs_nr_cached_objects,
>  	.free_cached_objects	= xfs_fs_free_cached_objects,
> +	.storage_lost		= xfs_fs_storage_lost,
>  };
>  
>  static int
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e019ea2f1347..bd90485cfdc9 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1951,6 +1951,7 @@ struct super_operations {
>  				  struct shrink_control *);
>  	long (*free_cached_objects)(struct super_block *,
>  				    struct shrink_control *);
> +	int (*storage_lost)(struct super_block *sb, loff_t offset, void *priv);
>  };
>  
>  /*
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1983e08f5906..3f0c36e1bf3d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3002,6 +3002,12 @@ extern void shake_page(struct page *p, int access);
>  extern atomic_long_t num_poisoned_pages __read_mostly;
>  extern int soft_offline_page(unsigned long pfn, int flags);
>  
> +struct mf_recover_controller {
> +	int (*recover_fn)(unsigned long pfn, int flags,
> +		struct address_space *mapping, pgoff_t index);
> +	unsigned long pfn;
> +	int flags;
> +};
>  
>  /*
>   * Error handlers for various types of pages.
> -- 
> 2.28.0
> 
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 2/4] pagemap: introduce ->memory_failure()
  2020-09-15 10:13 ` [RFC PATCH 2/4] pagemap: introduce ->memory_failure() Shiyang Ruan
@ 2020-09-15 16:31   ` Darrick J. Wong
  2020-09-16  2:15     ` Ruan Shiyang
  0 siblings, 1 reply; 9+ messages in thread
From: Darrick J. Wong @ 2020-09-15 16:31 UTC (permalink / raw)
  To: Shiyang Ruan
  Cc: linux-kernel, linux-xfs, linux-nvdimm, linux-mm, linux-fsdevel,
	dan.j.williams, david, hch, rgoldwyn, qi.fuli, y-goto

On Tue, Sep 15, 2020 at 06:13:09PM +0800, Shiyang Ruan wrote:
> When memory-failure occurs, we call this function which is implemented
> by each devices.  For fsdax, pmem device implements it.  Pmem device
> will find out the block device where the error page located in, gets the
> filesystem on this block device, and finally call ->storage_lost() to
> handle the error in filesystem layer.
> 
> Normally, a pmem device may contain one or more partitions, each
> partition contains a block device, each block device contains a
> filesystem.  So we are able to find out the filesystem by one offset on
> this pmem device.  However, in other cases, such as mapped device, I
> didn't find a way to obtain the filesystem laying on it.  It is a
> problem need to be fixed.
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
> ---
>  block/genhd.c            | 12 ++++++++++++
>  drivers/nvdimm/pmem.c    | 31 +++++++++++++++++++++++++++++++
>  include/linux/genhd.h    |  2 ++
>  include/linux/memremap.h |  3 +++
>  4 files changed, 48 insertions(+)
> 
> diff --git a/block/genhd.c b/block/genhd.c
> index 99c64641c314..e7442b60683e 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1063,6 +1063,18 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno)
>  }
>  EXPORT_SYMBOL(bdget_disk);
>  
> +struct block_device *bdget_disk_sector(struct gendisk *disk, sector_t sector)
> +{
> +	struct block_device *bdev = NULL;
> +	struct hd_struct *part = disk_map_sector_rcu(disk, sector);
> +
> +	if (part)
> +		bdev = bdget(part_devt(part));
> +
> +	return bdev;
> +}
> +EXPORT_SYMBOL(bdget_disk_sector);
> +
>  /*
>   * print a full list of all partitions - intended for places where the root
>   * filesystem can't be mounted and thus to give the victim some idea of what
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index fab29b514372..3ed96486c883 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -364,9 +364,40 @@ static void pmem_release_disk(void *__pmem)
>  	put_disk(pmem->disk);
>  }
>  
> +static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
> +		struct mf_recover_controller *mfrc)
> +{
> +	struct pmem_device *pdev;
> +	struct block_device *bdev;
> +	sector_t disk_sector;
> +	loff_t bdev_offset;
> +
> +	pdev = container_of(pgmap, struct pmem_device, pgmap);
> +	if (!pdev->disk)
> +		return -ENXIO;
> +
> +	disk_sector = (PFN_PHYS(mfrc->pfn) - pdev->phys_addr) >> SECTOR_SHIFT;

Ah, I see, looking at the current x86 MCE code, the MCE handler gets a
physical address, which is then rounded down to a PFN, which is then
blown back up into a byte address(?) and then rounded down to sectors.
That is then blown back up into a byte address and passed on to XFS,
which rounds it down to fs blocksize.

/me wishes that wasn't so convoluted, but reforming the whole mm poison
system to have smaller blast radii isn't the purpose of this patch. :)

> +	bdev = bdget_disk_sector(pdev->disk, disk_sector);
> +	if (!bdev)
> +		return -ENXIO;
> +
> +	// TODO what if block device contains a mapped device

Find its dev_pagemap_ops and invoke its memory_failure function? ;)

> +	if (!bdev->bd_super)
> +		goto out;
> +
> +	bdev_offset = ((disk_sector - get_start_sect(bdev)) << SECTOR_SHIFT) -
> +			pdev->data_offset;
> +	bdev->bd_super->s_op->storage_lost(bdev->bd_super, bdev_offset, mfrc);

->storage_lost is required for all filesystems?

--D

> +
> +out:
> +	bdput(bdev);
> +	return 0;
> +}
> +
>  static const struct dev_pagemap_ops fsdax_pagemap_ops = {
>  	.kill			= pmem_pagemap_kill,
>  	.cleanup		= pmem_pagemap_cleanup,
> +	.memory_failure		= pmem_pagemap_memory_failure,
>  };
>  
>  static int pmem_attach_disk(struct device *dev,
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 4ab853461dff..16e9e13e0841 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -303,6 +303,8 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk)
>  extern void del_gendisk(struct gendisk *gp);
>  extern struct gendisk *get_gendisk(dev_t dev, int *partno);
>  extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
> +extern struct block_device *bdget_disk_sector(struct gendisk *disk,
> +			sector_t sector);
>  
>  extern void set_device_ro(struct block_device *bdev, int flag);
>  extern void set_disk_ro(struct gendisk *disk, int flag);
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 5f5b2df06e61..efebefa70d00 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -6,6 +6,7 @@
>  
>  struct resource;
>  struct device;
> +struct mf_recover_controller;
>  
>  /**
>   * struct vmem_altmap - pre-allocated storage for vmemmap_populate
> @@ -87,6 +88,8 @@ struct dev_pagemap_ops {
>  	 * the page back to a CPU accessible page.
>  	 */
>  	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
> +	int (*memory_failure)(struct dev_pagemap *pgmap,
> +			      struct mf_recover_controller *mfrc);
>  };
>  
>  #define PGMAP_ALTMAP_VALID	(1 << 0)
> -- 
> 2.28.0
> 
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure
  2020-09-15 16:16   ` Darrick J. Wong
@ 2020-09-16  2:15     ` Ruan Shiyang
  0 siblings, 0 replies; 9+ messages in thread
From: Ruan Shiyang @ 2020-09-16  2:15 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: linux-kernel, linux-xfs, linux-nvdimm, linux-mm, linux-fsdevel,
	dan.j.williams, david, hch, rgoldwyn, qi.fuli, y-goto



On 2020/9/16 上午12:16, Darrick J. Wong wrote:
> On Tue, Sep 15, 2020 at 06:13:08PM +0800, Shiyang Ruan wrote:
>> This function is used to handle errors which may cause data lost in
>> filesystem.  Such as memory-failure in fsdax mode.
>>
>> In XFS, it requires "rmapbt" feature in order to query for files or
>> metadata which associated to the error block.  Then we could call fs
>> recover functions to try to repair the damaged data.(did not implemented
>> in this patch)
>>
>> After that, the memory-failure also needs to kill processes who are
>> using those files.  The struct mf_recover_controller is created to store
>> necessary parameters.
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
>> ---
>>   fs/xfs/xfs_super.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/fs.h |  1 +
>>   include/linux/mm.h |  6 ++++
>>   3 files changed, 87 insertions(+)
>>
>> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
>> index 71ac6c1cdc36..118d9c5d9e1e 100644
>> --- a/fs/xfs/xfs_super.c
>> +++ b/fs/xfs/xfs_super.c
>> @@ -35,6 +35,10 @@
>>   #include "xfs_refcount_item.h"
>>   #include "xfs_bmap_item.h"
>>   #include "xfs_reflink.h"
>> +#include "xfs_alloc.h"
>> +#include "xfs_rmap.h"
>> +#include "xfs_rmap_btree.h"
>> +#include "xfs_bit.h"
>>   
>>   #include <linux/magic.h>
>>   #include <linux/fs_context.h>
>> @@ -1104,6 +1108,81 @@ xfs_fs_free_cached_objects(
>>   	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
>>   }
>>   
>> +static int
>> +xfs_storage_lost_helper(
>> +	struct xfs_btree_cur		*cur,
>> +	struct xfs_rmap_irec		*rec,
>> +	void				*priv)
>> +{
>> +	struct xfs_inode		*ip;
>> +	struct mf_recover_controller	*mfrc = priv;
>> +	int				rc = 0;
>> +
>> +	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) {
>> +		// TODO check and try to fix metadata
>> +	} else {
>> +		/*
>> +		 * Get files that incore, filter out others that are not in use.
>> +		 */
>> +		xfs_iget(cur->bc_mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
>> +			 0, &ip);
> 
> Missing return value check here.

Yes, I ignored it.  My fault.

> 
>> +		if (!ip)
>> +			return 0;
>> +		if (!VFS_I(ip)->i_mapping)
>> +			goto out;
>> +
>> +		rc = mfrc->recover_fn(mfrc->pfn, mfrc->flags,
>> +				      VFS_I(ip)->i_mapping, rec->rm_offset);
>> +
>> +		// TODO try to fix data
>> +out:
>> +		xfs_irele(ip);
>> +	}
>> +
>> +	return rc;
>> +}
>> +
>> +static int
>> +xfs_fs_storage_lost(
>> +	struct super_block	*sb,
>> +	loff_t			offset,
> 
> offset into which device?  XFS supports three...
> 
> I'm also a little surprised you don't pass in a length.
> 
> I guess that means this function will get called repeatedly for every
> byte in the poisoned range?

The memory-failure will triggered on one PFN each time, so its length 
could be one PAGE_SIZE.  But I think you are right, it's better to tell 
filesystem how much range is effected and need to be fixed.  I didn't 
know but I think there may be some other cases besides memory-failure. 
So the length is necessary.

> 
>> +	void			*priv)
>> +{
>> +	struct xfs_mount	*mp = XFS_M(sb);
>> +	struct xfs_trans	*tp = NULL;
>> +	struct xfs_btree_cur	*cur = NULL;
>> +	struct xfs_rmap_irec	rmap_low, rmap_high;
>> +	struct xfs_buf		*agf_bp = NULL;
>> +	xfs_fsblock_t		fsbno = XFS_B_TO_FSB(mp, offset);
>> +	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
>> +	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
>> +	int			error = 0;
>> +
>> +	error = xfs_trans_alloc_empty(mp, &tp);
>> +	if (error)
>> +		return error;
>> +
>> +	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
>> +	if (error)
>> +		return error;
>> +
>> +	cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agno);
> 
> ...and this is definitely the wrong call sequence if the malfunctioning
> device is the realtime device.  If a dax rt device dies, you'll be
> shooting down files on the data device, which will cause all sorts of
> problems.

I didn't notice that.  Let me think about it.
> 
> Question: Should all this poison recovery stuff go into a new file?
> xfs_poison.c?  There's already a lot of code in xfs_super.c.

Yes, it's a bit too much.  I'll move them into a new file.


--
Thanks,
Ruan Shiyang.
> 
> --D
> 
>> +
>> +	/* Construct a range for rmap query */
>> +	memset(&rmap_low, 0, sizeof(rmap_low));
>> +	memset(&rmap_high, 0xFF, sizeof(rmap_high));
>> +	rmap_low.rm_startblock = rmap_high.rm_startblock = agbno;
>> +
>> +	error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
>> +				     xfs_storage_lost_helper, priv);
>> +	if (error == -ECANCELED)
>> +		error = 0;
>> +
>> +	xfs_btree_del_cursor(cur, error);
>> +	xfs_trans_brelse(tp, agf_bp);
>> +	return error;
>> +}
>> +
>>   static const struct super_operations xfs_super_operations = {
>>   	.alloc_inode		= xfs_fs_alloc_inode,
>>   	.destroy_inode		= xfs_fs_destroy_inode,
>> @@ -1117,6 +1196,7 @@ static const struct super_operations xfs_super_operations = {
>>   	.show_options		= xfs_fs_show_options,
>>   	.nr_cached_objects	= xfs_fs_nr_cached_objects,
>>   	.free_cached_objects	= xfs_fs_free_cached_objects,
>> +	.storage_lost		= xfs_fs_storage_lost,
>>   };
>>   
>>   static int
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index e019ea2f1347..bd90485cfdc9 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1951,6 +1951,7 @@ struct super_operations {
>>   				  struct shrink_control *);
>>   	long (*free_cached_objects)(struct super_block *,
>>   				    struct shrink_control *);
>> +	int (*storage_lost)(struct super_block *sb, loff_t offset, void *priv);
>>   };
>>   
>>   /*
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 1983e08f5906..3f0c36e1bf3d 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3002,6 +3002,12 @@ extern void shake_page(struct page *p, int access);
>>   extern atomic_long_t num_poisoned_pages __read_mostly;
>>   extern int soft_offline_page(unsigned long pfn, int flags);
>>   
>> +struct mf_recover_controller {
>> +	int (*recover_fn)(unsigned long pfn, int flags,
>> +		struct address_space *mapping, pgoff_t index);
>> +	unsigned long pfn;
>> +	int flags;
>> +};
>>   
>>   /*
>>    * Error handlers for various types of pages.
>> -- 
>> 2.28.0
>>
>>
>>
> 
> 



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 2/4] pagemap: introduce ->memory_failure()
  2020-09-15 16:31   ` Darrick J. Wong
@ 2020-09-16  2:15     ` Ruan Shiyang
  0 siblings, 0 replies; 9+ messages in thread
From: Ruan Shiyang @ 2020-09-16  2:15 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: linux-kernel, linux-xfs, linux-nvdimm, linux-mm, linux-fsdevel,
	dan.j.williams, david, hch, rgoldwyn, qi.fuli, y-goto



On 2020/9/16 上午12:31, Darrick J. Wong wrote:
> On Tue, Sep 15, 2020 at 06:13:09PM +0800, Shiyang Ruan wrote:
>> When memory-failure occurs, we call this function which is implemented
>> by each devices.  For fsdax, pmem device implements it.  Pmem device
>> will find out the block device where the error page located in, gets the
>> filesystem on this block device, and finally call ->storage_lost() to
>> handle the error in filesystem layer.
>>
>> Normally, a pmem device may contain one or more partitions, each
>> partition contains a block device, each block device contains a
>> filesystem.  So we are able to find out the filesystem by one offset on
>> this pmem device.  However, in other cases, such as mapped device, I
>> didn't find a way to obtain the filesystem laying on it.  It is a
>> problem need to be fixed.
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
>> ---
>>   block/genhd.c            | 12 ++++++++++++
>>   drivers/nvdimm/pmem.c    | 31 +++++++++++++++++++++++++++++++
>>   include/linux/genhd.h    |  2 ++
>>   include/linux/memremap.h |  3 +++
>>   4 files changed, 48 insertions(+)
>>
>> diff --git a/block/genhd.c b/block/genhd.c
>> index 99c64641c314..e7442b60683e 100644
>> --- a/block/genhd.c
>> +++ b/block/genhd.c
>> @@ -1063,6 +1063,18 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno)
>>   }
>>   EXPORT_SYMBOL(bdget_disk);
>>   
>> +struct block_device *bdget_disk_sector(struct gendisk *disk, sector_t sector)
>> +{
>> +	struct block_device *bdev = NULL;
>> +	struct hd_struct *part = disk_map_sector_rcu(disk, sector);
>> +
>> +	if (part)
>> +		bdev = bdget(part_devt(part));
>> +
>> +	return bdev;
>> +}
>> +EXPORT_SYMBOL(bdget_disk_sector);
>> +
>>   /*
>>    * print a full list of all partitions - intended for places where the root
>>    * filesystem can't be mounted and thus to give the victim some idea of what
>> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
>> index fab29b514372..3ed96486c883 100644
>> --- a/drivers/nvdimm/pmem.c
>> +++ b/drivers/nvdimm/pmem.c
>> @@ -364,9 +364,40 @@ static void pmem_release_disk(void *__pmem)
>>   	put_disk(pmem->disk);
>>   }
>>   
>> +static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
>> +		struct mf_recover_controller *mfrc)
>> +{
>> +	struct pmem_device *pdev;
>> +	struct block_device *bdev;
>> +	sector_t disk_sector;
>> +	loff_t bdev_offset;
>> +
>> +	pdev = container_of(pgmap, struct pmem_device, pgmap);
>> +	if (!pdev->disk)
>> +		return -ENXIO;
>> +
>> +	disk_sector = (PFN_PHYS(mfrc->pfn) - pdev->phys_addr) >> SECTOR_SHIFT;
> 
> Ah, I see, looking at the current x86 MCE code, the MCE handler gets a
> physical address, which is then rounded down to a PFN, which is then
> blown back up into a byte address(?) and then rounded down to sectors.
> That is then blown back up into a byte address and passed on to XFS,
> which rounds it down to fs blocksize.
> 
> /me wishes that wasn't so convoluted, but reforming the whole mm poison
> system to have smaller blast radii isn't the purpose of this patch. :)
> 
>> +	bdev = bdget_disk_sector(pdev->disk, disk_sector);
>> +	if (!bdev)
>> +		return -ENXIO;
>> +
>> +	// TODO what if block device contains a mapped device
> 
> Find its dev_pagemap_ops and invoke its memory_failure function? ;)

Thanks for pointing out.  I'll think about it in this way.
> 
>> +	if (!bdev->bd_super)
>> +		goto out;
>> +
>> +	bdev_offset = ((disk_sector - get_start_sect(bdev)) << SECTOR_SHIFT) -
>> +			pdev->data_offset;
>> +	bdev->bd_super->s_op->storage_lost(bdev->bd_super, bdev_offset, mfrc);
> 
> ->storage_lost is required for all filesystems?

I think it is required for filesystems that support fsdax, since the 
owner tracking is moved here.  But anyway, there should have a non-NULL 
judgment.


--
Thanks,
Ruan Shiyang.
> 
> --D
> 
>> +
>> +out:
>> +	bdput(bdev);
>> +	return 0;
>> +}
>> +
>>   static const struct dev_pagemap_ops fsdax_pagemap_ops = {
>>   	.kill			= pmem_pagemap_kill,
>>   	.cleanup		= pmem_pagemap_cleanup,
>> +	.memory_failure		= pmem_pagemap_memory_failure,
>>   };
>>   
>>   static int pmem_attach_disk(struct device *dev,
>> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
>> index 4ab853461dff..16e9e13e0841 100644
>> --- a/include/linux/genhd.h
>> +++ b/include/linux/genhd.h
>> @@ -303,6 +303,8 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk)
>>   extern void del_gendisk(struct gendisk *gp);
>>   extern struct gendisk *get_gendisk(dev_t dev, int *partno);
>>   extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
>> +extern struct block_device *bdget_disk_sector(struct gendisk *disk,
>> +			sector_t sector);
>>   
>>   extern void set_device_ro(struct block_device *bdev, int flag);
>>   extern void set_disk_ro(struct gendisk *disk, int flag);
>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>> index 5f5b2df06e61..efebefa70d00 100644
>> --- a/include/linux/memremap.h
>> +++ b/include/linux/memremap.h
>> @@ -6,6 +6,7 @@
>>   
>>   struct resource;
>>   struct device;
>> +struct mf_recover_controller;
>>   
>>   /**
>>    * struct vmem_altmap - pre-allocated storage for vmemmap_populate
>> @@ -87,6 +88,8 @@ struct dev_pagemap_ops {
>>   	 * the page back to a CPU accessible page.
>>   	 */
>>   	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
>> +	int (*memory_failure)(struct dev_pagemap *pgmap,
>> +			      struct mf_recover_controller *mfrc);
>>   };
>>   
>>   #define PGMAP_ALTMAP_VALID	(1 << 0)
>> -- 
>> 2.28.0
>>
>>
>>
> 
> 



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-09-16  2:17 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-15 10:13 [RFC PATCH 0/4] fsdax: introduce fs query to support reflink Shiyang Ruan
2020-09-15 10:13 ` [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure Shiyang Ruan
2020-09-15 16:16   ` Darrick J. Wong
2020-09-16  2:15     ` Ruan Shiyang
2020-09-15 10:13 ` [RFC PATCH 2/4] pagemap: introduce ->memory_failure() Shiyang Ruan
2020-09-15 16:31   ` Darrick J. Wong
2020-09-16  2:15     ` Ruan Shiyang
2020-09-15 10:13 ` [RFC PATCH 3/4] mm, fsdax: refactor dax handler in memory-failure Shiyang Ruan
2020-09-15 10:13 ` [RFC PATCH 4/4] fsdax: remove useless (dis)associate functions Shiyang Ruan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).