LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] mm/damon/vaddr: Safely walk page table
@ 2021-08-27 15:04 SeongJae Park
  2021-08-30 11:52 ` Boehme, Markus
  2021-08-31  9:53 ` David Hildenbrand
  0 siblings, 2 replies; 6+ messages in thread
From: SeongJae Park @ 2021-08-27 15:04 UTC (permalink / raw)
  To: akpm; +Cc: david, markubo, linux-mm, linux-kernel, SeongJae Park

From: SeongJae Park <sjpark@amazon.de>

Commit d7f647622761 ("mm/damon: implement primitives for the virtual
memory address spaces") of linux-mm[1] tries to find PTE or PMD for
arbitrary virtual address using 'follow_invalidate_pte()' without proper
locking[2].  This commit fixes the issue by using another page table
walk function for more general use case under proper locking.

[1] https://github.com/hnaz/linux-mm/commit/d7f647622761
[2] https://lore.kernel.org/linux-mm/3b094493-9c1e-6024-bfd5-7eca66399b7e@redhat.com

Fixes: d7f647622761 ("mm/damon: implement primitives for the virtual memory address spaces")
Reported-by: David Hildenbrand <david@redhat.com>
Signed-off-by: SeongJae Park <sjpark@amazon.de>
---
 mm/damon/vaddr.c | 81 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 74 insertions(+), 7 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 230db7413278..b3677f2ef54b 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -8,10 +8,12 @@
 #define pr_fmt(fmt) "damon-va: " fmt
 
 #include <linux/damon.h>
+#include <linux/hugetlb.h>
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/highmem.h>
 #include <linux/page_idle.h>
+#include <linux/pagewalk.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
@@ -446,14 +448,69 @@ static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 }
 
+struct damon_walk_private {
+	pmd_t *pmd;
+	pte_t *pte;
+	spinlock_t *ptl;
+};
+
+static int damon_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+		struct mm_walk *walk)
+{
+	struct damon_walk_private *priv = walk->private;
+
+	if (pmd_huge(*pmd)) {
+		priv->ptl = pmd_lock(walk->mm, pmd);
+		if (pmd_huge(*pmd)) {
+			priv->pmd = pmd;
+			return 0;
+		}
+		spin_unlock(priv->ptl);
+	}
+
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		return -EINVAL;
+	priv->pte = pte_offset_map_lock(walk->mm, pmd, addr, &priv->ptl);
+	if (!pte_present(*priv->pte)) {
+		pte_unmap_unlock(priv->pte, priv->ptl);
+		priv->pte = NULL;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct mm_walk_ops damon_walk_ops = {
+	.pmd_entry = damon_pmd_entry,
+};
+
+int damon_follow_pte_pmd(struct mm_struct *mm, unsigned long addr,
+		struct damon_walk_private *private)
+{
+	int rc;
+
+	private->pte = NULL;
+	private->pmd = NULL;
+	rc = walk_page_range(mm, addr, addr + 1, &damon_walk_ops, private);
+	if (!rc && !private->pte && !private->pmd)
+		return -EINVAL;
+	return rc;
+}
+
 static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
 {
-	pte_t *pte = NULL;
-	pmd_t *pmd = NULL;
+	struct damon_walk_private walk_result;
+	pte_t *pte;
+	pmd_t *pmd;
 	spinlock_t *ptl;
 
-	if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl))
+	mmap_write_lock(mm);
+	if (damon_follow_pte_pmd(mm, addr, &walk_result)) {
+		mmap_write_unlock(mm);
 		return;
+	}
+	pte = walk_result.pte;
+	pmd = walk_result.pmd;
+	ptl = walk_result.ptl;
 
 	if (pte) {
 		damon_ptep_mkold(pte, mm, addr);
@@ -462,6 +519,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
 		damon_pmdp_mkold(pmd, mm, addr);
 		spin_unlock(ptl);
 	}
+	mmap_write_unlock(mm);
 }
 
 /*
@@ -495,14 +553,21 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx)
 static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
 			unsigned long *page_sz)
 {
-	pte_t *pte = NULL;
-	pmd_t *pmd = NULL;
+	struct damon_walk_private walk_result;
+	pte_t *pte;
+	pmd_t *pmd;
 	spinlock_t *ptl;
 	struct page *page;
 	bool young = false;
 
-	if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl))
+	mmap_write_lock(mm);
+	if (damon_follow_pte_pmd(mm, addr, &walk_result)) {
+		mmap_write_unlock(mm);
 		return false;
+	}
+	pte = walk_result.pte;
+	pmd = walk_result.pmd;
+	ptl = walk_result.ptl;
 
 	*page_sz = PAGE_SIZE;
 	if (pte) {
@@ -513,7 +578,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
 		if (page)
 			put_page(page);
 		pte_unmap_unlock(pte, ptl);
-		return young;
+		goto out;
 	}
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -528,6 +593,8 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
 	*page_sz = ((1UL) << HPAGE_PMD_SHIFT);
 #endif	/* CONFIG_TRANSPARENT_HUGEPAGE */
 
+out:
+	mmap_write_unlock(mm);
 	return young;
 }
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/damon/vaddr: Safely walk page table
  2021-08-27 15:04 [PATCH] mm/damon/vaddr: Safely walk page table SeongJae Park
@ 2021-08-30 11:52 ` Boehme, Markus
  2021-08-31  9:53 ` David Hildenbrand
  1 sibling, 0 replies; 6+ messages in thread
From: Boehme, Markus @ 2021-08-30 11:52 UTC (permalink / raw)
  To: sj38.park, akpm
  Cc: Boehme, Markus, linux-mm, linux-kernel, david, Park, Seongjae

On Fri, 2021-08-27 at 15:04 +0000, SeongJae Park wrote:
> 
> From: SeongJae Park <sjpark@amazon.de>
> 
> Commit d7f647622761 ("mm/damon: implement primitives for the virtual
> memory address spaces") of linux-mm[1] tries to find PTE or PMD for
> arbitrary virtual address using 'follow_invalidate_pte()' without proper
> locking[2].  This commit fixes the issue by using another page table
> walk function for more general use case under proper locking.
> 
> [1] https://github.com/hnaz/linux-mm/commit/d7f647622761
> [2] https://lore.kernel.org/linux-mm/3b094493-9c1e-6024-bfd5-7eca66399b7e@redhat.com
> 
> Fixes: d7f647622761 ("mm/damon: implement primitives for the virtual memory address spaces")
> Reported-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: SeongJae Park <sjpark@amazon.de>
> ---
>  mm/damon/vaddr.c | 81 +++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 74 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
> index 230db7413278..b3677f2ef54b 100644
> --- a/mm/damon/vaddr.c
> +++ b/mm/damon/vaddr.c
> @@ -8,10 +8,12 @@
>  #define pr_fmt(fmt) "damon-va: " fmt
> 
>  #include <linux/damon.h>
> +#include <linux/hugetlb.h>
>  #include <linux/mm.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/highmem.h>
>  #include <linux/page_idle.h>
> +#include <linux/pagewalk.h>
>  #include <linux/random.h>
>  #include <linux/sched/mm.h>
>  #include <linux/slab.h>
> @@ -446,14 +448,69 @@ static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  }
> 
> +struct damon_walk_private {
> +       pmd_t *pmd;
> +       pte_t *pte;
> +       spinlock_t *ptl;
> +};
> +
> +static int damon_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> +               struct mm_walk *walk)
> +{
> +       struct damon_walk_private *priv = walk->private;
> +
> +       if (pmd_huge(*pmd)) {
> +               priv->ptl = pmd_lock(walk->mm, pmd);
> +               if (pmd_huge(*pmd)) {
> +                       priv->pmd = pmd;
> +                       return 0;
> +               }
> +               spin_unlock(priv->ptl);
> +       }
> +
> +       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
> +               return -EINVAL;
> +       priv->pte = pte_offset_map_lock(walk->mm, pmd, addr, &priv->ptl);
> +       if (!pte_present(*priv->pte)) {
> +               pte_unmap_unlock(priv->pte, priv->ptl);
> +               priv->pte = NULL;
> +               return -EINVAL;
> +       }
> +       return 0;
> +}
> +
> +static struct mm_walk_ops damon_walk_ops = {
> +       .pmd_entry = damon_pmd_entry,
> +};
> +
> +int damon_follow_pte_pmd(struct mm_struct *mm, unsigned long addr,
> +               struct damon_walk_private *private)
> +{
> +       int rc;
> +
> +       private->pte = NULL;
> +       private->pmd = NULL;
> +       rc = walk_page_range(mm, addr, addr + 1, &damon_walk_ops, private);
> +       if (!rc && !private->pte && !private->pmd)
> +               return -EINVAL;
> +       return rc;
> +}
> +
>  static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
>  {
> -       pte_t *pte = NULL;
> -       pmd_t *pmd = NULL;
> +       struct damon_walk_private walk_result;
> +       pte_t *pte;
> +       pmd_t *pmd;
>         spinlock_t *ptl;
> 
> -       if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl))
> +       mmap_write_lock(mm);
> +       if (damon_follow_pte_pmd(mm, addr, &walk_result)) {
> +               mmap_write_unlock(mm);
>                 return;
> +       }
> +       pte = walk_result.pte;
> +       pmd = walk_result.pmd;
> +       ptl = walk_result.ptl;
> 
>         if (pte) {
>                 damon_ptep_mkold(pte, mm, addr);
> @@ -462,6 +519,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
>                 damon_pmdp_mkold(pmd, mm, addr);
>                 spin_unlock(ptl);
>         }
> +       mmap_write_unlock(mm);
>  }
> 
>  /*
> @@ -495,14 +553,21 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx)
>  static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
>                         unsigned long *page_sz)
>  {
> -       pte_t *pte = NULL;
> -       pmd_t *pmd = NULL;
> +       struct damon_walk_private walk_result;
> +       pte_t *pte;
> +       pmd_t *pmd;
>         spinlock_t *ptl;
>         struct page *page;
>         bool young = false;
> 
> -       if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl))
> +       mmap_write_lock(mm);
> +       if (damon_follow_pte_pmd(mm, addr, &walk_result)) {
> +               mmap_write_unlock(mm);
>                 return false;
> +       }
> +       pte = walk_result.pte;
> +       pmd = walk_result.pmd;
> +       ptl = walk_result.ptl;
> 
>         *page_sz = PAGE_SIZE;
>         if (pte) {
> @@ -513,7 +578,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
>                 if (page)
>                         put_page(page);
>                 pte_unmap_unlock(pte, ptl);
> -               return young;
> +               goto out;
>         }
> 
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> @@ -528,6 +593,8 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
>         *page_sz = ((1UL) << HPAGE_PMD_SHIFT);
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> 
> +out:
> +       mmap_write_unlock(mm);
>         return young;
>  }
> 
> --
> 2.17.1
> 

Reviewed-by: Markus Boehme <markubo@amazon.de>



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/damon/vaddr: Safely walk page table
  2021-08-27 15:04 [PATCH] mm/damon/vaddr: Safely walk page table SeongJae Park
  2021-08-30 11:52 ` Boehme, Markus
@ 2021-08-31  9:53 ` David Hildenbrand
  2021-08-31 10:49   ` SeongJae Park
  1 sibling, 1 reply; 6+ messages in thread
From: David Hildenbrand @ 2021-08-31  9:53 UTC (permalink / raw)
  To: SeongJae Park, akpm; +Cc: markubo, linux-mm, linux-kernel, SeongJae Park

On 27.08.21 17:04, SeongJae Park wrote:
> From: SeongJae Park <sjpark@amazon.de>
> 
> Commit d7f647622761 ("mm/damon: implement primitives for the virtual
> memory address spaces") of linux-mm[1] tries to find PTE or PMD for
> arbitrary virtual address using 'follow_invalidate_pte()' without proper
> locking[2].  This commit fixes the issue by using another page table
> walk function for more general use case under proper locking.
> 
> [1] https://github.com/hnaz/linux-mm/commit/d7f647622761
> [2] https://lore.kernel.org/linux-mm/3b094493-9c1e-6024-bfd5-7eca66399b7e@redhat.com
> 
> Fixes: d7f647622761 ("mm/damon: implement primitives for the virtual memory address spaces")
> Reported-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: SeongJae Park <sjpark@amazon.de>
> ---
>   mm/damon/vaddr.c | 81 +++++++++++++++++++++++++++++++++++++++++++-----
>   1 file changed, 74 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
> index 230db7413278..b3677f2ef54b 100644
> --- a/mm/damon/vaddr.c
> +++ b/mm/damon/vaddr.c
> @@ -8,10 +8,12 @@
>   #define pr_fmt(fmt) "damon-va: " fmt
>   
>   #include <linux/damon.h>
> +#include <linux/hugetlb.h>
>   #include <linux/mm.h>
>   #include <linux/mmu_notifier.h>
>   #include <linux/highmem.h>
>   #include <linux/page_idle.h>
> +#include <linux/pagewalk.h>
>   #include <linux/random.h>
>   #include <linux/sched/mm.h>
>   #include <linux/slab.h>
> @@ -446,14 +448,69 @@ static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
>   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>   }
>   
> +struct damon_walk_private {
> +	pmd_t *pmd;
> +	pte_t *pte;
> +	spinlock_t *ptl;
> +};
> +
> +static int damon_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> +		struct mm_walk *walk)
> +{
> +	struct damon_walk_private *priv = walk->private;
> +
> +	if (pmd_huge(*pmd)) {
> +		priv->ptl = pmd_lock(walk->mm, pmd);
> +		if (pmd_huge(*pmd)) {
> +			priv->pmd = pmd;
> +			return 0;
> +		}
> +		spin_unlock(priv->ptl);
> +	}
> +
> +	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
> +		return -EINVAL;
> +	priv->pte = pte_offset_map_lock(walk->mm, pmd, addr, &priv->ptl);
> +	if (!pte_present(*priv->pte)) {
> +		pte_unmap_unlock(priv->pte, priv->ptl);
> +		priv->pte = NULL;
> +		return -EINVAL;
> +	}
> +	return 0;
> +}
> +
> +static struct mm_walk_ops damon_walk_ops = {
> +	.pmd_entry = damon_pmd_entry,
> +};
> +
> +int damon_follow_pte_pmd(struct mm_struct *mm, unsigned long addr,
> +		struct damon_walk_private *private)
> +{
> +	int rc;
> +
> +	private->pte = NULL;
> +	private->pmd = NULL;
> +	rc = walk_page_range(mm, addr, addr + 1, &damon_walk_ops, private);
> +	if (!rc && !private->pte && !private->pmd)
> +		return -EINVAL;
> +	return rc;
> +}
> +
>   static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
>   {
> -	pte_t *pte = NULL;
> -	pmd_t *pmd = NULL;
> +	struct damon_walk_private walk_result;
> +	pte_t *pte;
> +	pmd_t *pmd;
>   	spinlock_t *ptl;
>   
> -	if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl))
> +	mmap_write_lock(mm);

Can you elaborate why mmap_read_lock() isn't sufficient for your use 
case? The write mode might heavily affect damon performance and workload 
impact.


Also, I wonder if it wouldn't be much easier and cleaner to just handle 
it completely in the .pmd_entry callback, instead of returning PMDs, 
PTEs, LOCKs, ... here.

You could have

static struct mm_walk_ops damon_mkold_ops = {
	.pmd_entry = damon_mkold_pmd_entry,
};

and

static struct mm_walk_ops damon_young_ops = {
	.pmd_entry = damon_young_pmd_entry,
};

And then just handle everything completely inside the callback, avoiding 
having to return locked PTEs, PMDs, .... and instead handling it at a 
single location. Simply forward the page_sz pointer in the latter case 
to damon_young_ops.


damon_va_mkold()/damon_va_young() would mostly only call 
walk_page_range() with the right ops and eventually convert some return 
values.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/damon/vaddr: Safely walk page table
  2021-08-31  9:53 ` David Hildenbrand
@ 2021-08-31 10:49   ` SeongJae Park
  2021-08-31 11:46     ` David Hildenbrand
  0 siblings, 1 reply; 6+ messages in thread
From: SeongJae Park @ 2021-08-31 10:49 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: SeongJae Park, akpm, markubo, linux-mm, linux-kernel, SeongJae Park

From: SeongJae Park <sjpark@amazon.de>

On Tue, 31 Aug 2021 11:53:05 +0200 David Hildenbrand <david@redhat.com> wrote:

> On 27.08.21 17:04, SeongJae Park wrote:
> > From: SeongJae Park <sjpark@amazon.de>
> > 
> > Commit d7f647622761 ("mm/damon: implement primitives for the virtual
> > memory address spaces") of linux-mm[1] tries to find PTE or PMD for
> > arbitrary virtual address using 'follow_invalidate_pte()' without proper
> > locking[2].  This commit fixes the issue by using another page table
> > walk function for more general use case under proper locking.
> > 
> > [1] https://github.com/hnaz/linux-mm/commit/d7f647622761
> > [2] https://lore.kernel.org/linux-mm/3b094493-9c1e-6024-bfd5-7eca66399b7e@redhat.com
> > 
> > Fixes: d7f647622761 ("mm/damon: implement primitives for the virtual memory address spaces")
> > Reported-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: SeongJae Park <sjpark@amazon.de>
> > ---
> >   mm/damon/vaddr.c | 81 +++++++++++++++++++++++++++++++++++++++++++-----
> >   1 file changed, 74 insertions(+), 7 deletions(-)
> > 
> > diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
> > index 230db7413278..b3677f2ef54b 100644
> > --- a/mm/damon/vaddr.c
> > +++ b/mm/damon/vaddr.c
> > @@ -8,10 +8,12 @@
> >   #define pr_fmt(fmt) "damon-va: " fmt
> >   
> >   #include <linux/damon.h>
> > +#include <linux/hugetlb.h>
> >   #include <linux/mm.h>
> >   #include <linux/mmu_notifier.h>
> >   #include <linux/highmem.h>
> >   #include <linux/page_idle.h>
> > +#include <linux/pagewalk.h>
> >   #include <linux/random.h>
> >   #include <linux/sched/mm.h>
> >   #include <linux/slab.h>
> > @@ -446,14 +448,69 @@ static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
> >   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> >   }
> >   
> > +struct damon_walk_private {
> > +	pmd_t *pmd;
> > +	pte_t *pte;
> > +	spinlock_t *ptl;
> > +};
> > +
> > +static int damon_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> > +		struct mm_walk *walk)
> > +{
> > +	struct damon_walk_private *priv = walk->private;
> > +
> > +	if (pmd_huge(*pmd)) {
> > +		priv->ptl = pmd_lock(walk->mm, pmd);
> > +		if (pmd_huge(*pmd)) {
> > +			priv->pmd = pmd;
> > +			return 0;
> > +		}
> > +		spin_unlock(priv->ptl);
> > +	}
> > +
> > +	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
> > +		return -EINVAL;
> > +	priv->pte = pte_offset_map_lock(walk->mm, pmd, addr, &priv->ptl);
> > +	if (!pte_present(*priv->pte)) {
> > +		pte_unmap_unlock(priv->pte, priv->ptl);
> > +		priv->pte = NULL;
> > +		return -EINVAL;
> > +	}
> > +	return 0;
> > +}
> > +
> > +static struct mm_walk_ops damon_walk_ops = {
> > +	.pmd_entry = damon_pmd_entry,
> > +};
> > +
> > +int damon_follow_pte_pmd(struct mm_struct *mm, unsigned long addr,
> > +		struct damon_walk_private *private)
> > +{
> > +	int rc;
> > +
> > +	private->pte = NULL;
> > +	private->pmd = NULL;
> > +	rc = walk_page_range(mm, addr, addr + 1, &damon_walk_ops, private);
> > +	if (!rc && !private->pte && !private->pmd)
> > +		return -EINVAL;
> > +	return rc;
> > +}
> > +
> >   static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
> >   {
> > -	pte_t *pte = NULL;
> > -	pmd_t *pmd = NULL;
> > +	struct damon_walk_private walk_result;
> > +	pte_t *pte;
> > +	pmd_t *pmd;
> >   	spinlock_t *ptl;
> >   
> > -	if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl))
> > +	mmap_write_lock(mm);
> 
> Can you elaborate why mmap_read_lock() isn't sufficient for your use 
> case? The write mode might heavily affect damon performance and workload 
> impact.

Because as you also mentioned in the previous mail, 'we can walk page tables
ignoring VMAs with the mmap semaphore held in write mode', and in this case we
don't know to which VMA the address is belong.  I thought the link to the mail
can help people understanding the reason.  But, as you are suggesting, I now
think putting an elaborated explanation here would be much better.  I will also
put a warning for the possible performance impact.

> 
> 
> Also, I wonder if it wouldn't be much easier and cleaner to just handle 
> it completely in the .pmd_entry callback, instead of returning PMDs, 
> PTEs, LOCKs, ... here.
> 
> You could have
> 
> static struct mm_walk_ops damon_mkold_ops = {
> 	.pmd_entry = damon_mkold_pmd_entry,
> };
> 
> and
> 
> static struct mm_walk_ops damon_young_ops = {
> 	.pmd_entry = damon_young_pmd_entry,
> };
> 
> And then just handle everything completely inside the callback, avoiding 
> having to return locked PTEs, PMDs, .... and instead handling it at a 
> single location. Simply forward the page_sz pointer in the latter case 
> to damon_young_ops.
> 
> 
> damon_va_mkold()/damon_va_young() would mostly only call 
> walk_page_range() with the right ops and eventually convert some return 
> values.

I just wanted to make the change as small as possible, but you're right.  That
must be much cleaner.  I will post the next version soon.


Thanks,
SeongJae Park

> 
> -- 
> Thanks,
> 
> David / dhildenb

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/damon/vaddr: Safely walk page table
  2021-08-31 10:49   ` SeongJae Park
@ 2021-08-31 11:46     ` David Hildenbrand
  2021-08-31 11:56       ` SeongJae Park
  0 siblings, 1 reply; 6+ messages in thread
From: David Hildenbrand @ 2021-08-31 11:46 UTC (permalink / raw)
  To: SeongJae Park; +Cc: akpm, markubo, linux-mm, linux-kernel, SeongJae Park

On 31.08.21 12:49, SeongJae Park wrote:
> From: SeongJae Park <sjpark@amazon.de>
> 
> On Tue, 31 Aug 2021 11:53:05 +0200 David Hildenbrand <david@redhat.com> wrote:
> 
>> On 27.08.21 17:04, SeongJae Park wrote:
>>> From: SeongJae Park <sjpark@amazon.de>
>>>
>>> Commit d7f647622761 ("mm/damon: implement primitives for the virtual
>>> memory address spaces") of linux-mm[1] tries to find PTE or PMD for
>>> arbitrary virtual address using 'follow_invalidate_pte()' without proper
>>> locking[2].  This commit fixes the issue by using another page table
>>> walk function for more general use case under proper locking.
>>>
>>> [1] https://github.com/hnaz/linux-mm/commit/d7f647622761
>>> [2] https://lore.kernel.org/linux-mm/3b094493-9c1e-6024-bfd5-7eca66399b7e@redhat.com
>>>
>>> Fixes: d7f647622761 ("mm/damon: implement primitives for the virtual memory address spaces")
>>> Reported-by: David Hildenbrand <david@redhat.com>
>>> Signed-off-by: SeongJae Park <sjpark@amazon.de>
>>> ---
>>>    mm/damon/vaddr.c | 81 +++++++++++++++++++++++++++++++++++++++++++-----
>>>    1 file changed, 74 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
>>> index 230db7413278..b3677f2ef54b 100644
>>> --- a/mm/damon/vaddr.c
>>> +++ b/mm/damon/vaddr.c
>>> @@ -8,10 +8,12 @@
>>>    #define pr_fmt(fmt) "damon-va: " fmt
>>>    
>>>    #include <linux/damon.h>
>>> +#include <linux/hugetlb.h>
>>>    #include <linux/mm.h>
>>>    #include <linux/mmu_notifier.h>
>>>    #include <linux/highmem.h>
>>>    #include <linux/page_idle.h>
>>> +#include <linux/pagewalk.h>
>>>    #include <linux/random.h>
>>>    #include <linux/sched/mm.h>
>>>    #include <linux/slab.h>
>>> @@ -446,14 +448,69 @@ static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
>>>    #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>>>    }
>>>    
>>> +struct damon_walk_private {
>>> +	pmd_t *pmd;
>>> +	pte_t *pte;
>>> +	spinlock_t *ptl;
>>> +};
>>> +
>>> +static int damon_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
>>> +		struct mm_walk *walk)
>>> +{
>>> +	struct damon_walk_private *priv = walk->private;
>>> +
>>> +	if (pmd_huge(*pmd)) {
>>> +		priv->ptl = pmd_lock(walk->mm, pmd);
>>> +		if (pmd_huge(*pmd)) {
>>> +			priv->pmd = pmd;
>>> +			return 0;
>>> +		}
>>> +		spin_unlock(priv->ptl);
>>> +	}
>>> +
>>> +	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
>>> +		return -EINVAL;
>>> +	priv->pte = pte_offset_map_lock(walk->mm, pmd, addr, &priv->ptl);
>>> +	if (!pte_present(*priv->pte)) {
>>> +		pte_unmap_unlock(priv->pte, priv->ptl);
>>> +		priv->pte = NULL;
>>> +		return -EINVAL;
>>> +	}
>>> +	return 0;
>>> +}
>>> +
>>> +static struct mm_walk_ops damon_walk_ops = {
>>> +	.pmd_entry = damon_pmd_entry,
>>> +};
>>> +
>>> +int damon_follow_pte_pmd(struct mm_struct *mm, unsigned long addr,
>>> +		struct damon_walk_private *private)
>>> +{
>>> +	int rc;
>>> +
>>> +	private->pte = NULL;
>>> +	private->pmd = NULL;
>>> +	rc = walk_page_range(mm, addr, addr + 1, &damon_walk_ops, private);
>>> +	if (!rc && !private->pte && !private->pmd)
>>> +		return -EINVAL;
>>> +	return rc;
>>> +}
>>> +
>>>    static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
>>>    {
>>> -	pte_t *pte = NULL;
>>> -	pmd_t *pmd = NULL;
>>> +	struct damon_walk_private walk_result;
>>> +	pte_t *pte;
>>> +	pmd_t *pmd;
>>>    	spinlock_t *ptl;
>>>    
>>> -	if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl))
>>> +	mmap_write_lock(mm);
>>
>> Can you elaborate why mmap_read_lock() isn't sufficient for your use
>> case? The write mode might heavily affect damon performance and workload
>> impact.
> 
> Because as you also mentioned in the previous mail, 'we can walk page tables
> ignoring VMAs with the mmap semaphore held in write mode', and in this case we
> don't know to which VMA the address is belong.  I thought the link to the mail
> can help people understanding the reason.  But, as you are suggesting, I now
> think putting an elaborated explanation here would be much better.  I will also
> put a warning for the possible performance impact.

walk_page_range() make sure to skip any VMA holes and only walks ranges 
within VMAs. With the mmap sem in read mode, the VMA layout (mostly) 
cannot change, so calling walk_page_range() is fine. So pagewalk.c 
properly takes care of VMAs.

As an example, take a look at MADV_COLD handling in mm/madvise.c.

madvise_need_mmap_write() returns "0", and we end up calling 
madvise_cold()->...->walk_page_range() with mmap_lock_read().

You can exclude any VMAs you don't care about in the test_walk() 
callback, if required.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/damon/vaddr: Safely walk page table
  2021-08-31 11:46     ` David Hildenbrand
@ 2021-08-31 11:56       ` SeongJae Park
  0 siblings, 0 replies; 6+ messages in thread
From: SeongJae Park @ 2021-08-31 11:56 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: SeongJae Park, akpm, markubo, linux-mm, linux-kernel, SeongJae Park

From: SeongJae Park <sjpark@amazon.de>

On Tue, 31 Aug 2021 13:46:42 +0200 David Hildenbrand <david@redhat.com> wrote:

> On 31.08.21 12:49, SeongJae Park wrote:
> > From: SeongJae Park <sjpark@amazon.de>
> > 
> > On Tue, 31 Aug 2021 11:53:05 +0200 David Hildenbrand <david@redhat.com> wrote:
> > 
> >> On 27.08.21 17:04, SeongJae Park wrote:
> >>> From: SeongJae Park <sjpark@amazon.de>
> >>>
> >>> Commit d7f647622761 ("mm/damon: implement primitives for the virtual
> >>> memory address spaces") of linux-mm[1] tries to find PTE or PMD for
> >>> arbitrary virtual address using 'follow_invalidate_pte()' without proper
> >>> locking[2].  This commit fixes the issue by using another page table
> >>> walk function for more general use case under proper locking.
> >>>
> >>> [1] https://github.com/hnaz/linux-mm/commit/d7f647622761
> >>> [2] https://lore.kernel.org/linux-mm/3b094493-9c1e-6024-bfd5-7eca66399b7e@redhat.com
> >>>
> >>> Fixes: d7f647622761 ("mm/damon: implement primitives for the virtual memory address spaces")
> >>> Reported-by: David Hildenbrand <david@redhat.com>
> >>> Signed-off-by: SeongJae Park <sjpark@amazon.de>
> >>> ---
> >>>    mm/damon/vaddr.c | 81 +++++++++++++++++++++++++++++++++++++++++++-----
> >>>    1 file changed, 74 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
> >>> index 230db7413278..b3677f2ef54b 100644
> >>> --- a/mm/damon/vaddr.c
> >>> +++ b/mm/damon/vaddr.c
> >>> @@ -8,10 +8,12 @@
> >>>    #define pr_fmt(fmt) "damon-va: " fmt
> >>>    
> >>>    #include <linux/damon.h>
> >>> +#include <linux/hugetlb.h>
> >>>    #include <linux/mm.h>
> >>>    #include <linux/mmu_notifier.h>
> >>>    #include <linux/highmem.h>
> >>>    #include <linux/page_idle.h>
> >>> +#include <linux/pagewalk.h>
> >>>    #include <linux/random.h>
> >>>    #include <linux/sched/mm.h>
> >>>    #include <linux/slab.h>
> >>> @@ -446,14 +448,69 @@ static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
> >>>    #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> >>>    }
> >>>    
> >>> +struct damon_walk_private {
> >>> +	pmd_t *pmd;
> >>> +	pte_t *pte;
> >>> +	spinlock_t *ptl;
> >>> +};
> >>> +
> >>> +static int damon_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> >>> +		struct mm_walk *walk)
> >>> +{
> >>> +	struct damon_walk_private *priv = walk->private;
> >>> +
> >>> +	if (pmd_huge(*pmd)) {
> >>> +		priv->ptl = pmd_lock(walk->mm, pmd);
> >>> +		if (pmd_huge(*pmd)) {
> >>> +			priv->pmd = pmd;
> >>> +			return 0;
> >>> +		}
> >>> +		spin_unlock(priv->ptl);
> >>> +	}
> >>> +
> >>> +	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
> >>> +		return -EINVAL;
> >>> +	priv->pte = pte_offset_map_lock(walk->mm, pmd, addr, &priv->ptl);
> >>> +	if (!pte_present(*priv->pte)) {
> >>> +		pte_unmap_unlock(priv->pte, priv->ptl);
> >>> +		priv->pte = NULL;
> >>> +		return -EINVAL;
> >>> +	}
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static struct mm_walk_ops damon_walk_ops = {
> >>> +	.pmd_entry = damon_pmd_entry,
> >>> +};
> >>> +
> >>> +int damon_follow_pte_pmd(struct mm_struct *mm, unsigned long addr,
> >>> +		struct damon_walk_private *private)
> >>> +{
> >>> +	int rc;
> >>> +
> >>> +	private->pte = NULL;
> >>> +	private->pmd = NULL;
> >>> +	rc = walk_page_range(mm, addr, addr + 1, &damon_walk_ops, private);
> >>> +	if (!rc && !private->pte && !private->pmd)
> >>> +		return -EINVAL;
> >>> +	return rc;
> >>> +}
> >>> +
> >>>    static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
> >>>    {
> >>> -	pte_t *pte = NULL;
> >>> -	pmd_t *pmd = NULL;
> >>> +	struct damon_walk_private walk_result;
> >>> +	pte_t *pte;
> >>> +	pmd_t *pmd;
> >>>    	spinlock_t *ptl;
> >>>    
> >>> -	if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl))
> >>> +	mmap_write_lock(mm);
> >>
> >> Can you elaborate why mmap_read_lock() isn't sufficient for your use
> >> case? The write mode might heavily affect damon performance and workload
> >> impact.
> > 
> > Because as you also mentioned in the previous mail, 'we can walk page tables
> > ignoring VMAs with the mmap semaphore held in write mode', and in this case we
> > don't know to which VMA the address is belong.  I thought the link to the mail
> > can help people understanding the reason.  But, as you are suggesting, I now
> > think putting an elaborated explanation here would be much better.  I will also
> > put a warning for the possible performance impact.
> 
> walk_page_range() make sure to skip any VMA holes and only walks ranges 
> within VMAs. With the mmap sem in read mode, the VMA layout (mostly) 
> cannot change, so calling walk_page_range() is fine. So pagewalk.c 
> properly takes care of VMAs.
> 
> As an example, take a look at MADV_COLD handling in mm/madvise.c.
> 
> madvise_need_mmap_write() returns "0", and we end up calling 
> madvise_cold()->...->walk_page_range() with mmap_lock_read().

Oops, can't believe how I missed that.  I will hold only mmap read lock...


Thanks,
SJ

> 
> You can exclude any VMAs you don't care about in the test_walk() 
> callback, if required.
> 
> -- 
> Thanks,
> 
> David / dhildenb

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-08-31 11:56 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-27 15:04 [PATCH] mm/damon/vaddr: Safely walk page table SeongJae Park
2021-08-30 11:52 ` Boehme, Markus
2021-08-31  9:53 ` David Hildenbrand
2021-08-31 10:49   ` SeongJae Park
2021-08-31 11:46     ` David Hildenbrand
2021-08-31 11:56       ` SeongJae Park

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).