LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [patch 1/3 take2] smaps: extract pte walker from smaps code
@ 2007-02-07  6:15 David Rientjes
  2007-02-07  6:15 ` [patch 2/3 take2] smaps: add pages referenced count to smaps David Rientjes
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: David Rientjes @ 2007-02-07  6:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Hugh Dickins, Paul Mundt, Christoph Lameter, linux-kernel

Extracts the page table entry walker from the smaps-specific code in
fs/proc/task_mmu.c.  This will be used later for clearing the reference
bits on pages to measure the number of pages accessed over a time period
through /proc/pid/smaps.

The new struct pte_walker includes the struct vm_area_struct of the memory
to walk over.  Iteration begins at the start address and completes at the
the end address.  A pointer to another data structure may be stored in the
private field such as the struct mem_size_stats, which acts as the smaps
accumulator.  For each page table entry in the VMA, the func function is
called with the corresponding struct pte_walker, the pte_t, and its
address.

Since the PTE walker is now extracted from the smaps code,
smaps_pte_func() is invoked for each PTE in the VMA.  Its behavior is
identical to the existing implementation, except it is slightly slower
because each PTE now invokes a function call.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/proc/task_mmu.c |  126 ++++++++++++++++++++++++++++++++++------------------
 1 files changed, 82 insertions(+), 44 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 55ade0d..e87824b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -122,6 +122,15 @@ struct mem_size_stats
 	unsigned long private_dirty;
 };
 
+struct pte_walker {
+	struct vm_area_struct *vma;	/* VMA */
+	unsigned long start;		/* start address */
+	unsigned long end;		/* end address */
+	void *private;			/* private data */
+	/* function to invoke for each pte in the above range */
+	void (*func)(struct pte_walker*, pte_t*, unsigned long);
+};
+
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
 	struct proc_maps_private *priv = m->private;
@@ -204,98 +213,127 @@ static int show_map(struct seq_file *m, void *v)
 	return show_map_internal(m, v, NULL);
 }
 
-static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+/*
+ * Walks each PTE in the struct pte_walker address range and calls
+ * walker->func() for each entry.
+ */
+static void walk_ptes(struct pte_walker *walker, pmd_t *pmd)
 {
-	pte_t *pte, ptent;
+	struct vm_area_struct *vma = walker->vma;
+	unsigned long addr = walker->start;
 	spinlock_t *ptl;
-	struct page *page;
+	pte_t *pte;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
-		ptent = *pte;
-		if (!pte_present(ptent))
-			continue;
-
-		mss->resident += PAGE_SIZE;
-
-		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
-			continue;
-
-		if (page_mapcount(page) >= 2) {
-			if (pte_dirty(ptent))
-				mss->shared_dirty += PAGE_SIZE;
-			else
-				mss->shared_clean += PAGE_SIZE;
-		} else {
-			if (pte_dirty(ptent))
-				mss->private_dirty += PAGE_SIZE;
-			else
-				mss->private_clean += PAGE_SIZE;
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+		walker->func(walker, pte, addr);
+	} while (pte++, addr += PAGE_SIZE, addr != walker->end);
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 }
 
-static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+static inline void walk_pmds(struct pte_walker *walker, pud_t *pud)
 {
-	pmd_t *pmd;
+	unsigned long addr = walker->start;
+	unsigned long end = walker->end;
 	unsigned long next;
+	pmd_t *pmd;
 
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		smaps_pte_range(vma, pmd, addr, next, mss);
+		walk_ptes(walker, pmd);
 	} while (pmd++, addr = next, addr != end);
 }
 
-static inline void smaps_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+static inline void walk_puds(struct pte_walker *walker, pgd_t *pgd)
 {
-	pud_t *pud;
+	unsigned long addr = walker->start;
+	unsigned long end = walker->end;
 	unsigned long next;
+	pud_t *pud;
 
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		smaps_pmd_range(vma, pud, addr, next, mss);
+		walk_pmds(walker, pud);
 	} while (pud++, addr = next, addr != end);
 }
 
-static inline void smaps_pgd_range(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+static inline void walk_pgds(struct pte_walker *walker)
 {
-	pgd_t *pgd;
+	unsigned long addr = walker->start;
+	unsigned long end = walker->end;
 	unsigned long next;
+	pgd_t *pgd;
 
-	pgd = pgd_offset(vma->vm_mm, addr);
+	pgd = pgd_offset(walker->vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		smaps_pud_range(vma, pgd, addr, next, mss);
+		walk_puds(walker, pgd);
 	} while (pgd++, addr = next, addr != end);
 }
 
+/*
+ * Called for each PTE in the struct pte_walker address range.  For all normal,
+ * present pages, we accumulate the size (in pages) grouped by shared and
+ * private attributes and dirty bits.
+ */
+static void smaps_pte_func(struct pte_walker *walker, pte_t *pte,
+			   unsigned long addr)
+{
+	struct mem_size_stats *mss = walker->private;
+	struct page *page;
+	pte_t ptent;
+
+	ptent = *pte;
+	if (!pte_present(ptent))
+		return;
+
+	mss->resident += PAGE_SIZE;
+
+	page = vm_normal_page(walker->vma, addr, ptent);
+	if (!page)
+		return;
+
+	if (page_mapcount(page) >= 2) {
+		if (pte_dirty(ptent))
+			mss->shared_dirty += PAGE_SIZE;
+		else
+			mss->shared_clean += PAGE_SIZE;
+	} else {
+		if (pte_dirty(ptent))
+			mss->private_dirty += PAGE_SIZE;
+		else
+			mss->private_clean += PAGE_SIZE;
+	}
+}
+
+/*
+ * Displays the smap for the process.  smaps_pte_func() is called for each PTE
+ * in the range from vma->vm_start to vma->vm_end.
+ */
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
+	struct pte_walker walker = {
+		.vma		= vma,
+		.start		= vma->vm_start,
+		.end		= vma->vm_end,
+		.private	= &mss,
+		.func		= smaps_pte_func,
+	};
 
 	memset(&mss, 0, sizeof mss);
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-		smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
+		walk_pgds(&walker);
 	return show_map_internal(m, v, &mss);
 }
 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [patch 2/3 take2] smaps: add pages referenced count to smaps
  2007-02-07  6:15 [patch 1/3 take2] smaps: extract pte walker from smaps code David Rientjes
@ 2007-02-07  6:15 ` David Rientjes
  2007-02-07  6:15   ` [patch 3/3 take2] smaps: add clear_refs file to clear reference David Rientjes
  2007-02-07  6:29 ` [patch 1/3 take2] smaps: extract pte walker from smaps code Paul Mundt
  2007-02-07  8:30 ` Christoph Lameter
  2 siblings, 1 reply; 12+ messages in thread
From: David Rientjes @ 2007-02-07  6:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Hugh Dickins, Paul Mundt, Christoph Lameter, linux-kernel

Adds an additional unsigned long field to struct mem_size_stats:
referenced.  For each page table entry that is walked for the VMA in the
smaps code, this field is incremented by PAGE_SIZE if it has pte-reference
bits.

An additional line was added to the /proc/pid/smaps output for each VMA
to indicate how many pages within it are currently marked as accessed
(referenced).

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/proc/task_mmu.c |   20 +++++++++++++-------
 1 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e87824b..50bd004 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -120,6 +120,7 @@ struct mem_size_stats
 	unsigned long shared_dirty;
 	unsigned long private_clean;
 	unsigned long private_dirty;
+	unsigned long referenced;
 };
 
 struct pte_walker {
@@ -190,18 +191,20 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
 
 	if (mss)
 		seq_printf(m,
-			   "Size:          %8lu kB\n"
-			   "Rss:           %8lu kB\n"
-			   "Shared_Clean:  %8lu kB\n"
-			   "Shared_Dirty:  %8lu kB\n"
-			   "Private_Clean: %8lu kB\n"
-			   "Private_Dirty: %8lu kB\n",
+			   "Size:           %8lu kB\n"
+			   "Rss:            %8lu kB\n"
+			   "Shared_Clean:   %8lu kB\n"
+			   "Shared_Dirty:   %8lu kB\n"
+			   "Private_Clean:  %8lu kB\n"
+			   "Private_Dirty:  %8lu kB\n"
+			   "Pgs_Referenced: %8lu kB\n",
 			   (vma->vm_end - vma->vm_start) >> 10,
 			   mss->resident >> 10,
 			   mss->shared_clean  >> 10,
 			   mss->shared_dirty  >> 10,
 			   mss->private_clean >> 10,
-			   mss->private_dirty >> 10);
+			   mss->private_dirty >> 10,
+			   mss->referenced >> 10);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
@@ -302,6 +305,9 @@ static void smaps_pte_func(struct pte_walker *walker, pte_t *pte,
 	if (!page)
 		return;
 
+	/* Accumulate the number of pages that have been accessed. */
+	if (pte_young(ptent) || PageReferenced(page))
+		mss->referenced += PAGE_SIZE;
 	if (page_mapcount(page) >= 2) {
 		if (pte_dirty(ptent))
 			mss->shared_dirty += PAGE_SIZE;

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [patch 3/3 take2] smaps: add clear_refs file to clear reference
  2007-02-07  6:15 ` [patch 2/3 take2] smaps: add pages referenced count to smaps David Rientjes
@ 2007-02-07  6:15   ` David Rientjes
  2007-02-07  6:33     ` Andrew Morton
  0 siblings, 1 reply; 12+ messages in thread
From: David Rientjes @ 2007-02-07  6:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Hugh Dickins, Paul Mundt, Christoph Lameter, linux-kernel

Adds an additional file to /proc/pid: clear_refs.  When any non-zero
number is written to this file, all the PG_referenced flags and
PAGE_ACCESSED (meaning the page has been accessed) are cleared within each
VMA for the corresponding task.

It is now possible to measure how much memory a task is using by clearing
the reference bits with

	echo 1 > /proc/pid/clear_refs

and checking the reference count for each VMA from the /proc/pid/smaps
output at a time interval later.

The /proc/pid/clear_refs file is only writable by the user who owns the
task.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/proc/base.c          |   31 +++++++++++++++++++++++++++++++
 fs/proc/task_mmu.c      |   37 +++++++++++++++++++++++++++++++++++++
 include/linux/proc_fs.h |    1 +
 3 files changed, 69 insertions(+), 0 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1a979ea..b50315f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -715,6 +715,35 @@ static struct file_operations proc_oom_adjust_operations = {
 	.write		= oom_adjust_write,
 };
 
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF], *end;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	if (!simple_strtol(buffer, &end, 0))
+		return -EINVAL;
+	if (*end == '\n')
+		end++;
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	clear_refs_smap(task->mm->mmap);
+	put_task_struct(task);
+	if (end - buffer == 0)
+		return -EIO;
+	return end - buffer;
+}
+
+static struct file_operations proc_clear_refs_operations = {
+	.write		= clear_refs_write,
+};
+
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -1856,6 +1885,7 @@ static struct pid_entry tgid_base_stuff[] = {
 	REG("mounts",     S_IRUGO, mounts),
 	REG("mountstats", S_IRUSR, mountstats),
 #ifdef CONFIG_MMU
+	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",      S_IRUGO, smaps),
 #endif
 #ifdef CONFIG_SECURITY
@@ -2137,6 +2167,7 @@ static struct pid_entry tid_base_stuff[] = {
 	LNK("exe",       exe),
 	REG("mounts",    S_IRUGO, mounts),
 #ifdef CONFIG_MMU
+	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",     S_IRUGO, smaps),
 #endif
 #ifdef CONFIG_SECURITY
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 50bd004..b689a92 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -322,6 +322,27 @@ static void smaps_pte_func(struct pte_walker *walker, pte_t *pte,
 }
 
 /*
+ * Called for each PTE in the struct pte_walker address range.  For all normal,
+ * present pages, we clear their referenced bits.
+ */
+static void clear_refs_pte_func(struct pte_walker *walker, pte_t *pte,
+				unsigned long addr)
+{
+	struct page *page;
+	pte_t ptent;
+
+	ptent = *pte;
+	if (!pte_present(ptent))
+		return;
+
+	page = vm_normal_page(walker->vma, addr, ptent);
+	if (!page)
+		return;
+	pte_mkold(ptent);
+	ClearPageReferenced(page);
+}
+
+/*
  * Displays the smap for the process.  smaps_pte_func() is called for each PTE
  * in the range from vma->vm_start to vma->vm_end.
  */
@@ -343,6 +364,22 @@ static int show_smap(struct seq_file *m, void *v)
 	return show_map_internal(m, v, &mss);
 }
 
+void clear_refs_smap(struct vm_area_struct *vma)
+{
+	for (; vma; vma = vma->vm_next) {
+		struct pte_walker walker = {
+			.vma		= vma,
+			.start		= vma->vm_start,
+			.end		= vma->vm_end,
+			.private	= NULL,
+			.func		= clear_refs_pte_func,
+		};
+
+		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
+			walk_pgds(&walker);
+	};
+}
+
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 87dec8f..f3d426b 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -104,6 +104,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
 int task_statm(struct mm_struct *, int *, int *, int *, int *);
 char *task_mem(struct mm_struct *, char *);
+void clear_refs_smap(struct vm_area_struct *);
 
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 						struct proc_dir_entry *parent);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 1/3 take2] smaps: extract pte walker from smaps code
  2007-02-07  6:15 [patch 1/3 take2] smaps: extract pte walker from smaps code David Rientjes
  2007-02-07  6:15 ` [patch 2/3 take2] smaps: add pages referenced count to smaps David Rientjes
@ 2007-02-07  6:29 ` Paul Mundt
  2007-02-07  6:49   ` Andrew Morton
  2007-02-09  2:00   ` Matt Mackall
  2007-02-07  8:30 ` Christoph Lameter
  2 siblings, 2 replies; 12+ messages in thread
From: Paul Mundt @ 2007-02-07  6:29 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Hugh Dickins, Christoph Lameter, linux-kernel

On Tue, Feb 06, 2007 at 10:15:47PM -0800, David Rientjes wrote:
> Extracts the page table entry walker from the smaps-specific code in
> fs/proc/task_mmu.c.  This will be used later for clearing the reference
> bits on pages to measure the number of pages accessed over a time period
> through /proc/pid/smaps.
> 
I like the general idea of this patch set, however..

> Since the PTE walker is now extracted from the smaps code,
> smaps_pte_func() is invoked for each PTE in the VMA.  Its behavior is
> identical to the existing implementation, except it is slightly slower
> because each PTE now invokes a function call.
> 
Perhaps this is something that needs to be looked at more closely and
made more generic? There are many ranged page table walkers that aren't
so performance critical that the function call cost would cause too much
pain. ioremap_page_range() comes to mind, and there's bound to be others.
This would also help people to get the pte map/unmap right, which seems
to pop up from time to time as well..

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 3/3 take2] smaps: add clear_refs file to clear reference
  2007-02-07  6:15   ` [patch 3/3 take2] smaps: add clear_refs file to clear reference David Rientjes
@ 2007-02-07  6:33     ` Andrew Morton
  0 siblings, 0 replies; 12+ messages in thread
From: Andrew Morton @ 2007-02-07  6:33 UTC (permalink / raw)
  To: David Rientjes; +Cc: Hugh Dickins, Paul Mundt, Christoph Lameter, linux-kernel

On Tue, 6 Feb 2007 22:15:59 -0800 (PST) David Rientjes <rientjes@google.com> wrote:

> +static void clear_refs_pte_func(struct pte_walker *walker, pte_t *pte,
> +				unsigned long addr)
> +{
> +	struct page *page;
> +	pte_t ptent;
> +
> +	ptent = *pte;
> +	if (!pte_present(ptent))
> +		return;
> +
> +	page = vm_normal_page(walker->vma, addr, ptent);
> +	if (!page)
> +		return;
> +	pte_mkold(ptent);

That only changed the local variable, and not the pagetable entry.

> +	ClearPageReferenced(page);
> +}

Please, do some good runtime testing.  Write a script to monitor the
overall working set, run it for some sample workloads (your X server or
mozilla would be good choices).  Share the results with us so we can marvel
at the efficiency of modern applications.  Include these examples in the
changelog.

Also, we will need to do tlb writeback and invalidation prior to this
operation.  Documentation/cachetlb.txt has details.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 1/3 take2] smaps: extract pte walker from smaps code
  2007-02-07  6:29 ` [patch 1/3 take2] smaps: extract pte walker from smaps code Paul Mundt
@ 2007-02-07  6:49   ` Andrew Morton
  2007-02-07  7:24     ` Paul Mundt
  2007-02-09  2:00   ` Matt Mackall
  1 sibling, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2007-02-07  6:49 UTC (permalink / raw)
  To: Paul Mundt; +Cc: David Rientjes, Hugh Dickins, Christoph Lameter, linux-kernel

On Wed, 7 Feb 2007 15:29:34 +0900 Paul Mundt <lethal@linux-sh.org> wrote:

> On Tue, Feb 06, 2007 at 10:15:47PM -0800, David Rientjes wrote:
> > Extracts the page table entry walker from the smaps-specific code in
> > fs/proc/task_mmu.c.  This will be used later for clearing the reference
> > bits on pages to measure the number of pages accessed over a time period
> > through /proc/pid/smaps.
> > 
> I like the general idea of this patch set, however..

David didn't really spell out the rationale.  Userspace people ask "how
much memory is my application using".  For system planning and such.  We
don't know, really.  So the approach taken here is to nuke all the
referenced bits and to then monitor them coming back over time.

Obviously it doesn't work very well if the page scanner is doing things,
but one hopes that when someone is instrumenting their application they'll
ensure that sufficient memory is available to prevent this inaccuracy.

The other part of the question is of course pagecache.  Hopefully /proc/pid/io
will suffice for that, probably in combination with /proc/sys/vm/drop_caches.

I don't really have a sense for how much stuff we want to put into the kernel
to support this sort of thing.  The proposed patches are, I think, minimal.
Perhaps it needs more.  If so, opinions are solicited before we go and add
(and hence be forced to maintain) this new interface.

> > Since the PTE walker is now extracted from the smaps code,
> > smaps_pte_func() is invoked for each PTE in the VMA.  Its behavior is
> > identical to the existing implementation, except it is slightly slower
> > because each PTE now invokes a function call.
> > 
> Perhaps this is something that needs to be looked at more closely and
> made more generic? There are many ranged page table walkers that aren't
> so performance critical that the function call cost would cause too much
> pain. ioremap_page_range() comes to mind, and there's bound to be others.
> This would also help people to get the pte map/unmap right, which seems
> to pop up from time to time as well..

That's what Paul Davies' patches are aimed at:

http://marc.theaimsgroup.com/?l=linux-mm&m=115276500100695&w=2

I promised Paul that I'd take a serious look at those patches next time
they pop up.   It would be good if others could help out in that.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 1/3 take2] smaps: extract pte walker from smaps code
  2007-02-07  6:49   ` Andrew Morton
@ 2007-02-07  7:24     ` Paul Mundt
  0 siblings, 0 replies; 12+ messages in thread
From: Paul Mundt @ 2007-02-07  7:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Hugh Dickins, Christoph Lameter, linux-kernel

On Tue, Feb 06, 2007 at 10:49:14PM -0800, Andrew Morton wrote:
> On Wed, 7 Feb 2007 15:29:34 +0900 Paul Mundt <lethal@linux-sh.org> wrote:
> > I like the general idea of this patch set, however..
> 
> David didn't really spell out the rationale.  Userspace people ask "how
> much memory is my application using".  For system planning and such.  We
> don't know, really.  So the approach taken here is to nuke all the
> referenced bits and to then monitor them coming back over time.
> 
> Obviously it doesn't work very well if the page scanner is doing things,
> but one hopes that when someone is instrumenting their application they'll
> ensure that sufficient memory is available to prevent this inaccuracy.
> 
This was the problem we kept running in to as well, people that wanted to
instrument their applications on the desktop where all of the required
infrastructure already brought the system to its knees long before
instrumentation could begin didn't really provide the most consistent
metrics, especially when the application depended heavily on overcommit.

This resulted in a lot of extra hacks on top of the smaps code trying to
balance out the numbers, this just ended up being a massive headache.
smaps seems to be an appealing target for piling additional semantics on,
unfortunately.

> I don't really have a sense for how much stuff we want to put into the kernel
> to support this sort of thing.  The proposed patches are, I think, minimal.
> Perhaps it needs more.  If so, opinions are solicited before we go and add
> (and hence be forced to maintain) this new interface.
> 
This sort of minimalist interface is not necessarily a bad approach if it
helps in a way that works for the people that really want it. The issue
with any of the smaps extensions is being able to provide people with a
"good enough" metric without falling in to interface hell. This really
depends on how one defines "good enough", though. It would be nice to
hear from application folks if this ends up being useful for them or not.

> > Perhaps this is something that needs to be looked at more closely and
> > made more generic? There are many ranged page table walkers that aren't
> > so performance critical that the function call cost would cause too much
> > pain. ioremap_page_range() comes to mind, and there's bound to be others.
> > This would also help people to get the pte map/unmap right, which seems
> > to pop up from time to time as well..
> 
> That's what Paul Davies' patches are aimed at:
> 
> http://marc.theaimsgroup.com/?l=linux-mm&m=115276500100695&w=2
> 
> I promised Paul that I'd take a serious look at those patches next time
> they pop up.   It would be good if others could help out in that.

It would be nice to see this smaps patchset rebased on something like
that rather than introducing another walker abstraction simply to have it
overhauled later. I realize this isn't really David's problem, but we
shouldn't be trying to solve the same problem multiple times.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 1/3 take2] smaps: extract pte walker from smaps code
  2007-02-07  6:15 [patch 1/3 take2] smaps: extract pte walker from smaps code David Rientjes
  2007-02-07  6:15 ` [patch 2/3 take2] smaps: add pages referenced count to smaps David Rientjes
  2007-02-07  6:29 ` [patch 1/3 take2] smaps: extract pte walker from smaps code Paul Mundt
@ 2007-02-07  8:30 ` Christoph Lameter
  2007-02-07  9:35   ` David Rientjes
  2007-02-09 16:15   ` Matt Mackall
  2 siblings, 2 replies; 12+ messages in thread
From: Christoph Lameter @ 2007-02-07  8:30 UTC (permalink / raw)
  To: David Rientjes; +Cc: Andrew Morton, Hugh Dickins, Paul Mundt, linux-kernel

On Tue, 6 Feb 2007, David Rientjes wrote:

> Extracts the page table entry walker from the smaps-specific code in
> fs/proc/task_mmu.c.  This will be used later for clearing the reference
> bits on pages to measure the number of pages accessed over a time period
> through /proc/pid/smaps.

Clearing reference bits? Ummm... That is a pretty inaccurate measure since 
reclaim can remove pages and revert the reference bits. It can never work 
reliably.

> The new struct pte_walker includes the struct vm_area_struct of the memory
> to walk over.  Iteration begins at the start address and completes at the
> the end address.  A pointer to another data structure may be stored in the
> private field such as the struct mem_size_stats, which acts as the smaps
> accumulator.  For each page table entry in the VMA, the func function is
> called with the corresponding struct pte_walker, the pte_t, and its
> address.

Would it be possible to sync up with the people doing the page table 
interface?

Could we somehow consolidate smaps and numa_maps?


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 1/3 take2] smaps: extract pte walker from smaps code
  2007-02-07  8:30 ` Christoph Lameter
@ 2007-02-07  9:35   ` David Rientjes
  2007-02-09 16:15   ` Matt Mackall
  1 sibling, 0 replies; 12+ messages in thread
From: David Rientjes @ 2007-02-07  9:35 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Hugh Dickins, Paul Mundt, Paul Davies, linux-kernel

On Wed, 7 Feb 2007, Christoph Lameter wrote:

> Clearing reference bits? Ummm... That is a pretty inaccurate measure since 
> reclaim can remove pages and revert the reference bits. It can never work 
> reliably.
> 

It's not intended to work precisely, it's intended to give a good estimate 
of task memory footprint.  There are several different scenarios that can 
interfere with getting a precise measurement via this method; then again 
/proc/pid/smaps is so expensive already because of the iteration through 
VMA's that it's not something you'd do regularly.

Any other suggestions on how to obtain this data is certainly welcome.

> Would it be possible to sync up with the people doing the page table 
> interface?
> 

(+pauld)

Is there an update on Paul Davies' implementation that would allow us to 
iterate through a set of pte's in a vm_area_struct range?

> Could we somehow consolidate smaps and numa_maps?
> 

Sure, but that's beyond the scope of this patchset.  My intention in 
extracting away a new pte_walker was to prevent having two identical 
implementations and it could easily be extracted even further to lib for 
the ioremap case that Paul mentioned.

		David

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 1/3 take2] smaps: extract pte walker from smaps code
  2007-02-07  6:29 ` [patch 1/3 take2] smaps: extract pte walker from smaps code Paul Mundt
  2007-02-07  6:49   ` Andrew Morton
@ 2007-02-09  2:00   ` Matt Mackall
  2007-02-09  2:25     ` David Rientjes
  1 sibling, 1 reply; 12+ messages in thread
From: Matt Mackall @ 2007-02-09  2:00 UTC (permalink / raw)
  To: Paul Mundt, David Rientjes, Andrew Morton, Hugh Dickins,
	Christoph Lameter, linux-kernel

On Wed, Feb 07, 2007 at 03:29:34PM +0900, Paul Mundt wrote:
> On Tue, Feb 06, 2007 at 10:15:47PM -0800, David Rientjes wrote:
> > Extracts the page table entry walker from the smaps-specific code in
> > fs/proc/task_mmu.c.  This will be used later for clearing the reference
> > bits on pages to measure the number of pages accessed over a time period
> > through /proc/pid/smaps.
> > 
> I like the general idea of this patch set, however..
> 
> > Since the PTE walker is now extracted from the smaps code,
> > smaps_pte_func() is invoked for each PTE in the VMA.  Its behavior is
> > identical to the existing implementation, except it is slightly slower
> > because each PTE now invokes a function call.
> > 
> Perhaps this is something that needs to be looked at more closely and
> made more generic? There are many ranged page table walkers that aren't
> so performance critical that the function call cost would cause too much
> pain. ioremap_page_range() comes to mind, and there's bound to be others.
> This would also help people to get the pte map/unmap right, which seems
> to pop up from time to time as well..

I've been looking at a similar refactoring of other code and I think
the way to go is a callback per block-of-PTEs with start and end
pointers. That gets rid of most of the call indirection overhead.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 1/3 take2] smaps: extract pte walker from smaps code
  2007-02-09  2:00   ` Matt Mackall
@ 2007-02-09  2:25     ` David Rientjes
  0 siblings, 0 replies; 12+ messages in thread
From: David Rientjes @ 2007-02-09  2:25 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Paul Mundt, Andrew Morton, Hugh Dickins, Christoph Lameter, linux-kernel

On Thu, 8 Feb 2007, Matt Mackall wrote:

> I've been looking at a similar refactoring of other code and I think
> the way to go is a callback per block-of-PTEs with start and end
> pointers. That gets rid of most of the call indirection overhead.
> 

Yes, but only in a limited number of cases (including smaps).  It is not 
always sufficient to do it on the block-of-ptes level.  Many of the the 
pte iterators in the code right now are slightly different on each level: 
the ioremap code, for example, allocates the various directories as it 
progresses in depth.

To have one library pte iterator would only be possible with callback 
functions on each pgd, pud, and pmd level that (at least) ioremap would 
require to be implemented on top of.  That isn't very appealing.

The PTI patchset from July moved these iterators into .h files but did not 
implement a single pte iterator to build on.  It just turned out nicely in 
the smaps case that there was no difference on the pgd, pud, or pmd level 
between the two required iterators.  I added a void *private to carry the 
struct for statistics accumulating in the smaps case but this is worthless 
in the clear_refs case.

		David

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 1/3 take2] smaps: extract pte walker from smaps code
  2007-02-07  8:30 ` Christoph Lameter
  2007-02-07  9:35   ` David Rientjes
@ 2007-02-09 16:15   ` Matt Mackall
  1 sibling, 0 replies; 12+ messages in thread
From: Matt Mackall @ 2007-02-09 16:15 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David Rientjes, Andrew Morton, Hugh Dickins, Paul Mundt, linux-kernel

On Wed, Feb 07, 2007 at 12:30:38AM -0800, Christoph Lameter wrote:
> On Tue, 6 Feb 2007, David Rientjes wrote:
> 
> > Extracts the page table entry walker from the smaps-specific code in
> > fs/proc/task_mmu.c.  This will be used later for clearing the reference
> > bits on pages to measure the number of pages accessed over a time period
> > through /proc/pid/smaps.
> 
> Clearing reference bits? Ummm... That is a pretty inaccurate measure since 
> reclaim can remove pages and revert the reference bits. It can never work 
> reliably.
> 
> > The new struct pte_walker includes the struct vm_area_struct of the memory
> > to walk over.  Iteration begins at the start address and completes at the
> > the end address.  A pointer to another data structure may be stored in the
> > private field such as the struct mem_size_stats, which acts as the smaps
> > accumulator.  For each page table entry in the VMA, the func function is
> > called with the corresponding struct pte_walker, the pte_t, and its
> > address.
> 
> Would it be possible to sync up with the people doing the page table 
> interface?

Have a pointer to this?

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2007-02-09 16:28 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-07  6:15 [patch 1/3 take2] smaps: extract pte walker from smaps code David Rientjes
2007-02-07  6:15 ` [patch 2/3 take2] smaps: add pages referenced count to smaps David Rientjes
2007-02-07  6:15   ` [patch 3/3 take2] smaps: add clear_refs file to clear reference David Rientjes
2007-02-07  6:33     ` Andrew Morton
2007-02-07  6:29 ` [patch 1/3 take2] smaps: extract pte walker from smaps code Paul Mundt
2007-02-07  6:49   ` Andrew Morton
2007-02-07  7:24     ` Paul Mundt
2007-02-09  2:00   ` Matt Mackall
2007-02-09  2:25     ` David Rientjes
2007-02-07  8:30 ` Christoph Lameter
2007-02-07  9:35   ` David Rientjes
2007-02-09 16:15   ` Matt Mackall

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).