LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM
@ 2007-04-01  7:10 Christoph Lameter
  2007-04-01  7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter
  2007-04-02 20:50 ` [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Dave Hansen
  0 siblings, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-01  7:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Martin Bligh, linux-mm, Christoph Lameter,
	Andi Kleen, KAMEZAWA Hiroyuki

Spare Virtual: Virtual Memmap support for SPARSEMEM

SPARSEMEM is a pretty nice framework that unifies quite a bit of
code over all the arches. It would be great if it could be the default
so that we can get rid of various forms of DISCONTIG and other variations
on memory maps. So far what has hindered this are the additional lookups
that SPARSEMEM introduces for virt_to_page and page_address. This goes
so far that the code to do this has to be kept in a separate function
and cannot be used inline.

This patch introduces virtual memmap support for sparsemem. virt_to_page
page_address and consorts become simple shift/add operations. No page flag
fields, no table lookups nothing involving memory is required.

The two key operations pfn_to_page and page_to_page become:

#define pfn_to_page(pfn)     (vmemmap + (pfn))
#define page_to_pfn(page)    ((page) - vmemmap)

In order for this to work we will have to use a virtual mapping.
These are usually for free since kernel memory is already mapped
via a 1-1 mapping requiring a page tabld. The virtual mapping must
be big enough to span all of memory that an arch support which may
make a virtual memmap difficult to use on funky 32 bit platforms
that support 36 address bits.

However, if there is enough virtual space available and the arch
already maps its 1-1 kernel space using TLBs (f.e. true of IA64
and x86_64) then this technique makes sparsemem lookups as efficient
as CONFIG_FLATMEM.

Maybe this patch will allow us to make SPARSEMEM the default
configuration that will work on UP, SMP and NUMA on most platforms?
Then we may hopefully be able to remove the various forms of support
for FLATMEM, DISCONTIG etc etc.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc5-mm2/include/asm-generic/memory_model.h
===================================================================
--- linux-2.6.21-rc5-mm2.orig/include/asm-generic/memory_model.h	2007-03-31 22:47:14.000000000 -0700
+++ linux-2.6.21-rc5-mm2/include/asm-generic/memory_model.h	2007-03-31 22:59:35.000000000 -0700
@@ -47,6 +47,13 @@
 })
 
 #elif defined(CONFIG_SPARSEMEM)
+#ifdef CONFIG_SPARSE_VIRTUAL
+/*
+ * We have a virtual memmap that makes lookups very simple
+ */
+#define __pfn_to_page(pfn)	(vmemmap + (pfn))
+#define __page_to_pfn(page)	((page) - vmemmap)
+#else
 /*
  * Note: section's mem_map is encorded to reflect its start_pfn.
  * section[i].section_mem_map == mem_map's address - start_pfn;
@@ -62,6 +69,7 @@
 	struct mem_section *__sec = __pfn_to_section(__pfn);	\
 	__section_mem_map_addr(__sec) + __pfn;		\
 })
+#endif
 #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
 
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
Index: linux-2.6.21-rc5-mm2/mm/sparse.c
===================================================================
--- linux-2.6.21-rc5-mm2.orig/mm/sparse.c	2007-03-31 22:47:14.000000000 -0700
+++ linux-2.6.21-rc5-mm2/mm/sparse.c	2007-03-31 22:59:35.000000000 -0700
@@ -9,6 +9,7 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <asm/dma.h>
+#include <asm/pgalloc.h>
 
 /*
  * Permanent SPARSEMEM data:
@@ -101,7 +102,7 @@ static inline int sparse_index_init(unsi
 
 /*
  * Although written for the SPARSEMEM_EXTREME case, this happens
- * to also work for the flat array case becase
+ * to also work for the flat array case because
  * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
  */
 int __section_nr(struct mem_section* ms)
@@ -211,6 +212,90 @@ static int sparse_init_one_section(struc
 	return 1;
 }
 
+#ifdef CONFIG_SPARSE_VIRTUAL
+
+void *vmemmap_alloc_block(unsigned long size, int node)
+{
+	if (slab_is_available()) {
+		struct page *page =
+			alloc_pages_node(node, GFP_KERNEL,
+				get_order(size));
+
+		BUG_ON(!page);
+		return page_address(page);
+	} else
+		return __alloc_bootmem_node(NODE_DATA(node), size, size,
+					__pa(MAX_DMA_ADDRESS));
+}
+
+
+#ifndef ARCH_POPULATES_VIRTUAL_MEMMAP
+/*
+ * Virtual memmap populate functionality for architectures that support
+ * PMDs for huge pages like i386, x86_64 etc.
+ */
+static void vmemmap_pop_pmd(pud_t *pud, unsigned long addr,
+				unsigned long end, int node)
+{
+	pmd_t *pmd;
+
+	end = pmd_addr_end(addr, end);
+
+	for (pmd = pmd_offset(pud, addr); addr < end;
+			pmd++, addr += PMD_SIZE) {
+  		if (pmd_none(*pmd)) {
+  			void *block;
+			pte_t pte;
+
+			block = vmemmap_alloc_block(PMD_SIZE, node);
+			pte = pfn_pte(__pa(block) >> PAGE_SHIFT,
+						PAGE_KERNEL);
+			pte_mkdirty(pte);
+			pte_mkwrite(pte);
+			pte_mkyoung(pte);
+			mk_pte_huge(pte);
+			set_pmd(pmd, __pmd(pte_val(pte)));
+		}
+	}
+}
+
+static void vmemmap_pop_pud(pgd_t *pgd, unsigned long addr,
+					unsigned long end, int node)
+{
+	pud_t *pud;
+
+	end = pud_addr_end(addr, end);
+	for (pud = pud_offset(pgd, addr); addr < end;
+				pud++, addr += PUD_SIZE) {
+
+		if (pud_none(*pud))
+			pud_populate(&init_mm, pud,
+				vmemmap_alloc_block(PAGE_SIZE, node));
+
+		vmemmap_pop_pmd(pud, addr, end, node);
+	}
+}
+
+static void vmemmap_populate(struct page *start_page, unsigned long nr,
+								int node)
+{
+	pgd_t *pgd;
+	unsigned long addr = (unsigned long)(start_page);
+	unsigned long end = pgd_addr_end(addr,
+			(unsigned long)((start_page + nr)));
+
+	for (pgd = pgd_offset_k(addr); addr < end;
+				pgd++, addr += PGDIR_SIZE) {
+
+		if (pgd_none(*pgd))
+			pgd_populate(&init_mm, pgd,
+				vmemmap_alloc_block(PAGE_SIZE, node));
+		vmemmap_pop_pud(pgd, addr, end, node);
+	}
+}
+#endif
+#endif /* CONFIG_SPARSE_VIRTUAL */
+
 static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
 	struct page *map;
@@ -221,8 +306,13 @@ static struct page *sparse_early_mem_map
 	if (map)
 		return map;
 
+#ifdef CONFIG_SPARSE_VIRTUAL
+	map = pfn_to_page(pnum * PAGES_PER_SECTION);
+	vmemmap_populate(map, PAGES_PER_SECTION, nid);
+#else
 	map = alloc_bootmem_node(NODE_DATA(nid),
 			sizeof(struct page) * PAGES_PER_SECTION);
+#endif
 	if (map)
 		return map;
 
Index: linux-2.6.21-rc5-mm2/include/linux/mmzone.h
===================================================================
--- linux-2.6.21-rc5-mm2.orig/include/linux/mmzone.h	2007-03-31 22:47:14.000000000 -0700
+++ linux-2.6.21-rc5-mm2/include/linux/mmzone.h	2007-03-31 23:01:03.000000000 -0700
@@ -836,6 +836,8 @@ void sparse_init(void);
 
 void memory_present(int nid, unsigned long start, unsigned long end);
 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
+void vmemmap_populate(struct page *start_page, unsigned long pages, int node);
+void *vmemmap_alloc_block(unsigned long size, int node);
 
 /*
  * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-01  7:10 [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Christoph Lameter
@ 2007-04-01  7:10 ` Christoph Lameter
  2007-04-01  7:11   ` Christoph Lameter
  2007-04-01 10:46   ` Andi Kleen
  2007-04-02 20:50 ` [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Dave Hansen
  1 sibling, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-01  7:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Martin Bligh, linux-mm, Andi Kleen,
	Christoph Lameter, KAMEZAWA Hiroyuki

x86_64 make SPARSE_VIRTUAL the default

x86_64 is using 2M page table entries to map its 1-1 kernel space.
We implement the virtual memmap also using 2M page table entries.
So there is no difference at all to FLATMEM. Both schemes require
a page table and a TLB.

Thus the SPARSEMEM becomes the most efficient way of handling
virt_to_page, pfn_to_page and friends for UP, SMP and NUMA.

So change the Kconfig for x86_64 to make SPARSE_VIRTUAL the
default and switch off all other memory models.

Oh. And PFN_TO_PAGE used to be out of line. Since it is now
so simple switch it back to inline.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc5-mm2/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.21-rc5-mm2.orig/arch/x86_64/Kconfig	2007-03-31 23:47:58.000000000 -0700
+++ linux-2.6.21-rc5-mm2/arch/x86_64/Kconfig	2007-03-31 23:48:41.000000000 -0700
@@ -380,25 +380,29 @@ config NUMA_EMU
 	  number of nodes. This is only useful for debugging.
 
 config ARCH_DISCONTIGMEM_ENABLE
-       bool
-       depends on NUMA
-       default y
+       def_bool n
 
 config ARCH_DISCONTIGMEM_DEFAULT
-	def_bool y
-	depends on NUMA
+	def_bool n
 
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
-	depends on (NUMA || EXPERIMENTAL)
+
+config SPARSEMEM_MANUAL
+	def_bool y
+
+config ARCH_SPARSE_VIRTUAL
+	def_bool y
+
+config SELECT_MEMORY_MODEL
+	def_bool n
 
 config ARCH_MEMORY_PROBE
 	def_bool y
 	depends on MEMORY_HOTPLUG
 
 config ARCH_FLATMEM_ENABLE
-	def_bool y
-	depends on !NUMA
+	def_bool n
 
 source "mm/Kconfig"
 
@@ -411,8 +415,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
 	depends on NUMA
 
 config OUT_OF_LINE_PFN_TO_PAGE
-	def_bool y
-	depends on DISCONTIGMEM
+	def_bool n
 
 config NR_CPUS
 	int "Maximum number of CPUs (2-255)"
Index: linux-2.6.21-rc5-mm2/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.21-rc5-mm2.orig/include/asm-x86_64/page.h	2007-03-31 23:47:58.000000000 -0700
+++ linux-2.6.21-rc5-mm2/include/asm-x86_64/page.h	2007-03-31 23:48:41.000000000 -0700
@@ -135,6 +135,7 @@ typedef struct { unsigned long pgprot; }
 	 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #define __HAVE_ARCH_GATE_AREA 1	
+#define vmemmap ((struct page *)0xffffe20000000000UL)
 
 #include <asm-generic/memory_model.h>
 #include <asm-generic/page.h>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-01  7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter
@ 2007-04-01  7:11   ` Christoph Lameter
  2007-04-01 10:46   ` Andi Kleen
  1 sibling, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-01  7:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Martin Bligh, linux-mm, Andi Kleen, KAMEZAWA Hiroyuki

Argh. This should have been Patch 2/2. There is nothing in between.




^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-01  7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter
  2007-04-01  7:11   ` Christoph Lameter
@ 2007-04-01 10:46   ` Andi Kleen
  2007-04-02 15:37     ` Christoph Lameter
                       ` (2 more replies)
  1 sibling, 3 replies; 46+ messages in thread
From: Andi Kleen @ 2007-04-01 10:46 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki

On Sunday 01 April 2007 09:10, Christoph Lameter wrote:
> x86_64 make SPARSE_VIRTUAL the default
> 
> x86_64 is using 2M page table entries to map its 1-1 kernel space.
> We implement the virtual memmap also using 2M page table entries.
> So there is no difference at all to FLATMEM. Both schemes require
> a page table and a TLB.

Hmm, this means there is at least 2MB worth of struct page on every node?
Or do you have overlaps with other memory (I think you have)
In that case you have to handle the overlap in change_page_attr()

Also your "generic" vmemmap code doesn't look very generic, but
rather x86 specific. I didn't think huge pages could be easily
set up this way in many other architectures.  

And when you reserve virtual space somewhere you should 
update Documentation/x86_64/mm.txt. Also you didn't adjust 
the end of the vmalloc area so in theory vmalloc could run
into your vmemmap.

> Thus the SPARSEMEM becomes the most efficient way of handling
> virt_to_page, pfn_to_page and friends for UP, SMP and NUMA.

Do you have any benchmarks numbers to prove it? There seem to be a few
benchmarks where the discontig virt_to_page is a problem
(although I know ways to make it more efficient), and sparsemem
is normally slower. Still some numbers would be good.

-Andi


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-01 10:46   ` Andi Kleen
@ 2007-04-02 15:37     ` Christoph Lameter
  2007-04-02 15:44       ` Andi Kleen
                         ` (2 more replies)
  2007-04-02 15:44     ` Christoph Lameter
  2007-04-02 16:18     ` Christoph Lameter
  2 siblings, 3 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 15:37 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki

On Sun, 1 Apr 2007, Andi Kleen wrote:

> Hmm, this means there is at least 2MB worth of struct page on every node?
> Or do you have overlaps with other memory (I think you have)
> In that case you have to handle the overlap in change_page_attr()

Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes 
with smaller amounts of memory? Note also that the default sparsemem
section size is (include/asm-x86_64/sparsemem.h)

#define SECTION_SIZE_BITS       27 /* matt - 128 is convenient right now */

128MB ....

So you currently cannot have smaller sections of memory anyways.

> Also your "generic" vmemmap code doesn't look very generic, but
> rather x86 specific. I didn't think huge pages could be easily
> set up this way in many other architectures.  

We do this pmd special casing in other parts of the core VM. I have also a 
patch for IA64 that workks with this.

> Do you have any benchmarks numbers to prove it? There seem to be a few
> benchmarks where the discontig virt_to_page is a problem
> (although I know ways to make it more efficient), and sparsemem
> is normally slower. Still some numbers would be good.

You want a benchmark to prove that the removal of memory references and 
code improves performance?


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-01 10:46   ` Andi Kleen
  2007-04-02 15:37     ` Christoph Lameter
@ 2007-04-02 15:44     ` Christoph Lameter
  2007-04-02 16:18     ` Christoph Lameter
  2 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 15:44 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki

On Sun, 1 Apr 2007, Andi Kleen wrote:

> And when you reserve virtual space somewhere you should 
> update Documentation/x86_64/mm.txt. Also you didn't adjust 
> the end of the vmalloc area so in theory vmalloc could run
> into your vmemmap.

Ok. will add to the doc in the next release.

No need to adjust the end of the vmalloc area because
the vmemmap starts at the end of it:

include/asm-x86_64/pgtable.h:

#define VMALLOC_START    0xffffc20000000000UL
#define VMALLOC_END      0xffffe1ffffffffffUL

Index: linux-2.6.21-rc5-mm2/include/asm-x86_64/page.h

#define vmemmap ((struct page *)0xffffe20000000000UL)

According to Documentation/x86_64/mm.txt this is an unused hole:

ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
... unused hole ...
ffffffff80000000 - ffffffff82800000 (=40 MB)   kernel text mapping, from phys 0
... unused hole ...


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 15:37     ` Christoph Lameter
@ 2007-04-02 15:44       ` Andi Kleen
  2007-04-02 15:51         ` Martin J. Bligh
  2007-04-02 15:54         ` Christoph Lameter
  2007-04-02 20:13       ` Dave Hansen
  2007-04-02 23:03       ` Bjorn Helgaas
  2 siblings, 2 replies; 46+ messages in thread
From: Andi Kleen @ 2007-04-02 15:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki

On Monday 02 April 2007 17:37, Christoph Lameter wrote:
> On Sun, 1 Apr 2007, Andi Kleen wrote:
> 
> > Hmm, this means there is at least 2MB worth of struct page on every node?
> > Or do you have overlaps with other memory (I think you have)
> > In that case you have to handle the overlap in change_page_attr()
> 
> Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes 
> with smaller amounts of memory? 

Yes the discontigmem minimum is 64MB and there are some setups
(mostly with numa emulation) where you end up with nodes that small.

BTW there is no guarantee the node size is a multiple of 128MB so
you likely need to handle the overlap case. Otherwise we can 
get cache corruptions

> Note also that the default sparsemem 
> section size is (include/asm-x86_64/sparsemem.h)
> 
> #define SECTION_SIZE_BITS       27 /* matt - 128 is convenient right now */
> 
> 128MB ....
> 
> So you currently cannot have smaller sections of memory anyways.

Sparsemem is still quite experimental; discontigmem is the default
on x86-64.

> 
> > Also your "generic" vmemmap code doesn't look very generic, but
> > rather x86 specific. I didn't think huge pages could be easily
> > set up this way in many other architectures.  
> 
> We do this pmd special casing in other parts of the core VM. I have also a 
> patch for IA64 that workks with this.
> 
> > Do you have any benchmarks numbers to prove it? There seem to be a few
> > benchmarks where the discontig virt_to_page is a problem
> > (although I know ways to make it more efficient), and sparsemem
> > is normally slower. Still some numbers would be good.
> 
> You want a benchmark to prove that the removal of memory references and 
> code improves performance?

You're just moving them into MMU, not really removing it.  And need more TLB entries.
It might be faster or it might not. There are some unexpected issues, like most x86-64 
CPUs have a quite small number of large TLBs so you can get thrashing etc.

So numbers with TLB intensive workloads would be good. 

-Andi

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 15:44       ` Andi Kleen
@ 2007-04-02 15:51         ` Martin J. Bligh
  2007-04-02 15:54         ` Christoph Lameter
  1 sibling, 0 replies; 46+ messages in thread
From: Martin J. Bligh @ 2007-04-02 15:51 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Christoph Lameter, linux-kernel, linux-arch, linux-mm,
	KAMEZAWA Hiroyuki, Andy Whitcroft

cc: apw ... seeing as he wrote sparsemem in the first place, please copy
him on this stuff ?

Andi Kleen wrote:
> On Monday 02 April 2007 17:37, Christoph Lameter wrote:
>> On Sun, 1 Apr 2007, Andi Kleen wrote:
>>
>>> Hmm, this means there is at least 2MB worth of struct page on every node?
>>> Or do you have overlaps with other memory (I think you have)
>>> In that case you have to handle the overlap in change_page_attr()
>> Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes 
>> with smaller amounts of memory? 
> 
> Yes the discontigmem minimum is 64MB and there are some setups
> (mostly with numa emulation) where you end up with nodes that small.

We're actually using numa emulation to do real (container) things with.
However, 128MB is still pretty small for that ... and worst case, we
just waste 1MB for a 64MB node, right? Which isn't beautiful, but
doesn't seem like the end of the world for an obscure corner case.

>>> Do you have any benchmarks numbers to prove it? There seem to be a few
>>> benchmarks where the discontig virt_to_page is a problem
>>> (although I know ways to make it more efficient), and sparsemem
>>> is normally slower. Still some numbers would be good.
>> You want a benchmark to prove that the removal of memory references and 
>> code improves performance?
> 
> You're just moving them into MMU, not really removing it.  And need more TLB entries.
> It might be faster or it might not. There are some unexpected issues, like most x86-64 
> CPUs have a quite small number of large TLBs so you can get thrashing etc.
> 
> So numbers with TLB intensive workloads would be good. 

There's also the possibility it just doesn't make enough difference
to affect a real benchmark ...

M.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 15:44       ` Andi Kleen
  2007-04-02 15:51         ` Martin J. Bligh
@ 2007-04-02 15:54         ` Christoph Lameter
  2007-04-02 17:14           ` Andi Kleen
  2007-04-02 19:54           ` Dave Hansen
  1 sibling, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 15:54 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Andi Kleen wrote:

> > Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes 
> > with smaller amounts of memory? 
> 
> Yes the discontigmem minimum is 64MB and there are some setups
> (mostly with numa emulation) where you end up with nodes that small.

Ok. Then I guess we cannot remove discontigmem.
 
> BTW there is no guarantee the node size is a multiple of 128MB so
> you likely need to handle the overlap case. Otherwise we can 
> get cache corruptions

How does sparsemem handle that?

> Sparsemem is still quite experimental; discontigmem is the default
> on x86-64.

Ok more fodder for preserving the choice.

> > > Do you have any benchmarks numbers to prove it? There seem to be a few
> > > benchmarks where the discontig virt_to_page is a problem
> > > (although I know ways to make it more efficient), and sparsemem
> > > is normally slower. Still some numbers would be good.
> > 
> > You want a benchmark to prove that the removal of memory references and 
> > code improves performance?
> 
> You're just moving them into MMU, not really removing it.  And need more TLB entries.

No no no. For the gazillions time: All of 1-1 mapped kernel memory on 
x86_64 needs a 2 MB page table entry. The virtual memmap uses the same. 
There are *no* additional TLBs used.

> It might be faster or it might not. There are some unexpected issues, like most x86-64 
> CPUs have a quite small number of large TLBs so you can get thrashing etc.
> 
> So numbers with TLB intensive workloads would be good. 

You did not read the descriptions. Sigh.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-01 10:46   ` Andi Kleen
  2007-04-02 15:37     ` Christoph Lameter
  2007-04-02 15:44     ` Christoph Lameter
@ 2007-04-02 16:18     ` Christoph Lameter
  2 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 16:18 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki

On Sun, 1 Apr 2007, Andi Kleen wrote:

> Or do you have overlaps with other memory (I think you have)

We may get into a case where some page structs are physically located
on other nodes if there are no holes between nodes. This would be 
particularly significant for 64MB node sizes using numa emulation.
But there it really does not matter where the real memory is located.
(this is really inherent in sparsemems way of mapping memory).

> In that case you have to handle the overlap in change_page_attr()

Why would we use change_page_attr on page structs? I can see that the 
pages themselves should be subject to change_page_attr but why the 
page_structs? I think the system would panic if one would set page structs 
to read only.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 15:54         ` Christoph Lameter
@ 2007-04-02 17:14           ` Andi Kleen
  2007-04-02 17:34             ` Christoph Lameter
  2007-04-02 19:54           ` Dave Hansen
  1 sibling, 1 reply; 46+ messages in thread
From: Andi Kleen @ 2007-04-02 17:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki


> No no no. For the gazillions time: All of 1-1 mapped kernel memory on 
> x86_64 needs a 2 MB page table entry. The virtual memmap uses the same. 
> There are *no* additional TLBs used.

But why do you reserve an own virtual area then if you claim to not use any
additional mappings? 

-Andi

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 17:14           ` Andi Kleen
@ 2007-04-02 17:34             ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 17:34 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Andi Kleen wrote:

> > No no no. For the gazillions time: All of 1-1 mapped kernel memory on 
> > x86_64 needs a 2 MB page table entry. The virtual memmap uses the same. 
> > There are *no* additional TLBs used.
> 
> But why do you reserve an own virtual area then if you claim to not use any
> additional mappings? 

The 1-1 area using mappings for 2MB pages right? So it uses a virtual 1-1 
area. It already has a virtual mapping.

What we do for virtual memmap here is also use 2MB pages but order the 
pages a bit different so that they provide a linear memory map.

So the number of TLBs in use stays the same. There are a few additional 
higher level page table pages that are needed to provide the alternate 
view that generates the linear mapping but that is just a couple of pages.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 15:54         ` Christoph Lameter
  2007-04-02 17:14           ` Andi Kleen
@ 2007-04-02 19:54           ` Dave Hansen
  2007-04-02 20:11             ` Christoph Lameter
  1 sibling, 1 reply; 46+ messages in thread
From: Dave Hansen @ 2007-04-02 19:54 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Mon, 2007-04-02 at 08:54 -0700, Christoph Lameter wrote:
> > BTW there is no guarantee the node size is a multiple of 128MB so
> > you likely need to handle the overlap case. Otherwise we can 
> > get cache corruptions
> 
> How does sparsemem handle that? 

It doesn't. :)

In practice, this situation never happens because we don't have any
actual architectures that have any node boundaries on less than
MAX_ORDER, and the section size is at least MAX_ORDER.  If we *did* have
this, then the page allocator would already be broken for these
nodes. ;)

So, this SPARSE_VIRTUAL does introduce a new dependency, which Andi
calculated above.  But, in reality, I don't think it's a big deal.  Just
to spell it out a bit more, if this:

	VMEMMAP_MAPPING_SIZE/sizeof(struct page) * PAGE_SIZE

(where VMEMMAP_MAPPING_SIZE is PMD_SIZE in your case) is any larger than
the granularity on which your NUMA nodes are divided, then you might
have a problem with mem_map for one NUMA node getting allocated on
another.  

It might be worth a comment, or at least some kind of WARN_ON().
Perhaps we can stick something in online_page() to check if:

	page_to_nid(page) == page_to_nid(virt_to_page(page))

-- Dave


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 19:54           ` Dave Hansen
@ 2007-04-02 20:11             ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 20:11 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Dave Hansen wrote:

> MAX_ORDER, and the section size is at least MAX_ORDER.  If we *did* have
> this, then the page allocator would already be broken for these
> nodes. ;)

Ahh... Ok.
 
> So, this SPARSE_VIRTUAL does introduce a new dependency, which Andi
> calculated above.  But, in reality, I don't think it's a big deal.  Just
> to spell it out a bit more, if this:
> 
> 	VMEMMAP_MAPPING_SIZE/sizeof(struct page) * PAGE_SIZE
> 
> (where VMEMMAP_MAPPING_SIZE is PMD_SIZE in your case) is any larger than
> the granularity on which your NUMA nodes are divided, then you might
> have a problem with mem_map for one NUMA node getting allocated on
> another.  

This is only a problem if

1. We are not on NUMA emulation. In that case: Who cares. The SPARSEMEM
   sections make sure that the MAX_ORDER blocks do not overlap.

2. There is a hole less than 128 MB between the nodes.

3. The maximum overlap can then be theoretically less than 2M in terms of
   page structs. That is less than 128MB can overlap at the beginning of a 
   node. Typically the start of a node gets used for allocation system
   control areas. I.e. node data, vmemmap 2M blocks etc. For those we
   only use the page structs during bootstrap. They are not performance
   critical. We can just ignore the problem for those.

   In order for this to become a problem the overlap would need to
   more than the management data at the front of a node. The larger 
   the zone is the more 2M blocks will be allocated from the beginning a 
   node and the less we can actually get into this situation.

If this is an actual problem then we could take out this particular
2M page and replace it with single 4K pages that can be individually
placed. Yaw.... Too complex.

I think we can ignore this. The only problem could be reduced performance
accessing page structs of some small portion of a node.

> It might be worth a comment, or at least some kind of WARN_ON().
> Perhaps we can stick something in online_page() to check if:
> 
> 	page_to_nid(page) == page_to_nid(virt_to_page(page))

Could do that but the check is going to be too agressive. Check would 
have to be done after all control information has been allocated. WARN_ON 
would be sufficient since this is not going to impact functionality.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 15:37     ` Christoph Lameter
  2007-04-02 15:44       ` Andi Kleen
@ 2007-04-02 20:13       ` Dave Hansen
  2007-04-02 20:30         ` Christoph Lameter
  2007-04-02 23:03       ` Bjorn Helgaas
  2 siblings, 1 reply; 46+ messages in thread
From: Dave Hansen @ 2007-04-02 20:13 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Mon, 2007-04-02 at 08:37 -0700, Christoph Lameter wrote:
> You want a benchmark to prove that the removal of memory references and 
> code improves performance? 

Yes, please. ;)

I completely agree, it looks like it should be faster.  The code
certainly has potential benefits.  But, to add this neato, apparently
more performant feature, we unfortunately have to add code.  Adding the
code has a cost: code maintenance.  This isn't a runtime cost, but it is
a real, honest to goodness tradeoff.

So, let's get some kind of concrete idea what the tradeoffs are.  Is it,
400 lines of code gets us a 10% performance boost across the board, or
that 400,000 lines gets us 0.1% on one specialized benchmark?

BTW, I like the patches.  Very nice and clean.  

-- Dave


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 20:13       ` Dave Hansen
@ 2007-04-02 20:30         ` Christoph Lameter
  2007-04-02 20:38           ` Martin Bligh
  2007-04-02 21:08           ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Dave Hansen
  0 siblings, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 20:30 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Dave Hansen wrote:

> I completely agree, it looks like it should be faster.  The code
> certainly has potential benefits.  But, to add this neato, apparently
> more performant feature, we unfortunately have to add code.  Adding the
> code has a cost: code maintenance.  This isn't a runtime cost, but it is
> a real, honest to goodness tradeoff.

Its just the opposite. The vmemmap code is so efficient that we can remove 
lots of other code and gops of these alternate implementations. On x86_64 
its even superior to FLATMEM since FLATMEM still needs a memory reference 
for the mem_map area. So if we make SPARSE standard for all 
configurations then there is no need anymore for FLATMEM DISCONTIG etc 
etc. Can we not cleanup all this mess? Get rid of all the gazillions 
of #ifdefs please? This would ease code maintenance significantly. I hate 
having to constantly navigate my way through all the alternatives.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 20:30         ` Christoph Lameter
@ 2007-04-02 20:38           ` Martin Bligh
  2007-04-02 20:51             ` Christoph Lameter
  2007-04-02 21:08           ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Dave Hansen
  1 sibling, 1 reply; 46+ messages in thread
From: Martin Bligh @ 2007-04-02 20:38 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm,
	KAMEZAWA Hiroyuki, Andy Whitcroft

Christoph Lameter wrote:
> On Mon, 2 Apr 2007, Dave Hansen wrote:
> 
>> I completely agree, it looks like it should be faster.  The code
>> certainly has potential benefits.  But, to add this neato, apparently
>> more performant feature, we unfortunately have to add code.  Adding the
>> code has a cost: code maintenance.  This isn't a runtime cost, but it is
>> a real, honest to goodness tradeoff.
> 
> Its just the opposite. The vmemmap code is so efficient that we can remove 
> lots of other code and gops of these alternate implementations. On x86_64 
> its even superior to FLATMEM since FLATMEM still needs a memory reference 
> for the mem_map area. So if we make SPARSE standard for all 
> configurations then there is no need anymore for FLATMEM DISCONTIG etc 
> etc. Can we not cleanup all this mess? Get rid of all the gazillions 
> of #ifdefs please? This would ease code maintenance significantly. I hate 
> having to constantly navigate my way through all the alternatives.

The original plan when this was first merged was pretty much that -
for sparsemem to replace discontigmem once it was well tested. Seems
to have got stalled halfway through ;-(

Not sure we'll get away with replacing flatmem for all arches, but
we could at least get rid of discontigmem, it seems.

M.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM
  2007-04-01  7:10 [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Christoph Lameter
  2007-04-01  7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter
@ 2007-04-02 20:50 ` Dave Hansen
  2007-04-02 21:00   ` Christoph Lameter
  1 sibling, 1 reply; 46+ messages in thread
From: Dave Hansen @ 2007-04-02 20:50 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen,
	KAMEZAWA Hiroyuki

First of all, nice set of patches.

On Sat, 2007-03-31 at 23:10 -0800, Christoph Lameter wrote:
> --- linux-2.6.21-rc5-mm2.orig/include/asm-generic/memory_model.h	2007-03-31 22:47:14.000000000 -0700
> +++ linux-2.6.21-rc5-mm2/include/asm-generic/memory_model.h	2007-03-31 22:59:35.000000000 -0700
> @@ -47,6 +47,13 @@
>  })
> 
>  #elif defined(CONFIG_SPARSEMEM)
> +#ifdef CONFIG_SPARSE_VIRTUAL
> +/*
> + * We have a virtual memmap that makes lookups very simple
> + */
> +#define __pfn_to_page(pfn)	(vmemmap + (pfn))
> +#define __page_to_pfn(page)	((page) - vmemmap)
> +#else
>  /*
>   * Note: section's mem_map is encorded to reflect its start_pfn.
>   * section[i].section_mem_map == mem_map's address - start_pfn;
> @@ -62,6 +69,7 @@
>  	struct mem_section *__sec = __pfn_to_section(__pfn);	\
>  	__section_mem_map_addr(__sec) + __pfn;		\
>  })
> +#endif
>  #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */

Any chance this can be done without embedding this inside another
#ifdef?  I really hate untangling the mess when an #endif goes
missing.  

Any reason this can't just be another #elif?

>  #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
> Index: linux-2.6.21-rc5-mm2/mm/sparse.c
> ===================================================================
> --- linux-2.6.21-rc5-mm2.orig/mm/sparse.c	2007-03-31 22:47:14.000000000 -0700
> +++ linux-2.6.21-rc5-mm2/mm/sparse.c	2007-03-31 22:59:35.000000000 -0700
> @@ -9,6 +9,7 @@
>  #include <linux/spinlock.h>
>  #include <linux/vmalloc.h>
>  #include <asm/dma.h>
> +#include <asm/pgalloc.h>
> 
>  /*
>   * Permanent SPARSEMEM data:
> @@ -101,7 +102,7 @@ static inline int sparse_index_init(unsi
> 
>  /*
>   * Although written for the SPARSEMEM_EXTREME case, this happens
> - * to also work for the flat array case becase
> + * to also work for the flat array case because
>   * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
>   */
>  int __section_nr(struct mem_section* ms)
> @@ -211,6 +212,90 @@ static int sparse_init_one_section(struc
>  	return 1;
>  }
> 
> +#ifdef CONFIG_SPARSE_VIRTUAL
> +
> +void *vmemmap_alloc_block(unsigned long size, int node)
> +{
> +	if (slab_is_available()) {
> +		struct page *page =
> +			alloc_pages_node(node, GFP_KERNEL,
> +				get_order(size));
> +
> +		BUG_ON(!page);
> +		return page_address(page);
> +	} else
> +		return __alloc_bootmem_node(NODE_DATA(node), size, size,
> +					__pa(MAX_DMA_ADDRESS));
> +}

Hmmmmmmm.  Can we combine this with sparse_index_alloc()?  Also, why not
just use the slab for this?

Let's get rid of the _block() part, too.  I'm not sure it does any good.
At least make it _bytes() so that we know what the units are.  Also, if
you're just going to round up internally and _not_ use the slab, can you
just make the argument in pages, or even order?

Can you think of any times when we'd want that BUG_ON() to be a
WARN_ON(), instead?  I can see preferring having my mem_map[] on the
wrong node than hitting a BUG().

> +#ifndef ARCH_POPULATES_VIRTUAL_MEMMAP
> +/*
> + * Virtual memmap populate functionality for architectures that support
> + * PMDs for huge pages like i386, x86_64 etc.
> + */

How about:

/*
 * Virtual memmap support for architectures that use Linux pagetables
 * natively in hardware, and support mapping huge pages with PMD
 * entries.
 */

It wouldn't make sense to map the vmemmap area with Linux pagetables on
an arch that didn't use them in hardware, right?  So, perhaps this
doesn't quite belong in mm/sparse.c.  Perhaps we need
arch/x86/sparse.c. ;)

> +static void vmemmap_pop_pmd(pud_t *pud, unsigned long addr,
> +				unsigned long end, int node)
> +{
> +	pmd_t *pmd;
> +
> +	end = pmd_addr_end(addr, end);
> +
> +	for (pmd = pmd_offset(pud, addr); addr < end;
> +			pmd++, addr += PMD_SIZE) {
> +  		if (pmd_none(*pmd)) {
> +  			void *block;
> +			pte_t pte;
> +
> +			block = vmemmap_alloc_block(PMD_SIZE, node);
> +			pte = pfn_pte(__pa(block) >> PAGE_SHIFT,
> +						PAGE_KERNEL);
> +			pte_mkdirty(pte);
> +			pte_mkwrite(pte);
> +			pte_mkyoung(pte);
> +			mk_pte_huge(pte);
> +			set_pmd(pmd, __pmd(pte_val(pte)));
> +		}
> +	}
> +}

Nitpick: I think this would look quite a bit neater with a little less
indentation.

How about making the loop start with

	if (!pmd_none(*pmd))
		continue;

It should bring the rest of the code in a bit and make that long line
more readable.

> +static void vmemmap_pop_pud(pgd_t *pgd, unsigned long addr,
> +					unsigned long end, int node)
> +{
> +	pud_t *pud;
> +
> +	end = pud_addr_end(addr, end);
> +	for (pud = pud_offset(pgd, addr); addr < end;
> +				pud++, addr += PUD_SIZE) {
> +
> +		if (pud_none(*pud))
> +			pud_populate(&init_mm, pud,
> +				vmemmap_alloc_block(PAGE_SIZE, node));
> +
> +		vmemmap_pop_pmd(pud, addr, end, node);
> +	}
> +}
> +
> +static void vmemmap_populate(struct page *start_page, unsigned long nr,
> +								int node)
> +{
> +	pgd_t *pgd;
> +	unsigned long addr = (unsigned long)(start_page);
> +	unsigned long end = pgd_addr_end(addr,
> +			(unsigned long)((start_page + nr)));

There appear to be a few extra parentheses on these lines.

> +	for (pgd = pgd_offset_k(addr); addr < end;
> +				pgd++, addr += PGDIR_SIZE) {
> +
> +		if (pgd_none(*pgd))
> +			pgd_populate(&init_mm, pgd,
> +				vmemmap_alloc_block(PAGE_SIZE, node));
> +		vmemmap_pop_pud(pgd, addr, end, node);
> +	}
> +}
> +#endif
> +#endif /* CONFIG_SPARSE_VIRTUAL */

We don't really need these #ifdefs embedded inside of each other,
either, right?  Kconfig should take care of enforcing the dependency.

>   static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
>  {
>  	struct page *map;
> @@ -221,8 +306,13 @@ static struct page *sparse_early_mem_map
>  	if (map)
>  		return map;
> 
> +#ifdef CONFIG_SPARSE_VIRTUAL
> +	map = pfn_to_page(pnum * PAGES_PER_SECTION);
> +	vmemmap_populate(map, PAGES_PER_SECTION, nid);
> +#else
>  	map = alloc_bootmem_node(NODE_DATA(nid),
>  			sizeof(struct page) * PAGES_PER_SECTION);
> +#endif

We really worked hard to keep #ifdefs out of the code flow in that file
and keep it as clean as possible.  Could we hide this behind a helper?  

         map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
         if (map)
                 return map;

+        map = alloc_vmemmap(map, PAGES_PER_SECTION, nid);
+        if (map)
+                return map;
+
         map = alloc_bootmem_node(NODE_DATA(nid),
                         sizeof(struct page) * PAGES_PER_SECTION);
         if (map)
                 return map;

Then, do whatever magic you want in alloc_vmemmap().

-- Dave


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 20:38           ` Martin Bligh
@ 2007-04-02 20:51             ` Christoph Lameter
  2007-04-05 11:50               ` Andy Whitcroft
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 20:51 UTC (permalink / raw)
  To: Martin Bligh
  Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm,
	KAMEZAWA Hiroyuki, Andy Whitcroft

On Mon, 2 Apr 2007, Martin Bligh wrote:

> > Its just the opposite. The vmemmap code is so efficient that we can remove
> > lots of other code and gops of these alternate implementations. On x86_64
> > its even superior to FLATMEM since FLATMEM still needs a memory reference
> > for the mem_map area. So if we make SPARSE standard for all configurations
> > then there is no need anymore for FLATMEM DISCONTIG etc etc. Can we not
> > cleanup all this mess? Get rid of all the gazillions of #ifdefs please? This
> > would ease code maintenance significantly. I hate having to constantly
> > navigate my way through all the alternatives.
> 
> The original plan when this was first merged was pretty much that -
> for sparsemem to replace discontigmem once it was well tested. Seems
> to have got stalled halfway through ;-(

But you made big boboo in SPARSEMEM. Virtual memmap is a textbook case 
that was not covered. Instead this horrible stuff that involves calling 
functions in VM primitives. We could have been there years ago...

> Not sure we'll get away with replacing flatmem for all arches, but
> we could at least get rid of discontigmem, it seems.

I think we could start with x86_64 and ia64. Both will work fine with 
SPARSE VIRTUAL (and SGIs concerns about performance are addressed) and we 
could remove the other alternatives. That is going to throw out lots of 
stuff. Then proceed to other arches

Could the SPARSEMEM folks take this over this patch? You have more 
resources and I am all alone on this. I will post another patchset today 
that also includes an IA64 implementation.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM
  2007-04-02 20:50 ` [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Dave Hansen
@ 2007-04-02 21:00   ` Christoph Lameter
  2007-04-02 21:22     ` Dave Hansen
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 21:00 UTC (permalink / raw)
  To: Dave Hansen
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen,
	KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Dave Hansen wrote:

> First of all, nice set of patches.
> 
> On Sat, 2007-03-31 at 23:10 -0800, Christoph Lameter wrote:
> > --- linux-2.6.21-rc5-mm2.orig/include/asm-generic/memory_model.h	2007-03-31 22:47:14.000000000 -0700
> > +++ linux-2.6.21-rc5-mm2/include/asm-generic/memory_model.h	2007-03-31 22:59:35.000000000 -0700
> > @@ -47,6 +47,13 @@
> >  })
> > 
> >  #elif defined(CONFIG_SPARSEMEM)
> > +#ifdef CONFIG_SPARSE_VIRTUAL
> > +/*
> > + * We have a virtual memmap that makes lookups very simple
> > + */
> > +#define __pfn_to_page(pfn)	(vmemmap + (pfn))
> > +#define __page_to_pfn(page)	((page) - vmemmap)
> > +#else
> >  /*
> >   * Note: section's mem_map is encorded to reflect its start_pfn.
> >   * section[i].section_mem_map == mem_map's address - start_pfn;
> > @@ -62,6 +69,7 @@
> >  	struct mem_section *__sec = __pfn_to_section(__pfn);	\
> >  	__section_mem_map_addr(__sec) + __pfn;		\
> >  })
> > +#endif
> >  #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
> 
> Any chance this can be done without embedding this inside another
> #ifdef?  I really hate untangling the mess when an #endif goes
> missing.  
> 
> Any reason this can't just be another #elif?

Sure.

> > +	} else
> > +		return __alloc_bootmem_node(NODE_DATA(node), size, size,
> > +					__pa(MAX_DMA_ADDRESS));
> > +}
> 
> Hmmmmmmm.  Can we combine this with sparse_index_alloc()?  Also, why not
> just use the slab for this?

Use a slab for page sized allocations? No.

> Let's get rid of the _block() part, too.  I'm not sure it does any good.
> At least make it _bytes() so that we know what the units are.  Also, if
> you're just going to round up internally and _not_ use the slab, can you
> just make the argument in pages, or even order?

Its used for page sized allocations.
 
> Can you think of any times when we'd want that BUG_ON() to be a
> WARN_ON(), instead?  I can see preferring having my mem_map[] on the
> wrong node than hitting a BUG().

We should probably have some error handling there instead of the BUG.

> > +#ifndef ARCH_POPULATES_VIRTUAL_MEMMAP
> > +/*
> > + * Virtual memmap populate functionality for architectures that support
> > + * PMDs for huge pages like i386, x86_64 etc.
> > + */
> 
> How about:
> 
> /*
>  * Virtual memmap support for architectures that use Linux pagetables
>  * natively in hardware, and support mapping huge pages with PMD
>  * entries.
>  */
> 
> It wouldn't make sense to map the vmemmap area with Linux pagetables on
> an arch that didn't use them in hardware, right?  So, perhaps this
> doesn't quite belong in mm/sparse.c.  Perhaps we need
> arch/x86/sparse.c. ;)

I just extended this in V2 to also work on IA64. Its pretty generic.

> > +static void vmemmap_pop_pmd(pud_t *pud, unsigned long addr,
> > +				unsigned long end, int node)
> > +{
> > +	pmd_t *pmd;
> > +
> > +	end = pmd_addr_end(addr, end);
> > +
> > +	for (pmd = pmd_offset(pud, addr); addr < end;
> > +			pmd++, addr += PMD_SIZE) {
> > +  		if (pmd_none(*pmd)) {
> > +  			void *block;
> > +			pte_t pte;
> > +
> > +			block = vmemmap_alloc_block(PMD_SIZE, node);
> > +			pte = pfn_pte(__pa(block) >> PAGE_SHIFT,
> > +						PAGE_KERNEL);
> > +			pte_mkdirty(pte);
> > +			pte_mkwrite(pte);
> > +			pte_mkyoung(pte);
> > +			mk_pte_huge(pte);
> > +			set_pmd(pmd, __pmd(pte_val(pte)));
> > +		}
> > +	}
> > +}
> 
> Nitpick: I think this would look quite a bit neater with a little less
> indentation.
> 
> How about making the loop start with
> 
> 	if (!pmd_none(*pmd))
> 		continue;
> 
> It should bring the rest of the code in a bit and make that long line
> more readable.

V2 has the pmd setup separated out in a separate function.

> > +{
> > +	pgd_t *pgd;
> > +	unsigned long addr = (unsigned long)(start_page);
> > +	unsigned long end = pgd_addr_end(addr,
> > +			(unsigned long)((start_page + nr)));
> 
> There appear to be a few extra parentheses on these lines.

Right. Fixed.

> > +	for (pgd = pgd_offset_k(addr); addr < end;
> > +				pgd++, addr += PGDIR_SIZE) {
> > +
> > +		if (pgd_none(*pgd))
> > +			pgd_populate(&init_mm, pgd,
> > +				vmemmap_alloc_block(PAGE_SIZE, node));
> > +		vmemmap_pop_pud(pgd, addr, end, node);
> > +	}
> > +}
> > +#endif
> > +#endif /* CONFIG_SPARSE_VIRTUAL */
> 
> We don't really need these #ifdefs embedded inside of each other,
> either, right?  Kconfig should take care of enforcing the dependency.

Ok.

> > +#ifdef CONFIG_SPARSE_VIRTUAL
> > +	map = pfn_to_page(pnum * PAGES_PER_SECTION);
> > +	vmemmap_populate(map, PAGES_PER_SECTION, nid);
> > +#else
> >  	map = alloc_bootmem_node(NODE_DATA(nid),
> >  			sizeof(struct page) * PAGES_PER_SECTION);
> > +#endif
> 
> We really worked hard to keep #ifdefs out of the code flow in that file
> and keep it as clean as possible.  Could we hide this behind a helper?  

This is a major diffference in how sparsemem works. I think this needs to 
stay.

> 
>          map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
>          if (map)
>                  return map;
> 
> +        map = alloc_vmemmap(map, PAGES_PER_SECTION, nid);
> +        if (map)
> +                return map;
> +
>          map = alloc_bootmem_node(NODE_DATA(nid),
>                          sizeof(struct page) * PAGES_PER_SECTION);
>          if (map)
>                  return map;
> 
> Then, do whatever magic you want in alloc_vmemmap().

That would break if alloc_vmemmap returns NULL because it cannot allocate 
memory.



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 20:30         ` Christoph Lameter
  2007-04-02 20:38           ` Martin Bligh
@ 2007-04-02 21:08           ` Dave Hansen
  2007-04-02 21:28             ` Christoph Lameter
  1 sibling, 1 reply; 46+ messages in thread
From: Dave Hansen @ 2007-04-02 21:08 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Mon, 2007-04-02 at 13:30 -0700, Christoph Lameter wrote:
> On Mon, 2 Apr 2007, Dave Hansen wrote:
> > I completely agree, it looks like it should be faster.  The code
> > certainly has potential benefits.  But, to add this neato, apparently
> > more performant feature, we unfortunately have to add code.  Adding the
> > code has a cost: code maintenance.  This isn't a runtime cost, but it is
> > a real, honest to goodness tradeoff.
> 
> Its just the opposite. The vmemmap code is so efficient that we can remove 
> lots of other code and gops of these alternate implementations.

We do want to make sure that there isn't anyone relying on these.  Are
you thinking of simple sparsemem vs. extreme vs. sparsemem vmemmap?  Or,
are you thinking of sparsemem vs. discontig?

> On x86_64 
> its even superior to FLATMEM since FLATMEM still needs a memory reference 
> for the mem_map area. So if we make SPARSE standard for all 
> configurations then there is no need anymore for FLATMEM DISCONTIG etc 
> etc. Can we not cleanup all this mess? Get rid of all the gazillions 
> of #ifdefs please? This would ease code maintenance significantly. I hate 
> having to constantly navigate my way through all the alternatives.

Amen, brother.  I'd love to see DISCONTIG die, with sufficient testing,
of course.  Andi, do you have any ideas on how to get sparsemem out of
the 'experimental' phase?

I have noticed before that sparsemem should be able to cover the flatmem
case if we make MAX_PHYSMEM_BITS == SECTION_SIZE_BITS and massage from
there.  
 
-- Dave


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM
  2007-04-02 21:00   ` Christoph Lameter
@ 2007-04-02 21:22     ` Dave Hansen
  2007-04-02 21:31       ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Dave Hansen @ 2007-04-02 21:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen,
	KAMEZAWA Hiroyuki

On Mon, 2007-04-02 at 14:00 -0700, Christoph Lameter wrote:
> On Mon, 2 Apr 2007, Dave Hansen wrote:
> > > +	} else
> > > +		return __alloc_bootmem_node(NODE_DATA(node), size, size,
> > > +					__pa(MAX_DMA_ADDRESS));
> > > +}
> > 
> > Hmmmmmmm.  Can we combine this with sparse_index_alloc()?  Also, why not
> > just use the slab for this?
> 
> Use a slab for page sized allocations? No.

Why not?  We use it above for sparse_index_alloc() and if it is doing
something wrong, I'd love to fix it.  Can you elaborate?

> > Let's get rid of the _block() part, too.  I'm not sure it does any good.
> > At least make it _bytes() so that we know what the units are.  Also, if
> > you're just going to round up internally and _not_ use the slab, can you
> > just make the argument in pages, or even order?
> 
> Its used for page sized allocations.

Ok, then let's make it take pages of some kind or its argument.  

> > Can you think of any times when we'd want that BUG_ON() to be a
> > WARN_ON(), instead?  I can see preferring having my mem_map[] on the
> > wrong node than hitting a BUG().
> 
> We should probably have some error handling there instead of the BUG.
> 
> > > +#ifndef ARCH_POPULATES_VIRTUAL_MEMMAP
> > > +/*
> > > + * Virtual memmap populate functionality for architectures that support
> > > + * PMDs for huge pages like i386, x86_64 etc.
> > > + */
> > 
> > How about:
> > 
> > /*
> >  * Virtual memmap support for architectures that use Linux pagetables
> >  * natively in hardware, and support mapping huge pages with PMD
> >  * entries.
> >  */
> > 
> > It wouldn't make sense to map the vmemmap area with Linux pagetables on
> > an arch that didn't use them in hardware, right?  So, perhaps this
> > doesn't quite belong in mm/sparse.c.  Perhaps we need
> > arch/x86/sparse.c. ;)
> 
> I just extended this in V2 to also work on IA64. Its pretty generic.

Can you extend it to work on ppc? ;)

You haven't posted V2, right?

> > 
> >          map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
> >          if (map)
> >                  return map;
> > 
> > +        map = alloc_vmemmap(map, PAGES_PER_SECTION, nid);
> > +        if (map)
> > +                return map;
> > +
> >          map = alloc_bootmem_node(NODE_DATA(nid),
> >                          sizeof(struct page) * PAGES_PER_SECTION);
> >          if (map)
> >                  return map;
> > 
> > Then, do whatever magic you want in alloc_vmemmap().
> 
> That would break if alloc_vmemmap returns NULL because it cannot allocate 
> memory.

OK, that makes sense.  However, it would still be nice to hide that
#ifdef somewhere that people are a bit less likely to run into it.  It's
just one #ifdef, so if you can kill it, great.  Otherwise, they pile up
over time and _do_ cause real readability problems.

-- Dave


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 21:08           ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Dave Hansen
@ 2007-04-02 21:28             ` Christoph Lameter
  2007-04-02 21:43               ` Martin Bligh
                                 ` (2 more replies)
  0 siblings, 3 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 21:28 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Dave Hansen wrote:

> On Mon, 2007-04-02 at 13:30 -0700, Christoph Lameter wrote:
> > On Mon, 2 Apr 2007, Dave Hansen wrote:
> > > I completely agree, it looks like it should be faster.  The code
> > > certainly has potential benefits.  But, to add this neato, apparently
> > > more performant feature, we unfortunately have to add code.  Adding the
> > > code has a cost: code maintenance.  This isn't a runtime cost, but it is
> > > a real, honest to goodness tradeoff.
> > 
> > Its just the opposite. The vmemmap code is so efficient that we can remove 
> > lots of other code and gops of these alternate implementations.
> 
> We do want to make sure that there isn't anyone relying on these.  Are
> you thinking of simple sparsemem vs. extreme vs. sparsemem vmemmap?  Or,
> are you thinking of sparsemem vs. discontig?

I am thinking sparsemem default and then get rid discontig, flatmem etc.
On many platforms this will work. Flatmem for embedded could just be a 
variation on sparse_virtual.

> Amen, brother.  I'd love to see DISCONTIG die, with sufficient testing,
> of course.  Andi, do you have any ideas on how to get sparsemem out of
> the 'experimental' phase?

Note that these arguments on DISCONTIG are flame bait for many SGIers. 
We usually see this as an attack on DISCONTIG/VMEMMAP which is the 
existing best performing implementation for page_to_pfn and vice 
versa. Please lets stop the polarization. We want one consistent scheme 
to manage memory everywhere. I do not care what its called as long as it 
covers all the bases and is not a glaring performance regresssion (like 
SPARSEMEM so far).

> I have noticed before that sparsemem should be able to cover the flatmem
> case if we make MAX_PHYSMEM_BITS == SECTION_SIZE_BITS and massage from
> there.  

Right. But for embedded the memorymap base cannot be constant because 
they may not be able to have a fixed address in memory. So memory map 
needs to become a variable.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM
  2007-04-02 21:22     ` Dave Hansen
@ 2007-04-02 21:31       ` Christoph Lameter
  2007-04-02 21:42         ` Dave Hansen
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 21:31 UTC (permalink / raw)
  To: Dave Hansen
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen,
	KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Dave Hansen wrote:

> > > Hmmmmmmm.  Can we combine this with sparse_index_alloc()?  Also, why not
> > > just use the slab for this?
> > 
> > Use a slab for page sized allocations? No.
> 
> Why not?  We use it above for sparse_index_alloc() and if it is doing
> something wrong, I'd love to fix it.  Can you elaborate?

The slab allocator purposes is to deliver small sub page sized chunks.
The page allocator is there to allocate pages. Both are optimized for its 
purpose.

> > I just extended this in V2 to also work on IA64. Its pretty generic.
> 
> Can you extend it to work on ppc? ;)

I do not know enough about how ppc handles large pages.

> You haven't posted V2, right?

No tests are still running.

> > > Then, do whatever magic you want in alloc_vmemmap().
> > 
> > That would break if alloc_vmemmap returns NULL because it cannot allocate 
> > memory.
> 
> OK, that makes sense.  However, it would still be nice to hide that
> #ifdef somewhere that people are a bit less likely to run into it.  It's
> just one #ifdef, so if you can kill it, great.  Otherwise, they pile up
> over time and _do_ cause real readability problems.

Well think about how to handle the case that the allocatiopn of a page 
table page or a vmemmap block fails. Once we have that sorted out then we 
can cleanup the higher layers.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM
  2007-04-02 21:31       ` Christoph Lameter
@ 2007-04-02 21:42         ` Dave Hansen
  2007-04-02 21:53           ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Dave Hansen @ 2007-04-02 21:42 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen,
	KAMEZAWA Hiroyuki

On Mon, 2007-04-02 at 14:31 -0700, Christoph Lameter wrote:
> On Mon, 2 Apr 2007, Dave Hansen wrote:
> 
> > > > Hmmmmmmm.  Can we combine this with sparse_index_alloc()?  Also, why not
> > > > just use the slab for this?
> > > 
> > > Use a slab for page sized allocations? No.
> > 
> > Why not?  We use it above for sparse_index_alloc() and if it is doing
> > something wrong, I'd love to fix it.  Can you elaborate?
> 
> The slab allocator purposes is to deliver small sub page sized chunks.
> The page allocator is there to allocate pages. Both are optimized for its 
> purpose.

I understand that, in general, but how does that optimization help,
here, exactly?  My argument is that if we use the slab, it means more
code sharing with existing code in sparse.c.  Other than ideology, what
practical reasons are there in this case that keep us from doing that
otherwise attractive code sharing.

> > > I just extended this in V2 to also work on IA64. Its pretty generic.
> > 
> > Can you extend it to work on ppc? ;)
> 
> I do not know enough about how ppc handles large pages.

I don't think the pagetable walks are generic enough to ever get used on
ppc, unless they start walking the Linux pagetables for the kernel
virtual address area.  I was trying to poke you into getting the
pagetable walks out of sparse.c. ;)

> > > > Then, do whatever magic you want in alloc_vmemmap().
> > > 
> > > That would break if alloc_vmemmap returns NULL because it cannot allocate 
> > > memory.
> > 
> > OK, that makes sense.  However, it would still be nice to hide that
> > #ifdef somewhere that people are a bit less likely to run into it.  It's
> > just one #ifdef, so if you can kill it, great.  Otherwise, they pile up
> > over time and _do_ cause real readability problems.
> 
> Well think about how to handle the case that the allocatiopn of a page 
> table page or a vmemmap block fails. Once we have that sorted out then we 
> can cleanup the higher layers.

I think it is best to just completely replace
sparse_early_mem_map_alloc() for the vmemmap case.  It really is a
completely different beast.  You'd never, for instance, have
alloc_remap() come into play.

-- Dave


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 21:28             ` Christoph Lameter
@ 2007-04-02 21:43               ` Martin Bligh
  2007-04-02 21:56               ` Dave Hansen
  2007-04-04 21:27               ` Bob Picco
  2 siblings, 0 replies; 46+ messages in thread
From: Martin Bligh @ 2007-04-02 21:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm,
	KAMEZAWA Hiroyuki, Andy Whitcroft

> Note that these arguments on DISCONTIG are flame bait for many SGIers. 
> We usually see this as an attack on DISCONTIG/VMEMMAP which is the 
> existing best performing implementation for page_to_pfn and vice 
> versa. Please lets stop the polarization. We want one consistent scheme 
> to manage memory everywhere. I do not care what its called as long as it 
> covers all the bases and is not a glaring performance regresssion (like 
> SPARSEMEM so far).

The main conceptual difference (in my mind) was not having one
bastardized data structure (pg_data_t) that meant different
things in different situations (is it a node, or a section
of discontig mem?). Also we didn't support discontig mem within
a node (at least with the old discontigmem), which was partly
the result of that hybridization.

Beyond that, it's just naming really.

M.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM
  2007-04-02 21:42         ` Dave Hansen
@ 2007-04-02 21:53           ` Christoph Lameter
  2007-04-02 22:05             ` Dave Hansen
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 21:53 UTC (permalink / raw)
  To: Dave Hansen
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen,
	KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Dave Hansen wrote:

> > The slab allocator purposes is to deliver small sub page sized chunks.
> > The page allocator is there to allocate pages. Both are optimized for its 
> > purpose.
> 
> I understand that, in general, but how does that optimization help,
> here, exactly?  My argument is that if we use the slab, it means more
> code sharing with existing code in sparse.c.  Other than ideology, what
> practical reasons are there in this case that keep us from doing that
> otherwise attractive code sharing.

F.e. you are wasting memory that the slab needs to uselessly track these 
allocations. You would need to create a special slab that has page sized
(or huge page sized) alignment to have the proper allocation behavior. Not 
good.

The rest of sparse is not MMU bound. So you may be fine. I'd recommend 
though to use the page allocator if you are doing large allocations. I do 
not see the point of using slab there.

> I don't think the pagetable walks are generic enough to ever get used on
> ppc, unless they start walking the Linux pagetables for the kernel
> virtual address area.  I was trying to poke you into getting the
> pagetable walks out of sparse.c. ;)

If I would be doing that then we would end up adding these pagetable walks
to multiple architectures. I already need to cover IA64 and x86_64 and 
this will also do i386. Lets try to keep them generic. PPC may need to 
disable these walks and do its own thing.

> > Well think about how to handle the case that the allocatiopn of a page 
> > table page or a vmemmap block fails. Once we have that sorted out then we 
> > can cleanup the higher layers.
> 
> I think it is best to just completely replace
> sparse_early_mem_map_alloc() for the vmemmap case.  It really is a
> completely different beast.  You'd never, for instance, have
> alloc_remap() come into play.

What is the purpose of alloc_remap? Could not figure that one out.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 21:28             ` Christoph Lameter
  2007-04-02 21:43               ` Martin Bligh
@ 2007-04-02 21:56               ` Dave Hansen
  2007-04-02 22:29                 ` Christoph Lameter
  2007-04-02 22:31                 ` Andi Kleen
  2007-04-04 21:27               ` Bob Picco
  2 siblings, 2 replies; 46+ messages in thread
From: Dave Hansen @ 2007-04-02 21:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Mon, 2007-04-02 at 14:28 -0700, Christoph Lameter wrote:
> I do not care what its called as long as it 
> covers all the bases and is not a glaring performance regresssion (like 
> SPARSEMEM so far). 

I honestly don't doubt that there are regressions, somewhere.  Could you
elaborate, and perhaps actually show us some numbers on this?  Perhaps
instead of adding a completely new model, we can adapt the existing ones
somehow.

But, without some cold, hard, data, we mere mortals without the 1024-way
machines can only guess. ;)

-- Dave


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM
  2007-04-02 21:53           ` Christoph Lameter
@ 2007-04-02 22:05             ` Dave Hansen
  0 siblings, 0 replies; 46+ messages in thread
From: Dave Hansen @ 2007-04-02 22:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen,
	KAMEZAWA Hiroyuki

On Mon, 2007-04-02 at 14:53 -0700, Christoph Lameter wrote:
> > > Well think about how to handle the case that the allocatiopn of a page 
> > > table page or a vmemmap block fails. Once we have that sorted out then we 
> > > can cleanup the higher layers.
> > 
> > I think it is best to just completely replace
> > sparse_early_mem_map_alloc() for the vmemmap case.  It really is a
> > completely different beast.  You'd never, for instance, have
> > alloc_remap() come into play.
> 
> What is the purpose of alloc_remap? Could not figure that one out.

That's what we use on i386 to get some lowmem area for non-zero NUMA
nodes.  Otherwise, all of ZONE_NORMAL is on node 0.  It's a bit hokey,
and stuff like virt_to_phys() probably doesn't work on it, but it has
worked pretty well for a long time.  

-- Dave


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 21:56               ` Dave Hansen
@ 2007-04-02 22:29                 ` Christoph Lameter
  2007-04-02 22:31                 ` Andi Kleen
  1 sibling, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 22:29 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Dave Hansen wrote:

> On Mon, 2007-04-02 at 14:28 -0700, Christoph Lameter wrote:
> > I do not care what its called as long as it 
> > covers all the bases and is not a glaring performance regresssion (like 
> > SPARSEMEM so far). 
> 
> I honestly don't doubt that there are regressions, somewhere.  Could you
> elaborate, and perhaps actually show us some numbers on this?  Perhaps
> instead of adding a completely new model, we can adapt the existing ones
> somehow.

Just look at pfn_to_page and friends on ia64 and see what atrocities 
sparsemem does with those if you enable it.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 21:56               ` Dave Hansen
  2007-04-02 22:29                 ` Christoph Lameter
@ 2007-04-02 22:31                 ` Andi Kleen
  2007-04-02 22:37                   ` Christoph Lameter
  2007-04-05 12:07                   ` Andy Whitcroft
  1 sibling, 2 replies; 46+ messages in thread
From: Andi Kleen @ 2007-04-02 22:31 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Christoph Lameter, linux-kernel, linux-arch, Martin Bligh,
	linux-mm, KAMEZAWA Hiroyuki

On Monday 02 April 2007 23:56:08 Dave Hansen wrote:
> On Mon, 2007-04-02 at 14:28 -0700, Christoph Lameter wrote:
> > I do not care what its called as long as it 
> > covers all the bases and is not a glaring performance regresssion (like 
> > SPARSEMEM so far). 
> 
> I honestly don't doubt that there are regressions, somewhere.  Could you
> elaborate, and perhaps actually show us some numbers on this?  Perhaps
> instead of adding a completely new model, we can adapt the existing ones
> somehow.

If it works I would be inclined to replaced old sparsemem with Christoph's
new one on x86-64. Perhaps that could cut down the bewildering sparsemem
ifdef jungle that is there currently.

But I presume it won't work on 32bit because of the limited address space?

> But, without some cold, hard, data, we mere mortals without the 1024-way
> machines can only guess. ;)

Yep.

-Andi (who would be scared of a 1024 way x86 machine)
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 22:31                 ` Andi Kleen
@ 2007-04-02 22:37                   ` Christoph Lameter
  2007-04-02 22:41                     ` Martin Bligh
  2007-04-05 12:07                   ` Andy Whitcroft
  1 sibling, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 22:37 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Dave Hansen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Tue, 3 Apr 2007, Andi Kleen wrote:

> If it works I would be inclined to replaced old sparsemem with Christoph's
> new one on x86-64. Perhaps that could cut down the bewildering sparsemem
> ifdef jungle that is there currently.
> 
> But I presume it won't work on 32bit because of the limited address space?

Not in general but it will work in non PAE mode. 4GB need 2^(32-21+4) = 
16MB pages. This would require the mapping on 4 4MB pages.

For 64GB you'd need 256M which would be a quarter of low mem. Probably 
takes up too much of low mem.

> > But, without some cold, hard, data, we mere mortals without the 1024-way
> > machines can only guess. ;)
> 
> Yep.
> 
> -Andi (who would be scared of a 1024 way x86 machine)

- Christoph (who does not believe that Andi has to wait long for 
something bigger)


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 22:37                   ` Christoph Lameter
@ 2007-04-02 22:41                     ` Martin Bligh
  2007-04-02 22:49                       ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Martin Bligh @ 2007-04-02 22:41 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Dave Hansen, linux-kernel, linux-arch, linux-mm,
	KAMEZAWA Hiroyuki

Christoph Lameter wrote:
> On Tue, 3 Apr 2007, Andi Kleen wrote:
> 
>> If it works I would be inclined to replaced old sparsemem with Christoph's
>> new one on x86-64. Perhaps that could cut down the bewildering sparsemem
>> ifdef jungle that is there currently.
>>
>> But I presume it won't work on 32bit because of the limited address space?
> 
> Not in general but it will work in non PAE mode. 4GB need 2^(32-21+4) = 
> 16MB pages. This would require the mapping on 4 4MB pages.

IIRC NUMA isn't supported (or useful anyway) without PAE.

> For 64GB you'd need 256M which would be a quarter of low mem. Probably 
> takes up too much of low mem.

Yup.


M.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 22:41                     ` Martin Bligh
@ 2007-04-02 22:49                       ` Christoph Lameter
  2007-04-02 22:52                         ` Martin Bligh
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-04-02 22:49 UTC (permalink / raw)
  To: Martin Bligh
  Cc: Andi Kleen, Dave Hansen, linux-kernel, linux-arch, linux-mm,
	KAMEZAWA Hiroyuki

On Mon, 2 Apr 2007, Martin Bligh wrote:

> > For 64GB you'd need 256M which would be a quarter of low mem. Probably takes
> > up too much of low mem.
> 
> Yup.

We could move whatever you currently use to handle that into i386 arch 
code. Or are there other platforms that do similar tricks with highmem?

We already have special hooks for node lookups in sparsemem. Move all of 
that off into some arch dir?


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 22:49                       ` Christoph Lameter
@ 2007-04-02 22:52                         ` Martin Bligh
  0 siblings, 0 replies; 46+ messages in thread
From: Martin Bligh @ 2007-04-02 22:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Dave Hansen, linux-kernel, linux-arch, linux-mm,
	KAMEZAWA Hiroyuki

Christoph Lameter wrote:
> On Mon, 2 Apr 2007, Martin Bligh wrote:
> 
>>> For 64GB you'd need 256M which would be a quarter of low mem. Probably takes
>>> up too much of low mem.
>> Yup.
> 
> We could move whatever you currently use to handle that into i386 arch 
> code. Or are there other platforms that do similar tricks with highmem?
> 
> We already have special hooks for node lookups in sparsemem. Move all of 
> that off into some arch dir?

Well, all I did was basically an early vmalloc kind of thing.

You only need to allocate enough virtual space for how much memory
you actually *have*, not the full set. The problem on i386 is that
you just need to reserve that space early, in order to shuffle
everything else into fit. It's messy, but not hard.

M.



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 15:37     ` Christoph Lameter
  2007-04-02 15:44       ` Andi Kleen
  2007-04-02 20:13       ` Dave Hansen
@ 2007-04-02 23:03       ` Bjorn Helgaas
  2 siblings, 0 replies; 46+ messages in thread
From: Bjorn Helgaas @ 2007-04-02 23:03 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm,
	KAMEZAWA Hiroyuki

On Monday 02 April 2007 09:37, Christoph Lameter wrote:
> On Sun, 1 Apr 2007, Andi Kleen wrote:
> > Hmm, this means there is at least 2MB worth of struct page on every node?
> > Or do you have overlaps with other memory (I think you have)
> > In that case you have to handle the overlap in change_page_attr()
> 
> Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes 
> with smaller amounts of memory?

Do you deal with max_addr= and mem=?

RHEL4 (2.6.9) blows up if max_addr= happens to leave you with CPU-only
nodes.  So hopefully you can deal with arbitrary-sized nodes caused by
max_addr= or mem=.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 21:28             ` Christoph Lameter
  2007-04-02 21:43               ` Martin Bligh
  2007-04-02 21:56               ` Dave Hansen
@ 2007-04-04 21:27               ` Bob Picco
  2007-04-04 22:38                 ` Christoph Lameter
  2 siblings, 1 reply; 46+ messages in thread
From: Bob Picco @ 2007-04-04 21:27 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, Martin Bligh,
	linux-mm, KAMEZAWA Hiroyuki

Christoph Lameter wrote:	[Mon Apr 02 2007, 05:28:30PM EDT]
> On Mon, 2 Apr 2007, Dave Hansen wrote:
> 
> > On Mon, 2007-04-02 at 13:30 -0700, Christoph Lameter wrote:
> > > On Mon, 2 Apr 2007, Dave Hansen wrote:
> > > > I completely agree, it looks like it should be faster.  The code
> > > > certainly has potential benefits.  But, to add this neato, apparently
> > > > more performant feature, we unfortunately have to add code.  Adding the
> > > > code has a cost: code maintenance.  This isn't a runtime cost, but it is
> > > > a real, honest to goodness tradeoff.
> > > 
> > > Its just the opposite. The vmemmap code is so efficient that we can remove 
> > > lots of other code and gops of these alternate implementations.
> > 
> > We do want to make sure that there isn't anyone relying on these.  Are
> > you thinking of simple sparsemem vs. extreme vs. sparsemem vmemmap?  Or,
> > are you thinking of sparsemem vs. discontig?
> 
> I am thinking sparsemem default and then get rid discontig, flatmem etc.
> On many platforms this will work. Flatmem for embedded could just be a 
> variation on sparse_virtual.
> 
> > Amen, brother.  I'd love to see DISCONTIG die, with sufficient testing,
> > of course.  Andi, do you have any ideas on how to get sparsemem out of
> > the 'experimental' phase?
> 
> Note that these arguments on DISCONTIG are flame bait for many SGIers. 
> We usually see this as an attack on DISCONTIG/VMEMMAP which is the 
> existing best performing implementation for page_to_pfn and vice 
> versa. Please lets stop the polarization. We want one consistent scheme 
> to manage memory everywhere. I do not care what its called as long as it 
> covers all the bases and is not a glaring performance regresssion (like 
> SPARSEMEM so far).
Well you must have forgotten about these two postings in regards to
performance numbers:
http://marc.info/?l=linux-ia64&m=111990276501051&w=2
and
http://marc.info/?l=linux-kernel&m=116664638611634&w=2
.

I took your first patchset and ran some numbers on an amd64 machine with
two dual core sockets and 4Gb of memory. More iterations should be done
and perhaps larger number of tasks. The aim7 numbers are below.

bob

2.6.21-rc5+sparsemem
Benchmark	Version	Machine	Run Date
AIM Multiuser Benchmark - Suite VII	"1.1"	rcc5	Apr  2 05:04:33 2007

Tasks	Jobs/Min	JTI	Real	CPU	Jobs/sec/task
1	13.8		100	421.3	2.2	0.2303
101	527.8		97	1113.8	111.5	0.0871
201	565.0		97	2070.6	222.7	0.0468
301	570.9		96	3068.7	334.7	0.0316
401	573.0		97	4072.7	445.6	0.0238
501	583.3		99	4998.5	558.6	0.0194
601	583.8		99	5991.1	672.9	0.0162

2.6.21-rc5+sparsemem+patchset
Benchmark	Version	Machine	Run Date
AIM Multiuser Benchmark - Suite VII	"1.1"	vmem	Apr  4 02:22:24 2007

Tasks	Jobs/Min	JTI	Real	CPU	Jobs/sec/task
1	13.7		100	424.0	2.1	0.2288
101	500.3		97	1175.0	112.0	0.0826
201	554.2		97	2111.0	223.6	0.0460
301	578.5		97	3028.3	334.9	0.0320
401	586.2		97	3981.3	448.1	0.0244
501	584.2		99	4990.8	561.8	0.0194
601	584.4		98	5985.2	675.5	0.0162

> 
> > I have noticed before that sparsemem should be able to cover the flatmem
> > case if we make MAX_PHYSMEM_BITS == SECTION_SIZE_BITS and massage from
> > there.  
> 
> Right. But for embedded the memorymap base cannot be constant because 
> they may not be able to have a fixed address in memory. So memory map 
> needs to become a variable.
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-04 21:27               ` Bob Picco
@ 2007-04-04 22:38                 ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-04 22:38 UTC (permalink / raw)
  To: Bob Picco
  Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, Martin Bligh,
	linux-mm, KAMEZAWA Hiroyuki

On Wed, 4 Apr 2007, Bob Picco wrote:

> Well you must have forgotten about these two postings in regards to
> performance numbers:
> http://marc.info/?l=linux-ia64&m=111990276501051&w=2
> and
> http://marc.info/?l=linux-kernel&m=116664638611634&w=2

I am well aware of those but those were done with a PAGE_SIZE vmemmap 
which is particularly bad on IA64 given the TLB fault overhead. You 
eliminated the TLB fault overhead. Virtual Memmaps need to be designed in 
such a way that they do not create additional overhead. The x86_64 version 
here has no such overhead that you could eliminate with lookup tables.

The bad thing is that this benchmark then was used to justify 
sparsemem on other platforms where such overhead does not exist.

One needs to be careful with benchmarks..... Its better to review the 
code.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 20:51             ` Christoph Lameter
@ 2007-04-05 11:50               ` Andy Whitcroft
  2007-04-05 18:27                 ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Whitcroft @ 2007-04-05 11:50 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Martin Bligh, Dave Hansen, Andi Kleen, linux-kernel, linux-arch,
	linux-mm, KAMEZAWA Hiroyuki

Christoph Lameter wrote:
> On Mon, 2 Apr 2007, Martin Bligh wrote:
> 
>>> Its just the opposite. The vmemmap code is so efficient that we can remove
>>> lots of other code and gops of these alternate implementations. On x86_64
>>> its even superior to FLATMEM since FLATMEM still needs a memory reference
>>> for the mem_map area. So if we make SPARSE standard for all configurations
>>> then there is no need anymore for FLATMEM DISCONTIG etc etc. Can we not
>>> cleanup all this mess? Get rid of all the gazillions of #ifdefs please? This
>>> would ease code maintenance significantly. I hate having to constantly
>>> navigate my way through all the alternatives.
>> The original plan when this was first merged was pretty much that -
>> for sparsemem to replace discontigmem once it was well tested. Seems
>> to have got stalled halfway through ;-(
> 
> But you made big boboo in SPARSEMEM. Virtual memmap is a textbook case 
> that was not covered. Instead this horrible stuff that involves calling 
> functions in VM primitives. We could have been there years ago...
> 
>> Not sure we'll get away with replacing flatmem for all arches, but
>> we could at least get rid of discontigmem, it seems.
> 
> I think we could start with x86_64 and ia64. Both will work fine with 
> SPARSE VIRTUAL (and SGIs concerns about performance are addressed) and we 
> could remove the other alternatives. That is going to throw out lots of 
> stuff. Then proceed to other arches
> 
> Could the SPARSEMEM folks take this over this patch? You have more 
> resources and I am all alone on this. I will post another patchset today 
> that also includes an IA64 implementation.

That would be me.  I have been offline writing for OLS and did not get
to respond before this.

When we first saw these patches the reaction was a general positive
despite skepticism on the general utility of vmemmmap.  The patches
appear to provide an architecture neutral implementation.  At that time
s390 was just starting to use vmemmap bringing 2x implementations into
the kernel.  Now we have three.  Clearly, even if vemmemap was a net
performance loss having only one implementation has to be a good thing
for maintainability/coverage.

Without some benchmarking and some general testing it certainly is
premature to be removing the other memory models, but for sure that was
the original plan.  The original assumption that as time went on the
other models would wither on the vine, this has only happened on powerpc
so far.

So I will go and:

1) talk to s390 and find out if they can use this same form, as one
implementation of vmemmap only should be the goal, and

2) take the current patches and try and get some benchmarks for them
against other memory models

Christoph if you could let us know which benchmarks you are seeing gains
with that would be a help.

-apw

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-02 22:31                 ` Andi Kleen
  2007-04-02 22:37                   ` Christoph Lameter
@ 2007-04-05 12:07                   ` Andy Whitcroft
  1 sibling, 0 replies; 46+ messages in thread
From: Andy Whitcroft @ 2007-04-05 12:07 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Dave Hansen, Christoph Lameter, linux-kernel, linux-arch,
	Martin Bligh, linux-mm, KAMEZAWA Hiroyuki

Andi Kleen wrote:
> On Monday 02 April 2007 23:56:08 Dave Hansen wrote:
>> On Mon, 2007-04-02 at 14:28 -0700, Christoph Lameter wrote:
>>> I do not care what its called as long as it 
>>> covers all the bases and is not a glaring performance regresssion (like 
>>> SPARSEMEM so far). 
>> I honestly don't doubt that there are regressions, somewhere.  Could you
>> elaborate, and perhaps actually show us some numbers on this?  Perhaps
>> instead of adding a completely new model, we can adapt the existing ones
>> somehow.
> 
> If it works I would be inclined to replaced old sparsemem with Christoph's
> new one on x86-64. Perhaps that could cut down the bewildering sparsemem
> ifdef jungle that is there currently.
> 
> But I presume it won't work on 32bit because of the limited address space?

Right.  But we might be able to do switch SPARSEMEM_EXTREME users here
if performance is better and no other regressions are detected.

There seems to be a theme, we need to get some numbers.  I will try and
get what I can with the hardware I have and see whats missing.

> 
>> But, without some cold, hard, data, we mere mortals without the 1024-way
>> machines can only guess. ;)


-apw

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL
  2007-04-05 11:50               ` Andy Whitcroft
@ 2007-04-05 18:27                 ` Christoph Lameter
  2007-04-07 22:06                   ` [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-04-05 18:27 UTC (permalink / raw)
  To: Andy Whitcroft
  Cc: Martin Bligh, Dave Hansen, Andi Kleen, linux-kernel, linux-arch,
	linux-mm, KAMEZAWA Hiroyuki

On Thu, 5 Apr 2007, Andy Whitcroft wrote:

> Christoph if you could let us know which benchmarks you are seeing gains
> with that would be a help.

You saw the numbers that Ken got with the pipe test right?

Then there are some minor improvements if you run AIM7.

I got into this because I saw the code that was generated by sparsemem and 
discontig. Sparsemem was doing a series of lookups and so did pure 
discontig on x86_64. Both were really too complex for inlining. 
Discontig/Vmemmap on IA64 was sort of cleaner but there was still a 
reference to a variable plus it was creating the needs for lots of TLBs.

These VM primitives are pretty critical and the code for those needs to be 
minimal and inline. So I removed the variable ref from discontig/Vmemmap 
and reduced the wokr to be done to the simple formula (that I also found 
in textbooks on memmap management). Avoided the TLB pressure by using the 
1-1 mappings page size.

I could get real some performance numbers for this by sticking in a 
performance counter before and after virt_to_page and page_address. But I 
am pretty sure about the result just looking at the code.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed)
  2007-04-05 18:27                 ` Christoph Lameter
@ 2007-04-07 22:06                   ` Christoph Lameter
  2007-04-07 22:18                     ` Andi Kleen
  2007-04-09 16:40                     ` William Lee Irwin III
  0 siblings, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-07 22:06 UTC (permalink / raw)
  To: Andy Whitcroft
  Cc: Martin Bligh, Dave Hansen, Andi Kleen, linux-kernel, linux-arch,
	linux-mm, KAMEZAWA Hiroyuki

On Thu, 5 Apr 2007, Christoph Lameter wrote:

> On Thu, 5 Apr 2007, Andy Whitcroft wrote:
> > Christoph if you could let us know which benchmarks you are seeing gains
> > with that would be a help.
> 
> You saw the numbers that Ken got with the pipe test right?
> 
> Then there are some minor improvements if you run AIM7.
> 
> I could get real some performance numbers for this by sticking in a 
> performance counter before and after virt_to_page and page_address. But I 
> am pretty sure about the result just looking at the code.

Ok. Since I keep being asked, I stuck a performance counter in kfree on 
x86_64 to see the difference in performance:

Results:

x86_64 boot with virtual memmap

Format:               #events totaltime (min/avg/max)

kfree_virt_to_page       598430 5.6ms(3ns/9ns/322ns)

x86_64 boot regular sparsemem

kfree_virt_to_page       596360 10.5ms(4ns/18ns/28.7us)


On average sparsemem virtual takes half the time than of sparsemem.

Note that the maximum time for regular sparsemem is way higher than
sparse virtual. This reflects the possibility that regular sparsemem may
once in a while have to deal with a cache miss whereas sparsemem virtual 
has no memory reference. Thus the numbers stay consistently low. 

Patch that was used to get these results (this is not very clean sorry 
but it should be enough to verify the results):



Simple Performance Counters

This patch allows the use of simple performance counters to measure time
intervals in the kernel source code. This allows a detailed analysis of the
time spend and the amount of data processed in specific code sections of the
kernel.

Time is measured using the cycle counter (TSC on IA32, ITC on IA64) which has
a very low latency.

To use add #include <linux/perf.h> to the header of the file where the
measurement needs to take place.

Then add the folowing to the code:

To declare a time stamp do

	struct pc pc;

To mark the beginning of the time measurement do

	pc_start(&pc, <counter>)

(If measurement from the beginning of a function is desired one may use
INIT_PC(xx) instead).

To mark the end of the time frame do:

	pc_stop(&pc);

or if the amount of data transferred needs to be measured as well:

	pc_throughput(&pc, number-of-bytes);


The measurements will show up in /proc/perf/all.
Processor specific statistics
may be obtained via /proc/perf/<nr-of-processor>.
Writing to /proc/perf/reset will reset all counters. F.e.

echo >/proc/perf/reset

The first counter is the number of times that the time measurement was
performed. (+ xx) is the number of samples that were thrown away since
the processor on which the process is running changed. Cycle counters
may not be consistent across different processors.

Then follows the sum of the time spend in the code segment followed in
parentheses by the minimum / average / maximum time spent there.
The second block are the sizes of data processed.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc5-mm4/kernel/Makefile
===================================================================
--- linux-2.6.21-rc5-mm4.orig/kernel/Makefile	2007-04-07 14:04:32.000000000 -0700
+++ linux-2.6.21-rc5-mm4/kernel/Makefile	2007-04-07 14:05:12.000000000 -0700
@@ -55,6 +55,7 @@
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_UTRACE) += utrace.o
 obj-$(CONFIG_PTRACE) += ptrace.o
+obj-y += perf.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6.21-rc5-mm4/include/linux/perf.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.21-rc5-mm4/include/linux/perf.h	2007-04-07 14:05:12.000000000 -0700
@@ -0,0 +1,49 @@
+/*
+ * Performance Counters and Measurement macros
+ * (C) 2005 Silicon Graphics Incorporated
+ * by Christoph Lameter <clameter@sgi.com>, April 2005
+ *
+ * Counters are calculated using the cycle counter. If a process
+ * is migrated to another cpu during the measurement then the measurement
+ * is invalid.
+ *
+ * We cannot disable preemption during measurement since that may interfere
+ * with other things in the kernel and limit the usefulness of the counters.
+ */
+
+enum pc_item {
+	PC_KFREE_VIRT_TO_PAGE,
+	PC_PTE_ALLOC,
+	PC_PTE_FREE,
+	PC_PMD_ALLOC,
+	PC_PMD_FREE,
+	PC_PUD_ALLOC,
+	PC_PUD_FREE,
+	PC_PGD_ALLOC,
+	PC_PGD_FREE,
+	NR_PC_ITEMS
+};
+
+/*
+ * Information about the start of the measurement
+ */
+struct pc {
+	unsigned long time;
+	int processor;
+	enum pc_item item;
+};
+
+static inline void pc_start(struct pc *pc, enum pc_item nr)
+{
+	pc->item = nr;
+	pc->processor = smp_processor_id();
+	pc->time = get_cycles();
+}
+
+#define INIT_PC(__var, __item) struct pc __var = \
+		{ get_cycles(), smp_processor_id(), __item }
+
+void pc_throughput(struct pc *pc, unsigned long bytes);
+
+#define pc_stop(__pc) pc_throughput(__pc, 0)
+
Index: linux-2.6.21-rc5-mm4/kernel/perf.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.21-rc5-mm4/kernel/perf.c	2007-04-07 14:05:12.000000000 -0700
@@ -0,0 +1,345 @@
+/*
+ * Simple Performance Counter subsystem
+ *
+ * (C) 2007 sgi.
+ *
+ * April 2007, Christoph Lameter <clameter@sgi.com>
+ */
+
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/cpumask.h>
+#include <linux/perf.h>
+/* For the hash function */
+#include <linux/dcache.h>
+
+#ifdef CONFIG_IA64
+#define cycles_to_ns(x) (((x) * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT)
+#elif defined(CONFIG_X86_64)
+#define cycles_to_ns(x) cycles_2_ns(x)
+#else
+#error "cycles_to_ns not defined for this architecture"
+#endif
+
+const char *var_id[NR_PC_ITEMS] = {
+	"kfree_virt_to_page",
+	"pte_alloc",
+	"pte_free",
+	"pmd_alloc",
+	"pmd_free",
+	"pud_alloc",
+	"pud_free",
+	"pgd_alloc",
+	"pgd_free"
+};
+
+struct perf_counter {
+	u32 events;
+	u32 mintime;
+	u32 maxtime;
+	u32 minbytes;
+	u32 maxbytes;
+	u32 skipped;
+	u64 time;
+	u64 bytes;
+};
+
+static DEFINE_PER_CPU(struct perf_counter, perf_counters)[NR_PC_ITEMS];
+
+void pc_throughput(struct pc *pc, unsigned long bytes)
+{
+	unsigned long time = get_cycles();
+	unsigned long ns;
+	int cpu = smp_processor_id();
+	struct perf_counter *p = &get_cpu_var(perf_counters)[pc->item];
+
+	if (unlikely(pc->item >= NR_PC_ITEMS)) {
+		printk(KERN_CRIT "pc_throughput: item (%d) out of range\n",
+			pc->item);
+		dump_stack();
+		goto out;
+	}
+
+	if (unlikely(pc->processor != cpu)) {
+		/* On different processor. TSC measurement not possible. */
+		p->skipped++;
+		goto out;
+	}
+
+	ns = cycles_to_ns(time - pc->time);
+	if (unlikely(ns > (1UL << (BITS_PER_LONG - 2)))) {
+		printk(KERN_ERR "perfcount %s: invalid time difference.\n",
+			var_id[pc->item]);
+		goto out;
+	}
+
+	p->time += ns;
+	p->events++;
+
+	if (ns > p->maxtime)
+		p->maxtime = ns;
+
+	if (p->mintime == 0 || ns < p->mintime)
+		p->mintime = ns;
+
+	if (bytes) {
+		p->bytes += bytes;
+		if (bytes > p->maxbytes)
+			p->maxbytes = bytes;
+		if (p->minbytes == 0 || bytes < p->minbytes)
+			p->minbytes = bytes;
+	}
+out:
+	put_cpu_var();
+	return;
+}
+EXPORT_SYMBOL(pc_throughput);
+
+static void reset_perfcount_item(struct perf_counter *c)
+{
+	c->events =0;
+	c->time =0;
+	c->maxtime =0;
+	c->mintime =0;
+	c->bytes =0;
+	c->minbytes =0;
+	c->maxbytes =0;
+}
+
+static void perfcount_reset(void) {
+	int cpu;
+	enum pc_item i;
+
+	for_each_online_cpu(cpu)
+		for (i = 0; i < NR_PC_ITEMS; i++)
+		 	reset_perfcount_item(
+				&per_cpu(perf_counters, cpu)[i]);
+}
+
+struct unit {
+	unsigned int n;
+	const char * s;
+};
+
+static const struct unit event_units[] = {
+	{ 1000, "" },
+	{ 1000, "K" },
+	{ 1000, "M" },
+	{ 1000, "G" },
+	{ 1000, "T" },
+	{ 1000, "P" },
+	{ 1000, "XX" },
+};
+
+
+static const struct unit time_units[] = {
+	{ 1000, "ns" },
+	{ 1000, "us" },
+	{ 1000, "ms" },
+	{ 60, "s" },
+	{ 60, "m" },
+	{ 24, "h" },
+	{ 365, "d" },
+	{ 1000, "y" },
+};
+
+static const struct unit byte_units[] = {
+	{ 1000, "b" },
+	{ 1000, "kb" },
+	{ 1000, "mb" },
+	{ 1000, "gb" },
+	{ 1000, "tb" },
+	{ 1000, "pb" },
+	{ 1000, "xb" }
+};
+
+/* Print a value using the given array of units and scale it properly */
+static void pval(struct seq_file *s, unsigned long x, const struct unit *u)
+{
+	unsigned n = 0;
+	unsigned rem = 0;
+	unsigned last_divisor = 0;
+
+	while (x >= u[n].n) {
+		last_divisor = u[n].n;
+		rem = x % last_divisor;
+		x = x / last_divisor;
+		n++;
+	}
+
+	if (last_divisor)
+		rem = (rem * 10 + last_divisor / 2) / last_divisor;
+	else
+		rem = 0;
+
+	/*
+	 * Rounding may have resulted in the need to go
+	 * to the next number
+	 */
+	if (rem == 10) {
+		x++;
+		rem = 0;
+	};
+
+	seq_printf(s, "%lu", x);
+	if (rem) {
+		seq_putc(s, '.');
+		seq_putc(s, '0' + rem);
+	}
+	seq_puts(s, u[n].s);
+}
+
+/* Print a set of statistical values in the form sum(max/avg/min) */
+static void pc_print(struct seq_file *s, const struct unit *u,
+	unsigned long count, unsigned long sum,
+	unsigned long min, unsigned long max)
+{
+	pval(s, sum, u);
+	seq_putc(s,'(');
+	pval(s, min, u);
+	seq_putc(s,'/');
+	if (count)
+		pval(s, (sum + count / 2 ) / count, u);
+	else
+		pval(s, 0, u);
+	seq_putc(s,'/');
+	pval(s, max, u);
+	seq_putc(s,')');
+}
+
+
+static int perf_show(struct seq_file *s, void *v)
+{
+	int cpu = (unsigned long)s->private;
+	enum pc_item counter = (unsigned long)v - 1;
+	struct perf_counter summary, *x;
+
+	if (cpu >= 0)
+		x = &per_cpu(perf_counters, cpu)[counter];
+	else {
+		memcpy(&summary, &per_cpu(perf_counters, 0)[counter],
+			sizeof(summary));
+		for_each_online_cpu(cpu) {
+			struct perf_counter *c =
+				&per_cpu(perf_counters, 0)[counter];
+
+			summary.events += c->events;
+			summary.skipped += c->skipped;
+			summary.time += c->time;
+			summary.bytes += c->bytes;
+
+			if (summary.maxtime < c->maxtime)
+				summary.maxtime = c->maxtime;
+
+			if (summary.mintime == 0 ||
+				(c->mintime != 0 &&
+				summary.mintime > c->mintime))
+					summary.mintime = c->mintime;
+
+			if (summary.maxbytes < c->maxbytes)
+				summary.maxbytes = c->maxbytes;
+
+			if (summary.minbytes == 0 ||
+				(c->minbytes != 0 &&
+				summary.minbytes > c->minbytes))
+					summary.minbytes = c->minbytes;
+
+		}
+		x = &summary;
+	}
+
+	seq_printf(s, "%-20s %10u ", var_id[counter], x->events);
+	if (x->skipped)
+		seq_printf(s, "(+%3u) ", x->skipped);
+	pc_print(s, time_units, x->events, x->time, x->mintime, x->maxtime);
+	if (x->bytes) {
+		seq_putc(s,' ');
+		pc_print(s, byte_units, x->events, x->bytes, x->minbytes, x->maxbytes);
+	}
+	seq_putc(s, '\n');
+	return 0;
+}
+
+static void *perf_start(struct seq_file *m, loff_t *pos)
+{
+	return (*pos < NR_PC_ITEMS) ? (void *)(*pos +1) : NULL;
+}
+
+static void *perf_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return perf_start(m, pos);
+}
+
+static void perf_stop(struct seq_file *m, void *v)
+{
+}
+
+struct seq_operations perf_data_ops = {
+	.start  = perf_start,
+	.next   = perf_next,
+	.stop   = perf_stop,
+	.show   = perf_show,
+};
+
+static int perf_data_open(struct inode *inode, struct file *file)
+{
+	int res;
+
+	res = seq_open(file, &perf_data_ops);
+	if (!res)
+		((struct seq_file *)file->private_data)->private = PDE(inode)->data;
+
+	return res;
+}
+
+static struct file_operations perf_data_fops = {
+	.open		= perf_data_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int perf_reset_write(struct file *file, const char __user *buffer,
+	unsigned long count, void *data)
+{
+	perfcount_reset();
+	return count;
+}
+
+static __init int init_perfcounter(void) {
+	int cpu;
+
+	struct proc_dir_entry *proc_perf, *perf_reset, *perf_all;
+
+	proc_perf = proc_mkdir("perf", NULL);
+	if (!proc_perf)
+		return -ENOMEM;
+
+	perf_reset = create_proc_entry("reset", S_IWUGO, proc_perf);
+	perf_reset->write_proc = perf_reset_write;
+
+	perf_all = create_proc_entry("all", S_IRUGO, proc_perf);
+	perf_all->proc_fops = &perf_data_fops;
+	perf_all->data = (void *)-1;
+
+	for_each_possible_cpu(cpu) {
+		char name[20];
+		struct proc_dir_entry *p;
+
+		sprintf(name, "%d", cpu);
+		p = create_proc_entry(name, S_IRUGO, proc_perf);
+
+		p->proc_fops = &perf_data_fops;
+		p->data = (void *)(unsigned long)cpu;
+	}
+
+	perfcount_reset();
+	return 0;
+}
+
+__initcall(init_perfcounter);
+
Index: linux-2.6.21-rc5-mm4/arch/x86_64/kernel/tsc.c
===================================================================
--- linux-2.6.21-rc5-mm4.orig/arch/x86_64/kernel/tsc.c	2007-04-07 14:04:32.000000000 -0700
+++ linux-2.6.21-rc5-mm4/arch/x86_64/kernel/tsc.c	2007-04-07 14:05:12.000000000 -0700
@@ -23,7 +23,7 @@
 	cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
 }
 
-static unsigned long long cycles_2_ns(unsigned long long cyc)
+unsigned long long cycles_2_ns(unsigned long long cyc)
 {
 	return (cyc * cyc2ns_scale) >> NS_SCALE;
 }
Index: linux-2.6.21-rc5-mm4/include/asm-x86_64/timex.h
===================================================================
--- linux-2.6.21-rc5-mm4.orig/include/asm-x86_64/timex.h	2007-04-07 14:04:32.000000000 -0700
+++ linux-2.6.21-rc5-mm4/include/asm-x86_64/timex.h	2007-04-07 14:05:12.000000000 -0700
@@ -29,4 +29,5 @@
 
 extern void mark_tsc_unstable(void);
 extern void set_cyc2ns_scale(unsigned long khz);
+unsigned long long cycles_2_ns(unsigned long long cyc);
 #endif
Index: linux-2.6.21-rc5-mm4/mm/slub.c
===================================================================
--- linux-2.6.21-rc5-mm4.orig/mm/slub.c	2007-04-07 14:05:11.000000000 -0700
+++ linux-2.6.21-rc5-mm4/mm/slub.c	2007-04-07 14:05:15.000000000 -0700
@@ -20,6 +20,7 @@
 #include <linux/mempolicy.h>
 #include <linux/ctype.h>
 #include <linux/kallsyms.h>
+#include <linux/perf.h>
 
 /*
  * Lock order:
@@ -1299,8 +1302,11 @@
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
 	struct page * page;
+	struct pc pc;
 
+	pc_start(&pc, PC_KFREE_VIRT_TO_PAGE);
 	page = virt_to_page(x);
+	pc_stop(&pc);
 
 	page = compound_head(page);
 
@@ -1917,11 +1923,16 @@
 {
 	struct kmem_cache *s;
 	struct page * page;
+	struct pc pc;
 
 	if (!x)
 		return;
 
-	page = compound_head(virt_to_page(x));
+	pc_start(&pc, PC_KFREE_VIRT_TO_PAGE);
+	page = virt_to_page(x);
+	pc_stop(&pc);
+
+	page = compound_head(page);
 
 	s = page->slab;
 
Index: linux-2.6.21-rc5-mm4/kernel/fork.c
===================================================================
--- linux-2.6.21-rc5-mm4.orig/kernel/fork.c	2007-04-07 14:04:32.000000000 -0700
+++ linux-2.6.21-rc5-mm4/kernel/fork.c	2007-04-07 14:05:12.000000000 -0700
@@ -58,6 +58,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <linux/perf.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -304,7 +306,10 @@
 
 static inline int mm_alloc_pgd(struct mm_struct * mm)
 {
+	INIT_PC(pc, PC_PGD_ALLOC);
+
 	mm->pgd = pgd_alloc(mm);
+	pc_stop(&pc);
 	if (unlikely(!mm->pgd))
 		return -ENOMEM;
 	return 0;
@@ -312,7 +317,9 @@
 
 static inline void mm_free_pgd(struct mm_struct * mm)
 {
+	INIT_PC(pc, PC_PGD_FREE);
 	pgd_free(mm->pgd);
+	pc_stop(&pc);
 }
 #else
 #define dup_mmap(mm, oldmm)	(0)
Index: linux-2.6.21-rc5-mm4/mm/memory.c
===================================================================
--- linux-2.6.21-rc5-mm4.orig/mm/memory.c	2007-04-07 14:04:32.000000000 -0700
+++ linux-2.6.21-rc5-mm4/mm/memory.c	2007-04-07 14:05:12.000000000 -0700
@@ -60,6 +60,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 
+#include <linux/perf.h>
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -302,15 +304,22 @@
 
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
+	INIT_PC(pc, PC_PTE_ALLOC);
 	struct page *new = pte_alloc_one(mm, address);
+
+	pc_stop(&pc);
 	if (!new)
 		return -ENOMEM;
 
 	pte_lock_init(new);
 	spin_lock(&mm->page_table_lock);
 	if (pmd_present(*pmd)) {	/* Another has populated it */
+		struct pc pc;
+
 		pte_lock_deinit(new);
+		pc_start(&pc, PC_PTE_FREE);
 		pte_free(new);
+		pc_stop(&pc);
 	} else {
 		mm->nr_ptes++;
 		inc_zone_page_state(new, NR_PAGETABLE);
@@ -2509,14 +2518,20 @@
  */
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
+	INIT_PC(pc, PC_PUD_ALLOC);
 	pud_t *new = pud_alloc_one(mm, address);
+
+	pc_stop(&pc);
 	if (!new)
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd))		/* Another has populated it */
+	{
+		INIT_PC(pc, PC_PUD_FREE);
 		pud_free(new);
-	else
+		pc_stop(&pc);
+	} else
 		pgd_populate(mm, pgd, new);
 	spin_unlock(&mm->page_table_lock);
 	return 0;
@@ -2530,20 +2545,29 @@
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
+	INIT_PC(pc, PC_PMD_ALLOC);
 	pmd_t *new = pmd_alloc_one(mm, address);
+
+	pc_stop(&pc);
 	if (!new)
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud))		/* Another has populated it */
+	{
+		INIT_PC(pc, PC_PMD_FREE);
 		pmd_free(new);
-	else
+		pc_stop(&pc);
+	} else
 		pud_populate(mm, pud, new);
 #else
 	if (pgd_present(*pud))		/* Another has populated it */
+	{
+		INIT_PC(pc, PC_PMD_FREE);
 		pmd_free(new);
-	else
+		pc_stop(&pc);
+	} else
 		pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
Index: linux-2.6.21-rc5-mm4/mm/slab.c
===================================================================
--- linux-2.6.21-rc5-mm4.orig/mm/slab.c	2007-04-07 14:22:11.000000000 -0700
+++ linux-2.6.21-rc5-mm4/mm/slab.c	2007-04-07 14:24:14.000000000 -0700
@@ -621,7 +621,10 @@
 
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
+	INIT_PC(PC_KFREE_VIRT_TO_PAGE);
 	struct page *page = virt_to_page(obj);
+
+	pc_stop(&pc);
 	return page_get_cache(page);
 }
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed)
  2007-04-07 22:06                   ` [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) Christoph Lameter
@ 2007-04-07 22:18                     ` Andi Kleen
  2007-04-09 16:40                     ` William Lee Irwin III
  1 sibling, 0 replies; 46+ messages in thread
From: Andi Kleen @ 2007-04-07 22:18 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andy Whitcroft, Martin Bligh, Dave Hansen, linux-kernel,
	linux-arch, linux-mm, KAMEZAWA Hiroyuki

On Sunday 08 April 2007 00:06:13 Christoph Lameter wrote:

> Results:
> 
> x86_64 boot with virtual memmap
> 
> Format:               #events totaltime (min/avg/max)
> 
> kfree_virt_to_page       598430 5.6ms(3ns/9ns/322ns)
> 
> x86_64 boot regular sparsemem
> 
> kfree_virt_to_page       596360 10.5ms(4ns/18ns/28.7us)
> 
> 
> On average sparsemem virtual takes half the time than of sparsemem.

Nice.  But on what workloads? 

Anyways it looks promising. I hope we can just
replace old style sparsemem support with this for x86-64.

> Time is measured using the cycle counter (TSC on IA32, ITC on IA64) which has
> a very low latency.

Sorry that triggered my usual RDTSC rant...

Not on NetBurst (hundred of cycles) And on the others (C2,K8) it is a bit dangerous 
to measure short code blocks because RDTSC is not guaranteed ordered with the surrounding 
instructions.

-Andi
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed)
  2007-04-07 22:06                   ` [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) Christoph Lameter
  2007-04-07 22:18                     ` Andi Kleen
@ 2007-04-09 16:40                     ` William Lee Irwin III
  2007-04-09 17:16                       ` Christoph Lameter
  1 sibling, 1 reply; 46+ messages in thread
From: William Lee Irwin III @ 2007-04-09 16:40 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andy Whitcroft, Martin Bligh, Dave Hansen, Andi Kleen,
	linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki

On Sat, Apr 07, 2007 at 03:06:13PM -0700, Christoph Lameter wrote:
> +/*
> + * Performance Counters and Measurement macros
> + * (C) 2005 Silicon Graphics Incorporated
> + * by Christoph Lameter <clameter@sgi.com>, April 2005
> + *
> + * Counters are calculated using the cycle counter. If a process
> + * is migrated to another cpu during the measurement then the measurement
> + * is invalid.
> + *
> + * We cannot disable preemption during measurement since that may interfere
> + * with other things in the kernel and limit the usefulness of the counters.
> + */

Whatever's going on with the rest of this, I really like this
instrumentation patch. It may be worthwhile to allow pc_start() to be
overridden so things like performance counter MSR's are usable, but
the framework looks very useful.


-- wli

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed)
  2007-04-09 16:40                     ` William Lee Irwin III
@ 2007-04-09 17:16                       ` Christoph Lameter
  2007-04-09 18:20                         ` William Lee Irwin III
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-04-09 17:16 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Andy Whitcroft, Martin Bligh, Dave Hansen, Andi Kleen,
	linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki

On Mon, 9 Apr 2007, William Lee Irwin III wrote:

> Whatever's going on with the rest of this, I really like this
> instrumentation patch. It may be worthwhile to allow pc_start() to be
> overridden so things like performance counter MSR's are usable, but
> the framework looks very useful.

Yeah. I also did some measurements on quicklists on x86_64 and it seems 
that caching page table pages is also useful:

no quicklist

pte_alloc               1569048 4.3s(401ns/2.7us/179.7us)
pmd_alloc                780988 2.1s(337ns/2.7us/86.1us)
pud_alloc                780072 2.2s(424ns/2.8us/300.6us)
pgd_alloc                260022 1s(920ns/4us/263.1us)

quicklist:

pte_alloc                452436 573.4ms(8ns/1.3us/121.1us)
pmd_alloc                196204 174.5ms(7ns/889ns/46.1us)
pud_alloc                195688 172.4ms(7ns/881ns/151.3us)
pgd_alloc                 65228 9.8ms(8ns/150ns/6.1us)



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed)
  2007-04-09 17:16                       ` Christoph Lameter
@ 2007-04-09 18:20                         ` William Lee Irwin III
  0 siblings, 0 replies; 46+ messages in thread
From: William Lee Irwin III @ 2007-04-09 18:20 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andy Whitcroft, Martin Bligh, Dave Hansen, Andi Kleen,
	linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki

On Mon, 9 Apr 2007, William Lee Irwin III wrote:
>> Whatever's going on with the rest of this, I really like this
>> instrumentation patch. It may be worthwhile to allow pc_start() to be
>> overridden so things like performance counter MSR's are usable, but
>> the framework looks very useful.

On Mon, Apr 09, 2007 at 10:16:06AM -0700, Christoph Lameter wrote:
> Yeah. I also did some measurements on quicklists on x86_64 and it seems 
> that caching page table pages is also useful:
[...]

Sadly these timings will not address the arguments made on behalf of
eager zeroing, which are essentially that "precharging the cache" will
benefit fork(). I personally still find them meaningful.

I think a demonstration that will deal with more of others' concerns
might be instrumenting fault latency after a fresh execve() and
comparing fork() timings. This is clearly awkward given the scheduling
opportunities in these areas, but a virtual time measurement may suffice
to cope with those.

The fault latency after a fresh execve() is particularly relevant
because it's a scenario where incremental pagetable construction occurs
in "real life." It's actually less common for processes to merely fork()
than to fork() then execve() and then perform most of their work in the
context set up in execve(). The pagetables set up for fork() are most
commonly short-lived and the utility of caching them or even
constructing them so questionable that pagetable sharing and other
methods of avoiding the copy are quite plausible, though perhaps in
need of some heuristics to avoid new faults for the purposes of merely
copying pagetables where possible (AIUI such are considered or
implemented in pagetable sharing patches; of course, Dave McCracken has
far more detailed knowledge of the performance considerations there).

I daresay it's highly dubious to consider pagetable construction for
fork() in isolation when the common case is to almost immediately flush
all those pagetables down the toilet via execve().

Basically, if we can establish that (1) pagetable caching doesn't hurt
fork() or maybe even speeds it up then (2) it speeds up post-execve()
faults, then things look good. Raw timings on the pagetable allocation
primitives are unfortunately too micro to adequately make our case.


-- wli

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2007-04-09 18:26 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-01  7:10 [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Christoph Lameter
2007-04-01  7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter
2007-04-01  7:11   ` Christoph Lameter
2007-04-01 10:46   ` Andi Kleen
2007-04-02 15:37     ` Christoph Lameter
2007-04-02 15:44       ` Andi Kleen
2007-04-02 15:51         ` Martin J. Bligh
2007-04-02 15:54         ` Christoph Lameter
2007-04-02 17:14           ` Andi Kleen
2007-04-02 17:34             ` Christoph Lameter
2007-04-02 19:54           ` Dave Hansen
2007-04-02 20:11             ` Christoph Lameter
2007-04-02 20:13       ` Dave Hansen
2007-04-02 20:30         ` Christoph Lameter
2007-04-02 20:38           ` Martin Bligh
2007-04-02 20:51             ` Christoph Lameter
2007-04-05 11:50               ` Andy Whitcroft
2007-04-05 18:27                 ` Christoph Lameter
2007-04-07 22:06                   ` [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) Christoph Lameter
2007-04-07 22:18                     ` Andi Kleen
2007-04-09 16:40                     ` William Lee Irwin III
2007-04-09 17:16                       ` Christoph Lameter
2007-04-09 18:20                         ` William Lee Irwin III
2007-04-02 21:08           ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Dave Hansen
2007-04-02 21:28             ` Christoph Lameter
2007-04-02 21:43               ` Martin Bligh
2007-04-02 21:56               ` Dave Hansen
2007-04-02 22:29                 ` Christoph Lameter
2007-04-02 22:31                 ` Andi Kleen
2007-04-02 22:37                   ` Christoph Lameter
2007-04-02 22:41                     ` Martin Bligh
2007-04-02 22:49                       ` Christoph Lameter
2007-04-02 22:52                         ` Martin Bligh
2007-04-05 12:07                   ` Andy Whitcroft
2007-04-04 21:27               ` Bob Picco
2007-04-04 22:38                 ` Christoph Lameter
2007-04-02 23:03       ` Bjorn Helgaas
2007-04-02 15:44     ` Christoph Lameter
2007-04-02 16:18     ` Christoph Lameter
2007-04-02 20:50 ` [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Dave Hansen
2007-04-02 21:00   ` Christoph Lameter
2007-04-02 21:22     ` Dave Hansen
2007-04-02 21:31       ` Christoph Lameter
2007-04-02 21:42         ` Dave Hansen
2007-04-02 21:53           ` Christoph Lameter
2007-04-02 22:05             ` Dave Hansen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).