LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] [1/7] Implement true end_pfn_mapped for 32bit
@ 2008-03-12  2:53 Andi Kleen
  2008-03-12  2:53 ` [PATCH] [2/7] Account overlapped mappings in end_pfn_map Andi Kleen
                   ` (5 more replies)
  0 siblings, 6 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-12  2:53 UTC (permalink / raw)
  To: andreas.herrmann3, tglx, mingo, linux-kernel


[Old patch; repost, but needed for further patches in the series]

Even on 32bit 2MB pages can map more memory than is in the true
max_low_pfn if end_pfn is not highmem and not aligned to 2MB. 
Add a end_pfn_map similar to x86-64 that accounts for this 
fact. This is important for code that really needs to know about
all mapping aliases.

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86/mm/init_32.c     |    4 ++++
 include/asm-x86/page.h    |    4 +++-
 include/asm-x86/page_64.h |    1 -
 3 files changed, 7 insertions(+), 2 deletions(-)

Index: linux/arch/x86/mm/init_32.c
===================================================================
--- linux.orig/arch/x86/mm/init_32.c
+++ linux/arch/x86/mm/init_32.c
@@ -50,6 +50,8 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
+unsigned long end_pfn_map;
+
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
@@ -193,6 +195,7 @@ static void __init kernel_physical_mappi
 				set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
+				end_pfn_map = pfn;
 				continue;
 			}
 			pte = one_page_table_init(pmd);
@@ -207,6 +210,7 @@ static void __init kernel_physical_mappi
 
 				set_pte(pte, pfn_pte(pfn, prot));
 			}
+			end_pfn_map = pfn;
 		}
 	}
 }
Index: linux/include/asm-x86/page.h
===================================================================
--- linux.orig/include/asm-x86/page.h
+++ linux/include/asm-x86/page.h
@@ -36,7 +36,7 @@
 #define max_pfn_mapped		end_pfn_map
 #else
 #include <asm/page_32.h>
-#define max_pfn_mapped		max_low_pfn
+#define max_pfn_mapped		end_pfn_map
 #endif	/* CONFIG_X86_64 */
 
 #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
@@ -51,6 +51,8 @@
 extern int page_is_ram(unsigned long pagenr);
 extern int devmem_is_allowed(unsigned long pagenr);
 
+extern unsigned long end_pfn_map;
+
 struct page;
 
 static void inline clear_user_page(void *page, unsigned long vaddr,
Index: linux/include/asm-x86/page_64.h
===================================================================
--- linux.orig/include/asm-x86/page_64.h
+++ linux/include/asm-x86/page_64.h
@@ -55,7 +55,6 @@ void clear_page(void *page);
 void copy_page(void *to, void *from);
 
 extern unsigned long end_pfn;
-extern unsigned long end_pfn_map;
 extern unsigned long phys_base;
 
 extern unsigned long __phys_addr(unsigned long);

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] [2/7] Account overlapped mappings in end_pfn_map
  2008-03-12  2:53 [PATCH] [1/7] Implement true end_pfn_mapped for 32bit Andi Kleen
@ 2008-03-12  2:53 ` Andi Kleen
  2008-03-12  2:53 ` [PATCH] [3/7] Add set_memory_4k to pageattr.c Andi Kleen
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-12  2:53 UTC (permalink / raw)
  To: andreas.herrmann3, tglx, mingo, linux-kernel


[old patch repost, needed for further patches in the series)

When end_pfn is not aligned to 2MB (or 1GB) then the kernel might
map more memory than end_pfn. Account this in end_pfn_mapped.

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86/kernel/setup_64.c |    2 +-
 arch/x86/mm/init_64.c      |   33 +++++++++++++++++++++++----------
 include/asm-x86/proto.h    |    3 ++-
 3 files changed, 26 insertions(+), 12 deletions(-)

Index: linux/arch/x86/mm/init_64.c
===================================================================
--- linux.orig/arch/x86/mm/init_64.c
+++ linux/arch/x86/mm/init_64.c
@@ -296,7 +296,7 @@ __meminit void early_iounmap(void *addr,
 	__flush_tlb_all();
 }
 
-static void __meminit
+static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 {
 	int i = pmd_index(address);
@@ -318,21 +318,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned 
 		set_pte((pte_t *)pmd,
 			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 	}
+	return address;
 }
 
-static void __meminit
+static unsigned long __meminit
 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
 {
+	unsigned long true_end;
 	pmd_t *pmd = pmd_offset(pud, 0);
 	spin_lock(&init_mm.page_table_lock);
-	phys_pmd_init(pmd, address, end);
+	true_end = phys_pmd_init(pmd, address, end);
 	spin_unlock(&init_mm.page_table_lock);
 	__flush_tlb_all();
+	return true_end;
 }
 
-static void __meminit
+static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 {
+	unsigned long true_end = end;
 	int i = pud_index(addr);
 
 	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
@@ -351,13 +355,14 @@ phys_pud_init(pud_t *pud_page, unsigned 
 
 		if (pud_val(*pud)) {
 			if (!pud_large(*pud))
-				phys_pmd_update(pud, addr, end);
+				true_end = phys_pmd_update(pud, addr, end);
 			continue;
 		}
 
 		if (direct_gbpages) {
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+			true_end = (addr & PUD_MASK) + PUD_SIZE;
 			continue;
 		}
 
@@ -365,12 +370,14 @@ phys_pud_init(pud_t *pud_page, unsigned 
 
 		spin_lock(&init_mm.page_table_lock);
 		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
-		phys_pmd_init(pmd, addr, end);
+		true_end = phys_pmd_init(pmd, addr, end);
 		spin_unlock(&init_mm.page_table_lock);
 
 		unmap_low_page(pmd);
 	}
 	__flush_tlb_all();
+
+	return true_end >> PAGE_SHIFT;
 }
 
 static void __init find_early_table_space(unsigned long end)
@@ -415,9 +422,10 @@ static void __init init_gbpages(void)
  * This runs before bootmem is initialized and gets pages directly from
  * the physical memory. To access them they are temporarily mapped.
  */
-void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+unsigned long __init_refok
+init_memory_mapping(unsigned long start, unsigned long end)
 {
-	unsigned long next;
+	unsigned long next, true_end = end;
 
 	pr_debug("init_memory_mapping\n");
 
@@ -449,7 +457,7 @@ void __init_refok init_memory_mapping(un
 		next = start + PGDIR_SIZE;
 		if (next > end)
 			next = end;
-		phys_pud_init(pud, __pa(start), __pa(next));
+		true_end = phys_pud_init(pud, __pa(start), __pa(next));
 		if (!after_bootmem)
 			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
 		unmap_low_page(pud);
@@ -462,6 +470,8 @@ void __init_refok init_memory_mapping(un
 	if (!after_bootmem)
 		reserve_early(table_start << PAGE_SHIFT,
 				 table_end << PAGE_SHIFT, "PGTABLE");
+
+	return true_end;
 }
 
 #ifndef CONFIG_NUMA
@@ -503,9 +513,12 @@ int arch_add_memory(int nid, u64 start, 
 	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
+	unsigned long true_end_pfn;
 	int ret;
 
-	init_memory_mapping(start, start + size-1);
+	true_end_pfn = init_memory_mapping(start, start + size-1);
+	if (true_end_pfn > end_pfn_map)
+		end_pfn_map = true_end_pfn;
 
 	ret = __add_pages(zone, start_pfn, nr_pages);
 	WARN_ON(1);
Index: linux/include/asm-x86/proto.h
===================================================================
--- linux.orig/include/asm-x86/proto.h
+++ linux/include/asm-x86/proto.h
@@ -7,7 +7,8 @@
 
 extern void early_idt_handler(void);
 
-extern void init_memory_mapping(unsigned long start, unsigned long end);
+extern unsigned long init_memory_mapping(unsigned long start,
+					 unsigned long end);
 
 extern void system_call(void);
 extern void syscall_init(void);
Index: linux/arch/x86/kernel/setup_64.c
===================================================================
--- linux.orig/arch/x86/kernel/setup_64.c
+++ linux/arch/x86/kernel/setup_64.c
@@ -341,7 +341,7 @@ void __init setup_arch(char **cmdline_p)
 
 	check_efer();
 
-	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+	end_pfn_map = init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
 	if (efi_enabled)
 		efi_init();
 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] [3/7] Add set_memory_4k to pageattr.c
  2008-03-12  2:53 [PATCH] [1/7] Implement true end_pfn_mapped for 32bit Andi Kleen
  2008-03-12  2:53 ` [PATCH] [2/7] Account overlapped mappings in end_pfn_map Andi Kleen
@ 2008-03-12  2:53 ` Andi Kleen
  2008-03-12  2:53 ` [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory Andi Kleen
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-12  2:53 UTC (permalink / raw)
  To: andreas.herrmann3, tglx, mingo, linux-kernel


Add a new function to force split large pages into 4k pages.
This is needed for some followup optimizations.

I had to add a new field to cpa_data to pass down the information
that try_preserve_large_page should not run.

Right now no set_page_4k() because I didn't need it and all the
specialized users I have in mind would be more comfortable with
pure addresses. I also didn't export it because it's unlikely
external code needs it.

Signed-off-by: Andi Kleen <ak@suse.de>

Index: linux/arch/x86/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86/mm/pageattr.c
+++ linux/arch/x86/mm/pageattr.c
@@ -28,6 +28,7 @@ struct cpa_data {
 	int		numpages;
 	int		flushtlb;
 	unsigned long	pfn;
+	unsigned	force_split : 1;
 };
 
 #ifdef CONFIG_X86_64
@@ -259,6 +260,9 @@ try_preserve_large_page(pte_t *kpte, uns
 	int i, do_split = 1;
 	unsigned int level;
 
+	if (cpa->force_split)
+		return 1;
+
 	spin_lock_irqsave(&pgd_lock, flags);
 	/*
 	 * Check for races, another CPU might have split this page
@@ -693,7 +697,8 @@ static inline int cache_attr(pgprot_t at
 }
 
 static int change_page_attr_set_clr(unsigned long addr, int numpages,
-				    pgprot_t mask_set, pgprot_t mask_clr)
+				    pgprot_t mask_set, pgprot_t mask_clr,
+				    int force_split)
 {
 	struct cpa_data cpa;
 	int ret, cache, checkalias;
@@ -704,7 +709,7 @@ static int change_page_attr_set_clr(unsi
 	 */
 	mask_set = canon_pgprot(mask_set);
 	mask_clr = canon_pgprot(mask_clr);
-	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
+	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
 		return 0;
 
 	/* Ensure we are PAGE_SIZE aligned */
@@ -721,6 +726,7 @@ static int change_page_attr_set_clr(unsi
 	cpa.mask_set = mask_set;
 	cpa.mask_clr = mask_clr;
 	cpa.flushtlb = 0;
+	cpa.force_split = force_split;
 
 	/* No alias checking for _NX bit modifications */
 	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -759,13 +765,13 @@ out:
 static inline int change_page_attr_set(unsigned long addr, int numpages,
 				       pgprot_t mask)
 {
-	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
 }
 
 static inline int change_page_attr_clear(unsigned long addr, int numpages,
 					 pgprot_t mask)
 {
-	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
 }
 
 int set_memory_uc(unsigned long addr, int numpages)
@@ -809,6 +815,12 @@ int set_memory_np(unsigned long addr, in
 	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
 }
 
+int set_memory_4k(unsigned long addr, int numpages)
+{
+	return change_page_attr_set_clr(addr, numpages, __pgprot(0),
+					__pgprot(0), 1);
+}
+
 int set_pages_uc(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
Index: linux/include/asm-x86/cacheflush.h
===================================================================
--- linux.orig/include/asm-x86/cacheflush.h
+++ linux/include/asm-x86/cacheflush.h
@@ -41,6 +41,7 @@ int set_memory_nx(unsigned long addr, in
 int set_memory_ro(unsigned long addr, int numpages);
 int set_memory_rw(unsigned long addr, int numpages);
 int set_memory_np(unsigned long addr, int numpages);
+int set_memory_4k(unsigned long addr, int numpages);
 
 void clflush_cache_range(void *addr, unsigned int size);
 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory
  2008-03-12  2:53 [PATCH] [1/7] Implement true end_pfn_mapped for 32bit Andi Kleen
  2008-03-12  2:53 ` [PATCH] [2/7] Account overlapped mappings in end_pfn_map Andi Kleen
  2008-03-12  2:53 ` [PATCH] [3/7] Add set_memory_4k to pageattr.c Andi Kleen
@ 2008-03-12  2:53 ` Andi Kleen
  2008-03-12  5:38   ` Eric Dumazet
                     ` (2 more replies)
  2008-03-12  2:53 ` [PATCH] [5/7] Readd rdmsrl_safe Andi Kleen
                   ` (2 subsequent siblings)
  5 siblings, 3 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-12  2:53 UTC (permalink / raw)
  To: andreas.herrmann3, tglx, mingo, linux-kernel


Intel recommends to not use large pages for the first 1MB 
of the physical memory because there are fixed size MTRRs there
which cause splitups in the TLBs.

On AMD doing so is also a good idea.

The implementation is a little different between 32bit and 64bit.
On 32bit I just taught the initial page table set up about this
because it was very simple to do. This also has the advantage
that the risk of a prefetch ever seeing the page even
if it only exists for a short time is minimized.

On 64bit that is not quite possible, so use set_memory_4k() a little
later (in check_bugs) instead.

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86/kernel/bugs_64.c |   12 ++++++++++++
 arch/x86/mm/init_32.c     |    6 +++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

Index: linux/arch/x86/kernel/bugs_64.c
===================================================================
--- linux.orig/arch/x86/kernel/bugs_64.c
+++ linux/arch/x86/kernel/bugs_64.c
@@ -9,6 +9,7 @@
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/mtrr.h>
+#include <asm/cacheflush.h>
 
 void __init check_bugs(void)
 {
@@ -18,4 +19,15 @@ void __init check_bugs(void)
 	print_cpu_info(&boot_cpu_data);
 #endif
 	alternative_instructions();
+
+	/*
+	 * Make sure the first 2MB area is not mapped by huge pages
+	 * There are typically fixed size MTRRs in there and overlapping
+	 * MTRRs into large pages causes slow downs.
+	 *
+	 * Right now we don't do that with gbpages because there seems
+	 * very little benefit for that case.
+	 */
+	if (!direct_gbpages)
+		set_memory_4k((unsigned long)__va(0), 1);
 }
Index: linux/arch/x86/mm/init_32.c
===================================================================
--- linux.orig/arch/x86/mm/init_32.c
+++ linux/arch/x86/mm/init_32.c
@@ -181,8 +181,13 @@ static void __init kernel_physical_mappi
 			/*
 			 * Map with big pages if possible, otherwise
 			 * create normal page tables:
+			 *
+			 * Don't use a large page for the first 2/4MB of memory
+			 * because there are often fixed size MTRRs in there
+			 * and overlapping MTRRs into large pages can cause
+			 * slowdowns.
 			 */
-			if (cpu_has_pse) {
+			if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
 				unsigned int addr2;
 				pgprot_t prot = PAGE_KERNEL_LARGE;
 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] [5/7] Readd rdmsrl_safe
  2008-03-12  2:53 [PATCH] [1/7] Implement true end_pfn_mapped for 32bit Andi Kleen
                   ` (2 preceding siblings ...)
  2008-03-12  2:53 ` [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory Andi Kleen
@ 2008-03-12  2:53 ` Andi Kleen
  2008-03-21 17:06   ` Thomas Gleixner
  2008-03-12  2:53 ` [PATCH] [6/7] Split large page mapping for AMD TSEG Andi Kleen
  2008-03-12  2:53 ` [PATCH] [7/7] CPA: Add statistics about state of direct mapping v2 Andi Kleen
  5 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2008-03-12  2:53 UTC (permalink / raw)
  To: andreas.herrmann3, tglx, mingo, linux-kernel


RDMSR for 64bit values with exception handling.

Makes it easier to deal with 64bit valued MSRs. The old 64bit code
base had that too as checking_rdmsrl(), but it got dropped somehow. 

Needed for followup patch.

Signed-off-by: Andi Kleen <ak@suse.de>

---
 include/asm-x86/msr.h      |    3 +++
 include/asm-x86/paravirt.h |    4 ++++
 2 files changed, 7 insertions(+)

Index: linux/include/asm-x86/msr.h
===================================================================
--- linux.orig/include/asm-x86/msr.h
+++ linux/include/asm-x86/msr.h
@@ -150,6 +150,9 @@ static inline int wrmsr_safe(unsigned ms
 		__err;							\
 	})
 
+#define rdmsrl_safe(msr,p) \
+	({ int __err; *(p) = native_read_msr_safe(msr, &__err); __err; })
+
 #define rdtscl(low)						\
 	((low) = (u32)native_read_tsc())
 
Index: linux/include/asm-x86/paravirt.h
===================================================================
--- linux.orig/include/asm-x86/paravirt.h
+++ linux/include/asm-x86/paravirt.h
@@ -687,6 +687,10 @@ static inline int paravirt_write_msr(uns
 	(*b) = _l >> 32;			\
 	_err; })
 
+#define rdmsrl_safe(msr, p) ({			\
+	int _err;				\
+	*(p) = paravirt_read_msr(msr, &_err);	\
+	_err; })
 
 static inline u64 paravirt_read_tsc(void)
 {

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] [6/7] Split large page mapping for AMD TSEG
  2008-03-12  2:53 [PATCH] [1/7] Implement true end_pfn_mapped for 32bit Andi Kleen
                   ` (3 preceding siblings ...)
  2008-03-12  2:53 ` [PATCH] [5/7] Readd rdmsrl_safe Andi Kleen
@ 2008-03-12  2:53 ` Andi Kleen
  2008-03-21 17:55   ` Thomas Gleixner
                     ` (2 more replies)
  2008-03-12  2:53 ` [PATCH] [7/7] CPA: Add statistics about state of direct mapping v2 Andi Kleen
  5 siblings, 3 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-12  2:53 UTC (permalink / raw)
  To: andreas.herrmann3, tglx, mingo, linux-kernel


On AMD SMM protected memory is part of the address map, but handled
internally like an MTRR. That leads to large pages getting split
internally which has some performance implications. Check for the
AMD TSEG MSR and split the large page mapping on that area 
explicitely if it is part of the direct mapping. 

There is also SMM ASEG, but it is in the first 1MB and already covered by 
the earlier split first page patch.

Idea for this came from an earlier patch by Andreas Herrmann

On a RevF dual Socket Opteron system kernbench shows a clear
improvement from this:
(together with the earlier patches in this series, especially the 
split first 2MB patch) 

[lower is better]
              no split stddev         split  stddev    delta
Elapsed Time   87.146 (0.727516)     84.296 (1.09098)  -3.2%
User Time     274.537 (4.05226)     273.692 (3.34344)  -0.3%
System Time    34.907 (0.42492)      34.508 (0.26832)  -1.1%
Percent CPU   322.5   (38.3007)     326.5   (44.5128)  +1.2%

=> About 3.2% improvement in elapsed time for kernbench.

With GB pages on AMD Fam1h the impact of splitting is much higher of course,
since it would split two full GB pages (together with the first
1MB split patch) instead of two 2MB pages.  I could not benchmark
a clear difference in kernbench on gbpages, so I kept it disabled
for that case

That was only limited benchmarking of course, so if someone
was interested in running more tests for the gbpages case
that could be revisited (contributions welcome)

I didn't bother implementing this for 32bit because it is very
unlikely the 32bit lowmem mapping overlaps into the TSEG near 4GB
and the 2MB low split is already handled for both.

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86/kernel/setup_64.c  |   13 +++++++++++++
 include/asm-x86/msr-index.h |    1 +
 2 files changed, 14 insertions(+)

Index: linux/arch/x86/kernel/setup_64.c
===================================================================
--- linux.orig/arch/x86/kernel/setup_64.c
+++ linux/arch/x86/kernel/setup_64.c
@@ -721,6 +721,20 @@ static void __cpuinit init_amd(struct cp
 
 	if (amd_apic_timer_broken())
 		disable_apic_timer = 1;
+
+	if (!direct_gbpages &&
+		c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+		unsigned long tseg;
+
+		/*
+		 * Split up direct mapping around the TSEG SMM area.
+		 * Don't do it for gbpages because there seems very little
+		 * benefit in doing so.
+		 */
+		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
+		(tseg >> PMD_SHIFT) < (end_pfn_map >> (PMD_SHIFT-PAGE_SHIFT)))
+			set_memory_4k((unsigned long)__va(tseg), 1);
+	}
 }
 
 void __cpuinit detect_ht(struct cpuinfo_x86 *c)
Index: linux/include/asm-x86/msr-index.h
===================================================================
--- linux.orig/include/asm-x86/msr-index.h
+++ linux/include/asm-x86/msr-index.h
@@ -109,6 +109,7 @@
 #define MSR_K8_SYSCFG			0xc0010010
 #define MSR_K8_HWCR			0xc0010015
 #define MSR_K8_ENABLE_C1E		0xc0010055
+#define MSR_K8_TSEG_ADDR		0xc0010112
 #define K8_MTRRFIXRANGE_DRAM_ENABLE	0x00040000 /* MtrrFixDramEn bit    */
 #define K8_MTRRFIXRANGE_DRAM_MODIFY	0x00080000 /* MtrrFixDramModEn bit */
 #define K8_MTRR_RDMEM_WRMEM_MASK	0x18181818 /* Mask: RdMem|WrMem    */

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] [7/7] CPA: Add statistics about state of direct mapping v2
  2008-03-12  2:53 [PATCH] [1/7] Implement true end_pfn_mapped for 32bit Andi Kleen
                   ` (4 preceding siblings ...)
  2008-03-12  2:53 ` [PATCH] [6/7] Split large page mapping for AMD TSEG Andi Kleen
@ 2008-03-12  2:53 ` Andi Kleen
  2008-03-21 17:41   ` Thomas Gleixner
  5 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2008-03-12  2:53 UTC (permalink / raw)
  To: andreas.herrmann3, tglx, mingo, linux-kernel


Add information about the mapping state of the direct mapping to 
/proc/meminfo.

This way we can see how many large pages are really used for it and how
many are split.

Useful for debugging and general insight into the kernel.

v2: Add hotplug locking to 64bit to plug a very obscure theoretical race. 
    32bit doesn't need it because it doesn't support hotadd for lowmem.
    Fix some typos

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86/mm/init_32.c     |    2 ++
 arch/x86/mm/init_64.c     |    2 ++
 arch/x86/mm/pageattr.c    |   24 ++++++++++++++++++++++++
 fs/proc/proc_misc.c       |    7 +++++++
 include/asm-x86/pgtable.h |    3 +++
 5 files changed, 38 insertions(+)

Index: linux/arch/x86/mm/init_64.c
===================================================================
--- linux.orig/arch/x86/mm/init_64.c
+++ linux/arch/x86/mm/init_64.c
@@ -319,6 +319,8 @@ __meminit void early_iounmap(void *addr,
 static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 {
+	unsigned long flags;
+	unsigned pages = 0;
 	int i = pmd_index(address);
 
 	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
@@ -335,9 +337,15 @@ phys_pmd_init(pmd_t *pmd_page, unsigned 
 		if (pmd_val(*pmd))
 			continue;
 
+		pages++;
 		set_pte((pte_t *)pmd,
 			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 	}
+
+	/* Protect against CPA */
+	spin_lock_irqsave(&pgd_lock, flags);
+	dpages_cnt[PG_LEVEL_2M] += pages;
+	spin_unlock_irqrestore(&pgd_lock, flags);
 	return address;
 }
 
@@ -356,6 +364,8 @@ phys_pmd_update(pud_t *pud, unsigned lon
 static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 {
+	unsigned long flags;
+	unsigned pages = 0;
 	unsigned long true_end = end;
 	int i = pud_index(addr);
 
@@ -380,6 +390,7 @@ phys_pud_init(pud_t *pud_page, unsigned 
 		}
 
 		if (direct_gbpages) {
+			dpages_cnt[PG_LEVEL_1G]++;
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 			true_end = (addr & PUD_MASK) + PUD_SIZE;
@@ -397,6 +408,11 @@ phys_pud_init(pud_t *pud_page, unsigned 
 	}
 	__flush_tlb_all();
 
+	/* Protect against CPA */
+	spin_lock_irqsave(&pgd_lock, flags);
+	dpages_cnt[PG_LEVEL_1G] += pages;
+	spin_unlock_irqrestore(&pgd_lock, flags);
+
 	return true_end >> PAGE_SHIFT;
 }
 
Index: linux/arch/x86/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86/mm/pageattr.c
+++ linux/arch/x86/mm/pageattr.c
@@ -18,6 +18,8 @@
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
 
+unsigned long dpages_cnt[PG_LEVEL_NUM];
+
 /*
  * The current flushing context - we pass it instead of 5 arguments:
  */
@@ -499,6 +501,12 @@ static int split_large_page(pte_t *kpte,
 	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 
+	if (address >= (unsigned long)__va(0) &&
+		address < (unsigned long)__va(end_pfn_map << PAGE_SHIFT)) {
+		dpages_cnt[level]--;
+		dpages_cnt[level - 1] += PTRS_PER_PTE;
+	}
+
 	/*
 	 * Install the new, split up pagetable. Important details here:
 	 *
@@ -948,6 +956,22 @@ bool kernel_page_present(struct page *pa
 
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
+#ifdef CONFIG_PROC_FS
+int arch_report_meminfo(char *page)
+{
+	int n;
+	n = sprintf(page, "DirectMap4k:  %8lu\n"
+			  "DirectMap2M:  %8lu\n",
+			dpages_cnt[PG_LEVEL_4K],
+			dpages_cnt[PG_LEVEL_2M]);
+#ifdef CONFIG_X86_64
+	n += sprintf(page + n, "DirectMap1G:  %8lu\n",
+			dpages_cnt[PG_LEVEL_1G]);
+#endif
+	return n;
+}
+#endif
+
 /*
  * The testcases use internal knowledge of the implementation that shouldn't
  * be exposed to the rest of the kernel. Include these directly here.
Index: linux/include/asm-x86/pgtable.h
===================================================================
--- linux.orig/include/asm-x86/pgtable.h
+++ linux/include/asm-x86/pgtable.h
@@ -247,8 +247,11 @@ enum {
 	PG_LEVEL_4K,
 	PG_LEVEL_2M,
 	PG_LEVEL_1G,
+	PG_LEVEL_NUM
 };
 
+extern unsigned long dpages_cnt[PG_LEVEL_NUM];
+
 /*
  * Helper function that returns the kernel pagetable entry controlling
  * the virtual address 'address'. NULL means no pagetable entry present.
Index: linux/arch/x86/mm/init_32.c
===================================================================
--- linux.orig/arch/x86/mm/init_32.c
+++ linux/arch/x86/mm/init_32.c
@@ -198,6 +198,7 @@ static void __init kernel_physical_mappi
 				    is_kernel_text(addr2))
 					prot = PAGE_KERNEL_LARGE_EXEC;
 
+				dpages_cnt[PG_LEVEL_2M]++;
 				set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
@@ -214,6 +215,7 @@ static void __init kernel_physical_mappi
 				if (is_kernel_text(addr))
 					prot = PAGE_KERNEL_EXEC;
 
+				dpages_cnt[PG_LEVEL_4K]++;
 				set_pte(pte, pfn_pte(pfn, prot));
 			}
 			end_pfn_map = pfn;
Index: linux/fs/proc/proc_misc.c
===================================================================
--- linux.orig/fs/proc/proc_misc.c
+++ linux/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, 
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+	return 0;
+}
+
 static int meminfo_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -219,6 +224,8 @@ static int meminfo_read_proc(char *page,
 
 		len += hugetlb_report_meminfo(page + len);
 
+	len += arch_report_meminfo(page + len);
+
 	return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory
  2008-03-12  2:53 ` [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory Andi Kleen
@ 2008-03-12  5:38   ` Eric Dumazet
  2008-03-12  9:19     ` Andi Kleen
  2008-03-21 17:45   ` Thomas Gleixner
  2008-03-25 11:31   ` Joerg Roedel
  2 siblings, 1 reply; 34+ messages in thread
From: Eric Dumazet @ 2008-03-12  5:38 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, tglx, mingo, linux-kernel

Andi Kleen a écrit :
> Intel recommends to not use large pages for the first 1MB 
> of the physical memory because there are fixed size MTRRs there
> which cause splitups in the TLBs.
> 
> On AMD doing so is also a good idea.
> 
> The implementation is a little different between 32bit and 64bit.
> On 32bit I just taught the initial page table set up about this
> because it was very simple to do. This also has the advantage
> that the risk of a prefetch ever seeing the page even
> if it only exists for a short time is minimized.
> 
> On 64bit that is not quite possible, so use set_memory_4k() a little
> later (in check_bugs) instead.
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  arch/x86/kernel/bugs_64.c |   12 ++++++++++++
>  arch/x86/mm/init_32.c     |    6 +++++-
>  2 files changed, 17 insertions(+), 1 deletion(-)
> 

Should we then change CONFIG_PHYSICAL_START from 0x100000 to 0x400000 ?

Thank you

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory
  2008-03-12  5:38   ` Eric Dumazet
@ 2008-03-12  9:19     ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-12  9:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andi Kleen, andreas.herrmann3, tglx, mingo, linux-kernel

> Should we then change CONFIG_PHYSICAL_START from 0x100000 to 0x400000 ?

Yes that would be probably a good idea. This means for PAE and 64bit
kernels 2MB is ok too, for i386 non PAE 4MB. The SUSE 64bit kernels have been 
using that for quite some time.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [5/7] Readd rdmsrl_safe
  2008-03-12  2:53 ` [PATCH] [5/7] Readd rdmsrl_safe Andi Kleen
@ 2008-03-21 17:06   ` Thomas Gleixner
  2008-03-21 17:16     ` Andi Kleen
  2008-03-22  9:59     ` [PATCH] Readd rdmsrl_safe v2 Andi Kleen
  0 siblings, 2 replies; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-21 17:06 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel

On Wed, 12 Mar 2008, Andi Kleen wrote:
> RDMSR for 64bit values with exception handling.
> 
> Makes it easier to deal with 64bit valued MSRs. The old 64bit code
> base had that too as checking_rdmsrl(), but it got dropped somehow. 

Yup, no users.

> +#define rdmsrl_safe(msr,p) \
> +	({ int __err; *(p) = native_read_msr_safe(msr, &__err); __err; })
> +

static inline please

>  #define rdtscl(low)						\
>  	((low) = (u32)native_read_tsc())
>  
> Index: linux/include/asm-x86/paravirt.h
> ===================================================================
> --- linux.orig/include/asm-x86/paravirt.h
> +++ linux/include/asm-x86/paravirt.h
> @@ -687,6 +687,10 @@ static inline int paravirt_write_msr(uns
>  	(*b) = _l >> 32;			\
>  	_err; })
>  
> +#define rdmsrl_safe(msr, p) ({			\
> +	int _err;				\
> +	*(p) = paravirt_read_msr(msr, &_err);	\
> +	_err; })

ditto

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [5/7] Readd rdmsrl_safe
  2008-03-21 17:06   ` Thomas Gleixner
@ 2008-03-21 17:16     ` Andi Kleen
  2008-03-21 17:58       ` Thomas Gleixner
  2008-03-22  9:59     ` [PATCH] Readd rdmsrl_safe v2 Andi Kleen
  1 sibling, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2008-03-21 17:16 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

On Fri, Mar 21, 2008 at 06:06:13PM +0100, Thomas Gleixner wrote:
> On Wed, 12 Mar 2008, Andi Kleen wrote:
> > RDMSR for 64bit values with exception handling.
> > 
> > Makes it easier to deal with 64bit valued MSRs. The old 64bit code
> > base had that too as checking_rdmsrl(), but it got dropped somehow. 
> 
> Yup, no users.
> 
> > +#define rdmsrl_safe(msr,p) \
> > +	({ int __err; *(p) = native_read_msr_safe(msr, &__err); __err; })
> > +
> 
> static inline please

Well all of paravirt.h uses macros. I did the same for consistency. If you want 
inlines it would be better to just convert it all in one go (but please only
after this patch was applied)

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [7/7] CPA: Add statistics about state of direct mapping v2
  2008-03-12  2:53 ` [PATCH] [7/7] CPA: Add statistics about state of direct mapping v2 Andi Kleen
@ 2008-03-21 17:41   ` Thomas Gleixner
  2008-03-21 17:55     ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-21 17:41 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel

On Wed, 12 Mar 2008, Andi Kleen wrote:
> Add information about the mapping state of the direct mapping to 
> /proc/meminfo.

Please use debugfs for this.
 
> This way we can see how many large pages are really used for it and how
> many are split.
> 
> Useful for debugging and general insight into the kernel.
> 
> v2: Add hotplug locking to 64bit to plug a very obscure theoretical race. 
>     32bit doesn't need it because it doesn't support hotadd for lowmem.
>     Fix some typos
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  arch/x86/mm/init_32.c     |    2 ++
>  arch/x86/mm/init_64.c     |    2 ++
>  arch/x86/mm/pageattr.c    |   24 ++++++++++++++++++++++++
>  fs/proc/proc_misc.c       |    7 +++++++
>  include/asm-x86/pgtable.h |    3 +++
>  5 files changed, 38 insertions(+)
> 
> Index: linux/arch/x86/mm/init_64.c
> ===================================================================
> --- linux.orig/arch/x86/mm/init_64.c
> +++ linux/arch/x86/mm/init_64.c
> @@ -319,6 +319,8 @@ __meminit void early_iounmap(void *addr,
>  static unsigned long __meminit
>  phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
>  {
> +	unsigned long flags;
> +	unsigned pages = 0;

Can we use unsigned long for both please and safe one line ?

>  	int i = pmd_index(address);
>  
>  	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
> @@ -335,9 +337,15 @@ phys_pmd_init(pmd_t *pmd_page, unsigned 
>  		if (pmd_val(*pmd))
>  			continue;
>  
> +		pages++;
>  		set_pte((pte_t *)pmd,
>  			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
>  	}
> +
> +	/* Protect against CPA */
> +	spin_lock_irqsave(&pgd_lock, flags);
> +	dpages_cnt[PG_LEVEL_2M] += pages;
> +	spin_unlock_irqrestore(&pgd_lock, flags);

Please make the update a debugfs conditional function in the CPA
code. That way it can be compile out and the statistic internals are
not scattered all over the place.

> @@ -356,6 +364,8 @@ phys_pmd_update(pud_t *pud, unsigned lon
>  static unsigned long __meminit
>  phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
>  {
> +	unsigned long flags;
> +	unsigned pages = 0;
>  	unsigned long true_end = end;

See above.

>  	int i = pud_index(addr);
>  
> @@ -380,6 +390,7 @@ phys_pud_init(pud_t *pud_page, unsigned 
>  		}
>  
>  		if (direct_gbpages) {
> +			dpages_cnt[PG_LEVEL_1G]++;
>  			set_pte((pte_t *)pud,
>  				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
>  			true_end = (addr & PUD_MASK) + PUD_SIZE;
> @@ -397,6 +408,11 @@ phys_pud_init(pud_t *pud_page, unsigned 
>  	}
>  	__flush_tlb_all();
>  
> +	/* Protect against CPA */
> +	spin_lock_irqsave(&pgd_lock, flags);
> +	dpages_cnt[PG_LEVEL_1G] += pages;
> +	spin_unlock_irqrestore(&pgd_lock, flags);
> +

See above

>  	return true_end >> PAGE_SHIFT;
>  }
>  
> Index: linux/arch/x86/mm/pageattr.c
> ===================================================================
> --- linux.orig/arch/x86/mm/pageattr.c
> +++ linux/arch/x86/mm/pageattr.c
> @@ -18,6 +18,8 @@
>  #include <asm/pgalloc.h>
>  #include <asm/proto.h>
>  
> +unsigned long dpages_cnt[PG_LEVEL_NUM];

Can we have some intuitive name for that like direct_pages_stats, so
it's clear that it is debug/statistics info ?

>  /*
>   * The current flushing context - we pass it instead of 5 arguments:
>   */
> @@ -499,6 +501,12 @@ static int split_large_page(pte_t *kpte,
>  	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
>  		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
>  
> +	if (address >= (unsigned long)__va(0) &&
> +		address < (unsigned long)__va(end_pfn_map << PAGE_SHIFT)) {
> +		dpages_cnt[level]--;
> +		dpages_cnt[level - 1] += PTRS_PER_PTE;

  inline conditional on DEBUGFS please

> +	}
> +
>  	/*
>  	 * Install the new, split up pagetable. Important details here:
>  	 *
> @@ -948,6 +956,22 @@ bool kernel_page_present(struct page *pa
>  
>  #endif /* CONFIG_DEBUG_PAGEALLOC */
>  
> +#ifdef CONFIG_PROC_FS

debugfs please

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory
  2008-03-12  2:53 ` [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory Andi Kleen
  2008-03-12  5:38   ` Eric Dumazet
@ 2008-03-21 17:45   ` Thomas Gleixner
  2008-03-21 17:59     ` Andi Kleen
  2008-03-25 11:31   ` Joerg Roedel
  2 siblings, 1 reply; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-21 17:45 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel

On Wed, 12 Mar 2008, Andi Kleen wrote:
> Intel recommends to not use large pages for the first 1MB 
> of the physical memory because there are fixed size MTRRs there
> which cause splitups in the TLBs.
> 
> On AMD doing so is also a good idea.
> 
> The implementation is a little different between 32bit and 64bit.
> On 32bit I just taught the initial page table set up about this
> because it was very simple to do. This also has the advantage
> that the risk of a prefetch ever seeing the page even
> if it only exists for a short time is minimized.
> 
> On 64bit that is not quite possible, so use set_memory_4k() a little
> later (in check_bugs) instead.


> +	/*
> +	 * Make sure the first 2MB area is not mapped by huge pages
> +	 * There are typically fixed size MTRRs in there and overlapping
> +	 * MTRRs into large pages causes slow downs.
> +	 *
> +	 * Right now we don't do that with gbpages because there seems
> +	 * very little benefit for that case.

And why exactly ? Does slowdown not matter with gbpages ? 

Also we split the first GB mapping anyway due to the various regions
(NX, RO, UC) in there.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [7/7] CPA: Add statistics about state of direct mapping v2
  2008-03-21 17:41   ` Thomas Gleixner
@ 2008-03-21 17:55     ` Andi Kleen
  2008-03-22  9:50       ` [PATCH] CPA: Add statistics about state of direct mapping v3 Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2008-03-21 17:55 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

On Fri, Mar 21, 2008 at 06:41:59PM +0100, Thomas Gleixner wrote:
> On Wed, 12 Mar 2008, Andi Kleen wrote:
> > Add information about the mapping state of the direct mapping to 
> > /proc/meminfo.
> 
> Please use debugfs for this.

But it's not debugging. Or are you saying all memory state information
in /proc/meminfo is debugging information?  I would say it is generic
memory state tracking, similar to VmallocUsed or Slab: or NFS_Unstable:
e.g. I would expect a machine to go slower if all direct mapping pages
are split due to increase TLB misses and that information is generally
useful and should not be forced into some debugging ghetto.

-Andi


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [6/7] Split large page mapping for AMD TSEG
  2008-03-12  2:53 ` [PATCH] [6/7] Split large page mapping for AMD TSEG Andi Kleen
@ 2008-03-21 17:55   ` Thomas Gleixner
  2008-03-25 11:56   ` Joerg Roedel
  2008-03-25 16:44   ` Thomas Gleixner
  2 siblings, 0 replies; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-21 17:55 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel

On Wed, 12 Mar 2008, Andi Kleen wrote:
> On AMD SMM protected memory is part of the address map, but handled
> internally like an MTRR. That leads to large pages getting split
> internally which has some performance implications. Check for the
> AMD TSEG MSR and split the large page mapping on that area 
> explicitely if it is part of the direct mapping. 
> 
> There is also SMM ASEG, but it is in the first 1MB and already covered by 
> the earlier split first page patch.
> 
> Idea for this came from an earlier patch by Andreas Herrmann
> 
> On a RevF dual Socket Opteron system kernbench shows a clear
> improvement from this:
> (together with the earlier patches in this series, especially the 
> split first 2MB patch) 
> 
> [lower is better]
>               no split stddev         split  stddev    delta
> Elapsed Time   87.146 (0.727516)     84.296 (1.09098)  -3.2%
> User Time     274.537 (4.05226)     273.692 (3.34344)  -0.3%
> System Time    34.907 (0.42492)      34.508 (0.26832)  -1.1%
> Percent CPU   322.5   (38.3007)     326.5   (44.5128)  +1.2%
> 
> => About 3.2% improvement in elapsed time for kernbench.
> 
> With GB pages on AMD Fam1h the impact of splitting is much higher of course,
> since it would split two full GB pages (together with the first
> 1MB split patch) instead of two 2MB pages.  I could not benchmark
> a clear difference in kernbench on gbpages, so I kept it disabled
> for that case

Hmm. Where is this SMM memory usually located ?
 
> That was only limited benchmarking of course, so if someone
> was interested in running more tests for the gbpages case
> that could be revisited (contributions welcome)
> 
> I didn't bother implementing this for 32bit because it is very
> unlikely the 32bit lowmem mapping overlaps into the TSEG near 4GB
> and the 2MB low split is already handled for both.

Please keep 32bit and 64bit in sync. The setup code has been merged
and we don't want to have needless separation.

Thanks,
	tglx


> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  arch/x86/kernel/setup_64.c  |   13 +++++++++++++
>  include/asm-x86/msr-index.h |    1 +
>  2 files changed, 14 insertions(+)
> 
> Index: linux/arch/x86/kernel/setup_64.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_64.c
> +++ linux/arch/x86/kernel/setup_64.c
> @@ -721,6 +721,20 @@ static void __cpuinit init_amd(struct cp
>  
>  	if (amd_apic_timer_broken())
>  		disable_apic_timer = 1;
> +
> +	if (!direct_gbpages &&
> +		c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
> +		unsigned long tseg;
> +
> +		/*
> +		 * Split up direct mapping around the TSEG SMM area.
> +		 * Don't do it for gbpages because there seems very little
> +		 * benefit in doing so.
> +		 */
> +		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
> +		(tseg >> PMD_SHIFT) < (end_pfn_map >> (PMD_SHIFT-PAGE_SHIFT)))
> +			set_memory_4k((unsigned long)__va(tseg), 1);
> +	}
>  }
>  
>  void __cpuinit detect_ht(struct cpuinfo_x86 *c)
> Index: linux/include/asm-x86/msr-index.h
> ===================================================================
> --- linux.orig/include/asm-x86/msr-index.h
> +++ linux/include/asm-x86/msr-index.h
> @@ -109,6 +109,7 @@
>  #define MSR_K8_SYSCFG			0xc0010010
>  #define MSR_K8_HWCR			0xc0010015
>  #define MSR_K8_ENABLE_C1E		0xc0010055
> +#define MSR_K8_TSEG_ADDR		0xc0010112
>  #define K8_MTRRFIXRANGE_DRAM_ENABLE	0x00040000 /* MtrrFixDramEn bit    */
>  #define K8_MTRRFIXRANGE_DRAM_MODIFY	0x00080000 /* MtrrFixDramModEn bit */
>  #define K8_MTRR_RDMEM_WRMEM_MASK	0x18181818 /* Mask: RdMem|WrMem    */
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [5/7] Readd rdmsrl_safe
  2008-03-21 17:16     ` Andi Kleen
@ 2008-03-21 17:58       ` Thomas Gleixner
  2008-03-21 18:06         ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-21 17:58 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel

On Fri, 21 Mar 2008, Andi Kleen wrote:

> On Fri, Mar 21, 2008 at 06:06:13PM +0100, Thomas Gleixner wrote:
> > On Wed, 12 Mar 2008, Andi Kleen wrote:
> > > RDMSR for 64bit values with exception handling.
> > > 
> > > Makes it easier to deal with 64bit valued MSRs. The old 64bit code
> > > base had that too as checking_rdmsrl(), but it got dropped somehow. 
> > 
> > Yup, no users.
> > 
> > > +#define rdmsrl_safe(msr,p) \
> > > +	({ int __err; *(p) = native_read_msr_safe(msr, &__err); __err; })
> > > +
> > 
> > static inline please
> 
> Well all of paravirt.h uses macros. I did the same for consistency.

consistency ?

> If you want inlines it would be better to just convert it all in one
> go (but please only after this patch was applied)

Of course. We don't want to burden work on your shoulders.

Thanks

	tglx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory
  2008-03-21 17:45   ` Thomas Gleixner
@ 2008-03-21 17:59     ` Andi Kleen
  2008-03-21 18:03       ` Thomas Gleixner
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2008-03-21 17:59 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

> Also we split the first GB mapping anyway due to the various regions
> (NX, RO, UC) in there.

I didn't think so unless you have DEBUG_RODATA enabled?  Also there
should be no UC region there as known by the kernel. There might
be a WC region there from the frame buffer code, but that is an MTRR,
not a pageattr.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory
  2008-03-21 17:59     ` Andi Kleen
@ 2008-03-21 18:03       ` Thomas Gleixner
  2008-03-21 18:44         ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-21 18:03 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel

On Fri, 21 Mar 2008, Andi Kleen wrote:
> > Also we split the first GB mapping anyway due to the various regions
> > (NX, RO, UC) in there.
> 
> I didn't think so unless you have DEBUG_RODATA enabled? 

NX is independent of DEBUG_RODATA and the RODATA protection should be
made unconditional on anyway.

> Also there
> should be no UC region there as known by the kernel. There might
> be a WC region there from the frame buffer code, but that is an MTRR,
> not a pageattr.

The first ioremap of the PCI space splits the GB page as well.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [5/7] Readd rdmsrl_safe
  2008-03-21 17:58       ` Thomas Gleixner
@ 2008-03-21 18:06         ` Andi Kleen
  2008-03-21 18:14           ` Thomas Gleixner
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2008-03-21 18:06 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

On Fri, Mar 21, 2008 at 06:58:08PM +0100, Thomas Gleixner wrote:
> On Fri, 21 Mar 2008, Andi Kleen wrote:
> 
> > On Fri, Mar 21, 2008 at 06:06:13PM +0100, Thomas Gleixner wrote:
> > > On Wed, 12 Mar 2008, Andi Kleen wrote:
> > > > RDMSR for 64bit values with exception handling.
> > > > 
> > > > Makes it easier to deal with 64bit valued MSRs. The old 64bit code
> > > > base had that too as checking_rdmsrl(), but it got dropped somehow. 
> > > 
> > > Yup, no users.
> > > 
> > > > +#define rdmsrl_safe(msr,p) \
> > > > +	({ int __err; *(p) = native_read_msr_safe(msr, &__err); __err; })
> > > > +
> > > 
> > > static inline please
> > 
> > Well all of paravirt.h uses macros. I did the same for consistency.
> 
> consistency ?

At least I consider it clean and consistent to use similar
style as the other code in that file for new code I add.

> 
> > If you want inlines it would be better to just convert it all in one
> > go (but please only after this patch was applied)
> 
> Of course. We don't want to burden work on your shoulders.

Well you signed up for the work yourself last year ;-) It was your choice.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [5/7] Readd rdmsrl_safe
  2008-03-21 18:06         ` Andi Kleen
@ 2008-03-21 18:14           ` Thomas Gleixner
  2008-03-21 18:46             ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-21 18:14 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel

On Fri, 21 Mar 2008, Andi Kleen wrote:
> > > Well all of paravirt.h uses macros. I did the same for consistency.
> > 
> > consistency ?
> 
> At least I consider it clean and consistent to use similar
> style as the other code in that file for new code I add.

inlines are generally preferred and the existing macro pile is no
excuse to add more.

> > > If you want inlines it would be better to just convert it all in one
> > > go (but please only after this patch was applied)
> > 
> > Of course. We don't want to burden work on your shoulders.
> 
> Well you signed up for the work yourself last year ;-) It was your choice.

Yeah, I signed up for maintaining. And review is one of the tasks of a
maintainer. So when I do a review and ask for a macro -> inline change
I don't see that this means that I have to do the change myself.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory
  2008-03-21 18:03       ` Thomas Gleixner
@ 2008-03-21 18:44         ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-21 18:44 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

On Fri, Mar 21, 2008 at 07:03:19PM +0100, Thomas Gleixner wrote:
> On Fri, 21 Mar 2008, Andi Kleen wrote:
> > > Also we split the first GB mapping anyway due to the various regions
> > > (NX, RO, UC) in there.
> > 
> > I didn't think so unless you have DEBUG_RODATA enabled? 
> 
> NX is independent of DEBUG_RODATA 

Sure, but it is still not split by default. At least I don't see any code
for that anywhere except in my own patchkit.

> and the RODATA protection should be
> made unconditional on anyway.

Requiring hundreds instead of two TLB entries for the kernel text? 

I must say I personally cannot ever remember any bug caught by RODATA
anyways, so I am a bit dubious on its value.

> > Also there
> > should be no UC region there as known by the kernel. There might
> > be a WC region there from the frame buffer code, but that is an MTRR,
> > not a pageattr.
> 
> The first ioremap of the PCI space splits the GB page as well.

PCI space is normally in the fourth or sometimes third GB page,
not in the first.  I am not aware of any system that has the PCI hole 
in the first GB.

-Andi


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [5/7] Readd rdmsrl_safe
  2008-03-21 18:14           ` Thomas Gleixner
@ 2008-03-21 18:46             ` Andi Kleen
  2008-03-21 18:48               ` [PATCH] [5/7] Readd rdmsrl_safe II Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2008-03-21 18:46 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

Never mind I retract the clean up patch. While doing the actual
reordering is not that difficult I would need to retest and I don't
have time for that now. And since reordering tends to requiring
typing in everything again and I make occasional typos (and I added
bugs in the past during such reordering exercises) the retesting
would be needed. But for that clean up it is probably not worth it.
After all it just makes the code a little nicer, but doesn't fix
or improve anything. 

Please still consider the other patches in the serioes.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [5/7] Readd rdmsrl_safe II
  2008-03-21 18:46             ` Andi Kleen
@ 2008-03-21 18:48               ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-21 18:48 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Thomas Gleixner, andreas.herrmann3, mingo, linux-kernel

On Fri, Mar 21, 2008 at 07:46:56PM +0100, Andi Kleen wrote:
> Never mind I retract the clean up patch. While doing the actual
> reordering is not that difficult I would need to retest and I don't
> have time for that now. And since reordering tends to requiring
> typing in everything again and I make occasional typos (and I added
> bugs in the past during such reordering exercises) the retesting
> would be needed. But for that clean up it is probably not worth it.
> After all it just makes the code a little nicer, but doesn't fix
> or improve anything. 
> 
> Please still consider the other patches in the serioes.

Oops that ended up in the wrong thread sorry. The comment
was regarding the early exception recursion -> iteration change,
not for rdmsrl_safe (which is not a clean up)

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] CPA: Add statistics about state of direct mapping v3
  2008-03-21 17:55     ` Andi Kleen
@ 2008-03-22  9:50       ` Andi Kleen
  2008-03-25 15:40         ` Thomas Gleixner
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2008-03-22  9:50 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Thomas Gleixner, andreas.herrmann3, mingo, linux-kernel

[I didn't switch to debugfs because I strongly disagreed with that
suggestion. But all the other points you made are addressed.]

---

CPA: Add statistics about state of direct mapping v3

Add information about the mapping state of the direct mapping to 
/proc/meminfo. I chose /proc/meminfo because that is where all the other
memory statistics are too and it is a generally useful metric even
outside debugging situations. A lot of split kernel pages means the 
kernel will run slower.

This way we can see how many large pages are really used for it and how
many are split.

Useful for general insight into the kernel.

v2: Add hotplug locking to 64bit to plug a very obscure theoretical race. 
    32bit doesn't need it because it doesn't support hotadd for lowmem.
    Fix some typos
v3: Rename dpages_cnt
    Add CONFIG ifdef for count update as requested by tglx
    Expand description

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andi Kleen <andi@firstfloor.org>

---
 arch/x86/mm/init_32.c     |    2 ++
 arch/x86/mm/init_64.c     |    2 ++
 arch/x86/mm/pageattr.c    |   24 ++++++++++++++++++++++++
 fs/proc/proc_misc.c       |    7 +++++++
 include/asm-x86/pgtable.h |    3 +++
 5 files changed, 38 insertions(+)

Index: linux/arch/x86/mm/init_64.c
===================================================================
--- linux.orig/arch/x86/mm/init_64.c
+++ linux/arch/x86/mm/init_64.c
@@ -316,9 +316,22 @@ __meminit void early_iounmap(void *addr,
 	__flush_tlb_all();
 }
 
+static void update_page_count(int level, unsigned long pages)
+{
+#ifdef CONFIG_PROC_FS
+	unsigned long flags;
+	/* Protect against CPA */
+	spin_lock_irqsave(&pgd_lock, flags);
+	direct_pages_count[PG_LEVEL_2M] += pages;
+	spin_unlock_irqrestore(&pgd_lock, flags);
+#endif
+}
+
 static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 {
+	unsigned long pages = 0;
+
 	int i = pmd_index(address);
 
 	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
@@ -335,9 +348,11 @@ phys_pmd_init(pmd_t *pmd_page, unsigned 
 		if (pmd_val(*pmd))
 			continue;
 
+		pages++;
 		set_pte((pte_t *)pmd,
 			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 	}
+	update_page_count(PG_LEVEL_2M, pages);
 	return address;
 }
 
@@ -356,6 +371,7 @@ phys_pmd_update(pud_t *pud, unsigned lon
 static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 {
+	unsigned long pages = 0;
 	unsigned long true_end = end;
 	int i = pud_index(addr);
 
@@ -380,6 +396,7 @@ phys_pud_init(pud_t *pud_page, unsigned 
 		}
 
 		if (direct_gbpages) {
+			direct_pages_count[PG_LEVEL_1G]++;
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 			true_end = (addr & PUD_MASK) + PUD_SIZE;
@@ -397,6 +414,8 @@ phys_pud_init(pud_t *pud_page, unsigned 
 	}
 	__flush_tlb_all();
 
+	update_page_count(PG_LEVEL_1G, pages);
+
 	return true_end >> PAGE_SHIFT;
 }
 
Index: linux/arch/x86/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86/mm/pageattr.c
+++ linux/arch/x86/mm/pageattr.c
@@ -18,6 +18,8 @@
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
 
+unsigned long direct_pages_count[PG_LEVEL_NUM];
+
 /*
  * The current flushing context - we pass it instead of 5 arguments:
  */
@@ -499,6 +501,12 @@ static int split_large_page(pte_t *kpte,
 	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 
+	if (address >= (unsigned long)__va(0) &&
+		address < (unsigned long)__va(end_pfn_map << PAGE_SHIFT)) {
+		direct_pages_count[level]--;
+		direct_pages_count[level - 1] += PTRS_PER_PTE;
+	}
+
 	/*
 	 * Install the new, split up pagetable. Important details here:
 	 *
@@ -948,6 +956,22 @@ bool kernel_page_present(struct page *pa
 
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
+#ifdef CONFIG_PROC_FS
+int arch_report_meminfo(char *page)
+{
+	int n;
+	n = sprintf(page, "DirectMap4k:  %8lu\n"
+			  "DirectMap2M:  %8lu\n",
+			direct_pages_count[PG_LEVEL_4K],
+			direct_pages_count[PG_LEVEL_2M]);
+#ifdef CONFIG_X86_64
+	n += sprintf(page + n, "DirectMap1G:  %8lu\n",
+			direct_pages_count[PG_LEVEL_1G]);
+#endif
+	return n;
+}
+#endif
+
 /*
  * The testcases use internal knowledge of the implementation that shouldn't
  * be exposed to the rest of the kernel. Include these directly here.
Index: linux/include/asm-x86/pgtable.h
===================================================================
--- linux.orig/include/asm-x86/pgtable.h
+++ linux/include/asm-x86/pgtable.h
@@ -247,8 +247,11 @@ enum {
 	PG_LEVEL_4K,
 	PG_LEVEL_2M,
 	PG_LEVEL_1G,
+	PG_LEVEL_NUM
 };
 
+extern unsigned long direct_pages_count[PG_LEVEL_NUM];
+
 /*
  * Helper function that returns the kernel pagetable entry controlling
  * the virtual address 'address'. NULL means no pagetable entry present.
Index: linux/arch/x86/mm/init_32.c
===================================================================
--- linux.orig/arch/x86/mm/init_32.c
+++ linux/arch/x86/mm/init_32.c
@@ -198,6 +198,7 @@ static void __init kernel_physical_mappi
 				    is_kernel_text(addr2))
 					prot = PAGE_KERNEL_LARGE_EXEC;
 
+				direct_pages_count[PG_LEVEL_2M]++;
 				set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
@@ -214,6 +215,7 @@ static void __init kernel_physical_mappi
 				if (is_kernel_text(addr))
 					prot = PAGE_KERNEL_EXEC;
 
+				direct_pages_count[PG_LEVEL_4K]++;
 				set_pte(pte, pfn_pte(pfn, prot));
 			}
 			end_pfn_map = pfn;
Index: linux/fs/proc/proc_misc.c
===================================================================
--- linux.orig/fs/proc/proc_misc.c
+++ linux/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, 
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+	return 0;
+}
+
 static int meminfo_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -219,6 +224,8 @@ static int meminfo_read_proc(char *page,
 
 		len += hugetlb_report_meminfo(page + len);
 
+	len += arch_report_meminfo(page + len);
+
 	return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] Readd rdmsrl_safe v2
  2008-03-21 17:06   ` Thomas Gleixner
  2008-03-21 17:16     ` Andi Kleen
@ 2008-03-22  9:59     ` Andi Kleen
  1 sibling, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-22  9:59 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

Readd rdmsrl_safe v2

RDMSR for 64bit values with exception handling.

Makes it easier to deal with 64bit valued MSRs. The old 64bit code
base had that too as checking_rdmsrl(), but it got dropped somehow. 

Needed for followup patch.

v2: switch to inline

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andi Kleen <andi@firstfloor.org>

---
 include/asm-x86/msr.h      |    3 +++
 include/asm-x86/paravirt.h |    4 ++++
 2 files changed, 7 insertions(+)

Index: linux/include/asm-x86/msr.h
===================================================================
--- linux.orig/include/asm-x86/msr.h
+++ linux/include/asm-x86/msr.h
@@ -150,6 +150,13 @@ static inline int wrmsr_safe(unsigned ms
 		__err;							\
 	})
 
+static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
+{
+	int err;
+	*p = native_read_msr_safe(msr, &err);
+	return err;
+}
+
 #define rdtscl(low)						\
 	((low) = (u32)native_read_tsc())
 
Index: linux/include/asm-x86/paravirt.h
===================================================================
--- linux.orig/include/asm-x86/paravirt.h
+++ linux/include/asm-x86/paravirt.h
@@ -687,6 +687,12 @@ static inline int paravirt_write_msr(uns
 	(*b) = _l >> 32;			\
 	_err; })
 
+static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
+{
+	int err;
+	*p = paravirt_read_msr(msr, &err);
+	return err;
+}
 
 static inline u64 paravirt_read_tsc(void)
 {

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory
  2008-03-12  2:53 ` [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory Andi Kleen
  2008-03-12  5:38   ` Eric Dumazet
  2008-03-21 17:45   ` Thomas Gleixner
@ 2008-03-25 11:31   ` Joerg Roedel
  2008-03-25 11:39     ` Andi Kleen
  2 siblings, 1 reply; 34+ messages in thread
From: Joerg Roedel @ 2008-03-25 11:31 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, tglx, mingo, linux-kernel

On Wed, Mar 12, 2008 at 03:53:30AM +0100, Andi Kleen wrote:
> 
> Intel recommends to not use large pages for the first 1MB 
> of the physical memory because there are fixed size MTRRs there
> which cause splitups in the TLBs.
> 
> On AMD doing so is also a good idea.

This should especially boost performance on 32 bit. Have you numbers for
that?

> The implementation is a little different between 32bit and 64bit.
> On 32bit I just taught the initial page table set up about this
> because it was very simple to do. This also has the advantage
> that the risk of a prefetch ever seeing the page even
> if it only exists for a short time is minimized.
> 
> On 64bit that is not quite possible, so use set_memory_4k() a little
> later (in check_bugs) instead.
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>

> ---
>  arch/x86/kernel/bugs_64.c |   12 ++++++++++++
>  arch/x86/mm/init_32.c     |    6 +++++-
>  2 files changed, 17 insertions(+), 1 deletion(-)
> 
> Index: linux/arch/x86/kernel/bugs_64.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/bugs_64.c
> +++ linux/arch/x86/kernel/bugs_64.c
> @@ -9,6 +9,7 @@
>  #include <asm/bugs.h>
>  #include <asm/processor.h>
>  #include <asm/mtrr.h>
> +#include <asm/cacheflush.h>
>  
>  void __init check_bugs(void)
>  {
> @@ -18,4 +19,15 @@ void __init check_bugs(void)
>  	print_cpu_info(&boot_cpu_data);
>  #endif
>  	alternative_instructions();
> +
> +	/*
> +	 * Make sure the first 2MB area is not mapped by huge pages
> +	 * There are typically fixed size MTRRs in there and overlapping
> +	 * MTRRs into large pages causes slow downs.
> +	 *
> +	 * Right now we don't do that with gbpages because there seems
> +	 * very little benefit for that case.
> +	 */
> +	if (!direct_gbpages)
> +		set_memory_4k((unsigned long)__va(0), 1);
>  }
> Index: linux/arch/x86/mm/init_32.c
> ===================================================================
> --- linux.orig/arch/x86/mm/init_32.c
> +++ linux/arch/x86/mm/init_32.c
> @@ -181,8 +181,13 @@ static void __init kernel_physical_mappi
>  			/*
>  			 * Map with big pages if possible, otherwise
>  			 * create normal page tables:
> +			 *
> +			 * Don't use a large page for the first 2/4MB of memory
> +			 * because there are often fixed size MTRRs in there
> +			 * and overlapping MTRRs into large pages can cause
> +			 * slowdowns.
>  			 */
> -			if (cpu_has_pse) {
> +			if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
>  				unsigned int addr2;
>  				pgprot_t prot = PAGE_KERNEL_LARGE;
>  
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-- 
           |           AMD Saxony Limited Liability Company & Co. KG
 Operating |         Wilschdorfer Landstr. 101, 01109 Dresden, Germany
 System    |                  Register Court Dresden: HRA 4896
 Research  |              General Partner authorized to represent:
 Center    |             AMD Saxony LLC (Wilmington, Delaware, US)
           | General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory
  2008-03-25 11:31   ` Joerg Roedel
@ 2008-03-25 11:39     ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-25 11:39 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: Andi Kleen, andreas.herrmann3, tglx, mingo, linux-kernel

> This should especially boost performance on 32 bit. Have you numbers for
> that?

No numbers for 32bit no.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [6/7] Split large page mapping for AMD TSEG
  2008-03-12  2:53 ` [PATCH] [6/7] Split large page mapping for AMD TSEG Andi Kleen
  2008-03-21 17:55   ` Thomas Gleixner
@ 2008-03-25 11:56   ` Joerg Roedel
  2008-03-25 16:44   ` Thomas Gleixner
  2 siblings, 0 replies; 34+ messages in thread
From: Joerg Roedel @ 2008-03-25 11:56 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, tglx, mingo, linux-kernel

On Wed, Mar 12, 2008 at 03:53:32AM +0100, Andi Kleen wrote:
> 
> On AMD SMM protected memory is part of the address map, but handled
> internally like an MTRR. That leads to large pages getting split
> internally which has some performance implications. Check for the
> AMD TSEG MSR and split the large page mapping on that area 
> explicitely if it is part of the direct mapping. 
> 
> There is also SMM ASEG, but it is in the first 1MB and already covered by 
> the earlier split first page patch.
> 
> Idea for this came from an earlier patch by Andreas Herrmann
> 
> On a RevF dual Socket Opteron system kernbench shows a clear
> improvement from this:
> (together with the earlier patches in this series, especially the 
> split first 2MB patch) 
> 
> [lower is better]
>               no split stddev         split  stddev    delta
> Elapsed Time   87.146 (0.727516)     84.296 (1.09098)  -3.2%
> User Time     274.537 (4.05226)     273.692 (3.34344)  -0.3%
> System Time    34.907 (0.42492)      34.508 (0.26832)  -1.1%
> Percent CPU   322.5   (38.3007)     326.5   (44.5128)  +1.2%
> 
> => About 3.2% improvement in elapsed time for kernbench.
> 
> With GB pages on AMD Fam1h the impact of splitting is much higher of course,
> since it would split two full GB pages (together with the first
> 1MB split patch) instead of two 2MB pages.  I could not benchmark
> a clear difference in kernbench on gbpages, so I kept it disabled
> for that case
> 
> That was only limited benchmarking of course, so if someone
> was interested in running more tests for the gbpages case
> that could be revisited (contributions welcome)
> 
> I didn't bother implementing this for 32bit because it is very
> unlikely the 32bit lowmem mapping overlaps into the TSEG near 4GB
> and the 2MB low split is already handled for both.
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>


> ---
>  arch/x86/kernel/setup_64.c  |   13 +++++++++++++
>  include/asm-x86/msr-index.h |    1 +
>  2 files changed, 14 insertions(+)
> 
> Index: linux/arch/x86/kernel/setup_64.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_64.c
> +++ linux/arch/x86/kernel/setup_64.c
> @@ -721,6 +721,20 @@ static void __cpuinit init_amd(struct cp
>  
>  	if (amd_apic_timer_broken())
>  		disable_apic_timer = 1;
> +
> +	if (!direct_gbpages &&
> +		c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
> +		unsigned long tseg;
> +
> +		/*
> +		 * Split up direct mapping around the TSEG SMM area.
> +		 * Don't do it for gbpages because there seems very little
> +		 * benefit in doing so.
> +		 */
> +		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
> +		(tseg >> PMD_SHIFT) < (end_pfn_map >> (PMD_SHIFT-PAGE_SHIFT)))
> +			set_memory_4k((unsigned long)__va(tseg), 1);
> +	}
>  }
>  
>  void __cpuinit detect_ht(struct cpuinfo_x86 *c)
> Index: linux/include/asm-x86/msr-index.h
> ===================================================================
> --- linux.orig/include/asm-x86/msr-index.h
> +++ linux/include/asm-x86/msr-index.h
> @@ -109,6 +109,7 @@
>  #define MSR_K8_SYSCFG			0xc0010010
>  #define MSR_K8_HWCR			0xc0010015
>  #define MSR_K8_ENABLE_C1E		0xc0010055
> +#define MSR_K8_TSEG_ADDR		0xc0010112
>  #define K8_MTRRFIXRANGE_DRAM_ENABLE	0x00040000 /* MtrrFixDramEn bit    */
>  #define K8_MTRRFIXRANGE_DRAM_MODIFY	0x00080000 /* MtrrFixDramModEn bit */
>  #define K8_MTRR_RDMEM_WRMEM_MASK	0x18181818 /* Mask: RdMem|WrMem    */
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-- 
           |           AMD Saxony Limited Liability Company & Co. KG
 Operating |         Wilschdorfer Landstr. 101, 01109 Dresden, Germany
 System    |                  Register Court Dresden: HRA 4896
 Research  |              General Partner authorized to represent:
 Center    |             AMD Saxony LLC (Wilmington, Delaware, US)
           | General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] CPA: Add statistics about state of direct mapping v3
  2008-03-22  9:50       ` [PATCH] CPA: Add statistics about state of direct mapping v3 Andi Kleen
@ 2008-03-25 15:40         ` Thomas Gleixner
  2008-03-25 16:14           ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-25 15:40 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel

On Sat, 22 Mar 2008, Andi Kleen wrote:

> [I didn't switch to debugfs because I strongly disagreed with that
> suggestion. But all the other points you made are addressed.]

Halfways :)
  
> +static void update_page_count(int level, unsigned long pages)
> +{
> +#ifdef CONFIG_PROC_FS
> +	unsigned long flags;
> +	/* Protect against CPA */
> +	spin_lock_irqsave(&pgd_lock, flags);
> +	direct_pages_count[PG_LEVEL_2M] += pages;

Shouldn't this update direct_pages_count[level] perhaps ?

> +	spin_unlock_irqrestore(&pgd_lock, flags);
> +#endif
> +}

Also I asked to move the update function to pageattr.c where we have
the variable, so we dont need to make it global and can change the
statistics implementation w/o changing the callers

> @@ -356,6 +371,7 @@ phys_pmd_update(pud_t *pud, unsigned lon
>  static unsigned long __meminit
>  phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
>  {
> +	unsigned long pages = 0;
>  	unsigned long true_end = end;
>  	int i = pud_index(addr);
>  
> @@ -380,6 +396,7 @@ phys_pud_init(pud_t *pud_page, unsigned 
>  		}
>  
>  		if (direct_gbpages) {
> +			direct_pages_count[PG_LEVEL_1G]++;

  direct update

>  			set_pte((pte_t *)pud,
>  				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
>  			true_end = (addr & PUD_MASK) + PUD_SIZE;
> @@ -397,6 +414,8 @@ phys_pud_init(pud_t *pud_page, unsigned 
>  	}
>  	__flush_tlb_all();
>  
> +	update_page_count(PG_LEVEL_1G, pages);
> +

  update via function call with pages = 0 ?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] CPA: Add statistics about state of direct mapping v3
  2008-03-25 15:40         ` Thomas Gleixner
@ 2008-03-25 16:14           ` Andi Kleen
  2008-03-25 16:16             ` Thomas Gleixner
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2008-03-25 16:14 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

On Tue, Mar 25, 2008 at 04:40:43PM +0100, Thomas Gleixner wrote:
> On Sat, 22 Mar 2008, Andi Kleen wrote:
> 
> > [I didn't switch to debugfs because I strongly disagreed with that
> > suggestion. But all the other points you made are addressed.]
> 
> Halfways :)

Ok.

> > +	update_page_count(PG_LEVEL_1G, pages);
> > +
> 
>   update via function call with pages = 0 ?

Didn't get that one. Can you clarify?

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] CPA: Add statistics about state of direct mapping v3
  2008-03-25 16:14           ` Andi Kleen
@ 2008-03-25 16:16             ` Thomas Gleixner
  2008-03-25 17:01               ` [PATCH] CPA: Add statistics about state of direct mapping v4 Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-25 16:16 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel



On Tue, 25 Mar 2008, Andi Kleen wrote:

> On Tue, Mar 25, 2008 at 04:40:43PM +0100, Thomas Gleixner wrote:
> > On Sat, 22 Mar 2008, Andi Kleen wrote:
> > 
> > > [I didn't switch to debugfs because I strongly disagreed with that
> > > suggestion. But all the other points you made are addressed.]
> > 
> > Halfways :)
> 
> Ok.
> 
> > > +	update_page_count(PG_LEVEL_1G, pages);
> > > +
> > 
> >   update via function call with pages = 0 ?
> 
> Didn't get that one. Can you clarify?

You have the direct increment and the function call in
phys_pud_init. The function is always called with pages=0 because
nothing ever increments pages.

Thanks,
	tglx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [6/7] Split large page mapping for AMD TSEG
  2008-03-12  2:53 ` [PATCH] [6/7] Split large page mapping for AMD TSEG Andi Kleen
  2008-03-21 17:55   ` Thomas Gleixner
  2008-03-25 11:56   ` Joerg Roedel
@ 2008-03-25 16:44   ` Thomas Gleixner
  2008-03-25 16:54     ` Andi Kleen
  2 siblings, 1 reply; 34+ messages in thread
From: Thomas Gleixner @ 2008-03-25 16:44 UTC (permalink / raw)
  To: Andi Kleen; +Cc: andreas.herrmann3, mingo, linux-kernel

On Wed, 12 Mar 2008, Andi Kleen wrote:
> +
> +	if (!direct_gbpages &&
> +		c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
> +		unsigned long tseg;

Can we do this at some place which is only called once ?

> +		/*
> +		 * Split up direct mapping around the TSEG SMM area.
> +		 * Don't do it for gbpages because there seems very little
> +		 * benefit in doing so.
> +		 */
> +		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&

 warning: passing argument 2 of 'rdmsrl_safe' from incompatible pointer type  

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [6/7] Split large page mapping for AMD TSEG
  2008-03-25 16:44   ` Thomas Gleixner
@ 2008-03-25 16:54     ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-25 16:54 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

On Tue, Mar 25, 2008 at 05:44:21PM +0100, Thomas Gleixner wrote:
> On Wed, 12 Mar 2008, Andi Kleen wrote:
> > +
> > +	if (!direct_gbpages &&
> > +		c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
> > +		unsigned long tseg;
> 
> Can we do this at some place which is only called once ?

Nothing obvious. We could add it, but then would need to add a (imho ugly)
vendor check there first.

I think it is better to have the cpu specific code all in one place.

> 
> > +		/*
> > +		 * Split up direct mapping around the TSEG SMM area.
> > +		 * Don't do it for gbpages because there seems very little
> > +		 * benefit in doing so.
> > +		 */
> > +		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
> 
>  warning: passing argument 2 of 'rdmsrl_safe' from incompatible pointer type  

Yes the type has to be updated after the earlier inline change.
Easiest you just do the trivial change yourself.

-Andi


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] CPA: Add statistics about state of direct mapping v4
  2008-03-25 16:16             ` Thomas Gleixner
@ 2008-03-25 17:01               ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-25 17:01 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andi Kleen, andreas.herrmann3, mingo, linux-kernel

All comments addressed now hopefully.

-Andi

---

CPA: Add statistics about state of direct mapping v4

Add information about the mapping state of the direct mapping to 
/proc/meminfo. I chose /proc/meminfo because that is where all the other
memory statistics are too and it is a generally useful metric even
outside debugging situations. A lot of split kernel pages means the 
kernel will run slower.

This way we can see how many large pages are really used for it and how
many are split.

Useful for general insight into the kernel.

v2: Add hotplug locking to 64bit to plug a very obscure theoretical race. 
    32bit doesn't need it because it doesn't support hotadd for lowmem.
    Fix some typos
v3: Rename dpages_cnt
    Add CONFIG ifdef for count update as requested by tglx
    Expand description
v4: Fix stupid bugs added in v3
    Move update_page_count to pageattr.c

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andi Kleen <andi@firstfloor.org>

---
 arch/x86/mm/init_32.c     |    2 ++
 arch/x86/mm/init_64.c     |    2 ++
 arch/x86/mm/pageattr.c    |   24 ++++++++++++++++++++++++
 fs/proc/proc_misc.c       |    7 +++++++
 include/asm-x86/pgtable.h |    3 +++
 5 files changed, 38 insertions(+)

Index: linux/arch/x86/mm/init_64.c
===================================================================
--- linux.orig/arch/x86/mm/init_64.c
+++ linux/arch/x86/mm/init_64.c
@@ -319,6 +319,8 @@ __meminit void early_iounmap(void *addr,
 static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 {
+	unsigned long pages = 0;
+
 	int i = pmd_index(address);
 
 	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
@@ -335,9 +337,11 @@ phys_pmd_init(pmd_t *pmd_page, unsigned 
 		if (pmd_val(*pmd))
 			continue;
 
+		pages++;
 		set_pte((pte_t *)pmd,
 			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 	}
+	update_page_count(PG_LEVEL_2M, pages);
 	return address;
 }
 
@@ -356,6 +360,7 @@ phys_pmd_update(pud_t *pud, unsigned lon
 static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 {
+	unsigned long pages = 0;
 	unsigned long true_end = end;
 	int i = pud_index(addr);
 
@@ -380,6 +385,7 @@ phys_pud_init(pud_t *pud_page, unsigned 
 		}
 
 		if (direct_gbpages) {
+			pages++;
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 			true_end = (addr & PUD_MASK) + PUD_SIZE;
@@ -397,6 +403,8 @@ phys_pud_init(pud_t *pud_page, unsigned 
 	}
 	__flush_tlb_all();
 
+	update_page_count(PG_LEVEL_1G, pages);
+
 	return true_end >> PAGE_SHIFT;
 }
 
Index: linux/arch/x86/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86/mm/pageattr.c
+++ linux/arch/x86/mm/pageattr.c
@@ -31,6 +31,19 @@ struct cpa_data {
 	unsigned	force_split : 1;
 };
 
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void __meminit update_page_count(int level, unsigned long pages)
+{
+#ifdef CONFIG_PROC_FS
+	unsigned long flags;
+	/* Protect against CPA */
+	spin_lock_irqsave(&pgd_lock, flags);
+	direct_pages_count[level] += pages;
+	spin_unlock_irqrestore(&pgd_lock, flags);
+#endif
+}
+
 #ifdef CONFIG_X86_64
 
 static inline unsigned long highmap_start_pfn(void)
@@ -499,6 +512,12 @@ static int split_large_page(pte_t *kpte,
 	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 
+	if (address >= (unsigned long)__va(0) &&
+		address < (unsigned long)__va(end_pfn_map << PAGE_SHIFT)) {
+		direct_pages_count[level]--;
+		direct_pages_count[level - 1] += PTRS_PER_PTE;
+	}
+
 	/*
 	 * Install the new, split up pagetable. Important details here:
 	 *
@@ -948,6 +967,22 @@ bool kernel_page_present(struct page *pa
 
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
+#ifdef CONFIG_PROC_FS
+int arch_report_meminfo(char *page)
+{
+	int n;
+	n = sprintf(page, "DirectMap4k:  %8lu\n"
+			  "DirectMap2M:  %8lu\n",
+			direct_pages_count[PG_LEVEL_4K],
+			direct_pages_count[PG_LEVEL_2M]);
+#ifdef CONFIG_X86_64
+	n += sprintf(page + n, "DirectMap1G:  %8lu\n",
+			direct_pages_count[PG_LEVEL_1G]);
+#endif
+	return n;
+}
+#endif
+
 /*
  * The testcases use internal knowledge of the implementation that shouldn't
  * be exposed to the rest of the kernel. Include these directly here.
Index: linux/include/asm-x86/pgtable.h
===================================================================
--- linux.orig/include/asm-x86/pgtable.h
+++ linux/include/asm-x86/pgtable.h
@@ -247,8 +247,11 @@ enum {
 	PG_LEVEL_4K,
 	PG_LEVEL_2M,
 	PG_LEVEL_1G,
+	PG_LEVEL_NUM
 };
 
+void update_page_count(int level, unsigned long pages);
+
 /*
  * Helper function that returns the kernel pagetable entry controlling
  * the virtual address 'address'. NULL means no pagetable entry present.
Index: linux/arch/x86/mm/init_32.c
===================================================================
--- linux.orig/arch/x86/mm/init_32.c
+++ linux/arch/x86/mm/init_32.c
@@ -163,6 +163,7 @@ static void __init kernel_physical_mappi
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
+	unsigned pages_2m = 0, pages_4k = 0;
 
 	pgd_idx = pgd_index(PAGE_OFFSET);
 	pgd = pgd_base + pgd_idx;
@@ -198,6 +199,7 @@ static void __init kernel_physical_mappi
 				    is_kernel_text(addr2))
 					prot = PAGE_KERNEL_LARGE_EXEC;
 
+				pages_2m++;
 				set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
@@ -214,11 +216,14 @@ static void __init kernel_physical_mappi
 				if (is_kernel_text(addr))
 					prot = PAGE_KERNEL_EXEC;
 
+				pages_4k++;
 				set_pte(pte, pfn_pte(pfn, prot));
 			}
 			end_pfn_map = pfn;
 		}
 	}
+	update_page_count(PG_LEVEL_2M, pages_2m);
+	update_page_count(PG_LEVEL_4K, pages_4k);
 }
 
 static inline int page_kills_ppro(unsigned long pagenr)
Index: linux/fs/proc/proc_misc.c
===================================================================
--- linux.orig/fs/proc/proc_misc.c
+++ linux/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, 
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+	return 0;
+}
+
 static int meminfo_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -219,6 +224,8 @@ static int meminfo_read_proc(char *page,
 
 		len += hugetlb_report_meminfo(page + len);
 
+	len += arch_report_meminfo(page + len);
+
 	return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2008-03-25 17:02 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-12  2:53 [PATCH] [1/7] Implement true end_pfn_mapped for 32bit Andi Kleen
2008-03-12  2:53 ` [PATCH] [2/7] Account overlapped mappings in end_pfn_map Andi Kleen
2008-03-12  2:53 ` [PATCH] [3/7] Add set_memory_4k to pageattr.c Andi Kleen
2008-03-12  2:53 ` [PATCH] [4/7] Don't use large pages to map the first 2/4MB of memory Andi Kleen
2008-03-12  5:38   ` Eric Dumazet
2008-03-12  9:19     ` Andi Kleen
2008-03-21 17:45   ` Thomas Gleixner
2008-03-21 17:59     ` Andi Kleen
2008-03-21 18:03       ` Thomas Gleixner
2008-03-21 18:44         ` Andi Kleen
2008-03-25 11:31   ` Joerg Roedel
2008-03-25 11:39     ` Andi Kleen
2008-03-12  2:53 ` [PATCH] [5/7] Readd rdmsrl_safe Andi Kleen
2008-03-21 17:06   ` Thomas Gleixner
2008-03-21 17:16     ` Andi Kleen
2008-03-21 17:58       ` Thomas Gleixner
2008-03-21 18:06         ` Andi Kleen
2008-03-21 18:14           ` Thomas Gleixner
2008-03-21 18:46             ` Andi Kleen
2008-03-21 18:48               ` [PATCH] [5/7] Readd rdmsrl_safe II Andi Kleen
2008-03-22  9:59     ` [PATCH] Readd rdmsrl_safe v2 Andi Kleen
2008-03-12  2:53 ` [PATCH] [6/7] Split large page mapping for AMD TSEG Andi Kleen
2008-03-21 17:55   ` Thomas Gleixner
2008-03-25 11:56   ` Joerg Roedel
2008-03-25 16:44   ` Thomas Gleixner
2008-03-25 16:54     ` Andi Kleen
2008-03-12  2:53 ` [PATCH] [7/7] CPA: Add statistics about state of direct mapping v2 Andi Kleen
2008-03-21 17:41   ` Thomas Gleixner
2008-03-21 17:55     ` Andi Kleen
2008-03-22  9:50       ` [PATCH] CPA: Add statistics about state of direct mapping v3 Andi Kleen
2008-03-25 15:40         ` Thomas Gleixner
2008-03-25 16:14           ` Andi Kleen
2008-03-25 16:16             ` Thomas Gleixner
2008-03-25 17:01               ` [PATCH] CPA: Add statistics about state of direct mapping v4 Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).