LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 2/3 V8] NX protection for kernel data
@ 2010-11-16 21:31 matthieu castet
  2010-11-18 14:08 ` [tip:x86/security] x86: Add " tip-bot for Matthieu Castet
  0 siblings, 1 reply; 21+ messages in thread
From: matthieu castet @ 2010-11-16 21:31 UTC (permalink / raw)
  To: linux-kernel, linux-security-module, linux-next
  Cc: Arjan van de Ven, James Morris, Andrew Morton, Andi Kleen,
	Thomas Gleixner, H. Peter Anvin, Ingo Molnar, Rusty Russell,
	Stephen Rothwell, Dave Jones, Siarhei Liakh, Kees Cook

[-- Attachment #1: Type: text/plain, Size: 3457 bytes --]

    Note: this patch depends on "Correct improper large page preservation" patch
    
    This patch expands functionality of CONFIG_DEBUG_RODATA to set main
    (static) kernel data area as NX.
    The following steps are taken to achieve this:
    1. Linker script is adjusted so .text always starts and ends on a page bound
    2. Linker script is adjusted so .rodata always start and
    end on a page boundary
    3. NX is set for all pages from _etext through _end in mark_rodata_ro.
    4. free_init_pages() sets released memory NX in arch/x86/mm/init.c
    5. bios rom is set to x when pcibios is used.
    
    The results of patch application may be observed in the diff of kernel page
    table dumps.
    pcibios :
    --- data_nx_pt_before.txt       2009-10-13 07:48:59.000000000 -0400
    +++ data_nx_pt_after.txt        2009-10-13 07:26:46.000000000 -0400
    @@ -2,8 +2,9 @@
     0x00000000-0xc0000000           3G                           pmd
     ---[ Kernel Mapping ]---
    -0xc0000000-0xc0100000           1M     RW             GLB x  pte
    +0xc0000000-0xc00a0000         640K     RW             GLB NX pte
    +0xc00a0000-0xc0100000         384K     RW             GLB x  pte
    -0xc0100000-0xc03d7000        2908K     ro             GLB x  pte
    +0xc0100000-0xc0318000        2144K     ro             GLB x  pte
    +0xc0318000-0xc03d7000         764K     ro             GLB NX pte
    -0xc03d7000-0xc0600000        2212K     RW             GLB x  pte
    +0xc03d7000-0xc0600000        2212K     RW             GLB NX pte
     0xc0600000-0xf7a00000         884M     RW         PSE GLB NX pmd
     0xf7a00000-0xf7bfe000        2040K     RW             GLB NX pte
     0xf7bfe000-0xf7c00000           8K                           pte
    
    no pcibios :
    --- data_nx_pt_before.txt       2009-10-13 07:48:59.000000000 -0400
    +++ data_nx_pt_after.txt        2009-10-13 07:26:46.000000000 -0400
    @@ -2,8 +2,9 @@
     0x00000000-0xc0000000           3G                           pmd
     ---[ Kernel Mapping ]---
    -0xc0000000-0xc0100000           1M     RW             GLB x  pte
    +0xc0000000-0xc0100000           1M     RW             GLB NX pte
    -0xc0100000-0xc03d7000        2908K     ro             GLB x  pte
    +0xc0100000-0xc0318000        2144K     ro             GLB x  pte
    +0xc0318000-0xc03d7000         764K     ro             GLB NX pte
    -0xc03d7000-0xc0600000        2212K     RW             GLB x  pte
    +0xc03d7000-0xc0600000        2212K     RW             GLB NX pte
     0xc0600000-0xf7a00000         884M     RW         PSE GLB NX pmd
     0xf7a00000-0xf7bfe000        2040K     RW             GLB NX pte
     0xf7bfe000-0xf7c00000           8K                           pte
    
    The patch have been developed for Linux 2.6.34-rc2 x86 by Siarhei Liakh
    <sliakh.lkml@gmail.com> and Xuxian Jiang <jiang@cs.ncsu.edu>.
    
    V1:  initial patch for 2.6.30
    V2:  patch for 2.6.31-rc7
    V3:  moved all code into arch/x86, adjusted credits
    V4:  fixed ifdef, removed credits from CREDITS
    V5:  fixed an address calculation bug in mark_nxdata_nx()
    V6:  added acked-by and PT dump diff to commit log
    V7:  minor adjustments for -tip
    V8:  rework with the merge of "Set first MB as RW+NX"
    
    Signed-off-by: Siarhei Liakh <sliakh.lkml@gmail.com>
    Signed-off-by: Xuxian Jiang <jiang@cs.ncsu.edu>
    Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>


[-- Attachment #2: x86_nx_kernel_data.diff --]
[-- Type: text/x-diff, Size: 6785 bytes --]

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 404a880..d5cecf3 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -62,6 +62,7 @@ extern unsigned long pci_mem_start;
 
 #define PCIBIOS_MIN_CARDBUS_IO	0x4000
 
+extern int pcibios_enabled;
 void pcibios_config_init(void);
 struct pci_bus *pcibios_scan_root(int bus);
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb522..958ac45 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
 
 PHDRS {
 	text PT_LOAD FLAGS(5);          /* R_E */
-	data PT_LOAD FLAGS(7);          /* RWE */
+	data PT_LOAD FLAGS(6);          /* RW_ */
 #ifdef CONFIG_X86_64
 	user PT_LOAD FLAGS(5);          /* R_E */
 #ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
 
 	EXCEPTION_TABLE(16) :text = 0x9090
 
+#if defined(CONFIG_DEBUG_RODATA)
+	/* .text should occupy whole number of pages */
+	. = ALIGN(PAGE_SIZE);
+#endif
 	X64_ALIGN_DEBUG_RODATA_BEGIN
 	RO_DATA(PAGE_SIZE)
 	X64_ALIGN_DEBUG_RODATA_END
@@ -307,7 +311,7 @@ SECTIONS
 		__bss_start = .;
 		*(.bss..page_aligned)
 		*(.bss)
-		. = ALIGN(4);
+		. = ALIGN(PAGE_SIZE);
 		__bss_stop = .;
 	}
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index b278535..bfacdbe 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -362,8 +362,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	/*
 	 * We just marked the kernel text read only above, now that
 	 * we are going to free part of that, we need to make that
-	 * writeable first.
+	 * writeable and non-executable first.
 	 */
+	set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
 	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
 
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca7909..85a1a75 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -225,7 +225,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 
 static inline int is_kernel_text(unsigned long addr)
 {
-	if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+	if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
 		return 1;
 	return 0;
 }
@@ -1033,6 +1033,22 @@ void set_kernel_text_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 }
 
+static void mark_nxdata_nx(void)
+{
+	/*
+	 * When this called, init has already been executed and released,
+	 * so everything past _etext sould be NX.
+	 */
+	unsigned long start = PFN_ALIGN(_etext);
+	/* this comes from is_kernel_text upper limit. Also HPAGE where used */
+	unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK)- start;
+
+	if (__supported_pte_mask & _PAGE_NX)
+		printk(KERN_INFO "NX-protecting the kernel data: %luk\n",
+			size >> 10);
+	set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
@@ -1067,6 +1083,7 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
+	mark_nxdata_nx();
 }
 #endif
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a66746..6dfacec 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -762,6 +762,7 @@ void mark_rodata_ro(void)
 	unsigned long rodata_start =
 		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
 	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
+	unsigned long kernel_end = (((unsigned long)&__init_end + HPAGE_SIZE) & HPAGE_MASK);
 	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
 	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
 	unsigned long data_start = (unsigned long) &_sdata;
@@ -776,7 +777,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 */
-	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (kernel_end - rodata_start) >> PAGE_SHIFT);
 
 	rodata_test();
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 91ce756..ab01c6d 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
 #include <linux/pfn.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
+#include <linux/pci.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -261,8 +262,11 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	 * The BIOS area between 640k and 1Mb needs to be executable for
 	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 	 */
-	if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+	if (pcibios_enabled &&
+			within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_NX;
+#endif
 
 	/*
 	 * The kernel text needs to be executable for obvious reasons
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d16..e1f08b5 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
 #include <linux/uaccess.h>
 #include <asm/pci_x86.h>
 #include <asm/pci-functions.h>
+#include <asm/cacheflush.h>
 
 /* BIOS32 signature: "_32_" */
 #define BIOS32_SIGNATURE	(('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,28 @@
 #define PCIBIOS_HW_TYPE1_SPEC		0x10
 #define PCIBIOS_HW_TYPE2_SPEC		0x20
 
+int pcibios_enabled;
+
+/* according some bios specification 
+ * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could 
+ * restrict the x zone to some pages and make it ro. But this may be
+ * broken on some bios, complex to handle with static_protections.
+ * We could make the 0xe0000-0x100000 range rox, but this can break
+ * some ISA mapping.
+ *
+ * So we let's an rw and x hole when pcibios is used. This shouldn't
+ * happen for modern system with mmconfig, and if you don't want it
+ * you could disable pcibios...
+ */
+static inline void set_bios_x(void)
+{
+	pcibios_enabled = 1;
+	set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
+	if (__supported_pte_mask & _PAGE_NX)
+		printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. "
+				"Use pci=nobios if you want it NX.\n");
+}
+
 /*
  * This is the standard structure used to identify the entry point
  * to the BIOS32 Service Directory, as documented in
@@ -332,6 +355,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
 			DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
 					bios32_entry);
 			bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+			set_bios_x();
 			if (check_pcibios())
 				return &pci_bios_access;
 		}


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [tip:x86/security] x86: Add NX protection for kernel data
  2010-11-16 21:31 [PATCH 2/3 V8] NX protection for kernel data matthieu castet
@ 2010-11-18 14:08 ` tip-bot for Matthieu Castet
  2011-01-11 23:31   ` Kees Cook
  0 siblings, 1 reply; 21+ messages in thread
From: tip-bot for Matthieu Castet @ 2010-11-18 14:08 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, jmorris, sliakh.lkml, hpa, mingo, torvalds, rusty,
	davej, ak, jiang, arjan, castet.matthieu, kees.cook, sfr, tglx,
	mingo

Commit-ID:  5bd5a452662bc37c54fb6828db1a3faf87e6511c
Gitweb:     http://git.kernel.org/tip/5bd5a452662bc37c54fb6828db1a3faf87e6511c
Author:     Matthieu Castet <castet.matthieu@free.fr>
AuthorDate: Tue, 16 Nov 2010 22:31:26 +0100
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 18 Nov 2010 12:52:04 +0100

x86: Add NX protection for kernel data

This patch expands functionality of CONFIG_DEBUG_RODATA to set main
(static) kernel data area as NX.

The following steps are taken to achieve this:

 1. Linker script is adjusted so .text always starts and ends on a page bound
 2. Linker script is adjusted so .rodata always start and end on a page boundary
 3. NX is set for all pages from _etext through _end in mark_rodata_ro.
 4. free_init_pages() sets released memory NX in arch/x86/mm/init.c
 5. bios rom is set to x when pcibios is used.

The results of patch application may be observed in the diff of kernel page
table dumps:

pcibios:

 -- data_nx_pt_before.txt       2009-10-13 07:48:59.000000000 -0400
 ++ data_nx_pt_after.txt        2009-10-13 07:26:46.000000000 -0400
  0x00000000-0xc0000000           3G                           pmd
  ---[ Kernel Mapping ]---
 -0xc0000000-0xc0100000           1M     RW             GLB x  pte
 +0xc0000000-0xc00a0000         640K     RW             GLB NX pte
 +0xc00a0000-0xc0100000         384K     RW             GLB x  pte
 -0xc0100000-0xc03d7000        2908K     ro             GLB x  pte
 +0xc0100000-0xc0318000        2144K     ro             GLB x  pte
 +0xc0318000-0xc03d7000         764K     ro             GLB NX pte
 -0xc03d7000-0xc0600000        2212K     RW             GLB x  pte
 +0xc03d7000-0xc0600000        2212K     RW             GLB NX pte
  0xc0600000-0xf7a00000         884M     RW         PSE GLB NX pmd
  0xf7a00000-0xf7bfe000        2040K     RW             GLB NX pte
  0xf7bfe000-0xf7c00000           8K                           pte

No pcibios:

 -- data_nx_pt_before.txt       2009-10-13 07:48:59.000000000 -0400
 ++ data_nx_pt_after.txt        2009-10-13 07:26:46.000000000 -0400
  0x00000000-0xc0000000           3G                           pmd
  ---[ Kernel Mapping ]---
 -0xc0000000-0xc0100000           1M     RW             GLB x  pte
 +0xc0000000-0xc0100000           1M     RW             GLB NX pte
 -0xc0100000-0xc03d7000        2908K     ro             GLB x  pte
 +0xc0100000-0xc0318000        2144K     ro             GLB x  pte
 +0xc0318000-0xc03d7000         764K     ro             GLB NX pte
 -0xc03d7000-0xc0600000        2212K     RW             GLB x  pte
 +0xc03d7000-0xc0600000        2212K     RW             GLB NX pte
  0xc0600000-0xf7a00000         884M     RW         PSE GLB NX pmd
  0xf7a00000-0xf7bfe000        2040K     RW             GLB NX pte
  0xf7bfe000-0xf7c00000           8K                           pte

The patch has been originally developed for Linux 2.6.34-rc2 x86 by
Siarhei Liakh <sliakh.lkml@gmail.com> and Xuxian Jiang <jiang@cs.ncsu.edu>.

 -v1:  initial patch for 2.6.30
 -v2:  patch for 2.6.31-rc7
 -v3:  moved all code into arch/x86, adjusted credits
 -v4:  fixed ifdef, removed credits from CREDITS
 -v5:  fixed an address calculation bug in mark_nxdata_nx()
 -v6:  added acked-by and PT dump diff to commit log
 -v7:  minor adjustments for -tip
 -v8:  rework with the merge of "Set first MB as RW+NX"

Signed-off-by: Siarhei Liakh <sliakh.lkml@gmail.com>
Signed-off-by: Xuxian Jiang <jiang@cs.ncsu.edu>
Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: James Morris <jmorris@namei.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Dave Jones <davej@redhat.com>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4CE2F82E.60601@free.fr>
[ minor cleanliness edits ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/pci.h    |    1 +
 arch/x86/kernel/vmlinux.lds.S |    8 ++++++--
 arch/x86/mm/init.c            |    3 ++-
 arch/x86/mm/init_32.c         |   20 +++++++++++++++++++-
 arch/x86/mm/init_64.c         |    3 ++-
 arch/x86/mm/pageattr.c        |    5 ++++-
 arch/x86/pci/pcbios.c         |   23 +++++++++++++++++++++++
 7 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ca0437c..6761292 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start;
 
 #define PCIBIOS_MIN_CARDBUS_IO	0x4000
 
+extern int pcibios_enabled;
 void pcibios_config_init(void);
 struct pci_bus *pcibios_scan_root(int bus);
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e03530a..bf47007 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
 
 PHDRS {
 	text PT_LOAD FLAGS(5);          /* R_E */
-	data PT_LOAD FLAGS(7);          /* RWE */
+	data PT_LOAD FLAGS(6);          /* RW_ */
 #ifdef CONFIG_X86_64
 	user PT_LOAD FLAGS(5);          /* R_E */
 #ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
 
 	EXCEPTION_TABLE(16) :text = 0x9090
 
+#if defined(CONFIG_DEBUG_RODATA)
+	/* .text should occupy whole number of pages */
+	. = ALIGN(PAGE_SIZE);
+#endif
 	X64_ALIGN_DEBUG_RODATA_BEGIN
 	RO_DATA(PAGE_SIZE)
 	X64_ALIGN_DEBUG_RODATA_END
@@ -335,7 +339,7 @@ SECTIONS
 		__bss_start = .;
 		*(.bss..page_aligned)
 		*(.bss)
-		. = ALIGN(4);
+		. = ALIGN(PAGE_SIZE);
 		__bss_stop = .;
 	}
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a1..947f42a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	/*
 	 * We just marked the kernel text read only above, now that
 	 * we are going to free part of that, we need to make that
-	 * writeable first.
+	 * writeable and non-executable first.
 	 */
+	set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
 	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
 
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9..f89b5bb 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 
 static inline int is_kernel_text(unsigned long addr)
 {
-	if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+	if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
 		return 1;
 	return 0;
 }
@@ -912,6 +912,23 @@ void set_kernel_text_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 }
 
+static void mark_nxdata_nx(void)
+{
+	/*
+	 * When this called, init has already been executed and released,
+	 * so everything past _etext sould be NX.
+	 */
+	unsigned long start = PFN_ALIGN(_etext);
+	/*
+	 * This comes from is_kernel_text upper limit. Also HPAGE where used:
+	 */
+	unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+	if (__supported_pte_mask & _PAGE_NX)
+		printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+	set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +963,7 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
+	mark_nxdata_nx();
 }
 #endif
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a5929..ce59c05 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -788,6 +788,7 @@ void mark_rodata_ro(void)
 	unsigned long rodata_start =
 		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
 	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
+	unsigned long kernel_end = (((unsigned long)&__init_end + HPAGE_SIZE) & HPAGE_MASK);
 	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
 	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
 	unsigned long data_start = (unsigned long) &_sdata;
@@ -802,7 +803,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 */
-	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (kernel_end - rodata_start) >> PAGE_SHIFT);
 
 	rodata_test();
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6f2a6b6..8b830ca 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
 #include <linux/pfn.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
+#include <linux/pci.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -261,8 +262,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	 * The BIOS area between 640k and 1Mb needs to be executable for
 	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 	 */
-	if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_NX;
+#endif
 
 	/*
 	 * The kernel text needs to be executable for obvious reasons
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d16..a5f7d0d 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
 #include <linux/uaccess.h>
 #include <asm/pci_x86.h>
 #include <asm/pci-functions.h>
+#include <asm/cacheflush.h>
 
 /* BIOS32 signature: "_32_" */
 #define BIOS32_SIGNATURE	(('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,27 @@
 #define PCIBIOS_HW_TYPE1_SPEC		0x10
 #define PCIBIOS_HW_TYPE2_SPEC		0x20
 
+int pcibios_enabled;
+
+/* According to the BIOS specification at:
+ * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
+ * restrict the x zone to some pages and make it ro. But this may be
+ * broken on some bios, complex to handle with static_protections.
+ * We could make the 0xe0000-0x100000 range rox, but this can break
+ * some ISA mapping.
+ *
+ * So we let's an rw and x hole when pcibios is used. This shouldn't
+ * happen for modern system with mmconfig, and if you don't want it
+ * you could disable pcibios...
+ */
+static inline void set_bios_x(void)
+{
+	pcibios_enabled = 1;
+	set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
+	if (__supported_pte_mask & _PAGE_NX)
+		printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+}
+
 /*
  * This is the standard structure used to identify the entry point
  * to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
 			DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
 					bios32_entry);
 			bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+			set_bios_x();
 			if (check_pcibios())
 				return &pci_bios_access;
 		}

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2010-11-18 14:08 ` [tip:x86/security] x86: Add " tip-bot for Matthieu Castet
@ 2011-01-11 23:31   ` Kees Cook
  2011-01-14 20:15     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: Kees Cook @ 2011-01-11 23:31 UTC (permalink / raw)
  To: mingo, hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds,
	ak, davej, jiang, arjan, castet.matthieu, tglx, sfr, mingo,
	Stefan Bader

On Thu, Nov 18, 2010 at 02:08:22PM +0000, tip-bot for Matthieu Castet wrote:
> Commit-ID:  5bd5a452662bc37c54fb6828db1a3faf87e6511c
> Gitweb:     http://git.kernel.org/tip/5bd5a452662bc37c54fb6828db1a3faf87e6511c
> Author:     Matthieu Castet <castet.matthieu@free.fr>
> AuthorDate: Tue, 16 Nov 2010 22:31:26 +0100
> Committer:  Ingo Molnar <mingo@elte.hu>
> CommitDate: Thu, 18 Nov 2010 12:52:04 +0100
> 
> x86: Add NX protection for kernel data

[I'd sent this in reply to the wrong patch before, resending now...]

Hi,

I was just shown this[1] on Xen from an Ubuntu bug report[2].

[    1.230382] NX-protecting the kernel data: 3884k
[    1.231002] BUG: unable to handle kernel paging request at c1782ae0
...
[    1.231145] Call Trace:
[    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
[    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
[    1.231169]  [<c013857c>] ? __change_page_attr_set_clr+0x4c/0xb0
[    1.231176]  [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
[    1.231183]  [<c010798e>] ? __raw_callee_save_xen_restore_fl+0x6/0x8
[    1.231192]  [<c0159ca1>] ? vprintk+0x171/0x3f0
[    1.231198]  [<c0138bdf>] ? set_memory_nx+0x5f/0x70


Does Xen have different size page table allocations or something weird?

-Kees

[1] http://launchpadlibrarian.net/61853131/natty-failed-boot-ec2.txt
[2] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/699828

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-11 23:31   ` Kees Cook
@ 2011-01-14 20:15     ` Konrad Rzeszutek Wilk
  2011-01-19 21:14       ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-14 20:15 UTC (permalink / raw)
  To: Kees Cook, Jeremy Fitzhardinge, keir.fraser
  Cc: mingo, hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds,
	ak, davej, jiang, arjan, castet.matthieu, tglx, sfr, mingo,
	Stefan Bader

On Tue, Jan 11, 2011 at 03:31:35PM -0800, Kees Cook wrote:
> On Thu, Nov 18, 2010 at 02:08:22PM +0000, tip-bot for Matthieu Castet wrote:
> > Commit-ID:  5bd5a452662bc37c54fb6828db1a3faf87e6511c
> > Gitweb:     http://git.kernel.org/tip/5bd5a452662bc37c54fb6828db1a3faf87e6511c
> > Author:     Matthieu Castet <castet.matthieu@free.fr>
> > AuthorDate: Tue, 16 Nov 2010 22:31:26 +0100
> > Committer:  Ingo Molnar <mingo@elte.hu>
> > CommitDate: Thu, 18 Nov 2010 12:52:04 +0100
> > 
> > x86: Add NX protection for kernel data
> 
> [I'd sent this in reply to the wrong patch before, resending now...]
> 
> Hi,
> 
> I was just shown this[1] on Xen from an Ubuntu bug report[2].
> 
> [    1.230382] NX-protecting the kernel data: 3884k
> [    1.231002] BUG: unable to handle kernel paging request at c1782ae0
> ...
> [    1.231145] Call Trace:
> [    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> [    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> [    1.231169]  [<c013857c>] ? __change_page_attr_set_clr+0x4c/0xb0
> [    1.231176]  [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> [    1.231183]  [<c010798e>] ? __raw_callee_save_xen_restore_fl+0x6/0x8
> [    1.231192]  [<c0159ca1>] ? vprintk+0x171/0x3f0
> [    1.231198]  [<c0138bdf>] ? set_memory_nx+0x5f/0x70

If you run it with Xen debugging enabled:

[    7.753329] NX-protecting the kernel data: 2400k
(XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn 1355a5 (pfn 15a5)
(XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn 15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
(XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
[    7.759087] BUG: unable to handle kernel paging request at c82a4d28
[    7.759087] IP: [<c100608c>] xen_set_pte_atomic+0x21/0x2f
[    7.759087] *pdpt = 0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 
.. and same stack trace.

> 
> 
> Does Xen have different size page table allocations or something weird?

The same page size. Not sure actually why it is being triggered. Let me copy
Keir on this. Keir, the region that is being marked as _NX is .bss one and
they are marked as RWE in the ELF phhdrs.

Reverting above git 5bd5a452662bc37c54fb6828db1a3faf87e6511c does make Linux
work under Xen.

> 
> -Kees
> 
> [1] http://launchpadlibrarian.net/61853131/natty-failed-boot-ec2.txt
> [2] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/699828
> 
> -- 
> Kees Cook
> Ubuntu Security Team
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-14 20:15     ` Konrad Rzeszutek Wilk
@ 2011-01-19 21:14       ` Konrad Rzeszutek Wilk
  2011-01-19 22:59         ` matthieu castet
  0 siblings, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-19 21:14 UTC (permalink / raw)
  To: Kees Cook, Jeremy Fitzhardinge, keir.fraser, castet.matthieu
  Cc: mingo, hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds,
	ak, davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader

On Fri, Jan 14, 2011 at 03:15:30PM -0500, Konrad Rzeszutek Wilk wrote:
> On Tue, Jan 11, 2011 at 03:31:35PM -0800, Kees Cook wrote:
> > On Thu, Nov 18, 2010 at 02:08:22PM +0000, tip-bot for Matthieu Castet wrote:
> > > Commit-ID:  5bd5a452662bc37c54fb6828db1a3faf87e6511c
> > > Gitweb:     http://git.kernel.org/tip/5bd5a452662bc37c54fb6828db1a3faf87e6511c
> > > Author:     Matthieu Castet <castet.matthieu@free.fr>
> > > AuthorDate: Tue, 16 Nov 2010 22:31:26 +0100
> > > Committer:  Ingo Molnar <mingo@elte.hu>
> > > CommitDate: Thu, 18 Nov 2010 12:52:04 +0100
> > > 
> > > x86: Add NX protection for kernel data
> > 
> > [I'd sent this in reply to the wrong patch before, resending now...]
> > 
> > Hi,
> > 
> > I was just shown this[1] on Xen from an Ubuntu bug report[2].
> > 
> > [    1.230382] NX-protecting the kernel data: 3884k
> > [    1.231002] BUG: unable to handle kernel paging request at c1782ae0
> > ...
> > [    1.231145] Call Trace:
> > [    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> > [    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> > [    1.231169]  [<c013857c>] ? __change_page_attr_set_clr+0x4c/0xb0
> > [    1.231176]  [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> > [    1.231183]  [<c010798e>] ? __raw_callee_save_xen_restore_fl+0x6/0x8
> > [    1.231192]  [<c0159ca1>] ? vprintk+0x171/0x3f0
> > [    1.231198]  [<c0138bdf>] ? set_memory_nx+0x5f/0x70
> 
> If you run it with Xen debugging enabled:
> 
> [    7.753329] NX-protecting the kernel data: 2400k
> (XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn 1355a5 (pfn 15a5)
> (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn 15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
> (XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
> [    7.759087] BUG: unable to handle kernel paging request at c82a4d28
> [    7.759087] IP: [<c100608c>] xen_set_pte_atomic+0x21/0x2f
> [    7.759087] *pdpt = 0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 
> .. and same stack trace.
> 
> > 
> > 
> > Does Xen have different size page table allocations or something weird?
> 
> The same page size. Not sure actually why it is being triggered. Let me copy
> Keir on this. Keir, the region that is being marked as _NX is .bss one and

Um, it actually is from _etext -> __init_end + HPAGE_SIZE.

instrumenting the code a bit shows that setting of RW+NX from _etext throgh
__init_end works just fine. It just when you start at the PFN _past_ the
__init_end it dies. Any ideas? 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-19 21:14       ` Konrad Rzeszutek Wilk
@ 2011-01-19 22:59         ` matthieu castet
  2011-01-19 23:38           ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: matthieu castet @ 2011-01-19 22:59 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo, hpa,
	sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak, davej,
	jiang, arjan, tglx, sfr, mingo, Stefan Bader

Le Wed, 19 Jan 2011 16:14:32 -0500,
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> a écrit :

> On Fri, Jan 14, 2011 at 03:15:30PM -0500, Konrad Rzeszutek Wilk wrote:
> > On Tue, Jan 11, 2011 at 03:31:35PM -0800, Kees Cook wrote:
> > > On Thu, Nov 18, 2010 at 02:08:22PM +0000, tip-bot for Matthieu
> > > Castet wrote:
> > > > Commit-ID:  5bd5a452662bc37c54fb6828db1a3faf87e6511c
> > > > Gitweb:
> > > > http://git.kernel.org/tip/5bd5a452662bc37c54fb6828db1a3faf87e6511c
> > > > Author:     Matthieu Castet <castet.matthieu@free.fr>
> > > > AuthorDate: Tue, 16 Nov 2010 22:31:26 +0100 Committer:  Ingo
> > > > Molnar <mingo@elte.hu> CommitDate: Thu, 18 Nov 2010 12:52:04
> > > > +0100
> > > > 
> > > > x86: Add NX protection for kernel data
> > > 
> > > [I'd sent this in reply to the wrong patch before, resending
> > > now...]
> > > 
> > > Hi,
> > > 
> > > I was just shown this[1] on Xen from an Ubuntu bug report[2].
> > > 
> > > [    1.230382] NX-protecting the kernel data: 3884k
> > > [    1.231002] BUG: unable to handle kernel paging request at
> > > c1782ae0 ...
> > > [    1.231145] Call Trace:
> > > [    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> > > [    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> > > [    1.231169]  [<c013857c>] ?
> > > __change_page_attr_set_clr+0x4c/0xb0 [    1.231176]
> > > [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> > > [    1.231183]  [<c010798e>] ?
> > > __raw_callee_save_xen_restore_fl+0x6/0x8 [    1.231192]
> > > [<c0159ca1>] ? vprintk+0x171/0x3f0 [    1.231198]  [<c0138bdf>] ?
> > > set_memory_nx+0x5f/0x70
> > 
> > If you run it with Xen debugging enabled:
> > 
> > [    7.753329] NX-protecting the kernel data: 2400k
> > (XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn
this happen if (x & (PGT_type_mask|PGT_pae_xen_l2)) != type)

but
#define PGT_type_mask       (7U<<29) /* Bits 29-31. */
#define _PGT_pae_xen_l2     26
#define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)

but (exp type = 0x70000000) & (PGT_type_mask|PGT_pae_xen_l2) =
0x60000000

So the exp type look strange.
#define _PGT_pinned         28
#define PGT_pinned          (1U<<_PGT_pinned)

> > 1355a5 (pfn 15a5) (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn
> > 15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
> > (XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
> > [    7.759087] BUG: unable to handle kernel paging request at
> > c82a4d28 [    7.759087] IP: [<c100608c>]
> > xen_set_pte_atomic+0x21/0x2f [    7.759087] *pdpt =
> > 0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 ..
> > and same stack trace.
> > 
> > > 
> > > 
> > > Does Xen have different size page table allocations or something
> > > weird?
> > 
> > The same page size. Not sure actually why it is being triggered.
> > Let me copy Keir on this. Keir, the region that is being marked as
> > _NX is .bss one and
> 
> Um, it actually is from _etext -> __init_end + HPAGE_SIZE.
> 
> instrumenting the code a bit shows that setting of RW+NX from _etext
> throgh __init_end works just fine. It just when you start at the PFN
> _past_ the __init_end it dies. Any ideas? 
> 
Does this happen if you add ". = ALIGN(HPAGE_SIZE);" before bss section
in arch/x86/kernel/vmlinux.lds.S ?

Does it happen if you call set_pages_rw instead of set_pages_nx in
mark_nxdata_nx ?


What's the output of kernel_page_tables debugfs ?


Matthieu

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-19 22:59         ` matthieu castet
@ 2011-01-19 23:38           ` Konrad Rzeszutek Wilk
  2011-01-20 11:18             ` castet.matthieu
  0 siblings, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-19 23:38 UTC (permalink / raw)
  To: matthieu castet
  Cc: Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo, hpa,
	sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak, davej,
	jiang, arjan, tglx, sfr, mingo, Stefan Bader

On Wed, Jan 19, 2011 at 11:59:57PM +0100, matthieu castet wrote:
> Le Wed, 19 Jan 2011 16:14:32 -0500,
> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> a écrit :
> 
> > On Fri, Jan 14, 2011 at 03:15:30PM -0500, Konrad Rzeszutek Wilk wrote:
> > > On Tue, Jan 11, 2011 at 03:31:35PM -0800, Kees Cook wrote:
> > > > On Thu, Nov 18, 2010 at 02:08:22PM +0000, tip-bot for Matthieu
> > > > Castet wrote:
> > > > > Commit-ID:  5bd5a452662bc37c54fb6828db1a3faf87e6511c
> > > > > Gitweb:
> > > > > http://git.kernel.org/tip/5bd5a452662bc37c54fb6828db1a3faf87e6511c
> > > > > Author:     Matthieu Castet <castet.matthieu@free.fr>
> > > > > AuthorDate: Tue, 16 Nov 2010 22:31:26 +0100 Committer:  Ingo
> > > > > Molnar <mingo@elte.hu> CommitDate: Thu, 18 Nov 2010 12:52:04
> > > > > +0100
> > > > > 
> > > > > x86: Add NX protection for kernel data
> > > > 
> > > > [I'd sent this in reply to the wrong patch before, resending
> > > > now...]
> > > > 
> > > > Hi,
> > > > 
> > > > I was just shown this[1] on Xen from an Ubuntu bug report[2].
> > > > 
> > > > [    1.230382] NX-protecting the kernel data: 3884k
> > > > [    1.231002] BUG: unable to handle kernel paging request at
> > > > c1782ae0 ...
> > > > [    1.231145] Call Trace:
> > > > [    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> > > > [    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> > > > [    1.231169]  [<c013857c>] ?
> > > > __change_page_attr_set_clr+0x4c/0xb0 [    1.231176]
> > > > [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> > > > [    1.231183]  [<c010798e>] ?
> > > > __raw_callee_save_xen_restore_fl+0x6/0x8 [    1.231192]
> > > > [<c0159ca1>] ? vprintk+0x171/0x3f0 [    1.231198]  [<c0138bdf>] ?
> > > > set_memory_nx+0x5f/0x70
> > > 
> > > If you run it with Xen debugging enabled:
> > > 
> > > [    7.753329] NX-protecting the kernel data: 2400k
> > > (XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn
> this happen if (x & (PGT_type_mask|PGT_pae_xen_l2)) != type)
> 
> but
> #define PGT_type_mask       (7U<<29) /* Bits 29-31. */
> #define _PGT_pae_xen_l2     26
> #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
> 
> but (exp type = 0x70000000) & (PGT_type_mask|PGT_pae_xen_l2) =
> 0x60000000
> 
> So the exp type look strange.
> #define _PGT_pinned         28
> #define PGT_pinned          (1U<<_PGT_pinned)
> 
> > > 1355a5 (pfn 15a5) (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn
> > > 15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
> > > (XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
> > > [    7.759087] BUG: unable to handle kernel paging request at
> > > c82a4d28 [    7.759087] IP: [<c100608c>]
> > > xen_set_pte_atomic+0x21/0x2f [    7.759087] *pdpt =
> > > 0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 ..
> > > and same stack trace.
> > > 
> > > > 
> > > > 
> > > > Does Xen have different size page table allocations or something
> > > > weird?
> > > 
> > > The same page size. Not sure actually why it is being triggered.
> > > Let me copy Keir on this. Keir, the region that is being marked as
> > > _NX is .bss one and
> > 
> > Um, it actually is from _etext -> __init_end + HPAGE_SIZE.
> > 
> > instrumenting the code a bit shows that setting of RW+NX from _etext
> > throgh __init_end works just fine. It just when you start at the PFN

It was actually the thrid PFN, and if I looked at the objdump linux -x
output it fall right on .bss and it was swapper_pg_dir..

> > _past_ the __init_end it dies. Any ideas? 
> > 
> Does this happen if you add ". = ALIGN(HPAGE_SIZE);" before bss section
> in arch/x86/kernel/vmlinux.lds.S ?

Like this?

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index b34ab80..e37d10f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -341,7 +341,7 @@ SECTIONS
 #endif
 
 	/* BSS */
-	. = ALIGN(PAGE_SIZE);
+	. = ALIGN(HPAGE_SIZE);
 	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 		__bss_start = .;
 		*(.bss..page_aligned)

yeeeey...That made it boot.

> 
> Does it happen if you call set_pages_rw instead of set_pages_nx in
> mark_nxdata_nx ?

Did not make a difference (tried before the aligment).
> 
> 
> What's the output of kernel_page_tables debugfs ?

Shees.. I get

[   73.723105] BUG: unable to handle kernel paging request at 15555000
[   73.724061] IP: [<c102bef3>] ptdump_show+0x143/0x214 SMP 
[   73.724061] last sysfs file: /sys/devices/pci0000:00/0000:00:18.4/class
[   73.724061] Modules linked in: xen_evtchn sg sd_mod fbcon tileblit ata_generic font bitblit softcursor ttm sata_nv drm_kms_helper video libata scsi_mod xen_blkfront xen_netfront fb_sys_fops sysimgblt sysfillrect syscopyarea xenfs [last unloaded: dump_dma]
[   73.724061] 
[   73.724061] Pid: 2843, comm: cat Tainted: G      D W   2.6.38-rc1test-00126-g9b66acf-dirty #55 N61PB-M2S/N61PB-M2S
[   73.724061] EIP: 0061:[<c102bef3>] EFLAGS: 00010206 CPU: 0
[   73.724061] EIP is at ptdump_show+0x143/0x214
[   73.724061] EAX: 15556000 EBX: 15555000 ECX: fe800000 EDX: 00000555
[   73.724061] ESI: 00000003 EDI: c16c0fa0 EBP: cd057f34 ESP: cd057ef0
[   73.724061]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0069
[   73.724061] Process cat (pid: 2843, ti=cd056000 task=cea81400 task.ti=cd056000)
[   73.724061] Stack:
[   73.724061]  fdfff000 00000000 00000555 c16c0ff8 15556000 c0000000 fe800000 caee1cc0
[   73.724061]  00000003 00000000 00000000 fe000000 fe800000 c14fa040 00000001 caee1cc0
[   73.724061]  00000001 cd057f70 c10dd83a caee1ce8 c116de08 ce6a8b40 08628000 00008000
[   73.724061] Call Trace:
[   73.724061]  [<c10dd83a>] seq_read+0x16b/0x321
[   73.724061]  [<c116de08>] ? security_file_permission+0x22/0x26
[   73.724061]  [<c10dd6cf>] ? seq_read+0x0/0x321
[   73.724061]  [<c10c858c>] vfs_read+0x7d/0xdb
[   73.724061]  [<c10c8681>] sys_read+0x3b/0x60
[   73.724061]  [<c13a4009>] syscall_call+0x7/0xb
[   73.724061] Code: 25 ff 0f 00 00 81 e2 00 f0 ff ff 52 50 eb 68 ff 15 2c 9e 4f c1 8b 4d d4 25 00 f0 ff ff 8d 98 00 00 00 c0 2d 00 f0 ff 3f 89 45 cc <8b> 13 8b 43 04 83 c3 08 6a 04 89 55 c4 89 45 c0 89 d0 8b 55 c0 
[   73.724061] EIP: [<c102bef3>] ptdump_show+0x143/0x214 SS:ESP 0069:cd057ef0
[   73.724061] CR2: 0000000015555000
[   73.724061] ---[ end trace 4eaa2a86a8e2da25 ]---

with the patch and if I revert 5bd5a452662bc37c54fb6828db1a3faf87e6511c.. 

That looks to be another bug to hunt down.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-19 23:38           ` Konrad Rzeszutek Wilk
@ 2011-01-20 11:18             ` castet.matthieu
  2011-01-20 15:06               ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: castet.matthieu @ 2011-01-20 11:18 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: matthieu castet, Kees Cook, Jeremy Fitzhardinge, keir.fraser,
	mingo, hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds,
	ak, davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader

Quoting Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:

> On Wed, Jan 19, 2011 at 11:59:57PM +0100, matthieu castet wrote:
> > Le Wed, 19 Jan 2011 16:14:32 -0500,
> > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> a écrit :
> > > > >
> > > > > I was just shown this[1] on Xen from an Ubuntu bug report[2].
> > > > >
> > > > > [    1.230382] NX-protecting the kernel data: 3884k
> > > > > [    1.231002] BUG: unable to handle kernel paging request at
> > > > > c1782ae0 ...
> > > > > [    1.231145] Call Trace:
> > > > > [    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> > > > > [    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> > > > > [    1.231169]  [<c013857c>] ?
> > > > > __change_page_attr_set_clr+0x4c/0xb0 [    1.231176]
> > > > > [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> > > > > [    1.231183]  [<c010798e>] ?
> > > > > __raw_callee_save_xen_restore_fl+0x6/0x8 [    1.231192]
> > > > > [<c0159ca1>] ? vprintk+0x171/0x3f0 [    1.231198]  [<c0138bdf>] ?
> > > > > set_memory_nx+0x5f/0x70
> > > >
> > > > If you run it with Xen debugging enabled:
> > > >
> > > > [    7.753329] NX-protecting the kernel data: 2400k
> > > > (XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn
> > this happen if (x & (PGT_type_mask|PGT_pae_xen_l2)) != type)
> >
> > but
> > #define PGT_type_mask       (7U<<29) /* Bits 29-31. */
> > #define _PGT_pae_xen_l2     26
> > #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
> >
> > but (exp type = 0x70000000) & (PGT_type_mask|PGT_pae_xen_l2) =
> > 0x60000000
> >
> > So the exp type look strange.
> > #define _PGT_pinned         28
> > #define PGT_pinned          (1U<<_PGT_pinned)
> >
> > > > 1355a5 (pfn 15a5) (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn
> > > > 15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
> > > > (XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
> > > > [    7.759087] BUG: unable to handle kernel paging request at
> > > > c82a4d28 [    7.759087] IP: [<c100608c>]
> > > > xen_set_pte_atomic+0x21/0x2f [    7.759087] *pdpt =
> > > > 0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 ..
> > > > and same stack trace.
> > > >
> > > > >
> > > > >
> > > > > Does Xen have different size page table allocations or something
> > > > > weird?
> > > >
> > > > The same page size. Not sure actually why it is being triggered.
> > > > Let me copy Keir on this. Keir, the region that is being marked as
> > > > _NX is .bss one and
> > >
>
> > > _past_ the __init_end it dies. Any ideas?
> > >
> > Does this happen if you add ". = ALIGN(HPAGE_SIZE);" before bss section
> > in arch/x86/kernel/vmlinux.lds.S ?
>
> Like this?
Yes
>
> yeeeey...That made it boot.
>
> >
> > What's the output of kernel_page_tables debugfs ?
>
> Shees.. I get
>
> [   73.723105] BUG: unable to handle kernel paging request at 15555000
[...]
> with the patch and if I revert 5bd5a452662bc37c54fb6828db1a3faf87e6511c..
>
> That looks to be another bug to hunt down.
>
No that the same bug : that the root cause.

For some reason with xen, accessing some page tables (bss and after) make the
system crash.

Resolving this one, should resolve the nx case.


Matthieu


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-20 11:18             ` castet.matthieu
@ 2011-01-20 15:06               ` Konrad Rzeszutek Wilk
  2011-01-20 15:37                 ` Ian Campbell
  0 siblings, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-20 15:06 UTC (permalink / raw)
  To: castet.matthieu, Ian Campbell
  Cc: Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo, hpa,
	sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak, davej,
	jiang, arjan, tglx, sfr, mingo, Stefan Bader

On Thu, Jan 20, 2011 at 12:18:26PM +0100, castet.matthieu@free.fr wrote:
> Quoting Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:
> 
> > On Wed, Jan 19, 2011 at 11:59:57PM +0100, matthieu castet wrote:
> > > Le Wed, 19 Jan 2011 16:14:32 -0500,
> > > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> a écrit :
> > > > > >
> > > > > > I was just shown this[1] on Xen from an Ubuntu bug report[2].
> > > > > >
> > > > > > [    1.230382] NX-protecting the kernel data: 3884k
> > > > > > [    1.231002] BUG: unable to handle kernel paging request at
> > > > > > c1782ae0 ...
> > > > > > [    1.231145] Call Trace:
> > > > > > [    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> > > > > > [    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> > > > > > [    1.231169]  [<c013857c>] ?
> > > > > > __change_page_attr_set_clr+0x4c/0xb0 [    1.231176]
> > > > > > [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> > > > > > [    1.231183]  [<c010798e>] ?
> > > > > > __raw_callee_save_xen_restore_fl+0x6/0x8 [    1.231192]
> > > > > > [<c0159ca1>] ? vprintk+0x171/0x3f0 [    1.231198]  [<c0138bdf>] ?
> > > > > > set_memory_nx+0x5f/0x70
> > > > >
> > > > > If you run it with Xen debugging enabled:
> > > > >
> > > > > [    7.753329] NX-protecting the kernel data: 2400k
> > > > > (XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn
> > > this happen if (x & (PGT_type_mask|PGT_pae_xen_l2)) != type)
> > >
> > > but
> > > #define PGT_type_mask       (7U<<29) /* Bits 29-31. */
> > > #define _PGT_pae_xen_l2     26
> > > #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
> > >
> > > but (exp type = 0x70000000) & (PGT_type_mask|PGT_pae_xen_l2) =
> > > 0x60000000
> > >
> > > So the exp type look strange.
> > > #define _PGT_pinned         28
> > > #define PGT_pinned          (1U<<_PGT_pinned)
> > >
> > > > > 1355a5 (pfn 15a5) (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn
> > > > > 15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
> > > > > (XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
> > > > > [    7.759087] BUG: unable to handle kernel paging request at
> > > > > c82a4d28 [    7.759087] IP: [<c100608c>]
> > > > > xen_set_pte_atomic+0x21/0x2f [    7.759087] *pdpt =
> > > > > 0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 ..
> > > > > and same stack trace.
> > > > >
> > > > > >
> > > > > >
> > > > > > Does Xen have different size page table allocations or something
> > > > > > weird?
> > > > >
> > > > > The same page size. Not sure actually why it is being triggered.
> > > > > Let me copy Keir on this. Keir, the region that is being marked as
> > > > > _NX is .bss one and
> > > >
> >
> > > > _past_ the __init_end it dies. Any ideas?
> > > >
> > > Does this happen if you add ". = ALIGN(HPAGE_SIZE);" before bss section
> > > in arch/x86/kernel/vmlinux.lds.S ?
> >
> > Like this?
> Yes
> >
> > yeeeey...That made it boot.
> >
> > >
> > > What's the output of kernel_page_tables debugfs ?
> >
> > Shees.. I get
> >
> > [   73.723105] BUG: unable to handle kernel paging request at 15555000
> [...]
> > with the patch and if I revert 5bd5a452662bc37c54fb6828db1a3faf87e6511c..
> >
> > That looks to be another bug to hunt down.
> >
> No that the same bug : that the root cause.
> 
> For some reason with xen, accessing some page tables (bss and after) make the
> system crash.

I think I know the failure in the first case - the swapper_pg_dir is marked as _RO
and you are not suppose to make it _RW (unless you first do a bit of dance and switch
over to another pagetable). The reason being that Xen has a symbiotic relationship
with PV domains where pagetables are marked _RO so that any update to
it will go through Xen so it can validate that we aren't doing anything stupid.

But accessing the page table should be OK, not sure why it crashed - we
aren't writting anything to it - just reading.

Let me copy Ian on this - he might have better ideas.
> 
> Resolving this one, should resolve the nx case.

That
> 
> 
> Matthieu

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-20 15:06               ` Konrad Rzeszutek Wilk
@ 2011-01-20 15:37                 ` Ian Campbell
  2011-01-20 19:05                   ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: Ian Campbell @ 2011-01-20 15:37 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: castet.matthieu, Kees Cook, Jeremy Fitzhardinge, keir.fraser,
	mingo, hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds,
	ak, davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader

On Thu, 2011-01-20 at 15:06 +0000, Konrad Rzeszutek Wilk wrote:
> On Thu, Jan 20, 2011 at 12:18:26PM +0100, castet.matthieu@free.fr wrote:
> > Quoting Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:
> > 
> > > On Wed, Jan 19, 2011 at 11:59:57PM +0100, matthieu castet wrote:
> > > > Le Wed, 19 Jan 2011 16:14:32 -0500,
> > > > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> a écrit :
> > > > > > >
> > > > > > > I was just shown this[1] on Xen from an Ubuntu bug report[2].
> > > > > > >
> > > > > > > [    1.230382] NX-protecting the kernel data: 3884k
> > > > > > > [    1.231002] BUG: unable to handle kernel paging request at
> > > > > > > c1782ae0 ...
> > > > > > > [    1.231145] Call Trace:
> > > > > > > [    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> > > > > > > [    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> > > > > > > [    1.231169]  [<c013857c>] ?
> > > > > > > __change_page_attr_set_clr+0x4c/0xb0 [    1.231176]
> > > > > > > [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> > > > > > > [    1.231183]  [<c010798e>] ?
> > > > > > > __raw_callee_save_xen_restore_fl+0x6/0x8 [    1.231192]
> > > > > > > [<c0159ca1>] ? vprintk+0x171/0x3f0 [    1.231198]  [<c0138bdf>] ?
> > > > > > > set_memory_nx+0x5f/0x70
> > > > > >
> > > > > > If you run it with Xen debugging enabled:
> > > > > >
> > > > > > [    7.753329] NX-protecting the kernel data: 2400k
> > > > > > (XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn
> > > > this happen if (x & (PGT_type_mask|PGT_pae_xen_l2)) != type)
> > > >
> > > > but
> > > > #define PGT_type_mask       (7U<<29) /* Bits 29-31. */
> > > > #define _PGT_pae_xen_l2     26
> > > > #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
> > > >
> > > > but (exp type = 0x70000000) & (PGT_type_mask|PGT_pae_xen_l2) =
> > > > 0x60000000
> > > >
> > > > So the exp type look strange.
> > > > #define _PGT_pinned         28
> > > > #define PGT_pinned          (1U<<_PGT_pinned)
> > > >
> > > > > > 1355a5 (pfn 15a5) (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn
> > > > > > 15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
> > > > > > (XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
> > > > > > [    7.759087] BUG: unable to handle kernel paging request at
> > > > > > c82a4d28 [    7.759087] IP: [<c100608c>]
> > > > > > xen_set_pte_atomic+0x21/0x2f [    7.759087] *pdpt =
> > > > > > 0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 ..
> > > > > > and same stack trace.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > Does Xen have different size page table allocations or something
> > > > > > > weird?
> > > > > >
> > > > > > The same page size. Not sure actually why it is being triggered.
> > > > > > Let me copy Keir on this. Keir, the region that is being marked as
> > > > > > _NX is .bss one and
> > > > >
> > >
> > > > > _past_ the __init_end it dies. Any ideas?
> > > > >
> > > > Does this happen if you add ". = ALIGN(HPAGE_SIZE);" before bss section
> > > > in arch/x86/kernel/vmlinux.lds.S ?
> > >
> > > Like this?
> > Yes
> > >
> > > yeeeey...That made it boot.
> > >
> > > >
> > > > What's the output of kernel_page_tables debugfs ?
> > >
> > > Shees.. I get
> > >
> > > [   73.723105] BUG: unable to handle kernel paging request at 15555000
> > [...]
> > > with the patch and if I revert 5bd5a452662bc37c54fb6828db1a3faf87e6511c..
> > >
> > > That looks to be another bug to hunt down.
> > >
> > No that the same bug : that the root cause.
> > 
> > For some reason with xen, accessing some page tables (bss and after) make the
> > system crash.
> 
> I think I know the failure in the first case - the swapper_pg_dir is marked as _RO
> and you are not suppose to make it _RW (unless you first do a bit of dance and switch
> over to another pagetable). The reason being that Xen has a symbiotic relationship
> with PV domains where pagetables are marked _RO so that any update to
> it will go through Xen so it can validate that we aren't doing anything stupid.
> 
> But accessing the page table should be OK, not sure why it crashed - we
> aren't writting anything to it - just reading.
> 
> Let me copy Ian on this - he might have better ideas.

It's pretty hard to follow the quoted context above but it certainly
seems plausible that set_memory_nx could inadvertently end up trying to
make a page which Xen made RO into a RW again.

For example the callchain appear to pass through static_protections()
which explicitly makes .data and .bss writeable, I think these regions
can potentially contain page table pages -- e.g. allocated from BRK
perhaps?

Ian.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-20 15:37                 ` Ian Campbell
@ 2011-01-20 19:05                   ` Konrad Rzeszutek Wilk
  2011-01-20 20:23                     ` matthieu castet
  0 siblings, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-20 19:05 UTC (permalink / raw)
  To: Ian Campbell
  Cc: castet.matthieu, Kees Cook, Jeremy Fitzhardinge, keir.fraser,
	mingo, hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds,
	ak, davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader

On Thu, Jan 20, 2011 at 03:37:36PM +0000, Ian Campbell wrote:
> On Thu, 2011-01-20 at 15:06 +0000, Konrad Rzeszutek Wilk wrote:
> > On Thu, Jan 20, 2011 at 12:18:26PM +0100, castet.matthieu@free.fr wrote:
> > > Quoting Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:
> > > 
> > > > On Wed, Jan 19, 2011 at 11:59:57PM +0100, matthieu castet wrote:
> > > > > Le Wed, 19 Jan 2011 16:14:32 -0500,
> > > > > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> a écrit :
> > > > > > > >
> > > > > > > > I was just shown this[1] on Xen from an Ubuntu bug report[2].
> > > > > > > >
> > > > > > > > [    1.230382] NX-protecting the kernel data: 3884k
> > > > > > > > [    1.231002] BUG: unable to handle kernel paging request at
> > > > > > > > c1782ae0 ...
> > > > > > > > [    1.231145] Call Trace:
> > > > > > > > [    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> > > > > > > > [    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> > > > > > > > [    1.231169]  [<c013857c>] ?
> > > > > > > > __change_page_attr_set_clr+0x4c/0xb0 [    1.231176]
> > > > > > > > [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> > > > > > > > [    1.231183]  [<c010798e>] ?
> > > > > > > > __raw_callee_save_xen_restore_fl+0x6/0x8 [    1.231192]
> > > > > > > > [<c0159ca1>] ? vprintk+0x171/0x3f0 [    1.231198]  [<c0138bdf>] ?
> > > > > > > > set_memory_nx+0x5f/0x70
> > > > > > >
> > > > > > > If you run it with Xen debugging enabled:
> > > > > > >
> > > > > > > [    7.753329] NX-protecting the kernel data: 2400k
> > > > > > > (XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn
> > > > > this happen if (x & (PGT_type_mask|PGT_pae_xen_l2)) != type)
> > > > >
> > > > > but
> > > > > #define PGT_type_mask       (7U<<29) /* Bits 29-31. */
> > > > > #define _PGT_pae_xen_l2     26
> > > > > #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
> > > > >
> > > > > but (exp type = 0x70000000) & (PGT_type_mask|PGT_pae_xen_l2) =
> > > > > 0x60000000
> > > > >
> > > > > So the exp type look strange.
> > > > > #define _PGT_pinned         28
> > > > > #define PGT_pinned          (1U<<_PGT_pinned)
> > > > >
> > > > > > > 1355a5 (pfn 15a5) (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn
> > > > > > > 15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
> > > > > > > (XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
> > > > > > > [    7.759087] BUG: unable to handle kernel paging request at
> > > > > > > c82a4d28 [    7.759087] IP: [<c100608c>]
> > > > > > > xen_set_pte_atomic+0x21/0x2f [    7.759087] *pdpt =
> > > > > > > 0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 ..
> > > > > > > and same stack trace.
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > Does Xen have different size page table allocations or something
> > > > > > > > weird?
> > > > > > >
> > > > > > > The same page size. Not sure actually why it is being triggered.
> > > > > > > Let me copy Keir on this. Keir, the region that is being marked as
> > > > > > > _NX is .bss one and
> > > > > >
> > > >
> > > > > > _past_ the __init_end it dies. Any ideas?
> > > > > >
> > > > > Does this happen if you add ". = ALIGN(HPAGE_SIZE);" before bss section
> > > > > in arch/x86/kernel/vmlinux.lds.S ?
> > > >
> > > > Like this?
> > > Yes
> > > >
> > > > yeeeey...That made it boot.
> > > >
> > > > >
> > > > > What's the output of kernel_page_tables debugfs ?
> > > >
> > > > Shees.. I get
> > > >
> > > > [   73.723105] BUG: unable to handle kernel paging request at 15555000
> > > [...]
> > > > with the patch and if I revert 5bd5a452662bc37c54fb6828db1a3faf87e6511c..
> > > >
> > > > That looks to be another bug to hunt down.
> > > >
> > > No that the same bug : that the root cause.
> > > 
> > > For some reason with xen, accessing some page tables (bss and after) make the
> > > system crash.
> > 
> > I think I know the failure in the first case - the swapper_pg_dir is marked as _RO
> > and you are not suppose to make it _RW (unless you first do a bit of dance and switch
> > over to another pagetable). The reason being that Xen has a symbiotic relationship
> > with PV domains where pagetables are marked _RO so that any update to
> > it will go through Xen so it can validate that we aren't doing anything stupid.
> > 
> > But accessing the page table should be OK, not sure why it crashed - we
> > aren't writting anything to it - just reading.
> > 
> > Let me copy Ian on this - he might have better ideas.
> 
> It's pretty hard to follow the quoted context above but it certainly
> seems plausible that set_memory_nx could inadvertently end up trying to
> make a page which Xen made RO into a RW again.
> 
> For example the callchain appear to pass through static_protections()
> which explicitly makes .data and .bss writeable, I think these regions
> can potentially contain page table pages -- e.g. allocated from BRK
> perhaps?

They definitly do - it has the level1_ident_pgt, which is definitly used
during bootup.

Perhaps the fix is when marking NX, just do NX, don't try to set RW if they
are RO.

> 
> Ian.
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-20 19:05                   ` Konrad Rzeszutek Wilk
@ 2011-01-20 20:23                     ` matthieu castet
  2011-01-20 21:04                       ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: matthieu castet @ 2011-01-20 20:23 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo,
	hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak,
	davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader

[-- Attachment #1: Type: text/plain, Size: 4778 bytes --]

Konrad Rzeszutek Wilk a écrit :
> On Thu, Jan 20, 2011 at 03:37:36PM +0000, Ian Campbell wrote:
>> On Thu, 2011-01-20 at 15:06 +0000, Konrad Rzeszutek Wilk wrote:
>>> On Thu, Jan 20, 2011 at 12:18:26PM +0100, castet.matthieu@free.fr wrote:
>>>> Quoting Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:
>>>>
>>>>> On Wed, Jan 19, 2011 at 11:59:57PM +0100, matthieu castet wrote:
>>>>>> Le Wed, 19 Jan 2011 16:14:32 -0500,
>>>>>> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> a écrit :
>>>>>>>>> I was just shown this[1] on Xen from an Ubuntu bug report[2].
>>>>>>>>>
>>>>>>>>> [    1.230382] NX-protecting the kernel data: 3884k
>>>>>>>>> [    1.231002] BUG: unable to handle kernel paging request at
>>>>>>>>> c1782ae0 ...
>>>>>>>>> [    1.231145] Call Trace:
>>>>>>>>> [    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
>>>>>>>>> [    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
>>>>>>>>> [    1.231169]  [<c013857c>] ?
>>>>>>>>> __change_page_attr_set_clr+0x4c/0xb0 [    1.231176]
>>>>>>>>> [<c0138838>] ? change_page_attr_set_clr+0x128/0x300
>>>>>>>>> [    1.231183]  [<c010798e>] ?
>>>>>>>>> __raw_callee_save_xen_restore_fl+0x6/0x8 [    1.231192]
>>>>>>>>> [<c0159ca1>] ? vprintk+0x171/0x3f0 [    1.231198]  [<c0138bdf>] ?
>>>>>>>>> set_memory_nx+0x5f/0x70
>>>>>>>> If you run it with Xen debugging enabled:
>>>>>>>>
>>>>>>>> [    7.753329] NX-protecting the kernel data: 2400k
>>>>>>>> (XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn
>>>>>> this happen if (x & (PGT_type_mask|PGT_pae_xen_l2)) != type)
>>>>>>
>>>>>> but
>>>>>> #define PGT_type_mask       (7U<<29) /* Bits 29-31. */
>>>>>> #define _PGT_pae_xen_l2     26
>>>>>> #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
>>>>>>
>>>>>> but (exp type = 0x70000000) & (PGT_type_mask|PGT_pae_xen_l2) =
>>>>>> 0x60000000
>>>>>>
>>>>>> So the exp type look strange.
>>>>>> #define _PGT_pinned         28
>>>>>> #define PGT_pinned          (1U<<_PGT_pinned)
>>>>>>
>>>>>>>> 1355a5 (pfn 15a5) (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn
>>>>>>>> 15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
>>>>>>>> (XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
>>>>>>>> [    7.759087] BUG: unable to handle kernel paging request at
>>>>>>>> c82a4d28 [    7.759087] IP: [<c100608c>]
>>>>>>>> xen_set_pte_atomic+0x21/0x2f [    7.759087] *pdpt =
>>>>>>>> 0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 ..
>>>>>>>> and same stack trace.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Does Xen have different size page table allocations or something
>>>>>>>>> weird?
>>>>>>>> The same page size. Not sure actually why it is being triggered.
>>>>>>>> Let me copy Keir on this. Keir, the region that is being marked as
>>>>>>>> _NX is .bss one and
>>>>>>> _past_ the __init_end it dies. Any ideas?
>>>>>>>
>>>>>> Does this happen if you add ". = ALIGN(HPAGE_SIZE);" before bss section
>>>>>> in arch/x86/kernel/vmlinux.lds.S ?
>>>>> Like this?
>>>> Yes
>>>>> yeeeey...That made it boot.
>>>>>
>>>>>> What's the output of kernel_page_tables debugfs ?
>>>>> Shees.. I get
>>>>>
>>>>> [   73.723105] BUG: unable to handle kernel paging request at 15555000
>>>> [...]
>>>>> with the patch and if I revert 5bd5a452662bc37c54fb6828db1a3faf87e6511c..
>>>>>
>>>>> That looks to be another bug to hunt down.
>>>>>
>>>> No that the same bug : that the root cause.
>>>>
>>>> For some reason with xen, accessing some page tables (bss and after) make the
>>>> system crash.
>>> I think I know the failure in the first case - the swapper_pg_dir is marked as _RO
>>> and you are not suppose to make it _RW (unless you first do a bit of dance and switch
>>> over to another pagetable). The reason being that Xen has a symbiotic relationship
>>> with PV domains where pagetables are marked _RO so that any update to
>>> it will go through Xen so it can validate that we aren't doing anything stupid.
>>>
>>> But accessing the page table should be OK, not sure why it crashed - we
>>> aren't writting anything to it - just reading.
>>>
>>> Let me copy Ian on this - he might have better ideas.
>> It's pretty hard to follow the quoted context above but it certainly
>> seems plausible that set_memory_nx could inadvertently end up trying to
>> make a page which Xen made RO into a RW again.
>>
>> For example the callchain appear to pass through static_protections()
>> which explicitly makes .data and .bss writeable, I think these regions
>> can potentially contain page table pages -- e.g. allocated from BRK
>> perhaps?
> 
> They definitly do - it has the level1_ident_pgt, which is definitly used
> during bootup.
> 
Ok that make sense
> Perhaps the fix is when marking NX, just do NX, don't try to set RW if they
> are RO.
> 
What do you think of this patch ?


Matthieu

[-- Attachment #2: 0001-NX-protection-for-kernel-data-support-xen.patch --]
[-- Type: text/x-diff, Size: 1365 bytes --]

>From 928dabe66cc5992587eb70410208ca9885c64a5c Mon Sep 17 00:00:00 2001
From: Matthieu CASTET <castet.matthieu@free.fr>
Date: Thu, 20 Jan 2011 21:11:45 +0100
Subject: [PATCH] NX protection for kernel data : support xen

Xen want page table pages read only.

But the initial page table (from head_*.S) live in .data or .bss.
Don't make static_protections enforce rw for .data/.bss in xen case.

Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>
---
 arch/x86/mm/pageattr.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8b830ca..8698521 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -283,11 +283,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
 	/*
-	 * .data and .bss should always be writable.
+	 * .data and .bss should always be writable, but xen won't like
+	 * if we make page table rw (that live in .data or .bss)
 	 */
+#ifndef CONFIG_XEN
 	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
 	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
 		pgprot_val(required) |= _PAGE_RW;
+#endif
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 	/*
-- 
1.7.2.3


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-20 20:23                     ` matthieu castet
@ 2011-01-20 21:04                       ` Konrad Rzeszutek Wilk
  2011-01-20 21:19                         ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-20 21:04 UTC (permalink / raw)
  To: matthieu castet
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo,
	hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak,
	davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader

On Thu, Jan 20, 2011 at 09:23:07PM +0100, matthieu castet wrote:
> Konrad Rzeszutek Wilk a écrit :
> >On Thu, Jan 20, 2011 at 03:37:36PM +0000, Ian Campbell wrote:
> >>On Thu, 2011-01-20 at 15:06 +0000, Konrad Rzeszutek Wilk wrote:
> >>>On Thu, Jan 20, 2011 at 12:18:26PM +0100, castet.matthieu@free.fr wrote:
> >>>>Quoting Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:
> >>>>
> >>>>>On Wed, Jan 19, 2011 at 11:59:57PM +0100, matthieu castet wrote:
> >>>>>>Le Wed, 19 Jan 2011 16:14:32 -0500,
> >>>>>>Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> a écrit :
> >>>>>>>>>I was just shown this[1] on Xen from an Ubuntu bug report[2].
> >>>>>>>>>
> >>>>>>>>>[    1.230382] NX-protecting the kernel data: 3884k
> >>>>>>>>>[    1.231002] BUG: unable to handle kernel paging request at
> >>>>>>>>>c1782ae0 ...
> >>>>>>>>>[    1.231145] Call Trace:
> >>>>>>>>>[    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> >>>>>>>>>[    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> >>>>>>>>>[    1.231169]  [<c013857c>] ?
> >>>>>>>>>__change_page_attr_set_clr+0x4c/0xb0 [    1.231176]
> >>>>>>>>>[<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> >>>>>>>>>[    1.231183]  [<c010798e>] ?
> >>>>>>>>>__raw_callee_save_xen_restore_fl+0x6/0x8 [    1.231192]
> >>>>>>>>>[<c0159ca1>] ? vprintk+0x171/0x3f0 [    1.231198]  [<c0138bdf>] ?
> >>>>>>>>>set_memory_nx+0x5f/0x70
> >>>>>>>>If you run it with Xen debugging enabled:
> >>>>>>>>
> >>>>>>>>[    7.753329] NX-protecting the kernel data: 2400k
> >>>>>>>>(XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn
> >>>>>>this happen if (x & (PGT_type_mask|PGT_pae_xen_l2)) != type)
> >>>>>>
> >>>>>>but
> >>>>>>#define PGT_type_mask       (7U<<29) /* Bits 29-31. */
> >>>>>>#define _PGT_pae_xen_l2     26
> >>>>>>#define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
> >>>>>>
> >>>>>>but (exp type = 0x70000000) & (PGT_type_mask|PGT_pae_xen_l2) =
> >>>>>>0x60000000
> >>>>>>
> >>>>>>So the exp type look strange.
> >>>>>>#define _PGT_pinned         28
> >>>>>>#define PGT_pinned          (1U<<_PGT_pinned)
> >>>>>>
> >>>>>>>>1355a5 (pfn 15a5) (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn
> >>>>>>>>15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
> >>>>>>>>(XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
> >>>>>>>>[    7.759087] BUG: unable to handle kernel paging request at
> >>>>>>>>c82a4d28 [    7.759087] IP: [<c100608c>]
> >>>>>>>>xen_set_pte_atomic+0x21/0x2f [    7.759087] *pdpt =
> >>>>>>>>0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 ..
> >>>>>>>>and same stack trace.
> >>>>>>>>
> >>>>>>>>>
> >>>>>>>>>Does Xen have different size page table allocations or something
> >>>>>>>>>weird?
> >>>>>>>>The same page size. Not sure actually why it is being triggered.
> >>>>>>>>Let me copy Keir on this. Keir, the region that is being marked as
> >>>>>>>>_NX is .bss one and
> >>>>>>>_past_ the __init_end it dies. Any ideas?
> >>>>>>>
> >>>>>>Does this happen if you add ". = ALIGN(HPAGE_SIZE);" before bss section
> >>>>>>in arch/x86/kernel/vmlinux.lds.S ?
> >>>>>Like this?
> >>>>Yes
> >>>>>yeeeey...That made it boot.
> >>>>>
> >>>>>>What's the output of kernel_page_tables debugfs ?
> >>>>>Shees.. I get
> >>>>>
> >>>>>[   73.723105] BUG: unable to handle kernel paging request at 15555000
> >>>>[...]
> >>>>>with the patch and if I revert 5bd5a452662bc37c54fb6828db1a3faf87e6511c..
> >>>>>
> >>>>>That looks to be another bug to hunt down.
> >>>>>
> >>>>No that the same bug : that the root cause.
> >>>>
> >>>>For some reason with xen, accessing some page tables (bss and after) make the
> >>>>system crash.
> >>>I think I know the failure in the first case - the swapper_pg_dir is marked as _RO
> >>>and you are not suppose to make it _RW (unless you first do a bit of dance and switch
> >>>over to another pagetable). The reason being that Xen has a symbiotic relationship
> >>>with PV domains where pagetables are marked _RO so that any update to
> >>>it will go through Xen so it can validate that we aren't doing anything stupid.
> >>>
> >>>But accessing the page table should be OK, not sure why it crashed - we
> >>>aren't writting anything to it - just reading.
> >>>
> >>>Let me copy Ian on this - he might have better ideas.
> >>It's pretty hard to follow the quoted context above but it certainly
> >>seems plausible that set_memory_nx could inadvertently end up trying to
> >>make a page which Xen made RO into a RW again.
> >>
> >>For example the callchain appear to pass through static_protections()
> >>which explicitly makes .data and .bss writeable, I think these regions
> >>can potentially contain page table pages -- e.g. allocated from BRK
> >>perhaps?
> >
> >They definitly do - it has the level1_ident_pgt, which is definitly used
> >during bootup.
> >
> Ok that make sense
> >Perhaps the fix is when marking NX, just do NX, don't try to set RW if they
> >are RO.
> >
> What do you think of this patch ?
> 
> 
> Matthieu

> >From 928dabe66cc5992587eb70410208ca9885c64a5c Mon Sep 17 00:00:00 2001
> From: Matthieu CASTET <castet.matthieu@free.fr>
> Date: Thu, 20 Jan 2011 21:11:45 +0100
> Subject: [PATCH] NX protection for kernel data : support xen
> 
> Xen want page table pages read only.
> 
> But the initial page table (from head_*.S) live in .data or .bss.
> Don't make static_protections enforce rw for .data/.bss in xen case.
> 
> Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>
> ---
>  arch/x86/mm/pageattr.c |    5 ++++-
>  1 files changed, 4 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index 8b830ca..8698521 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -283,11 +283,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
>  		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
>  		pgprot_val(forbidden) |= _PAGE_RW;
>  	/*
> -	 * .data and .bss should always be writable.
> +	 * .data and .bss should always be writable, but xen won't like
> +	 * if we make page table rw (that live in .data or .bss)
>  	 */
> +#ifndef CONFIG_XEN
>  	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
>  	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
>  		pgprot_val(required) |= _PAGE_RW;
> +#endif

<shudders>There has to be a better way than this. Keep in mind that this
would mean that any kernel that runs with the pvops turned on (pretty much all distros)
will do this. You don't need anymore to build a kernel that is Xen specific - it is
one kernel that can run on baremetal, Xen, etc.

Is there no way to just say, pass in PAGE_NX and don't unset the other
permissions? Hmm, there is something right below what your patch does:

  if (kernel_set_to_readonly &&
            within(address, (unsigned long)_text,
                   (unsigned long)__end_rodata_hpage_align)) {
                unsigned int level;

...
                 * This also fixes the Linux Xen paravirt guest boot failure
                 * (because of unexpected read-only mappings for kernel identity
                 * mappings). In this paravirt guest case, the kernel text
...


Could we just expand the search criteria to be __end ? 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-20 21:04                       ` Konrad Rzeszutek Wilk
@ 2011-01-20 21:19                         ` Konrad Rzeszutek Wilk
  2011-01-20 21:55                           ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-20 21:19 UTC (permalink / raw)
  To: matthieu castet
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo,
	hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak,
	davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader

On Thu, Jan 20, 2011 at 04:04:36PM -0500, Konrad Rzeszutek Wilk wrote:
> On Thu, Jan 20, 2011 at 09:23:07PM +0100, matthieu castet wrote:
> > Konrad Rzeszutek Wilk a écrit :
> > >On Thu, Jan 20, 2011 at 03:37:36PM +0000, Ian Campbell wrote:
> > >>On Thu, 2011-01-20 at 15:06 +0000, Konrad Rzeszutek Wilk wrote:
> > >>>On Thu, Jan 20, 2011 at 12:18:26PM +0100, castet.matthieu@free.fr wrote:
> > >>>>Quoting Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:
> > >>>>
> > >>>>>On Wed, Jan 19, 2011 at 11:59:57PM +0100, matthieu castet wrote:
> > >>>>>>Le Wed, 19 Jan 2011 16:14:32 -0500,
> > >>>>>>Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> a écrit :
> > >>>>>>>>>I was just shown this[1] on Xen from an Ubuntu bug report[2].
> > >>>>>>>>>
> > >>>>>>>>>[    1.230382] NX-protecting the kernel data: 3884k
> > >>>>>>>>>[    1.231002] BUG: unable to handle kernel paging request at
> > >>>>>>>>>c1782ae0 ...
> > >>>>>>>>>[    1.231145] Call Trace:
> > >>>>>>>>>[    1.231152]  [<c0138481>] ? __change_page_attr+0x2c1/0x370
> > >>>>>>>>>[    1.231161]  [<c02163a1>] ? __purge_vmap_area_lazy+0xc1/0x180
> > >>>>>>>>>[    1.231169]  [<c013857c>] ?
> > >>>>>>>>>__change_page_attr_set_clr+0x4c/0xb0 [    1.231176]
> > >>>>>>>>>[<c0138838>] ? change_page_attr_set_clr+0x128/0x300
> > >>>>>>>>>[    1.231183]  [<c010798e>] ?
> > >>>>>>>>>__raw_callee_save_xen_restore_fl+0x6/0x8 [    1.231192]
> > >>>>>>>>>[<c0159ca1>] ? vprintk+0x171/0x3f0 [    1.231198]  [<c0138bdf>] ?
> > >>>>>>>>>set_memory_nx+0x5f/0x70
> > >>>>>>>>If you run it with Xen debugging enabled:
> > >>>>>>>>
> > >>>>>>>>[    7.753329] NX-protecting the kernel data: 2400k
> > >>>>>>>>(XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn
> > >>>>>>this happen if (x & (PGT_type_mask|PGT_pae_xen_l2)) != type)
> > >>>>>>
> > >>>>>>but
> > >>>>>>#define PGT_type_mask       (7U<<29) /* Bits 29-31. */
> > >>>>>>#define _PGT_pae_xen_l2     26
> > >>>>>>#define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
> > >>>>>>
> > >>>>>>but (exp type = 0x70000000) & (PGT_type_mask|PGT_pae_xen_l2) =
> > >>>>>>0x60000000
> > >>>>>>
> > >>>>>>So the exp type look strange.
> > >>>>>>#define _PGT_pinned         28
> > >>>>>>#define PGT_pinned          (1U<<_PGT_pinned)
> > >>>>>>
> > >>>>>>>>1355a5 (pfn 15a5) (XEN) mm.c:889:d0 Error getting mfn 1355a5 (pfn
> > >>>>>>>>15a5) from L1 entry 80000001355a5063 for l1e_owner=0, pg_owner=0
> > >>>>>>>>(XEN) mm.c:4958:d0 ptwr_emulate: could not get_page_from_l1e()
> > >>>>>>>>[    7.759087] BUG: unable to handle kernel paging request at
> > >>>>>>>>c82a4d28 [    7.759087] IP: [<c100608c>]
> > >>>>>>>>xen_set_pte_atomic+0x21/0x2f [    7.759087] *pdpt =
> > >>>>>>>>0000000001663001 *pde = 00000000082db067 *pte = 80000000082a4061 ..
> > >>>>>>>>and same stack trace.
> > >>>>>>>>
> > >>>>>>>>>
> > >>>>>>>>>Does Xen have different size page table allocations or something
> > >>>>>>>>>weird?
> > >>>>>>>>The same page size. Not sure actually why it is being triggered.
> > >>>>>>>>Let me copy Keir on this. Keir, the region that is being marked as
> > >>>>>>>>_NX is .bss one and
> > >>>>>>>_past_ the __init_end it dies. Any ideas?
> > >>>>>>>
> > >>>>>>Does this happen if you add ". = ALIGN(HPAGE_SIZE);" before bss section
> > >>>>>>in arch/x86/kernel/vmlinux.lds.S ?
> > >>>>>Like this?
> > >>>>Yes
> > >>>>>yeeeey...That made it boot.
> > >>>>>
> > >>>>>>What's the output of kernel_page_tables debugfs ?
> > >>>>>Shees.. I get
> > >>>>>
> > >>>>>[   73.723105] BUG: unable to handle kernel paging request at 15555000
> > >>>>[...]
> > >>>>>with the patch and if I revert 5bd5a452662bc37c54fb6828db1a3faf87e6511c..
> > >>>>>
> > >>>>>That looks to be another bug to hunt down.
> > >>>>>
> > >>>>No that the same bug : that the root cause.
> > >>>>
> > >>>>For some reason with xen, accessing some page tables (bss and after) make the
> > >>>>system crash.
> > >>>I think I know the failure in the first case - the swapper_pg_dir is marked as _RO
> > >>>and you are not suppose to make it _RW (unless you first do a bit of dance and switch
> > >>>over to another pagetable). The reason being that Xen has a symbiotic relationship
> > >>>with PV domains where pagetables are marked _RO so that any update to
> > >>>it will go through Xen so it can validate that we aren't doing anything stupid.
> > >>>
> > >>>But accessing the page table should be OK, not sure why it crashed - we
> > >>>aren't writting anything to it - just reading.
> > >>>
> > >>>Let me copy Ian on this - he might have better ideas.
> > >>It's pretty hard to follow the quoted context above but it certainly
> > >>seems plausible that set_memory_nx could inadvertently end up trying to
> > >>make a page which Xen made RO into a RW again.
> > >>
> > >>For example the callchain appear to pass through static_protections()
> > >>which explicitly makes .data and .bss writeable, I think these regions
> > >>can potentially contain page table pages -- e.g. allocated from BRK
> > >>perhaps?
> > >
> > >They definitly do - it has the level1_ident_pgt, which is definitly used
> > >during bootup.
> > >
> > Ok that make sense
> > >Perhaps the fix is when marking NX, just do NX, don't try to set RW if they
> > >are RO.
> > >
> > What do you think of this patch ?
> > 
> > 
> > Matthieu
> 
> > >From 928dabe66cc5992587eb70410208ca9885c64a5c Mon Sep 17 00:00:00 2001
> > From: Matthieu CASTET <castet.matthieu@free.fr>
> > Date: Thu, 20 Jan 2011 21:11:45 +0100
> > Subject: [PATCH] NX protection for kernel data : support xen
> > 
> > Xen want page table pages read only.
> > 
> > But the initial page table (from head_*.S) live in .data or .bss.
> > Don't make static_protections enforce rw for .data/.bss in xen case.
> > 
> > Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>
> > ---
> >  arch/x86/mm/pageattr.c |    5 ++++-
> >  1 files changed, 4 insertions(+), 1 deletions(-)
> > 
> > diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> > index 8b830ca..8698521 100644
> > --- a/arch/x86/mm/pageattr.c
> > +++ b/arch/x86/mm/pageattr.c
> > @@ -283,11 +283,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
> >  		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
> >  		pgprot_val(forbidden) |= _PAGE_RW;
> >  	/*
> > -	 * .data and .bss should always be writable.
> > +	 * .data and .bss should always be writable, but xen won't like
> > +	 * if we make page table rw (that live in .data or .bss)
> >  	 */
> > +#ifndef CONFIG_XEN
> >  	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
> >  	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
> >  		pgprot_val(required) |= _PAGE_RW;
> > +#endif
> 
> <shudders>There has to be a better way than this. Keep in mind that this
> would mean that any kernel that runs with the pvops turned on (pretty much all distros)
> will do this. You don't need anymore to build a kernel that is Xen specific - it is
> one kernel that can run on baremetal, Xen, etc.
> 
> Is there no way to just say, pass in PAGE_NX and don't unset the other
> permissions? Hmm, there is something right below what your patch does:
> 
>   if (kernel_set_to_readonly &&
>             within(address, (unsigned long)_text,
>                    (unsigned long)__end_rodata_hpage_align)) {
>                 unsigned int level;
> 
> ...
>                  * This also fixes the Linux Xen paravirt guest boot failure
>                  * (because of unexpected read-only mappings for kernel identity
>                  * mappings). In this paravirt guest case, the kernel text
> ...
> 
> 
> Could we just expand the search criteria to be __end ? 

This seems to work, which is just a copy-n-paste of what X86_64 does. Not sure
thought if it is correct. 

commit 9ea22c7cb9bf00617dc53f2fb3d3f55a1d55b0f8
Author: matthieu castet <castet.matthieu@free.fr>
Date:   Thu Jan 20 21:23:07 2011 +0100

    [PATCH] NX protection for kernel data : support xen
    
    Xen want page table pages read only.
    
    But the initial page table (from head_*.S) live in .data or .bss.
    Don't make static_protections enforce rw for .data/.bss in xen case.
    
    Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8b830ca..524dba2 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -283,11 +283,17 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
 	/*
-	 * .data and .bss should always be writable.
+	 * .data and .bss should always be writable, but xen won't like
+	 * if we make page table rw (that live in .data or .bss)
 	 */
+#ifdef CONFIG_X86_32
 	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
-	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
-		pgprot_val(required) |= _PAGE_RW;
+	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) {
+		unsigned int level;
+		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
+			pgprot_val(forbidden) |= _PAGE_RW;
+	}
+#endif
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)

fyi, it does make it boot.
 	/*
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-20 21:19                         ` Konrad Rzeszutek Wilk
@ 2011-01-20 21:55                           ` Konrad Rzeszutek Wilk
  2011-01-21 21:41                             ` matthieu castet
  2011-01-21 23:20                             ` [tip:x86/security] x86: Add NX protection for kernel data Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-20 21:55 UTC (permalink / raw)
  To: matthieu castet
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo,
	hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak,
	davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader

> -	 * .data and .bss should always be writable.
> +	 * .data and .bss should always be writable, but xen won't like
> +	 * if we make page table rw (that live in .data or .bss)
>  	 */
> +#ifdef CONFIG_X86_32
>  	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
> -	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
> -		pgprot_val(required) |= _PAGE_RW;
> +	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) {
> +		unsigned int level;
> +		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
> +			pgprot_val(forbidden) |= _PAGE_RW;
> +	}
> +#endif
>  
>  #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
> 
> fyi, it does make it boot.

Hold it.. ccache is a wonderful tool but I think I've just "rebuilt" the
binaries with the .bss HPAGE_ALIGN aligment by mistake, so this path got never
taken.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-20 21:55                           ` Konrad Rzeszutek Wilk
@ 2011-01-21 21:41                             ` matthieu castet
  2011-01-22  5:11                               ` Konrad Rzeszutek Wilk
  2011-01-21 23:20                             ` [tip:x86/security] x86: Add NX protection for kernel data Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 21+ messages in thread
From: matthieu castet @ 2011-01-21 21:41 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo,
	hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak,
	davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader

Konrad Rzeszutek Wilk a écrit :
>> -	 * .data and .bss should always be writable.
>> +	 * .data and .bss should always be writable, but xen won't like
>> +	 * if we make page table rw (that live in .data or .bss)
>>  	 */
>> +#ifdef CONFIG_X86_32
>>  	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
>> -	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
>> -		pgprot_val(required) |= _PAGE_RW;
>> +	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) {
>> +		unsigned int level;
>> +		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
>> +			pgprot_val(forbidden) |= _PAGE_RW;
>> +	}
>> +#endif
>>  
>>  #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
>>
>> fyi, it does make it boot.
> 
> Hold it.. ccache is a wonderful tool but I think I've just "rebuilt" the
> binaries with the .bss HPAGE_ALIGN aligment by mistake, so this path got never
> taken.
> 
> 
Ok,

ATM I saw the following solution to solve the problem :
1) remove the data/bss check in static_protections, it was introduced by NX patches (64edc8ed). But I am not sure it
is really needed anymore.
2) add ". = ALIGN(HPAGE_SIZE)" somewhere after init section. But if we want not to be allocated in image we
should put it before bss. And if we want to be freed after init, we should put before .init.end.
This mean moving .smp_locks (and .data_nosave when x86 will be added) before init section. I have no idea of the impact.
3) add some logic in arch/x86/xen/mmu.c, that will ignore RW page setting for the page table marked RO.
4) make static_protections take and old_prot argument, and only apply RW .data/.bss requirement if page is already RW.

If possible I will go for 1).

Matthieu

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-20 21:55                           ` Konrad Rzeszutek Wilk
  2011-01-21 21:41                             ` matthieu castet
@ 2011-01-21 23:20                             ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-21 23:20 UTC (permalink / raw)
  To: matthieu castet
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo,
	hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak,
	davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader


So this patch fixes the regression and allows to boot Linux on Xen.
It does not affect negatively baremetal (tried x86_32 and x86_64).

The debugfs pagetable looks exactly as before this patch (from
__init_end and to further it expands as time goes on):
0xc14e7000-0xccc00000      187492K     RW             GLB NX pte.

I am concerned with 64edc8ed5ffae999d8d413ba006850e9e34166cb
(x86: Fix improper large page preservation) as it would make it possible
for the .bss sections to have RO pages in it - which would inhibit
the kernel from collapsing the small pages into a large one. However on
the machines I boot this patched kernel up it had no trouble
collapsing small pages in a large one (it also did not have
any pages in .bss section set to RO). Are there any known
(besides Xen) sections of code that sets pages in .bss and .data to RO?

commit 0390f0a87470fd2686c4eed6ace28443f8cc9b2f
Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date:   Fri Jan 21 11:23:39 2011 -0500

    x86: Don't force in .bss everything to RW.
    
    If there are pages there which are RO leave them be. Otherwise
    set them to RW.
    
    This fixes a bug where under Xen we would get:
    (XEN) mm.c:2389:d0 Bad type (saw 3c000003 != exp 70000000) for mfn 1335a1 (pfn 15a1)
    (XEN) mm.c:889:d0 Error getting mfn 1335a1 (pfn 15a1) from L1 entry 80000001335a1063 for l1e_owner=0, pg_owner=0
    (XEN) mm.c:4939:d0 ptwr_emulate: could not get_page_from_l1e()
    [    8.296405] BUG: unable to handle kernel paging request at cb159d08
    [    8.296405] IP: [<c1005ff4>] xen_set_pte_atomic+0x21/0x2f
    [    8.296405] *pdpt = 000000000165e001 *pde = 000000000b1a7067 *pte = 800000000b159061
    
    B/c we tried to set RW on the swapper_pg_dir, which had been
    set to RO by the Xen init code (xen_write_cr3_init) and MUST be
    RO (so that the Xen hypervisor can be assured that the guest is not
    messing with the pagetables on its own).
    
    Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8b830ca..53b99e3 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -283,11 +283,18 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
 	/*
-	 * .data and .bss should always be writable.
+	 * .data and .bss ought to be writable. But if there is a RO
+	 * in there, don't force RW on it.
 	 */
 	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
-	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
-		pgprot_val(required) |= _PAGE_RW;
+	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) {
+		unsigned int level;
+		pte_t *pte = lookup_address(address, &level);
+		pgprot_t prot = ((pte) ? pte_pgprot(*pte) : __pgprot(0));
+
+		if ((pgprot_val(prot) & _PAGE_RW))
+			pgprot_val(required) |= _PAGE_RW;
+	}
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 	/*
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-21 21:41                             ` matthieu castet
@ 2011-01-22  5:11                               ` Konrad Rzeszutek Wilk
  2011-01-23 14:27                                 ` matthieu castet
  0 siblings, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-22  5:11 UTC (permalink / raw)
  To: matthieu castet
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo,
	hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak,
	davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader, konrad

On Fri, Jan 21, 2011 at 10:41:54PM +0100, matthieu castet wrote:
> Konrad Rzeszutek Wilk a écrit :
>>> -	 * .data and .bss should always be writable.
>>> +	 * .data and .bss should always be writable, but xen won't like
>>> +	 * if we make page table rw (that live in .data or .bss)
>>>  	 */
>>> +#ifdef CONFIG_X86_32
>>>  	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
>>> -	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
>>> -		pgprot_val(required) |= _PAGE_RW;
>>> +	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) {
>>> +		unsigned int level;
>>> +		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
>>> +			pgprot_val(forbidden) |= _PAGE_RW;
>>> +	}
>>> +#endif
>>>   #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
>>>
>>> fyi, it does make it boot.
>>
>> Hold it.. ccache is a wonderful tool but I think I've just "rebuilt" the
>> binaries with the .bss HPAGE_ALIGN aligment by mistake, so this path got never
>> taken.
>>
>>
> Ok,
>
> ATM I saw the following solution to solve the problem :
> 1) remove the data/bss check in static_protections, it was introduced by NX patches (64edc8ed). But I am not sure it
> is really needed anymore.
> 2) add ". = ALIGN(HPAGE_SIZE)" somewhere after init section. But if we want not to be allocated in image we
> should put it before bss. And if we want to be freed after init, we should put before .init.end.
> This mean moving .smp_locks (and .data_nosave when x86 will be added) before init section. I have no idea of the impact.
> 3) add some logic in arch/x86/xen/mmu.c, that will ignore RW page setting for the page table marked RO.
> 4) make static_protections take and old_prot argument, and only apply RW .data/.bss requirement if page is already RW.
>
> If possible I will go for 1).

Sounds good. Just send me the patch and I will test it.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-22  5:11                               ` Konrad Rzeszutek Wilk
@ 2011-01-23 14:27                                 ` matthieu castet
  2011-01-24 15:31                                   ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: matthieu castet @ 2011-01-23 14:27 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo,
	hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak,
	davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader, konrad

[-- Attachment #1: Type: text/plain, Size: 2071 bytes --]

Konrad Rzeszutek Wilk a écrit :
> On Fri, Jan 21, 2011 at 10:41:54PM +0100, matthieu castet wrote:
>> Konrad Rzeszutek Wilk a écrit :
>>>> -	 * .data and .bss should always be writable.
>>>> +	 * .data and .bss should always be writable, but xen won't like
>>>> +	 * if we make page table rw (that live in .data or .bss)
>>>>  	 */
>>>> +#ifdef CONFIG_X86_32
>>>>  	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
>>>> -	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
>>>> -		pgprot_val(required) |= _PAGE_RW;
>>>> +	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) {
>>>> +		unsigned int level;
>>>> +		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
>>>> +			pgprot_val(forbidden) |= _PAGE_RW;
>>>> +	}
>>>> +#endif
>>>>   #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
>>>>
>>>> fyi, it does make it boot.
>>> Hold it.. ccache is a wonderful tool but I think I've just "rebuilt" the
>>> binaries with the .bss HPAGE_ALIGN aligment by mistake, so this path got never
>>> taken.
>>>
>>>
>> Ok,
>>
>> ATM I saw the following solution to solve the problem :
>> 1) remove the data/bss check in static_protections, it was introduced by NX patches (64edc8ed). But I am not sure it
>> is really needed anymore.
>> 2) add ". = ALIGN(HPAGE_SIZE)" somewhere after init section. But if we want not to be allocated in image we
>> should put it before bss. And if we want to be freed after init, we should put before .init.end.
>> This mean moving .smp_locks (and .data_nosave when x86 will be added) before init section. I have no idea of the impact.
>> 3) add some logic in arch/x86/xen/mmu.c, that will ignore RW page setting for the page table marked RO.
>> 4) make static_protections take and old_prot argument, and only apply RW .data/.bss requirement if page is already RW.
>>
>> If possible I will go for 1).
> 
> Sounds good. Just send me the patch and I will test it.

Ok, what give you the attached patch.

I don't know if I should give the printk or not.


Matthieu

[-- Attachment #2: xen_nx.diff --]
[-- Type: text/x-diff, Size: 1418 bytes --]

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8b830ca..eec93c5 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -256,7 +256,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 				   unsigned long pfn)
 {
 	pgprot_t forbidden = __pgprot(0);
-	pgprot_t required = __pgprot(0);
 
 	/*
 	 * The BIOS area between 640k and 1Mb needs to be executable for
@@ -283,11 +282,13 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
 	/*
-	 * .data and .bss should always be writable.
+	 * .data and .bss should always be writable, but xen won't like
+	 * if we make page table rw (that live in .data or .bss)
 	 */
 	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
 	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
-		pgprot_val(required) |= _PAGE_RW;
+		if ((pgprot_val(prot) & _PAGE_RW) == 0)
+			printk(KERN_INFO "RO page for 0x%lx in bss/data.\n", address);
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 	/*
@@ -327,7 +328,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 #endif
 
 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
-	prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
 
 	return prot;
 }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [tip:x86/security] x86: Add NX protection for kernel data
  2011-01-23 14:27                                 ` matthieu castet
@ 2011-01-24 15:31                                   ` Konrad Rzeszutek Wilk
  2011-01-27 16:30                                     ` Was: [tip:x86/security] x86: Add NX protection for kernel data. Is: don't set RW on RO regions in .bss Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-24 15:31 UTC (permalink / raw)
  To: matthieu castet
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, keir.fraser, mingo,
	hpa, sliakh.lkml, jmorris, linux-kernel, rusty, torvalds, ak,
	davej, jiang, arjan, tglx, sfr, mingo, Stefan Bader, konrad

On Sun, Jan 23, 2011 at 03:27:39PM +0100, matthieu castet wrote:
> Konrad Rzeszutek Wilk a écrit :
> >On Fri, Jan 21, 2011 at 10:41:54PM +0100, matthieu castet wrote:
> >>Konrad Rzeszutek Wilk a écrit :
> >>>>-	 * .data and .bss should always be writable.
> >>>>+	 * .data and .bss should always be writable, but xen won't like
> >>>>+	 * if we make page table rw (that live in .data or .bss)
> >>>> 	 */
> >>>>+#ifdef CONFIG_X86_32
> >>>> 	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
> >>>>-	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
> >>>>-		pgprot_val(required) |= _PAGE_RW;
> >>>>+	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) {
> >>>>+		unsigned int level;
> >>>>+		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
> >>>>+			pgprot_val(forbidden) |= _PAGE_RW;
> >>>>+	}
> >>>>+#endif
> >>>>  #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
> >>>>
> >>>>fyi, it does make it boot.
> >>>Hold it.. ccache is a wonderful tool but I think I've just "rebuilt" the
> >>>binaries with the .bss HPAGE_ALIGN aligment by mistake, so this path got never
> >>>taken.
> >>>
> >>>
> >>Ok,
> >>
> >>ATM I saw the following solution to solve the problem :
> >>1) remove the data/bss check in static_protections, it was introduced by NX patches (64edc8ed). But I am not sure it
> >>is really needed anymore.
> >>2) add ". = ALIGN(HPAGE_SIZE)" somewhere after init section. But if we want not to be allocated in image we
> >>should put it before bss. And if we want to be freed after init, we should put before .init.end.
> >>This mean moving .smp_locks (and .data_nosave when x86 will be added) before init section. I have no idea of the impact.
> >>3) add some logic in arch/x86/xen/mmu.c, that will ignore RW page setting for the page table marked RO.
> >>4) make static_protections take and old_prot argument, and only apply RW .data/.bss requirement if page is already RW.
> >>
> >>If possible I will go for 1).
> >
> >Sounds good. Just send me the patch and I will test it.
> 
> Ok, what give you the attached patch.
> 
> I don't know if I should give the printk or not. 

I would say get rid of the printk. It does not really help the users. Here is an excerpt of
2.6.38-rc2 with this patch:

   7.247448] NX-protecting the kernel data: 2412k
[    7.252489] RO page for 0xc15a0000 in bss/data.
[    7.253052] RO page for 0xc15a1000 in bss/data.
[    7.253052] RO page for 0xc15a3000 in bss/data.
[    7.365104] mv used greatest stack depth: 6616 bytes left

So Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

(I tested on baremetal x86,x86_64 and Xen x86 and x86_64)
> 
> 
> Matthieu


> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index 8b830ca..eec93c5 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -256,7 +256,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
>  				   unsigned long pfn)
>  {
>  	pgprot_t forbidden = __pgprot(0);
> -	pgprot_t required = __pgprot(0);
>  
>  	/*
>  	 * The BIOS area between 640k and 1Mb needs to be executable for
> @@ -283,11 +282,13 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
>  		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
>  		pgprot_val(forbidden) |= _PAGE_RW;
>  	/*
> -	 * .data and .bss should always be writable.
> +	 * .data and .bss should always be writable, but xen won't like
> +	 * if we make page table rw (that live in .data or .bss)
>  	 */
>  	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
>  	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
> -		pgprot_val(required) |= _PAGE_RW;
> +		if ((pgprot_val(prot) & _PAGE_RW) == 0)
> +			printk(KERN_INFO "RO page for 0x%lx in bss/data.\n", address);
>  
>  #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
>  	/*
> @@ -327,7 +328,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
>  #endif
>  
>  	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
> -	prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
>  
>  	return prot;
>  }


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Was: [tip:x86/security] x86: Add NX protection for kernel data. Is: don't set RW on RO regions in .bss
  2011-01-24 15:31                                   ` Konrad Rzeszutek Wilk
@ 2011-01-27 16:30                                     ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-01-27 16:30 UTC (permalink / raw)
  To: matthieu castet, hpa, tglx
  Cc: Ian Campbell, Kees Cook, Jeremy Fitzhardinge, sliakh.lkml,
	linux-kernel, jiang, Stefan Bader, konrad

> > Ok, what give you the attached patch.
> > 
> > I don't know if I should give the printk or not. 
> 
> I would say get rid of the printk. It does not really help the users. Here is an excerpt of
> 2.6.38-rc2 with this patch:
> 
>    7.247448] NX-protecting the kernel data: 2412k
> [    7.252489] RO page for 0xc15a0000 in bss/data.
> [    7.253052] RO page for 0xc15a1000 in bss/data.
> [    7.253052] RO page for 0xc15a3000 in bss/data.
> [    7.365104] mv used greatest stack depth: 6616 bytes left
> 
> So Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> 
> (I tested on baremetal x86,x86_64 and Xen x86 and x86_64)

Matthieu,

I can send a patch for rc3 that would get rid of the "if within(.._sdata"
(since if we get rid of the printk it does not do anything) - but I
would much prefer if you did it since you are much familiar with
this patch and know the ramifications.

This being in the x86 generic code it should go through the tip
probably, I think?

P.S.
[I trimmed down the CC and moved some folks to the To field]
> > 
> > 
> > Matthieu
> 
> 
> > diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> > index 8b830ca..eec93c5 100644
> > --- a/arch/x86/mm/pageattr.c
> > +++ b/arch/x86/mm/pageattr.c
> > @@ -256,7 +256,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
> >  				   unsigned long pfn)
> >  {
> >  	pgprot_t forbidden = __pgprot(0);
> > -	pgprot_t required = __pgprot(0);
> >  
> >  	/*
> >  	 * The BIOS area between 640k and 1Mb needs to be executable for
> > @@ -283,11 +282,13 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
> >  		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
> >  		pgprot_val(forbidden) |= _PAGE_RW;
> >  	/*
> > -	 * .data and .bss should always be writable.
> > +	 * .data and .bss should always be writable, but xen won't like
> > +	 * if we make page table rw (that live in .data or .bss)
> >  	 */
> >  	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
> >  	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
> > -		pgprot_val(required) |= _PAGE_RW;
> > +		if ((pgprot_val(prot) & _PAGE_RW) == 0)
> > +			printk(KERN_INFO "RO page for 0x%lx in bss/data.\n", address);
> >  
> >  #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
> >  	/*
> > @@ -327,7 +328,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
> >  #endif
> >  
> >  	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
> > -	prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
> >  
> >  	return prot;
> >  }

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2011-01-27 16:31 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-16 21:31 [PATCH 2/3 V8] NX protection for kernel data matthieu castet
2010-11-18 14:08 ` [tip:x86/security] x86: Add " tip-bot for Matthieu Castet
2011-01-11 23:31   ` Kees Cook
2011-01-14 20:15     ` Konrad Rzeszutek Wilk
2011-01-19 21:14       ` Konrad Rzeszutek Wilk
2011-01-19 22:59         ` matthieu castet
2011-01-19 23:38           ` Konrad Rzeszutek Wilk
2011-01-20 11:18             ` castet.matthieu
2011-01-20 15:06               ` Konrad Rzeszutek Wilk
2011-01-20 15:37                 ` Ian Campbell
2011-01-20 19:05                   ` Konrad Rzeszutek Wilk
2011-01-20 20:23                     ` matthieu castet
2011-01-20 21:04                       ` Konrad Rzeszutek Wilk
2011-01-20 21:19                         ` Konrad Rzeszutek Wilk
2011-01-20 21:55                           ` Konrad Rzeszutek Wilk
2011-01-21 21:41                             ` matthieu castet
2011-01-22  5:11                               ` Konrad Rzeszutek Wilk
2011-01-23 14:27                                 ` matthieu castet
2011-01-24 15:31                                   ` Konrad Rzeszutek Wilk
2011-01-27 16:30                                     ` Was: [tip:x86/security] x86: Add NX protection for kernel data. Is: don't set RW on RO regions in .bss Konrad Rzeszutek Wilk
2011-01-21 23:20                             ` [tip:x86/security] x86: Add NX protection for kernel data Konrad Rzeszutek Wilk

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).