LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
@ 2007-05-04 12:59 Rusty Russell
  2007-05-04 13:02 ` [RFC PATCH 2/3] lguest: Boot with virtual == physical to get closer to native Linux Rusty Russell
  2007-05-04 14:01 ` [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field Eric W. Biederman
  0 siblings, 2 replies; 37+ messages in thread
From: Rusty Russell @ 2007-05-04 12:59 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andi Kleen, Chris Wright, Jeremy Fitzhardinge, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

[ This patch won't apply, even to -mm: it comes after the documentation
patch.  But it's hopefully readable ]

1) This code assumes that 0x23C is going to be the "platform type"
   field of the next version of the Linux boot header format.

2) This doesn't actually assume that P=V, but unlike the paravirt
   probe code doesn't assume that V=P+PAGE_OFFSET either.

The following patch actually fixes up lguest to boot with P=V.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/i386/kernel/head.S     |   31 +++++++------------------------
 drivers/lguest/lguest.c     |    6 +-----
 drivers/lguest/lguest_asm.S |   26 ++------------------------
 include/asm-i386/paravirt.h |    5 -----
 4 files changed, 10 insertions(+), 58 deletions(-)

diff -r 0e7e7991bd30 arch/i386/kernel/head.S
--- a/arch/i386/kernel/head.S	Fri May 04 16:59:07 2007 +1000
+++ b/arch/i386/kernel/head.S	Fri May 04 20:38:08 2007 +1000
@@ -503,35 +503,13 @@ ignore_int:
 .section .text
 #ifdef CONFIG_PARAVIRT
 startup_paravirt:
-	cld
- 	movl $(init_thread_union+THREAD_SIZE),%esp
-
-	/* We take pains to preserve all the regs. */
-	pushl	%edx
-	pushl	%ecx
-	pushl	%eax
-
-	pushl	$__start_paravirtprobe
-1:
-	movl	0(%esp), %eax
-	cmpl	$__stop_paravirtprobe, %eax
-	je	unhandled_paravirt
-	pushl	(%eax)
-	movl	8(%esp), %eax
-	call	*(%esp)
-	popl	%eax
-
-	movl	4(%esp), %eax
-	movl	8(%esp), %ecx
-	movl	12(%esp), %edx
-
-	addl	$4, (%esp)
-	jmp	1b
-
-unhandled_paravirt:
+#ifdef CONFIG_LGUEST_GUEST
+	cmpl	$1, 0x23c(%esi)
+	je	lguest_init
+#endif
 	/* Nothing wanted us: we're screwed. */
 	ud2
-#endif
+#endif /* CONFIG_PARAVIRT */
 
 /*
  * Real beginning of normal "text" segment
diff -r 0e7e7991bd30 arch/i386/kernel/vmlinux.lds.S
--- a/arch/i386/kernel/vmlinux.lds.S	Fri May 04 16:59:07 2007 +1000
+++ b/arch/i386/kernel/vmlinux.lds.S	Fri May 04 22:30:45 2007 +1000
@@ -80,12 +80,6 @@ SECTIONS
 	EXTRA_RWDATA
 	CONSTRUCTORS
 	} :data
-
-  .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
-  	__start_paravirtprobe = .;
-	*(.paravirtprobe)
-  	__stop_paravirtprobe = .;
-  }
 
   . = ALIGN(4096);
   .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
diff -r 0e7e7991bd30 drivers/lguest/lguest.c
--- a/drivers/lguest/lguest.c	Fri May 04 16:59:07 2007 +1000
+++ b/drivers/lguest/lguest.c	Fri May 04 22:30:20 2007 +1000
@@ -799,10 +799,10 @@ static unsigned lguest_patch(u8 type, u1
 	return insn_len;
 }
 
-/*G:030 Once we get to lguest_init(), we know we're a Guest.  The paravirt_ops
- * structure in the kernel provides a single point for (almost) every routine
- * we have to override to avoid privileged instructions. */
-__init void lguest_init(void *boot)
+/*G:030 Once we get to do_lguest_init(), we know we're a Guest.  The
+ * paravirt_ops structure in the kernel provides a single point for (almost)
+ * every routine we have to override to avoid privileged instructions. */
+__init void do_lguest_init(void *boot)
 {
 	/* Copy boot parameters first: the Launcher put the physical location
 	 * in %esi, and head.S converted that to a virtual address and handed
@@ -868,7 +868,7 @@ __init void lguest_init(void *boot)
 	 * before returning to the rest of lguest_init(). :*/
 
 	/*G:070 Now we've seen all the paravirt_ops, we return to
-	 * lguest_init() where the rest of the fairly chaotic boot setup
+	 * do_lguest_init() where the rest of the fairly chaotic boot setup
 	 * occurs.
 	 *
 	 * The Host expects our first hypercall to tell it where our "struct
@@ -927,7 +927,3 @@ __init void lguest_init(void *boot)
  * It is now time for us to explore the nooks and crannies of the three Guest
  * devices and complete our understanding of the Guest in "make Drivers".
  */
-
-/*G:025 To complete the magic, lguest_maybe_init() from lguest_asm.S needs to
- * be registered as a paravirt probe function: */
-paravirt_probe(lguest_maybe_init);
diff -r 0e7e7991bd30 drivers/lguest/lguest_asm.S
--- a/drivers/lguest/lguest_asm.S	Fri May 04 16:59:07 2007 +1000
+++ b/drivers/lguest/lguest_asm.S	Fri May 04 22:30:20 2007 +1000
@@ -1,33 +1,26 @@
 #include <linux/linkage.h>
 #include <linux/lguest.h>
 #include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
 #include <asm/page.h>
 
 /* FIXME: Once asm/processor-flags.h goes in, include that */
 #define X86_EFLAGS_IF 0x00000200
 
 /*G:020 This is where we begin: head.S notes that we're not running at
- * privilege level 0 (as we would be in native boot) and calls the registered
- * paravirt_probe functions one at a time.
- *
- * This is expected to change to use the linux boot header in %esi and a
- * platform type in the future, so we test for that here (the final version
- * will be slightly different: we assume offset 0x23C and a value of "1" for
- * lguest).  We know that %esi with the current launcher should be zero: we
- * pass that in %eax to lguest_init().
+ * privilege level 0 (as we would be in native boot) and looks at the
+ * platform type field in the boot structure (address in %esi).  If 1 (lguest),
+ * it calls here.
  *
  * The .section line puts this code in .init.text so it will be discarded after
  * boot. */
 .section .init.text, "ax", @progbits
-ENTRY(lguest_maybe_init)
-	cmpl $0, %esi
-	jne out
+ENTRY(lguest_init)
+	/* Set up initial stack. */
+ 	movl $(init_thread_union+THREAD_SIZE),%esp
+	/* Hand boot information pointer to do_lguest_init() */
 	movl %esi, %eax
-	addl $__PAGE_OFFSET, %eax
-	cmpl $1, 0x23c(%eax)
-	je lguest_init
-out:
-	ret
+	jmp do_lguest_init
 
 /*G:055 We create a macro which puts the assembler code between lgstart_ and
  * lgend_ markers.  These templates end up in the .init.text section, so they
diff -r 0e7e7991bd30 drivers/lguest/lguest_user.c
--- a/drivers/lguest/lguest_user.c	Fri May 04 16:59:07 2007 +1000
+++ b/drivers/lguest/lguest_user.c	Fri May 04 22:30:20 2007 +1000
@@ -14,7 +14,9 @@
  *
  * Most of the Guest's registers are left alone: we used get_zeroed_page() to
  * allocate the structure, so they will be 0. */
-static void setup_regs(struct lguest_regs *regs, unsigned long start)
+static void setup_regs(struct lguest_regs *regs,
+		       unsigned long start,
+		       unsigned long page_offset)
 {
 	/* There are four "segment" registers which the Guest needs to boot:
 	 * The "code segment" register (cs) refers to the kernel code segment
@@ -36,8 +38,9 @@ static void setup_regs(struct lguest_reg
 	 * running. */
 	regs->eip = start;
 
-	/* %esi points to our boot information, at physical address 0, so don't
-	 * touch it. */
+	/* %esi points to the virtual address of the boot information (it's at
+	 * physical address 0 == virtual address page_offset). */
+	regs->esi = page_offset;
 }
 
 /*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a
@@ -184,7 +187,7 @@ static int initialize(struct file *file,
 
 	/* Now we initialize the Guest's registers, handing it the start
 	 * address. */
-	setup_regs(lg->regs, args[2]);
+	setup_regs(lg->regs, args[2], lg->page_offset);
 
 	/* There are a couple of GDT entries the Guest expects when first
 	 * booting. */
diff -r 0e7e7991bd30 include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h	Fri May 04 16:59:07 2007 +1000
+++ b/include/asm-i386/paravirt.h	Fri May 04 20:24:39 2007 +1000
@@ -221,11 +221,6 @@ struct paravirt_ops
 	void (*irq_enable_sysexit)(void);
 	void (*iret)(void);
 };
-
-/* Mark a paravirt probe function. */
-#define paravirt_probe(fn)						\
- static asmlinkage void (*__paravirtprobe_##fn)(void) __attribute_used__ \
-		__attribute__((__section__(".paravirtprobe"))) = fn
 
 extern struct paravirt_ops paravirt_ops;
 



^ permalink raw reply	[flat|nested] 37+ messages in thread

* [RFC PATCH 2/3] lguest: Boot with virtual == physical to get closer to native Linux.
  2007-05-04 12:59 [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field Rusty Russell
@ 2007-05-04 13:02 ` Rusty Russell
  2007-05-04 13:07   ` [RFC PATCH 3/3] boot bzImages under paravirt Rusty Russell
  2007-05-04 14:01 ` [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field Eric W. Biederman
  1 sibling, 1 reply; 37+ messages in thread
From: Rusty Russell @ 2007-05-04 13:02 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andi Kleen, Chris Wright, Jeremy Fitzhardinge, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

(This won't be very interesting to the paravirt discussion, it only
changes lguest).

1) This allows us to get alot closer to booting bzImages.

2) It means we don't have to know page_offset.

3) The Guest needs to modify the boot pagetables to create the
   PAGE_OFFSET mapping before jumping to C code.

4) guest_pa() walks the page tables rather than using page_offset.

5) We don't use page_offset to figure out whether to emulate: it was
   always kinda questionable, and won't work for instructions done
   before remapping (bzImage unpacking in particular).

6) We still want the kernel address for tlb flushing: have the initial
   hypercall give us that, too.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 04ffad46c687 Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Fri May 04 22:30:51 2007 +1000
+++ b/Documentation/lguest/lguest.c	Fri May 04 22:49:27 2007 +1000
@@ -143,15 +143,12 @@ static void *map_zeroed_pages(unsigned l
  * by all modern binaries on Linux including the kernel.
  *
  * The ELF headers give *two* addresses: a physical address, and a virtual
- * address.  The Guest kernel expects to be placed in memory at the physical
- * address, and the page tables set up so it will correspond to that virtual
- * address.  We return the difference between the virtual and physical
- * addresses in the "page_offset" pointer.
+ * address.  We use the physical address; the Guest will map itself to the
+ * virtual address.
  *
  * We also return the starting address (which will be the address of the
  * startup_32 routine): */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
-			     unsigned long *page_offset)
+static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
 {
 	void *addr;
 	Elf32_Phdr phdr[ehdr->e_phnum];
@@ -175,9 +172,6 @@ static unsigned long map_elf(int elf_fd,
 	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
 		err(1, "Reading program headers");
 
-	/* We don't know page_offset yet. */
-	*page_offset = 0;
-
 	/* Try all the headers: there are usually only three.  A read-only one,
 	 * a read-write one, and a "note" section which isn't loadable. */
 	for (i = 0; i < ehdr->e_phnum; i++) {
@@ -187,14 +181,6 @@ static unsigned long map_elf(int elf_fd,
 
 		verbose("Section %i: size %i addr %p\n",
 			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
-
-		/* We expect a simple linear address space: every segment must
-		 * have the same difference between virtual (p_vaddr) and
-		 * physical (p_paddr) address. */
-		if (!*page_offset)
-			*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
-		else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
-			errx(1, "Page offset of section %i different", i);
 
 		/* We map this section of the file at its physical address.  We
 		 * map it read & write even if the header says this segment is
@@ -215,53 +201,14 @@ static unsigned long map_elf(int elf_fd,
 			    i, addr, (void *)phdr[i].p_paddr);
 	}
 
-	/* The ELF header gives us the physical starting address.  We want the
-	 * virtual address in this case, and fortunately, we already figured
-	 * out the physical-virtual difference and put it in "page_offset". */
-	return ehdr->e_entry + *page_offset;
-}
-
-/*L:170 Prepare to be SHOCKED and AMAZED.  And possibly a trifle nauseated.
- *
- * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects
- * to be.  We don't know what that option was, but we can figure it out
- * approximately by looking at the addresses in the code.  I chose the common
- * case of reading a memory location into the %eax register:
- *
- *  movl <some-address>, %eax
- *
- * This gets encoded as five bytes: "0xA1 <4-byte-address>".  For example,
- * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
- *
- * In this example can guess that the kernel was compiled with
- * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number).  If the
- * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
- * kernel isn't that bloated yet.
- *
- * Unfortunately, x86 has variable-length instructions, so finding this
- * particular instruction properly involves writing a disassembler.  Instead,
- * we rely on statistics.  We look for "0xA1" and tally the different bytes
- * which occur 4 bytes later (the "0xC0" in our example above).  When one of
- * those bytes appears three times, we can be reasonably confident that it
- * forms the start of CONFIG_PAGE_OFFSET.
- *
- * This is amazingly reliable. */
-static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
-{
-	unsigned int i, possibilities[256] = { 0 };
-
-	for (i = 0; i + 4 < len; i++) {
-		/* mov 0xXXXXXXXX,%eax */
-		if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
-			return (unsigned long)img[i+4] << 24;
-	}
-	errx(1, "could not determine page offset");
+	/* The ELF header gives us the starting address. */
+	return ehdr->e_entry;
 }
 
 /*L:160 Unfortunately the entire ELF image isn't compressed: the segments
  * which need loading are extracted and compressed raw.  This denies us the
  * information we need to make a fully-general loader. */
-static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
+static unsigned long unpack_bzimage(int fd)
 {
 	gzFile f;
 	int ret, len = 0;
@@ -282,13 +229,8 @@ static unsigned long unpack_bzimage(int 
 
 	verbose("Unpacked size %i addr %p\n", len, img);
 
-	/* Without the ELF header, we can't tell virtual-physical gap.  This is
-	 * CONFIG_PAGE_OFFSET, and people do actually change it.  Fortunately,
-	 * I have a clever way of figuring it out from the code itself.  */
-	*page_offset = intuit_page_offset(img, len);
-
 	/* Entry is physical address: convert to virtual */
-	return (unsigned long)img + *page_offset;
+	return (unsigned long)img;
 }
 
 /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded.  You're
@@ -299,7 +241,7 @@ static unsigned long unpack_bzimage(int 
  * The bzImage is formed by putting the decompressing code in front of the
  * compressed kernel code.  So we can simple scan through it looking for the
  * first "gzip" header, and start decompressing from there. */
-static unsigned long load_bzimage(int fd, unsigned long *page_offset)
+static unsigned long load_bzimage(int fd)
 {
 	unsigned char c;
 	int state = 0;
@@ -327,7 +269,7 @@ static unsigned long load_bzimage(int fd
 			if (c != 0x03)
 				state = -1;
 			else
-				return unpack_bzimage(fd, page_offset);
+				return unpack_bzimage(fd);
 		}
 	}
 	errx(1, "Could not find kernel in bzImage");
@@ -336,7 +278,7 @@ static unsigned long load_bzimage(int fd
 /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
  * come wrapped up in the self-decompressing "bzImage" format.  With some funky
  * coding, we can load those, too. */
-static unsigned long load_kernel(int fd, unsigned long *page_offset)
+static unsigned long load_kernel(int fd)
 {
 	Elf32_Ehdr hdr;
 
@@ -346,10 +288,10 @@ static unsigned long load_kernel(int fd,
 
 	/* If it's an ELF file, it starts with "\177ELF" */
 	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		return map_elf(fd, &hdr, page_offset);
+		return map_elf(fd, &hdr);
 
 	/* Otherwise we assume it's a bzImage, and try to unpack it */
-	return load_bzimage(fd, page_offset);
+	return load_bzimage(fd);
 }
 
 /* This is a trivial little helper to align pages.  Andi Kleen hated it because
@@ -401,27 +343,20 @@ static unsigned long load_initrd(const c
 	return len;
 }
 
-/* Once we know how much memory we have, and the address the Guest kernel
- * expects, we can construct simple linear page tables which will get the Guest
- * far enough into the boot to create its own.
+/* Once we know how much memory we have, we can construct simple linear page
+ * tables which set virtual == physical which will get the Guest far enough
+ * into the boot to create its own.
  *
  * We lay them out of the way, just below the initrd (which is why we need to
  * know its size). */
 static unsigned long setup_pagetables(unsigned long mem,
-				      unsigned long initrd_size,
-				      unsigned long page_offset)
+				      unsigned long initrd_size)
 {
 	u32 *pgdir, *linear;
 	unsigned int mapped_pages, i, linear_pages;
 	unsigned int ptes_per_page = getpagesize()/sizeof(u32);
 
-	/* Ideally we map all physical memory starting at page_offset.
-	 * However, if page_offset is 0xC0000000 we can only map 1G of physical
-	 * (0xC0000000 + 1G overflows). */
-	if (mem > -page_offset)
-		mapped_pages = mem/getpagesize();
-	else
-		mapped_pages = -page_offset/getpagesize();
+	mapped_pages = mem/getpagesize();
 
 	/* Each PTE page can map ptes_per_page pages: how many do we need? */
 	linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
@@ -438,11 +373,9 @@ static unsigned long setup_pagetables(un
 	for (i = 0; i < mapped_pages; i++)
 		linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
 
-	/* The top level points to the linear page table pages above.  The
-	 * entry representing page_offset points to the first one, and they
-	 * continue from there. */
+	/* The top level points to the linear page table pages above. */
 	for (i = 0; i < mapped_pages; i += ptes_per_page) {
-		pgdir[(i + page_offset/getpagesize())/ptes_per_page]
+		pgdir[i/ptes_per_page]
 			= (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
 	}
 
@@ -471,13 +404,13 @@ static void concat(char *dst, char *args
 
 /* This is where we actually tell the kernel to initialize the Guest.  We saw
  * the arguments it expects when we looked at initialize() in lguest_user.c:
- * the top physical page to allow, the top level pagetable, the entry point and
- * the page_offset constant for the Guest. */
-static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+ * the top physical page to allow, the top level pagetable, and the entry point
+ * into the Guest. */
+static int tell_kernel(u32 pgdir, u32 start)
 {
 	u32 args[] = { LHREQ_INITIALIZE,
 		       LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
-		       pgdir, start, page_offset };
+		       pgdir, start };
 	int fd;
 
 	fd = open_or_die("/dev/lguest", O_RDWR);
@@ -1442,9 +1375,9 @@ static void usage(void)
  */
 int main(int argc, char *argv[])
 {
-	/* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size
-	 * of the (optional) initrd. */
-	unsigned long mem, pgdir, start, page_offset, initrd_size = 0;
+	/* Memory, top-level pagetable, code startpoint and size of the
+	 * (optional) initrd. */
+	unsigned long mem, pgdir, start, initrd_size = 0;
 	/* A temporary, the /dev/lguest file descriptor, and a waker fd. */
 	int c, lguest_fd, waker_fd;
 	/* The list of Guest devices, based on command line arguments. */
@@ -1504,8 +1437,7 @@ int main(int argc, char *argv[])
 	map_zeroed_pages(0, mem / getpagesize());
 
 	/* Now we load the kernel */
-	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
-			    &page_offset);
+	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
 
 	/* Write the device descriptors into memory for the Guest to know what
 	 * devices it has.  The descriptors are mapped just above normal
@@ -1526,7 +1458,7 @@ int main(int argc, char *argv[])
 	}
 
 	/* Set up the initial linear pagetables, starting below the initrd. */
-	pgdir = setup_pagetables(mem, initrd_size, page_offset);
+	pgdir = setup_pagetables(mem, initrd_size);
 
 	/* The Linux boot header contains an "E820" memory map: ours is a
 	 * simple, single region. */
@@ -1544,7 +1476,7 @@ int main(int argc, char *argv[])
 
 	/* We tell the kernel to initialize the Guest: this returns the open
 	 * /dev/lguest file descriptor. */
-	lguest_fd = tell_kernel(pgdir, start, page_offset);
+	lguest_fd = tell_kernel(pgdir, start);
 
 	/* We fork off a child process, which wakes the Launcher whenever one
 	 * of the input file descriptors needs attention.  Otherwise we would
diff -r 04ffad46c687 arch/i386/kernel/asm-offsets.c
--- a/arch/i386/kernel/asm-offsets.c	Fri May 04 22:30:51 2007 +1000
+++ b/arch/i386/kernel/asm-offsets.c	Fri May 04 22:37:09 2007 +1000
@@ -124,6 +124,7 @@ void foo(void)
 #ifdef CONFIG_LGUEST_GUEST
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
 	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
 	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
 	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
diff -r 04ffad46c687 drivers/lguest/Makefile
--- a/drivers/lguest/Makefile	Fri May 04 22:30:51 2007 +1000
+++ b/drivers/lguest/Makefile	Fri May 04 22:37:09 2007 +1000
@@ -5,6 +5,8 @@ obj-$(CONFIG_LGUEST)	+= lg.o
 obj-$(CONFIG_LGUEST)	+= lg.o
 lg-objs := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
 	segments.o io.o lguest_user.o switcher.o
+
+CFLAGS_interrupts_and_traps.o := -O0
 
 Preparation Preparation!: PREFIX=P
 Guest: PREFIX=G
diff -r 04ffad46c687 drivers/lguest/core.c
--- a/drivers/lguest/core.c	Fri May 04 22:30:51 2007 +1000
+++ b/drivers/lguest/core.c	Fri May 04 22:37:42 2007 +1000
@@ -262,12 +262,13 @@ static int emulate_insn(struct lguest *l
 	u8 insn;
 	unsigned int insnlen = 0, in = 0, shift = 0;
 	/* The eip contains the *virtual* address of the Guest's instruction:
-	 * guest_pa just subtracts the Guest's page_offset. */
+	 * guest_pa maps that to the guest-physical address. */
 	unsigned long physaddr = guest_pa(lg, lg->regs->eip);
 
-	/* The guest_pa() function only works for Guest kernel addresses, but
-	 * that's all we're trying to do anyway. */
-	if (lg->regs->eip < lg->page_offset)
+	/* This must be the Guest kernel trying to do something, not userspace!
+	 * The bottom two bits of the "cs" segment register are the privilege
+	 * level. */
+	if ((lg->regs->cs & 3) != GUEST_PL)
 		return 0;
 
 	/* Decoding x86 instructions is icky. */
diff -r 04ffad46c687 drivers/lguest/hypercalls.c
--- a/drivers/lguest/hypercalls.c	Fri May 04 22:30:51 2007 +1000
+++ b/drivers/lguest/hypercalls.c	Fri May 04 22:37:09 2007 +1000
@@ -221,12 +221,12 @@ static void initialize(struct lguest *lg
 	 * the range of addresses into "struct lguest_data". */
 	if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
 	    || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
-	    /* We tell the Guest that it can't use the top 4MB of virtual
-	     * addresses used by the Switcher. */
-	    || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
 	    /* We also give the Guest a unique id, as used in lguest_net.c. */
 	    || put_user(lg->guestid, &lg->lguest_data->guestid))
 		kill_guest(lg, "bad guest page %p", lg->lguest_data);
+
+	/* page_tables.c will also do some setup. */
+	page_table_guest_data_init(lg);
 
 	/* This is the one case where the above accesses might have been the
 	 * first write to a Guest page.  This may have caused a copy-on-write
diff -r 04ffad46c687 drivers/lguest/interrupts_and_traps.c
--- a/drivers/lguest/interrupts_and_traps.c	Fri May 04 22:30:51 2007 +1000
+++ b/drivers/lguest/interrupts_and_traps.c	Fri May 04 22:37:09 2007 +1000
@@ -54,8 +54,9 @@ static void push_guest_stack(struct lgue
  * it). */
 static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
 {
-	u32 __user *gstack;
+	u32 __user *gstack, *origstack;
 	u32 eflags, ss, irq_enable;
+	unsigned long virtstack;
 
 	/* There are two cases for interrupts: one where the Guest is already
 	 * in the kernel, and a more complex one where the Guest is in
@@ -63,8 +64,10 @@ static void set_guest_interrupt(struct l
 	if ((lg->regs->ss&0x3) != GUEST_PL) {
 		/* The Guest told us their kernel stack with the SET_STACK
 		 * hypercall: both the virtual address and the segment */
-		gstack = (u32 __user *)guest_pa(lg, lg->esp1);
+		virtstack = lg->esp1;
 		ss = lg->ss1;
+
+		origstack = gstack = (u32 __user *)guest_pa(lg, virtstack);
 		/* We push the old stack segment and pointer onto the new
 		 * stack: when the Guest does an "iret" back from the interrupt
 		 * handler the CPU will notice they're dropping privilege
@@ -73,8 +76,10 @@ static void set_guest_interrupt(struct l
 		push_guest_stack(lg, &gstack, lg->regs->esp);
 	} else {
 		/* We're staying on the same Guest (kernel) stack. */
-		gstack = (u32 __user *)guest_pa(lg, lg->regs->esp);
+		virtstack = lg->regs->esp;
 		ss = lg->regs->ss;
+
+		origstack = gstack = (u32 __user *)guest_pa(lg, virtstack);
 	}
 
 	/* Remember that we never let the Guest actually disable interrupts, so
@@ -100,7 +105,7 @@ static void set_guest_interrupt(struct l
 	/* Now we've pushed all the old state, we change the stack, the code
 	 * segment and the address to execute. */
 	lg->regs->ss = ss;
-	lg->regs->esp = (u32)gstack + lg->page_offset;
+	lg->regs->esp = virtstack + (gstack - origstack)*sizeof(unsigned long);
 	lg->regs->cs = (__KERNEL_CS|GUEST_PL);
 	lg->regs->eip = idt_address(lo, hi);
 
diff -r 04ffad46c687 drivers/lguest/lg.h
--- a/drivers/lguest/lg.h	Fri May 04 22:30:51 2007 +1000
+++ b/drivers/lguest/lg.h	Fri May 04 22:37:09 2007 +1000
@@ -142,7 +142,7 @@ struct lguest
 	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
 	u16 guestid;
 	u32 pfn_limit;
-	u32 page_offset;
+	unsigned long kernel_address;
 	u32 cr2;
 	int halted;
 	int ts;
@@ -231,6 +231,8 @@ void map_switcher_in_guest(struct lguest
 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
 int demand_page(struct lguest *info, unsigned long cr2, int errcode);
 void pin_page(struct lguest *lg, unsigned long vaddr);
+unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
+void page_table_guest_data_init(struct lguest *lg);
 
 /* lguest_user.c: */
 int lguest_device_init(void);
@@ -283,9 +285,5 @@ do {								\
 } while(0)
 /* (End of aside) :*/
 
-static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
-{
-	return vaddr - lg->page_offset;
-}
 #endif	/* __ASSEMBLY__ */
 #endif	/* _LGUEST_H */
diff -r 04ffad46c687 drivers/lguest/lguest.c
--- a/drivers/lguest/lguest.c	Fri May 04 22:30:51 2007 +1000
+++ b/drivers/lguest/lguest.c	Fri May 04 22:40:31 2007 +1000
@@ -83,6 +83,7 @@ struct lguest_data lguest_data = {
 	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
 	.noirq_start = (u32)lguest_noirq_start,
 	.noirq_end = (u32)lguest_noirq_end,
+	.kernel_address = PAGE_OFFSET,
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
 };
 struct lguest_device_desc *lguest_devices;
@@ -869,11 +870,7 @@ __init void do_lguest_init(void *boot)
 
 	/*G:070 Now we've seen all the paravirt_ops, we return to
 	 * do_lguest_init() where the rest of the fairly chaotic boot setup
-	 * occurs.
-	 *
-	 * The Host expects our first hypercall to tell it where our "struct
-	 * lguest_data" is, so we do that first. */
-	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+	 * occurs. */
 
 	/* The native boot code sets up initial page tables immediately after
 	 * the kernel itself, and sets init_pg_tables_end so they're not
diff -r 04ffad46c687 drivers/lguest/lguest_asm.S
--- a/drivers/lguest/lguest_asm.S	Fri May 04 22:30:51 2007 +1000
+++ b/drivers/lguest/lguest_asm.S	Fri May 04 22:37:09 2007 +1000
@@ -8,19 +8,40 @@
 #define X86_EFLAGS_IF 0x00000200
 
 /*G:020 This is where we begin: head.S notes that we're not running at
- * privilege level 0 (as we would be in native boot) and looks at the
- * platform type field in the boot structure (address in %esi).  If 1 (lguest),
- * it calls here.
+ * privilege level 0 (as we would be in native boot) and looks at the platform
+ * type field in the boot structure (address in %esi).  If 1 (lguest), it calls
+ * here.  We have to be careful though: we're running at low addresses, not
+ * above PAGE_OFFSET as expected (eg. 0xC0000000).
  *
  * The .section line puts this code in .init.text so it will be discarded after
  * boot. */
 .section .init.text, "ax", @progbits
 ENTRY(lguest_init)
+	/* Make initial hypercall now, so we can set up the pagetables. */
+	movl $LHCALL_LGUEST_INIT, %eax
+	movl $lguest_data - __PAGE_OFFSET, %edx
+	int $LGUEST_TRAP_ENTRY
+
+	/* Set up boot information pointer to hand to do_lguest_init() */
+	movl %esi, %eax
+	addl $__PAGE_OFFSET, %eax
+
+	/* Current toplevel is now in lguest_data.pgdir. */
+	movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
+
+	/* Copy first 32 entries of page directory to __PAGE_OFFSET entries. */
+	movl $32, %ecx
+	movl %esi, %edi
+	addl $((__PAGE_OFFSET >> 22) * 4), %edi
+	rep
+	movsl
+
 	/* Set up initial stack. */
  	movl $(init_thread_union+THREAD_SIZE),%esp
-	/* Hand boot information pointer to do_lguest_init() */
-	movl %esi, %eax
-	jmp do_lguest_init
+
+	/* Jumps are relative, and we're running __PAGE_OFFSET too low at the
+	 * moment. */
+	jmp do_lguest_init+__PAGE_OFFSET
 
 /*G:055 We create a macro which puts the assembler code between lgstart_ and
  * lgend_ markers.  These templates end up in the .init.text section, so they
diff -r 04ffad46c687 drivers/lguest/lguest_user.c
--- a/drivers/lguest/lguest_user.c	Fri May 04 22:30:51 2007 +1000
+++ b/drivers/lguest/lguest_user.c	Fri May 04 22:37:09 2007 +1000
@@ -14,9 +14,7 @@
  *
  * Most of the Guest's registers are left alone: we used get_zeroed_page() to
  * allocate the structure, so they will be 0. */
-static void setup_regs(struct lguest_regs *regs,
-		       unsigned long start,
-		       unsigned long page_offset)
+static void setup_regs(struct lguest_regs *regs, unsigned long start)
 {
 	/* There are four "segment" registers which the Guest needs to boot:
 	 * The "code segment" register (cs) refers to the kernel code segment
@@ -38,9 +36,8 @@ static void setup_regs(struct lguest_reg
 	 * running. */
 	regs->eip = start;
 
-	/* %esi points to the virtual address of the boot information (it's at
-	 * physical address 0 == virtual address page_offset). */
-	regs->esi = page_offset;
+	/* %esi must point to the address of the boot information (at 0), and
+	 * it already does, so we don't touch it here. */
 }
 
 /*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a
@@ -117,7 +114,7 @@ static ssize_t read(struct file *file, c
 	return run_guest(lg, user);
 }
 
-/*L:020 The initialization write supplies 4 32-bit values (in addition to the
+/*L:020 The initialization write supplies 3 32-bit values (in addition to the
  * 32-bit LHREQ_INITIALIZE value).  These are:
  *
  * pfnlimit: The highest (Guest-physical) page number the Guest should be
@@ -128,12 +125,6 @@ static ssize_t read(struct file *file, c
  * pagetables (which are set up by the Launcher).
  *
  * start: The first instruction to execute ("eip" in x86-speak).
- *
- * page_offset: The PAGE_OFFSET constant in the Guest kernel.  We should
- * probably wean the code off this, but it's a very useful constant!  Any
- * address above this is within the Guest kernel, and any kernel address can
- * quickly converted from physical to virtual by adding PAGE_OFFSET.  It's
- * 0xC0000000 (3G) by default, but it's configurable at kernel build time.
  */
 static int initialize(struct file *file, const u32 __user *input)
 {
@@ -141,7 +132,7 @@ static int initialize(struct file *file,
 	 * Guest. */
 	struct lguest *lg;
 	int err, i;
-	u32 args[4];
+	u32 args[3];
 
 	/* You can't initialize twice!  Close the device and start again... */
 	if (file->private_data)
@@ -166,7 +157,6 @@ static int initialize(struct file *file,
 	/* Populate the easy fields of our "struct lguest" */
 	lg->guestid = i;
 	lg->pfn_limit = args[0];
-	lg->page_offset = args[3];
 
 	/* We need a complete page for the Guest registers: they are accessible
 	 * to the Guest and we can only grant it access to whole pages. */
@@ -187,7 +177,7 @@ static int initialize(struct file *file,
 
 	/* Now we initialize the Guest's registers, handing it the start
 	 * address. */
-	setup_regs(lg->regs, args[2], lg->page_offset);
+	setup_regs(lg->regs, args[2]);
 
 	/* There are a couple of GDT entries the Guest expects when first
 	 * booting. */
diff -r 04ffad46c687 drivers/lguest/page_tables.c
--- a/drivers/lguest/page_tables.c	Fri May 04 22:30:51 2007 +1000
+++ b/drivers/lguest/page_tables.c	Fri May 04 22:37:09 2007 +1000
@@ -13,6 +13,7 @@
 #include <linux/random.h>
 #include <linux/percpu.h>
 #include <asm/tlbflush.h>
+#include <asm/uaccess.h>
 #include "lg.h"
 
 /*M:008 We hold reference to pages, which prevents them from being swapped.
@@ -356,7 +357,7 @@ static void flush_user_mappings(struct l
 {
 	unsigned int i;
 	/* Release every pgd entry up to the kernel's address. */
-	for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
+	for (i = 0; i < vaddr_to_pgd_index(lg->kernel_address); i++)
 		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
 }
 
@@ -368,6 +369,25 @@ void guest_pagetable_flush_user(struct l
 	flush_user_mappings(lg, lg->pgdidx);
 }
 /*:*/
+
+/* We walk down the guest page tables to get a guest-physical address */
+unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+{
+	gpgd_t gpgd;
+	gpte_t gpte;
+
+	/* First step: get the top-level Guest page table entry. */
+	gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
+	/* Toplevel not present?  We can't map it in. */
+	if (!(gpgd.flags & _PAGE_PRESENT))
+		kill_guest(lg, "Bad address %#lx", vaddr);
+
+	gpte = mkgpte(lgread_u32(lg, gpte_addr(lg, gpgd, vaddr)));
+	if (!(gpgd.flags & _PAGE_PRESENT))
+		kill_guest(lg, "Bad address %#lx", vaddr);
+
+	return gpte.pfn * PAGE_SIZE | (vaddr & ~PAGE_MASK);
+}
 
 /* We keep several page tables.  This is a simple routine to find the page
  * table (if any) corresponding to this top-level address the Guest has given
@@ -510,7 +530,7 @@ void guest_set_pte(struct lguest *lg,
 {
 	/* Kernel mappings must be changed on all top levels.  Slow, but
 	 * doesn't happen often. */
-	if (vaddr >= lg->page_offset) {
+	if (vaddr >= lg->kernel_address) {
 		unsigned int i;
 		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
 			if (lg->pgdirs[i].pgdir)
@@ -560,11 +580,6 @@ void guest_set_pmd(struct lguest *lg, un
  * its first page table is.  We set some things up here: */
 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
 {
-	/* In flush_user_mappings() we loop from 0 to
-	 * "vaddr_to_pgd_index(lg->page_offset)".  This assumes it won't hit
-	 * the Switcher mappings, so check that now. */
-	if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
-		return -EINVAL;
 	/* We start on the first shadow page table, and give it a blank PGD
 	 * page. */
 	lg->pgdidx = 0;
@@ -573,6 +588,24 @@ int init_guest_pagetable(struct lguest *
 	if (!lg->pgdirs[lg->pgdidx].pgdir)
 		return -ENOMEM;
 	return 0;
+}
+
+/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
+void page_table_guest_data_init(struct lguest *lg)
+{
+	/* We get the kernel address: above this is all kernel memory. */
+	if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
+	    /* We tell the Guest that it can't use the top 4MB of virtual
+	     * addresses used by the Switcher. */
+	    || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
+	    || put_user(lg->pgdirs[lg->pgdidx].cr3, &lg->lguest_data->pgdir))
+		kill_guest(lg, "bad guest page %p", lg->lguest_data);
+
+	/* In flush_user_mappings() we loop from 0 to
+	 * "vaddr_to_pgd_index(lg->kernel_address)".  This assumes it won't hit
+	 * the Switcher mappings, so check that now. */
+	if (vaddr_to_pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
+		kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
 }
 
 /* When a Guest dies, our cleanup is fairly simple. */
diff -r 04ffad46c687 include/linux/lguest.h
--- a/include/linux/lguest.h	Fri May 04 22:30:51 2007 +1000
+++ b/include/linux/lguest.h	Fri May 04 22:37:09 2007 +1000
@@ -2,9 +2,6 @@
  * this is subject to wild and random change between versions. */
 #ifndef _ASM_LGUEST_H
 #define _ASM_LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <asm/irq.h>
 
 #define LHCALL_FLUSH_ASYNC	0
 #define LHCALL_LGUEST_INIT	1
@@ -24,6 +21,11 @@
 #define LHCALL_SET_PMD		15
 #define LHCALL_LOAD_TLS		16
 
+#define LGUEST_TRAP_ENTRY 0x1F
+
+#ifndef __ASSEMBLY__
+#include <asm/irq.h>
+
 /*G:031 First, how does our Guest contact the Host to ask for privileged
  * operations?  There are two ways: the direct way is to make a "hypercall",
  * to make requests of the Host Itself.
@@ -37,8 +39,6 @@
  * Grossly invalid calls result in Sudden Death at the hands of the vengeful
  * Host, rather than returning failure.  This reflects Winston Churchill's
  * definition of a gentleman: "someone who is only rude intentionally". */
-#define LGUEST_TRAP_ENTRY 0x1F
-
 static inline unsigned long
 hcall(unsigned long call,
       unsigned long arg1, unsigned long arg2, unsigned long arg3)
@@ -99,10 +99,14 @@ struct lguest_data
 	unsigned long reserve_mem;
 	/* ID of this Guest (used by network driver to set ethernet address) */
 	u16 guestid;
+	/* Page where the top-level pagetable is */
+	unsigned long pgdir;
 
 /* Fields initialized by the Guest at boot: */
 	/* Instruction range to suppress interrupts even if enabled */
 	unsigned long noirq_start, noirq_end;
+	/* Address above which we can assume page tables don't often change. */
+	unsigned long kernel_address;
 };
 extern struct lguest_data lguest_data;
 #endif /* __ASSEMBLY__ */



^ permalink raw reply	[flat|nested] 37+ messages in thread

* [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 13:02 ` [RFC PATCH 2/3] lguest: Boot with virtual == physical to get closer to native Linux Rusty Russell
@ 2007-05-04 13:07   ` Rusty Russell
  2007-05-04 14:38     ` Eric W. Biederman
  0 siblings, 1 reply; 37+ messages in thread
From: Rusty Russell @ 2007-05-04 13:07 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andi Kleen, Chris Wright, Jeremy Fitzhardinge, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

(This is kind of a bonus, but it shows how minor the change is to boot
bzImages)

Skipping over the "cli" and segment loading is enough to allow lguest
to boot bzImages.  There are some "out" insns in the unpacking code,
but lguest already has to emulate/skip-over them because of random x86
probes during boot.

We can no longer assume the launcher has set the bss to zero: we now
need to zero it ourselves.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c    |  134 +++++++-------------------------------
 arch/i386/boot/compressed/head.S |    6 +
 drivers/lguest/core.c            |    8 +-
 drivers/lguest/lguest.c          |    2
 4 files changed, 41 insertions(+), 109 deletions(-)

diff -r 73d71b701360 arch/i386/boot/compressed/head.S
--- a/arch/i386/boot/compressed/head.S	Fri May 04 22:49:34 2007 +1000
+++ b/arch/i386/boot/compressed/head.S	Fri May 04 22:51:49 2007 +1000
@@ -32,6 +32,11 @@
 	.globl startup_32
 
 startup_32:
+#ifdef CONFIG_PARAVIRT
+        movl %cs,%eax
+        andl $0x3,%eax
+        jnz calc_delta
+#endif
 	cld
 	cli
 	movl $(__BOOT_DS),%eax
@@ -48,6 +53,7 @@ startup_32:
  * data at 0x34-0x3f are used as the stack for this calculation.
  * Only 4 bytes are needed.
  */
+calc_delta:
 	leal 0x40(%esi), %esp
 	call 1f
 1:	popl %ebp
diff -r 73d71b701360 drivers/lguest/lguest.c
--- a/drivers/lguest/lguest.c	Fri May 04 22:49:34 2007 +1000
+++ b/drivers/lguest/lguest.c	Fri May 04 22:51:49 2007 +1000
@@ -805,6 +805,8 @@ static unsigned lguest_patch(u8 type, u1
  * every routine we have to override to avoid privileged instructions. */
 __init void do_lguest_init(void *boot)
 {
+	memset(__bss_start, 0, __bss_stop - __bss_start);
+
 	/* Copy boot parameters first: the Launcher put the physical location
 	 * in %esi, and head.S converted that to a virtual address and handed
 	 * it to us. */
diff -r 73d71b701360 Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Fri May 04 22:49:34 2007 +1000
+++ b/Documentation/lguest/lguest.c	Fri May 04 22:53:31 2007 +1000
@@ -205,74 +205,30 @@ static unsigned long map_elf(int elf_fd,
 	return ehdr->e_entry;
 }
 
-/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
- * which need loading are extracted and compressed raw.  This denies us the
- * information we need to make a fully-general loader. */
-static unsigned long unpack_bzimage(int fd)
-{
-	gzFile f;
-	int ret, len = 0;
-	/* A bzImage always gets loaded at physical address 1M.  This is
-	 * actually configurable as CONFIG_PHYSICAL_START, but as the comment
-	 * there says, "Don't change this unless you know what you are doing".
-	 * Indeed. */
-	void *img = (void *)0x100000;
-
-	/* gzdopen takes our file descriptor (carefully placed at the start of
-	 * the GZIP header we found) and returns a gzFile. */
-	f = gzdopen(fd, "rb");
-	/* We read it into memory in 64k chunks until we hit the end. */
-	while ((ret = gzread(f, img + len, 65536)) > 0)
-		len += ret;
-	if (ret < 0)
-		err(1, "reading image from bzImage");
-
-	verbose("Unpacked size %i addr %p\n", len, img);
-
-	/* Entry is physical address: convert to virtual */
-	return (unsigned long)img;
-}
-
-/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded.  You're
- * supposed to jump into it and it will unpack itself.  We can't do that
- * because the Guest can't run the unpacking code, and adding features to
- * lguest kills puppies, so we don't want to.
- *
- * The bzImage is formed by putting the decompressing code in front of the
- * compressed kernel code.  So we can simple scan through it looking for the
- * first "gzip" header, and start decompressing from there. */
+/*L:150 A bzImage, unlike an ELF file, is not meant to be mapped into memory.
+ * You're supposed to jump into it and it will unpack itself.  We used to have
+ * to perform some hairy magic becuase we couldn't run the unpacking code, and
+ * adding features to lguest kills puppies, so we didn't want to.
+ *
+ * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard to fix, so
+ * now we just read the funky header so we know where in the file to load, and
+ * away we go. */
 static unsigned long load_bzimage(int fd)
 {
-	unsigned char c;
-	int state = 0;
-
-	/* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
-	while (read(fd, &c, 1) == 1) {
-		switch (state) {
-		case 0:
-			if (c == 0x1F)
-				state++;
-			break;
-		case 1:
-			if (c == 0x8B)
-				state++;
-			else
-				state = 0;
-			break;
-		case 2 ... 8:
-			state++;
-			break;
-		case 9:
-			/* Seek back to the start of the gzip header. */
-			lseek(fd, -10, SEEK_CUR);
-			/* One final check: "compressed under UNIX". */
-			if (c != 0x03)
-				state = -1;
-			else
-				return unpack_bzimage(fd);
-		}
-	}
-	errx(1, "Could not find kernel in bzImage");
+#warning document this with reference to Documentation/i386/boot.txt
+	u8 hdr[0x300];
+	int r;
+	void *p = (void *)0x100000;
+
+	lseek(fd, 0, SEEK_SET);
+	read(fd, hdr, sizeof(hdr));
+
+	lseek(fd, (unsigned long)(hdr[0x1F1]+1) * 512, SEEK_SET);
+
+	while ((r = read(fd, p, 65535)) > 0)
+		p += r;
+
+	return *(unsigned long *)&hdr[0x214];
 }
 
 /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 12:59 [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field Rusty Russell
  2007-05-04 13:02 ` [RFC PATCH 2/3] lguest: Boot with virtual == physical to get closer to native Linux Rusty Russell
@ 2007-05-04 14:01 ` Eric W. Biederman
  2007-05-04 14:18   ` Rusty Russell
  1 sibling, 1 reply; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 14:01 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, Chris Wright, Jeremy Fitzhardinge, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

Rusty Russell <rusty@rustcorp.com.au> writes:

> [ This patch won't apply, even to -mm: it comes after the documentation
> patch.  But it's hopefully readable ]
>
> 1) This code assumes that 0x23C is going to be the "platform type"
>    field of the next version of the Linux boot header format.
>
> 2) This doesn't actually assume that P=V, but unlike the paravirt
>    probe code doesn't assume that V=P+PAGE_OFFSET either.
>
> The following patch actually fixes up lguest to boot with P=V.

We should place that initial branch right after startup_32,
we don't need the startup_paravirt at all now.

ENTRY(startup_32)
#ifdef CONFIG_LGUEST_GUEST
        cmpl	$1, 0x23c(%esi)
	je      lguest_init
#endif

...

Although it might make sense to do:
ENTRY(startup_32)
	cmpl	$0, 0x23c(%esi)
        jnz	startup_subarch

To get the branches out of the main flow of execution but
that is minor.  This isn't a performance critical path.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 14:01 ` [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field Eric W. Biederman
@ 2007-05-04 14:18   ` Rusty Russell
  2007-05-04 14:23     ` Eric W. Biederman
                       ` (2 more replies)
  0 siblings, 3 replies; 37+ messages in thread
From: Rusty Russell @ 2007-05-04 14:18 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andi Kleen, Chris Wright, Jeremy Fitzhardinge, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

On Fri, 2007-05-04 at 08:01 -0600, Eric W. Biederman wrote:
> We should place that initial branch right after startup_32,
> we don't need the startup_paravirt at all now.
> 
> ENTRY(startup_32)
> #ifdef CONFIG_LGUEST_GUEST
>         cmpl	$1, 0x23c(%esi)
> 	je      lguest_init
> #endif

Hi Eric!

	Makes sense to me, but I wasn't sure... do we need to check for old
bootloader versions and such?

Thanks,
Rusty.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 14:18   ` Rusty Russell
@ 2007-05-04 14:23     ` Eric W. Biederman
  2007-05-04 15:52       ` H. Peter Anvin
  2007-05-04 15:10     ` Eric W. Biederman
  2007-05-04 15:53     ` H. Peter Anvin
  2 siblings, 1 reply; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 14:23 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, Chris Wright, Jeremy Fitzhardinge, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

Rusty Russell <rusty@rustcorp.com.au> writes:

> On Fri, 2007-05-04 at 08:01 -0600, Eric W. Biederman wrote:
>> We should place that initial branch right after startup_32,
>> we don't need the startup_paravirt at all now.
>> 
>> ENTRY(startup_32)
>> #ifdef CONFIG_LGUEST_GUEST
>>         cmpl	$1, 0x23c(%esi)
>> 	je      lguest_init
>> #endif
>
> Hi Eric!
>
> 	Makes sense to me, but I wasn't sure... do we need to check for old
> bootloader versions and such?

Unlikely.  Unless we expect that this offset will come in non-zero.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 13:07   ` [RFC PATCH 3/3] boot bzImages under paravirt Rusty Russell
@ 2007-05-04 14:38     ` Eric W. Biederman
  2007-05-04 14:55       ` Rusty Russell
  2007-05-04 15:15       ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 14:38 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, Chris Wright, Jeremy Fitzhardinge, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

Rusty Russell <rusty@rustcorp.com.au> writes:

> (This is kind of a bonus, but it shows how minor the change is to boot
> bzImages)
>
> Skipping over the "cli" and segment loading is enough to allow lguest
> to boot bzImages.  There are some "out" insns in the unpacking code,
> but lguest already has to emulate/skip-over them because of random x86
> probes during boot.
>
> We can no longer assume the launcher has set the bss to zero: we now
> need to zero it ourselves.

Ok.  Although we can hoist the bss zeroing, if everything needs it.

Are you booting with P=V now?  If not I expect you will run
into trouble if you set CONFIG_RELOCATABLE.

> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> ---
>  Documentation/lguest/lguest.c    |  134 +++++++-------------------------------
>  arch/i386/boot/compressed/head.S |    6 +
>  drivers/lguest/core.c            |    8 +-
>  drivers/lguest/lguest.c          |    2
>  4 files changed, 41 insertions(+), 109 deletions(-)
>
> diff -r 73d71b701360 arch/i386/boot/compressed/head.S
> --- a/arch/i386/boot/compressed/head.S	Fri May 04 22:49:34 2007 +1000
> +++ b/arch/i386/boot/compressed/head.S	Fri May 04 22:51:49 2007 +1000
> @@ -32,6 +32,11 @@
>  	.globl startup_32
>  
>  startup_32:
> +#ifdef CONFIG_PARAVIRT
> +        movl %cs,%eax
> +        andl $0x3,%eax
> +        jnz calc_delta
> +#endif
>  	cld
>  	cli
>  	movl $(__BOOT_DS),%eax
> @@ -48,6 +53,7 @@ startup_32:
>   * data at 0x34-0x3f are used as the stack for this calculation.
>   * Only 4 bytes are needed.
>   */
> +calc_delta:
>  	leal 0x40(%esi), %esp
>  	call 1f
>  1:	popl %ebp

In essence this is ok.  I would like to do the research to see if we
can perhaps just remove the segment reload.  Short of that we should
at least be able to remove the unnecessary preprocessor test for
CONFIG_PARAVIRT.

Hmm.  I'm wondering about the segment reload and how much of a problem
that is.  My memory says that segment reloads are not actually a
privileged operation, so we may be able to support this even in
paravirt mode.  How hard would that be to support?  The segment
we reload is a fixed part of our boot protocol.

Anyway something to research.

> +#warning document this with reference to Documentation/i386/boot.txt
> +	u8 hdr[0x300];
> +	int r;
> +	void *p = (void *)0x100000;
> +
> +	lseek(fd, 0, SEEK_SET);
> +	read(fd, hdr, sizeof(hdr));
> +
> +	lseek(fd, (unsigned long)(hdr[0x1F1]+1) * 512, SEEK_SET);

I guess this works.  You don't handle the old case of set_sectors == 0
but I guess you don't support any kernels where that is the case.

> +
> +	while ((r = read(fd, p, 65535)) > 0)
> +		p += r;
> +
> +	return *(unsigned long *)&hdr[0x214];
>  }
>  
>  /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 14:38     ` Eric W. Biederman
@ 2007-05-04 14:55       ` Rusty Russell
  2007-05-04 15:49         ` H. Peter Anvin
  2007-05-04 15:15       ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 37+ messages in thread
From: Rusty Russell @ 2007-05-04 14:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andi Kleen, Chris Wright, Jeremy Fitzhardinge, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

On Fri, 2007-05-04 at 08:38 -0600, Eric W. Biederman wrote:
> Rusty Russell <rusty@rustcorp.com.au> writes:
> > We can no longer assume the launcher has set the bss to zero: we now
> > need to zero it ourselves.
> 
> Ok.  Although we can hoist the bss zeroing, if everything needs it.

Hi Eric!

	Yeah, Jeremy said he wanted it for Xen.  This is the advantage of
having code in front of us tho.

> Are you booting with P=V now?  If not I expect you will run
> into trouble if you set CONFIG_RELOCATABLE.

	Yep... haven't tried CONFIG_RELOCATABLE tho.  It's kinda exotic 8)

> Hmm.  I'm wondering about the segment reload and how much of a problem
> that is.  My memory says that segment reloads are not actually a
> privileged operation, so we may be able to support this even in
> paravirt mode.  How hard would that be to support?  The segment
> we reload is a fixed part of our boot protocol.

It was a PITA because we can't just load $0x18 into %ss: it has to be
$0x19.  I started doing it (read %cs, mask off bottom 2 bits, orl
__BOOT_DS), but between that extra code and the extra code to then
change segments back...  well, we have to skip the cli anyway so it was
easy to skip them all.

I think Jeremy said Xen doesn't start on the boot descriptors either.  I
have to wonder why we don't just run on the boot descriptors forever
(#define __KERNEL_DS __BOOT_DS etc)?

> I guess this works.  You don't handle the old case of set_sectors == 0
> but I guess you don't support any kernels where that is the case.

Yes, I want to clean that up with proper checks for signatures
(currently, if it's not ELF, it's assumed to be a bzImage).  Might as
well at least error out if there's some surprise.

Thanks for the feedback!
Rusty.
Eric


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 14:18   ` Rusty Russell
  2007-05-04 14:23     ` Eric W. Biederman
@ 2007-05-04 15:10     ` Eric W. Biederman
  2007-05-04 15:53     ` H. Peter Anvin
  2 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 15:10 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, Chris Wright, Jeremy Fitzhardinge, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

Rusty Russell <rusty@rustcorp.com.au> writes:

> On Fri, 2007-05-04 at 08:01 -0600, Eric W. Biederman wrote:
>> We should place that initial branch right after startup_32,
>> we don't need the startup_paravirt at all now.
>> 
>> ENTRY(startup_32)
>> #ifdef CONFIG_LGUEST_GUEST
>>         cmpl	$1, 0x23c(%esi)
>> 	je      lguest_init
>> #endif
>
> Hi Eric!
>
> 	Makes sense to me, but I wasn't sure... do we need to check for old
> bootloader versions and such?

Thinking about this a little more.  It probably makes sense to move the
test for lguest down to just before we initialize the page tables.  That
way we can share common bss clearing code and parameter parsing setup
code.

Eric


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 14:38     ` Eric W. Biederman
  2007-05-04 14:55       ` Rusty Russell
@ 2007-05-04 15:15       ` Jeremy Fitzhardinge
  2007-05-04 15:45         ` H. Peter Anvin
  2007-05-04 16:46         ` Eric W. Biederman
  1 sibling, 2 replies; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2007-05-04 15:15 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
> Ok.  Although we can hoist the bss zeroing, if everything needs it.
>   

It will if we're booting out of bzImage; the bss won't be clear in that
case.

> Hmm.  I'm wondering about the segment reload and how much of a problem
> that is.  My memory says that segment reloads are not actually a
> privileged operation, so we may be able to support this even in
> paravirt mode.  How hard would that be to support?  The segment
> we reload is a fixed part of our boot protocol.
>   

The problem is not the reloads themselves, but what you're reloading
them with.  If we come up under Xen, then it will provide a default GDT
and pre-load the segments with flat 4G(-ish) selectors - but the
selectors won't be the normal Linux ones.

So if we reload using a constant selector, then that will break under
Xen. But if we do a:

    mov %cs, %eax
    mov %eax, %ds
    // etc

sequence then it should be fine.  This will work even for loading %ss,
since the %cs CPL will equal the RPL needed for %ss.

    J

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 15:15       ` Jeremy Fitzhardinge
@ 2007-05-04 15:45         ` H. Peter Anvin
  2007-05-04 16:13           ` Jeremy Fitzhardinge
  2007-05-04 16:46         ` Eric W. Biederman
  1 sibling, 1 reply; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 15:45 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Eric W. Biederman, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Jeremy Fitzhardinge wrote:
> 
> So if we reload using a constant selector, then that will break under
> Xen. But if we do a:
> 
>     mov %cs, %eax
>     mov %eax, %ds
>     // etc
> 
> sequence then it should be fine.  This will work even for loading %ss,
> since the %cs CPL will equal the RPL needed for %ss.

In 32-bit mode?  Surely you're joking, Mr. Feynman!

In protected mode, you *must* have different descriptor values for the
code segment as opposed to the data segments, at least if you ever want
to write anything to the data segments!

What's worse, reloading segments here might be highly unsafe, if the
memory previously occupied by the GDT has been overwritten.  Keep in
mind the GDT is touched on a segment *load*, not on a segment *access*;
in areas such as booting that can be a huge difference.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 14:55       ` Rusty Russell
@ 2007-05-04 15:49         ` H. Peter Anvin
  0 siblings, 0 replies; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 15:49 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Eric W. Biederman, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Rusty Russell wrote:
> 
> It was a PITA because we can't just load $0x18 into %ss: it has to be
> $0x19.  I started doing it (read %cs, mask off bottom 2 bits, orl
> __BOOT_DS), but between that extra code and the extra code to then
> change segments back...  well, we have to skip the cli anyway so it was
> easy to skip them all.
> 
> I think Jeremy said Xen doesn't start on the boot descriptors either.  I
> have to wonder why we don't just run on the boot descriptors forever
> (#define __KERNEL_DS __BOOT_DS etc)?
> 

You can't do that once you define your own layout, or you introduce a
nasty intermodule dependency which shouldn't exist.

The rule should be:

- You MUST NOT reload segments before LGDT.
  (One can possibly get away with declaring that DS = CS+8 by
  protocol and load dynamic values, but that's not a good thing.)
- After LGDT, you MUST load your own segment values.

Anything else is just broken.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 14:23     ` Eric W. Biederman
@ 2007-05-04 15:52       ` H. Peter Anvin
  2007-05-04 16:48         ` Eric W. Biederman
  0 siblings, 1 reply; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 15:52 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
> 
> Unlikely.  Unless we expect that this offset will come in non-zero.
> 

You might have to worry about that.  Historically, the "zero-page" was
really just the setup code overwritten, and it's still true for a big
chunk of it.

One of the major changes in my setup code rewrite is to start out with
an all-zero chunk of memory for this.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 14:18   ` Rusty Russell
  2007-05-04 14:23     ` Eric W. Biederman
  2007-05-04 15:10     ` Eric W. Biederman
@ 2007-05-04 15:53     ` H. Peter Anvin
  2 siblings, 0 replies; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 15:53 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Eric W. Biederman, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Rusty Russell wrote:
>>
>> ENTRY(startup_32)
>> #ifdef CONFIG_LGUEST_GUEST
>>         cmpl	$1, 0x23c(%esi)
>> 	je      lguest_init
>> #endif
> 
> Hi Eric!
> 
> 	Makes sense to me, but I wasn't sure... do we need to check for old
> bootloader versions and such?

So, in summary, yes, you have to check the version number before
assigning any meaning to this field.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 15:45         ` H. Peter Anvin
@ 2007-05-04 16:13           ` Jeremy Fitzhardinge
  2007-05-04 16:43             ` H. Peter Anvin
  2007-05-04 16:57             ` Eric W. Biederman
  0 siblings, 2 replies; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2007-05-04 16:13 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric W. Biederman, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

H. Peter Anvin wrote:
> In 32-bit mode?  Surely you're joking, Mr. Feynman!
>   

Right, yes.

> What's worse, reloading segments here might be highly unsafe, if the
> memory previously occupied by the GDT has been overwritten.  Keep in
> mind the GDT is touched on a segment *load*, not on a segment *access*;
> in areas such as booting that can be a huge difference.
>   

Yep, suits me.  I'm happy for the code to assume that at least %cs and
%ds are sane; I guess %ss too.  We could copy %ds into %[efg]s if we
want to be sure (since I could imagine a bootloader leaving them in a
less defined state).

But if the gdt could be missing altogether, then, yes, we should not
touch them at all.

    J

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 16:13           ` Jeremy Fitzhardinge
@ 2007-05-04 16:43             ` H. Peter Anvin
  2007-05-04 16:57             ` Eric W. Biederman
  1 sibling, 0 replies; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 16:43 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Eric W. Biederman, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Jeremy Fitzhardinge wrote:
> H. Peter Anvin wrote:
>> In 32-bit mode?  Surely you're joking, Mr. Feynman!
>>   
> 
> Right, yes.
> 
>> What's worse, reloading segments here might be highly unsafe, if the
>> memory previously occupied by the GDT has been overwritten.  Keep in
>> mind the GDT is touched on a segment *load*, not on a segment *access*;
>> in areas such as booting that can be a huge difference.
>>   
> 
> Yep, suits me.  I'm happy for the code to assume that at least %cs and
> %ds are sane; I guess %ss too.  We could copy %ds into %[efg]s if we
> want to be sure (since I could imagine a bootloader leaving them in a
> less defined state).

No, we shouldn't.  %es should be assumed set up (this is 32-bit code,
after all!), and %fs and %gs should not be used.

> But if the gdt could be missing altogether, then, yes, we should not
> touch them at all.

Exactly.  Not relying on a set-up GDT is the safest option, IMNSHO.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 15:15       ` Jeremy Fitzhardinge
  2007-05-04 15:45         ` H. Peter Anvin
@ 2007-05-04 16:46         ` Eric W. Biederman
  2007-05-04 17:25           ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 16:46 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Eric W. Biederman wrote:
>> Ok.  Although we can hoist the bss zeroing, if everything needs it.
>>   
>
> It will if we're booting out of bzImage; the bss won't be clear in that
> case.
>
>> Hmm.  I'm wondering about the segment reload and how much of a problem
>> that is.  My memory says that segment reloads are not actually a
>> privileged operation, so we may be able to support this even in
>> paravirt mode.  How hard would that be to support?  The segment
>> we reload is a fixed part of our boot protocol.
>>   
>
> The problem is not the reloads themselves, but what you're reloading
> them with.  If we come up under Xen, then it will provide a default GDT
> and pre-load the segments with flat 4G(-ish) selectors - but the
> selectors won't be the normal Linux ones.
>
> So if we reload using a constant selector, then that will break under
> Xen. But if we do a:
>
>     mov %cs, %eax
>     mov %eax, %ds
>     // etc

Hmm. If we made that:
	mov	%cs, %eax
	add	$0x10, %eax
	mov	%eax, %ds

That is likely even backwards compatible.  If you don't mind having a fixed
offset between the code and the data segments.  As I recall code and data
are not interchangeable.

I'm trying to remember the reason for the reloads.

As I recall loadlin intercepts code32_start from head.S and so it
can do things just after we have switched to protected mode.  Because
historically we didn't load the segments before this jump loadlin had
to do it.  The code of loadlin appears to reload all of the segments
just like head.S does and then not touch them.

My two bootloaders that enter the kernel at the 32bit entry point already
load the segments as well.

Gujin looks like it loads just %es and %ds.

It is hard to tell with elilo what it sets up, it preserves the
descriptors from EFI, but sets up a linux boot protocol gdt.

Since setup.S finally does the right thing in loading segment
registers.  It looks to me like we need to sit down and document
the 32bit kernel interface, as it is today, and then extend
things to just replicate %ds into the other segments, and kill
any lss instructions.

That is trivially compatible with our expectations of vmlinux.  For
the bzImage 32bit entry point it is a bit of an extension, but except
for para-virtualization solutions I don't know of anything that
doesn't support bzImage and vmlinux with the same code.

I like it a lot better then trying to derive %ds from %cs.

At the same time we are doing this it would be good to drop our
boot protocol version into an ELF note so people booting vmlinux
can discover when we have relaxed various restrictions and which
fields in the real mode data we support.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 15:52       ` H. Peter Anvin
@ 2007-05-04 16:48         ` Eric W. Biederman
  2007-05-04 17:13           ` H. Peter Anvin
  0 siblings, 1 reply; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 16:48 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

"H. Peter Anvin" <hpa@zytor.com> writes:

> Eric W. Biederman wrote:
>> 
>> Unlikely.  Unless we expect that this offset will come in non-zero.
>> 
>
> You might have to worry about that.  Historically, the "zero-page" was
> really just the setup code overwritten, and it's still true for a big
> chunk of it.
>
> One of the major changes in my setup code rewrite is to start out with
> an all-zero chunk of memory for this.

Well as long as we are in sync with setup.S we are fine.  The issue
is people generating the real-mode data from scratch, when using the
32bit entry point.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 16:13           ` Jeremy Fitzhardinge
  2007-05-04 16:43             ` H. Peter Anvin
@ 2007-05-04 16:57             ` Eric W. Biederman
  2007-05-04 17:07               ` H. Peter Anvin
  1 sibling, 1 reply; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 16:57 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> H. Peter Anvin wrote:
>> In 32-bit mode?  Surely you're joking, Mr. Feynman!
>>   
>
> Right, yes.
>
>> What's worse, reloading segments here might be highly unsafe, if the
>> memory previously occupied by the GDT has been overwritten.  Keep in
>> mind the GDT is touched on a segment *load*, not on a segment *access*;
>> in areas such as booting that can be a huge difference.
>>   
>
> Yep, suits me.  I'm happy for the code to assume that at least %cs and
> %ds are sane; I guess %ss too.  We could copy %ds into %[efg]s if we
> want to be sure (since I could imagine a bootloader leaving them in a
> less defined state).
>
> But if the gdt could be missing altogether, then, yes, we should not
> touch them at all.

Peter has a good point here.  We have been reloading segments historically
so the existing boot loaders won't fall over.  If we could load the gdt
in the paravirt solutions all would be simple.

We have real mystery historical cases in Gujin and ELILO.  So it
probably makes sense at this point to force a gdt reload if we can and
otherwise require all of the segments %ds, %es, %fs, %gs to be loaded
with a valid segment, that we can reach everything we need to touch
from.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 16:57             ` Eric W. Biederman
@ 2007-05-04 17:07               ` H. Peter Anvin
  2007-05-04 17:30                 ` Eric W. Biederman
  0 siblings, 1 reply; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 17:07 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Jeremy Fitzhardinge, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
> 
> We have real mystery historical cases in Gujin and ELILO.  So it
> probably makes sense at this point to force a gdt reload if we can and
> otherwise require all of the segments %ds, %es, %fs, %gs to be loaded
> with a valid segment, that we can reach everything we need to touch
> from.
> 

I think we should avoid using %fs and %gs (no use for them, anyway); but
 I think it's a decent expectation to have %cs, %ds, %es, and %ss set up.

Gujin seems to have a near-zero user community, so if they have to rev
their code it wouldn't be a big deal (the author keeps trying to push
some crack-smoking "Gujin native" patches into the kernel, too), but
breaking ELILO would hurt anyone using Intel Macs.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 16:48         ` Eric W. Biederman
@ 2007-05-04 17:13           ` H. Peter Anvin
  2007-05-04 18:30             ` Eric W. Biederman
  0 siblings, 1 reply; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 17:13 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
> "H. Peter Anvin" <hpa@zytor.com> writes:
> 
>> Eric W. Biederman wrote:
>>> Unlikely.  Unless we expect that this offset will come in non-zero.
>>>
>> You might have to worry about that.  Historically, the "zero-page" was
>> really just the setup code overwritten, and it's still true for a big
>> chunk of it.
>>
>> One of the major changes in my setup code rewrite is to start out with
>> an all-zero chunk of memory for this.
> 
> Well as long as we are in sync with setup.S we are fine.  The issue
> is people generating the real-mode data from scratch, when using the
> 32bit entry point.

Point.  I'd like that interface to specify that any undefined fields
should be zero, or we have a hopeless situation on our hands.

By the way, see the following for the structure definition; prepare to barf:

http://git.kernel.org/?p=linux/kernel/git/hpa/linux-2.6-newsetup.git;a=blob;f=arch/i386/boot/boot.h;h=41a16f96ac3476cbd969aabe5e6a792ffe8c64a0;hb=HEAD

[I intend to move this into include/asm-i386/boot.h, but haven't gotten
that far yet.  I just yesterday got the code booting on both i386 and
x86-64, but haven't committed all the include/asm-* mods that went a
long with that yet.  I'm also waiting for a Kbuild fix so that the boot
directory can be shared between i386 and x86-64 without requiring a
symlink in the source tarball.]

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 16:46         ` Eric W. Biederman
@ 2007-05-04 17:25           ` Jeremy Fitzhardinge
  2007-05-04 17:27             ` H. Peter Anvin
  2007-05-04 17:36             ` Eric W. Biederman
  0 siblings, 2 replies; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2007-05-04 17:25 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
> Hmm. If we made that:
> 	mov	%cs, %eax
> 	add	$0x10, %eax
> 	mov	%eax, %ds
>
> That is likely even backwards compatible.  If you don't mind having a fixed
> offset between the code and the data segments.  As I recall code and data
> are not interchangeable.
>   

Yes, that's just bogus thinko on my part.

> I'm trying to remember the reason for the reloads.
>
> As I recall loadlin intercepts code32_start from head.S and so it
> can do things just after we have switched to protected mode.  Because
> historically we didn't load the segments before this jump loadlin had
> to do it.  The code of loadlin appears to reload all of the segments
> just like head.S does and then not touch them.
>
> My two bootloaders that enter the kernel at the 32bit entry point already
> load the segments as well.
>
> Gujin looks like it loads just %es and %ds.
>   

We should be able to make do with that until we've got our own gdt.

> It is hard to tell with elilo what it sets up, it preserves the
> descriptors from EFI, but sets up a linux boot protocol gdt.
>   

? But the cached descriptors are still the EFI ones?

> Since setup.S finally does the right thing in loading segment
> registers.  It looks to me like we need to sit down and document
> the 32bit kernel interface, as it is today, and then extend
> things to just replicate %ds into the other segments, and kill
> any lss instructions.
>   

Yes.

> At the same time we are doing this it would be good to drop our
> boot protocol version into an ELF note so people booting vmlinux
> can discover when we have relaxed various restrictions and which
> fields in the real mode data we support.
>   

Do you mean the have the kernel expose its max supported version for
bootloaders to inspect?

    J

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 17:25           ` Jeremy Fitzhardinge
@ 2007-05-04 17:27             ` H. Peter Anvin
  2007-05-04 17:36             ` Eric W. Biederman
  1 sibling, 0 replies; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 17:27 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Eric W. Biederman, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Jeremy Fitzhardinge wrote:
> 
> We should be able to make do with that until we've got our own gdt.
> 

It's harsh being without %ss and %es.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 17:07               ` H. Peter Anvin
@ 2007-05-04 17:30                 ` Eric W. Biederman
  2007-05-04 18:22                   ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 17:30 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jeremy Fitzhardinge, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

"H. Peter Anvin" <hpa@zytor.com> writes:

> Eric W. Biederman wrote:
>> 
>> We have real mystery historical cases in Gujin and ELILO.  So it
>> probably makes sense at this point to force a gdt reload if we can and
>> otherwise require all of the segments %ds, %es, %fs, %gs to be loaded
>> with a valid segment, that we can reach everything we need to touch
>> from.
>> 
>
> I think we should avoid using %fs and %gs (no use for them, anyway); but
> I think it's a decent expectation to have %cs, %ds, %es, and %ss set up.

Yes we have no need for %fs and %gs.  Lumping them in with the rest is
just extra helpfulness.

Basically be conservative in what you send (all segments).
Be liberal in what you accept (%cs,%ds,%es,%ss) and less if
we can get away with it.

> Gujin seems to have a near-zero user community, so if they have to rev
> their code it wouldn't be a big deal (the author keeps trying to push
> some crack-smoking "Gujin native" patches into the kernel, too), 
> breaking ELILO would hurt anyone using Intel Macs.

I'm thinking we just make the code start.
startup_32:
	movl	%cs, %eax
        testl   $3, %eax
        jnz     1f

	lgdt boot_gdt_descr - __PAGE_OFFSET
	movl $(__BOOT_DS),%eax
	movl %eax,%ds
	movl %eax,%es
	movl %eax,%fs
	movl %eax,%gs
        movl %eax,%ss
1:

But that won't work if we want to support relocatability.
Because we can't load a gdt if we don't know where we are.

To find out where we are we need %ss and %ds, at which point
we might as well assume we have %es to.

I think that will work in the elilo case but we can't reload them.
As silly elilo loaded a new gdt but not it's segments...

So be it then.  The next rev of the boot protocol gets to be partially
incompatible, and we just assume that %cs, %ds, %es, %ss meet our
basic requirements.  I'm pretty certain from what I saw only Gujin
is going to suffer  :(

Yep it is long past time that we document what needs to happen for
the 32bit entry point of the linux kernel.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 17:25           ` Jeremy Fitzhardinge
  2007-05-04 17:27             ` H. Peter Anvin
@ 2007-05-04 17:36             ` Eric W. Biederman
  2007-05-04 17:44               ` H. Peter Anvin
  2007-05-04 18:25               ` Jeremy Fitzhardinge
  1 sibling, 2 replies; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 17:36 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Eric W. Biederman wrote:
>> Hmm. If we made that:
>> 	mov	%cs, %eax
>> 	add	$0x10, %eax
>> 	mov	%eax, %ds
>>
>> That is likely even backwards compatible.  If you don't mind having a fixed
>> offset between the code and the data segments.  As I recall code and data
>> are not interchangeable.
>>   
>
> Yes, that's just bogus thinko on my part.
>
>> I'm trying to remember the reason for the reloads.
>>
>> As I recall loadlin intercepts code32_start from head.S and so it
>> can do things just after we have switched to protected mode.  Because
>> historically we didn't load the segments before this jump loadlin had
>> to do it.  The code of loadlin appears to reload all of the segments
>> just like head.S does and then not touch them.
>>
>> My two bootloaders that enter the kernel at the 32bit entry point already
>> load the segments as well.
>>
>> Gujin looks like it loads just %es and %ds.
>>   
>
> We should be able to make do with that until we've got our own gdt.

Right after the segments loads we have:
	leal 0x40(%esi), %esp
	call 1f
1:	popl %ebp
	subl $1b, %ebp

That uses %ss and %ds.  Can we use a segment override on a call
instruction?

>> It is hard to tell with elilo what it sets up, it preserves the
>> descriptors from EFI, but sets up a linux boot protocol gdt.
>>   
>
> ? But the cached descriptors are still the EFI ones?

Yep.  Which means we can reload with our know good linux descriptors
but we can't reload the descriptors we are currently running with. 

>> At the same time we are doing this it would be good to drop our
>> boot protocol version into an ELF note so people booting vmlinux
>> can discover when we have relaxed various restrictions and which
>> fields in the real mode data we support.
>>   
>
> Do you mean the have the kernel expose its max supported version for
> bootloaders to inspect?

Yes.  Exactly like we do for real mode code, and with exactly the same
values.  Just in a little different format.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 17:36             ` Eric W. Biederman
@ 2007-05-04 17:44               ` H. Peter Anvin
  2007-05-04 18:25               ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 17:44 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Jeremy Fitzhardinge, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
> 
> Right after the segments loads we have:
> 	leal 0x40(%esi), %esp
> 	call 1f
> 1:	popl %ebp
> 	subl $1b, %ebp
> 
> That uses %ss and %ds.  Can we use a segment override on a call
> instruction?
> 

Nope.  As I said, not having %ss and %es set up is not realistic for
anything other than switching to our own GDT.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 17:30                 ` Eric W. Biederman
@ 2007-05-04 18:22                   ` Jeremy Fitzhardinge
  2007-05-04 18:48                     ` Eric W. Biederman
  0 siblings, 1 reply; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2007-05-04 18:22 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: H. Peter Anvin, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
>> Gujin seems to have a near-zero user community, so if they have to rev
>> their code it wouldn't be a big deal (the author keeps trying to push
>> some crack-smoking "Gujin native" patches into the kernel, too), 
>> breaking ELILO would hurt anyone using Intel Macs.
>>     
>
> I'm thinking we just make the code start.
> startup_32:
> 	movl	%cs, %eax
>         testl   $3, %eax
>         jnz     1f
>   

I'm not really happy about using this as a way to distinguish paravirt
from non-paravirt in general.  At some point we're going to be running
paravirt kernels in ring0 within a VT/SVM container - but they'll still
be completely paravirtualized kernels.

I think a better approach is to just do it purely based on the boot
params platform field.  Ie, something along the lines of:

	if (boot_params.version < new_enough)
		goto native_boot;
	else {
		for (int i = 0; i < nplatforms; i++)
			if (boot_params.platform == platforms[i].id)
				goto *platforms[i].startup
		panic();
	}


> But that won't work if we want to support relocatability.
> Because we can't load a gdt if we don't know where we are.
>
> To find out where we are we need %ss and %ds, at which point
> we might as well assume we have %es to.
>   

Yes, we won't make it far without ss and ds, and while we could avoid
string instructions, you'd have to be a pretty short-sighted bootloader
author to set ss and ds without also setting es.

> So be it then.  The next rev of the boot protocol gets to be partially
> incompatible, and we just assume that %cs, %ds, %es, %ss meet our
> basic requirements.  I'm pretty certain from what I saw only Gujin
> is going to suffer  :(
>   

I missed what Gujin does wrong here?


    J

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 17:36             ` Eric W. Biederman
  2007-05-04 17:44               ` H. Peter Anvin
@ 2007-05-04 18:25               ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2007-05-04 18:25 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Zachary Amsden,
	Andrew Morton, Linus Torvalds, H. Peter Anvin,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
>>> load the segments as well.
>>>
>>> Gujin looks like it loads just %es and %ds.
>>>   
>>>       
>> We should be able to make do with that until we've got our own gdt.
>>     
>
> Right after the segments loads we have:
> 	leal 0x40(%esi), %esp
> 	call 1f
> 1:	popl %ebp
> 	subl $1b, %ebp
>
> That uses %ss and %ds.  Can we use a segment override on a call
> instruction?
>   

I overlooked Gujin missing %ss.  That's harsh.  Easy for them to fix,
presumably.

> Yes.  Exactly like we do for real mode code, and with exactly the same
> values.  Just in a little different format.

Well, fits in well with Xen's requirements, since it wants ELF notes
there for exactly the same reason.

    J

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 17:13           ` H. Peter Anvin
@ 2007-05-04 18:30             ` Eric W. Biederman
  2007-05-04 18:55               ` H. Peter Anvin
  0 siblings, 1 reply; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 18:30 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

"H. Peter Anvin" <hpa@zytor.com> writes:

> Eric W. Biederman wrote:
>> "H. Peter Anvin" <hpa@zytor.com> writes:
>> 
>>> Eric W. Biederman wrote:
>>>> Unlikely.  Unless we expect that this offset will come in non-zero.
>>>>
>>> You might have to worry about that.  Historically, the "zero-page" was
>>> really just the setup code overwritten, and it's still true for a big
>>> chunk of it.
>>>
>>> One of the major changes in my setup code rewrite is to start out with
>>> an all-zero chunk of memory for this.
>> 
>> Well as long as we are in sync with setup.S we are fine.  The issue
>> is people generating the real-mode data from scratch, when using the
>> 32bit entry point.
>
> Point.  I'd like that interface to specify that any undefined fields
> should be zero, or we have a hopeless situation on our hands.
>
> By the way, see the following for the structure definition; prepare to barf:
>
> http://git.kernel.org/?p=linux/kernel/git/hpa/linux-2.6-newsetup.git;a=blob;f=arch/i386/boot/boot.h;h=41a16f96ac3476cbd969aabe5e6a792ffe8c64a0;hb=HEAD
>
> [I intend to move this into include/asm-i386/boot.h, but haven't gotten
> that far yet.  I just yesterday got the code booting on both i386 and
> x86-64, but haven't committed all the include/asm-* mods that went a
> long with that yet.  I'm also waiting for a Kbuild fix so that the boot
> directory can be shared between i386 and x86-64 without requiring a
> symlink in the source tarball.]

You should be able to just include linux/screen_info.h instead of duplicating
it inline.

I like the use of struct header in the middle of boot_params that
seems like a nice maintenance device, although I'm not quite certain about

However you haven't documented the old swap_dev field in struct header.
At least rdev still knows about it, so it is probably inappropriate to
merge it with syssize.  Not that syssize is actually useful for anything
in a modern system.

So I just looked at what /sbin/kexec does so we know what to expect.
If I have a bzImage I just grab the first setup_sects (i.e. setup.S) and make
it the initial linux boot parameters, placing the command line immediately
afterwards.

If I just have a vmlinux so I have to fake it I memset x86_linux_faked_param_header
to zero, before placing in the values I care about.  And the size.  4K aka 1 page.
Although I due put the command line at 2K, I think that is actually the historical
kernel usage of the zero page.

elilo does something similar but starts with a 16K pages and then backs up
2K for the command line.

Gujin does something similar but also seems to place a command line at 2K.

So short of the first 2K we can reasonably expect new parameters to be zero
initialized.  Past that we need to be a little more careful.

And 4K seems to be our maximum size for backwards compatibility.  Although
we use it in a fairly sparse way, so we should be ok.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 18:22                   ` Jeremy Fitzhardinge
@ 2007-05-04 18:48                     ` Eric W. Biederman
  2007-05-04 18:55                       ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 18:48 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Eric W. Biederman wrote:
>>> Gujin seems to have a near-zero user community, so if they have to rev
>>> their code it wouldn't be a big deal (the author keeps trying to push
>>> some crack-smoking "Gujin native" patches into the kernel, too), 
>>> breaking ELILO would hurt anyone using Intel Macs.
>>>     
>>
>> I'm thinking we just make the code start.
>> startup_32:
>> 	movl	%cs, %eax
>>         testl   $3, %eax
>>         jnz     1f
>>   
>
> I'm not really happy about using this as a way to distinguish paravirt
> from non-paravirt in general.  At some point we're going to be running
> paravirt kernels in ring0 within a VT/SVM container - but they'll still
> be completely paravirtualized kernels.

That wasn't paravirt detection that was do I have permissions to load
the gdt.

Basically we have two choices.   Either unconditionally demand
that %cs %ds %es and %ss are loaded, and so we remove all descriptor
loading.

Or we can do:
startup_32:
	movl	%cs, %eax
	testl	$3, %eax
        jnz     1f

	movl $(__BOOT_DS),%eax
	movl %eax,%ds
	movl %eax,%es
	movl %eax,%fs
	movl %eax,%gs
	movl %eax,%ss

1:

Which only demands that the segments be loaded when we aren't in ring0.
At least historically that is safe for the linux kernel but really weird.

Since there is only one fringe bootloader that cares I'm thinking just
removing all segment loading and assume we have the equivalent of
BOOT_DS in the segments probably makes the most sense.

I'm thinking that our next protocol version probably needs to be 3.0
given the significance of the change, and the fact we are breaking
in a very small ways backwards compatibility.

> I think a better approach is to just do it purely based on the boot
> params platform field.  Ie, something along the lines of:
>
> 	if (boot_params.version < new_enough)
> 		goto native_boot;
> 	else {
> 		for (int i = 0; i < nplatforms; i++)
> 			if (boot_params.platform == platforms[i].id)
> 				goto *platforms[i].startup
> 		panic();
> 	}

I think it is much better to test the boot_params.platform field where
we care.  If the platform is a native x86 subarch we don't need
a magic startup function.  If the platform is Xen or lguest
we can at best copy our boot parameters and jump to their custom
startup routines.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 18:48                     ` Eric W. Biederman
@ 2007-05-04 18:55                       ` Jeremy Fitzhardinge
  2007-05-04 19:21                         ` Eric W. Biederman
  0 siblings, 1 reply; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2007-05-04 18:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: H. Peter Anvin, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
>> I'm not really happy about using this as a way to distinguish paravirt
>> from non-paravirt in general.  At some point we're going to be running
>> paravirt kernels in ring0 within a VT/SVM container - but they'll still
>> be completely paravirtualized kernels.
>>     
>
> That wasn't paravirt detection that was do I have permissions to load
> the gdt.
>   

Well, a paravirtualized ring0 kernel may still have special constraints
on how the gdt can be set up (page-aligned, read-only, etc).

> Basically we have two choices.   Either unconditionally demand
> that %cs %ds %es and %ss are loaded, and so we remove all descriptor
> loading.
>   

Yep.  At least if the boot-version is new enough to boot this way.  The
old native-boot path can reload as much as it likes.

>> I think a better approach is to just do it purely based on the boot
>> params platform field.  Ie, something along the lines of:
>>
>> 	if (boot_params.version < new_enough)
>> 		goto native_boot;
>> 	else {
>> 		for (int i = 0; i < nplatforms; i++)
>> 			if (boot_params.platform == platforms[i].id)
>> 				goto *platforms[i].startup
>> 		panic();
>> 	}
>>     
>
> I think it is much better to test the boot_params.platform field where
> we care.  If the platform is a native x86 subarch we don't need
> a magic startup function.  If the platform is Xen or lguest
> we can at best copy our boot parameters and jump to their custom
> startup routines.
>   

Why not just treat them all in the same way?  Especially if we start
sweeping other non-virtual architectures like voyager/visws/etc into the
same mechanism.

My idea was that "goto native_boot" would jump to code which assumes
it's running on real hardware, where there's no problem reloading
gdt/segment registers, etc.

    J

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 18:30             ` Eric W. Biederman
@ 2007-05-04 18:55               ` H. Peter Anvin
  2007-05-04 19:10                 ` Eric W. Biederman
  0 siblings, 1 reply; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 18:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
> 
> You should be able to just include linux/screen_info.h instead of duplicating
> it inline.
> 

I'm working on it!!!!!

> I like the use of struct header in the middle of boot_params that
> seems like a nice maintenance device, although I'm not quite certain about
> 
> However you haven't documented the old swap_dev field in struct header.
> At least rdev still knows about it, so it is probably inappropriate to
> merge it with syssize.  Not that syssize is actually useful for anything
> in a modern system.

Actually, ROM bootloaders care about it, which is why it was expanded
out in boot loader protocol 2.04; see the documentation.

> So I just looked at what /sbin/kexec does so we know what to expect.
> If I have a bzImage I just grab the first setup_sects (i.e. setup.S) and make
> it the initial linux boot parameters, placing the command line immediately
> afterwards.
> 
> If I just have a vmlinux so I have to fake it I memset x86_linux_faked_param_header
> to zero, before placing in the values I care about.  And the size.  4K aka 1 page.
> Although I due put the command line at 2K, I think that is actually the historical
> kernel usage of the zero page.

Oh flippin' hell.  *STABBITY STAB STAB STAB*.

All of which is just evil.  So much for "oh, the definition of the
zeropage never changes, so it doesn't matter."

> elilo does something similar but starts with a 16K pages and then backs up
> 2K for the command line.
> 
> Gujin does something similar but also seems to place a command line at 2K.
> 
> So short of the first 2K we can reasonably expect new parameters to be zero
> initialized.  Past that we need to be a little more careful.

Oh flippin' hell.

> And 4K seems to be our maximum size for backwards compatibility.  Although
> we use it in a fairly sparse way, so we should be ok.

Sort of.  It's pretty full.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 18:55               ` H. Peter Anvin
@ 2007-05-04 19:10                 ` Eric W. Biederman
  2007-05-04 19:14                   ` H. Peter Anvin
  2007-05-04 19:19                   ` H. Peter Anvin
  0 siblings, 2 replies; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 19:10 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

"H. Peter Anvin" <hpa@zytor.com> writes:

> Eric W. Biederman wrote:
>> 
>> You should be able to just include linux/screen_info.h instead of duplicating
>> it inline.
>> 
>
> I'm working on it!!!!!

Sure. I meant that linux/screen_info.h should already be safe.

>> I like the use of struct header in the middle of boot_params that
>> seems like a nice maintenance device, although I'm not quite certain about
>> 
>> However you haven't documented the old swap_dev field in struct header.
>> At least rdev still knows about it, so it is probably inappropriate to
>> merge it with syssize.  Not that syssize is actually useful for anything
>> in a modern system.
>
> Actually, ROM bootloaders care about it, which is why it was expanded
> out in boot loader protocol 2.04; see the documentation.

Interesting. I missed that one.

>> So I just looked at what /sbin/kexec does so we know what to expect.
>> If I have a bzImage I just grab the first setup_sects (i.e. setup.S) and make
>> it the initial linux boot parameters, placing the command line immediately
>> afterwards.
>> 
>> If I just have a vmlinux so I have to fake it I memset
> x86_linux_faked_param_header
>> to zero, before placing in the values I care about.  And the size.  4K aka 1
> page.
>> Although I due put the command line at 2K, I think that is actually the
> historical
>> kernel usage of the zero page.
>
> Oh flippin' hell.  *STABBITY STAB STAB STAB*.

Yep, a real pain.

At this point it looks like a bug in kexec to me, and the bzImage loader
which is the primary way to boot linux doesn't have this problem.  Yea.

> All of which is just evil.  So much for "oh, the definition of the
> zeropage never changes, so it doesn't matter."

>> elilo does something similar but starts with a 16K pages and then backs up
>> 2K for the command line.
>> 
>> Gujin does something similar but also seems to place a command line at 2K.
>> 
>> So short of the first 2K we can reasonably expect new parameters to be zero
>> initialized.  Past that we need to be a little more careful.
>
> Oh flippin' hell.

And now you understand why I am surveying these things and want to get
the 32bit entry point well documented.  So the situation doesn't get worse.

Frankly while I consider what we are doing pretty sane I have always considered
the 32bit entry point at least partly experimental.  But we have enough users
of it now and enough reasons to have users of it, that it looks like we need to
do things a little more methodically.

>> And 4K seems to be our maximum size for backwards compatibility.  Although
>> we use it in a fairly sparse way, so we should be ok.
>
> Sort of.  It's pretty full.

True.  For small little extensions we have room.  For big things probably
not.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 19:10                 ` Eric W. Biederman
@ 2007-05-04 19:14                   ` H. Peter Anvin
  2007-05-04 19:31                     ` Eric W. Biederman
  2007-05-04 19:19                   ` H. Peter Anvin
  1 sibling, 1 reply; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 19:14 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
> 
> And now you understand why I am surveying these things and want to get
> the 32bit entry point well documented.  So the situation doesn't get worse.
> 
> Frankly while I consider what we are doing pretty sane I have always considered
> the 32bit entry point at least partly experimental.  But we have enough users
> of it now and enough reasons to have users of it, that it looks like we need to
> do things a little more methodically.
> 

Indeed.  I think, yes, what has been there up to now has pretty much
been at least in part experimental, and I fear there will be unavoidable
breakage as part of sanitizing it.  C'est la vie, I guess.

>>> And 4K seems to be our maximum size for backwards compatibility.  Although
>>> we use it in a fairly sparse way, so we should be ok.
>> Sort of.  It's pretty full.
> 
> True.  For small little extensions we have room.  For big things probably
> not.

For big extensions we'll probably have to go the pointer route already
done with the command line.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 19:10                 ` Eric W. Biederman
  2007-05-04 19:14                   ` H. Peter Anvin
@ 2007-05-04 19:19                   ` H. Peter Anvin
  1 sibling, 0 replies; 37+ messages in thread
From: H. Peter Anvin @ 2007-05-04 19:19 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Eric W. Biederman wrote:
>> I'm working on it!!!!!
> 
> Sure. I meant that linux/screen_info.h should already be safe.

Just pushed out a change which removes pretty much all the header
duplication into the git tree.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 3/3] boot bzImages under paravirt
  2007-05-04 18:55                       ` Jeremy Fitzhardinge
@ 2007-05-04 19:21                         ` Eric W. Biederman
  0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 19:21 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, Rusty Russell, Andi Kleen, Chris Wright,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Well, a paravirtualized ring0 kernel may still have special constraints
> on how the gdt can be set up (page-aligned, read-only, etc).

Maybe but that is hardly ring0.  I would have expected a
para-virtualized ring0 to behave like vmi.  Where everything
works like native hardware until you enable paravirtualization.

> Why not just treat them all in the same way?  Especially if we start
> sweeping other non-virtual architectures like voyager/visws/etc into the
> same mechanism.

That is my intention.  But there are different places we care for
different subarchitectures.  So I'm not convinced an early lookup
table is actually helpful.

The issues are that the kernel provides the version to the boot
loader not the other way around.

The code is simple enough in assembly we don't need a table a table
lookup just:

cmp $MY_PLATFORM, BOOT_PARAMS_PLATFORM(%esi)
jz  my_init.

Roughly this is what rusty has been prototyping in his lguest patches.

> My idea was that "goto native_boot" would jump to code which assumes
> it's running on real hardware, where there's no problem reloading
> gdt/segment registers, etc.

That may make sense to.  It is a question of can we place any code
before the test like clearing the bss?

If we declare the segments are properly initialized it doesn't matter.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field
  2007-05-04 19:14                   ` H. Peter Anvin
@ 2007-05-04 19:31                     ` Eric W. Biederman
  0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2007-05-04 19:31 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Rusty Russell, Andi Kleen, Chris Wright, Jeremy Fitzhardinge,
	Zachary Amsden, Andrew Morton, Linus Torvalds,
	lkml - Kernel Mailing List

"H. Peter Anvin" <hpa@zytor.com> writes:

> Indeed.  I think, yes, what has been there up to now has pretty much
> been at least in part experimental, and I fear there will be unavoidable
> breakage as part of sanitizing it.  C'est la vie, I guess.

The one significant one I left out I think is the VISWS.  I'm not certain
what we do there, but I know it never went through setup.S

Yes.  At the same time we have been sufficiently disciplined (baring
paravirt) that the changes should be quite small, and we have a big
enough sample size now that we can pretty clearly see ways in which
the code will vary.

>>>> And 4K seems to be our maximum size for backwards compatibility.  Although
>>>> we use it in a fairly sparse way, so we should be ok.
>>> Sort of.  It's pretty full.
>> 
>> True.  For small little extensions we have room.  For big things probably
>> not.
>
> For big extensions we'll probably have to go the pointer route already
> done with the command line.

Likely.  It is tricky because if we actually have to do a normal BIOS
query to get it things a little sticky, because we can't allocate
memory.  Hmm.  It looks like we need a way to export the size of
our parameter area to the bootloader.  We have setup_sects for 16bit
real mode bootloaders and that should be good enough, but we need
something equivalent for the 32bit entry point.

Requirements analysis here we come.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2007-05-04 19:33 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-05-04 12:59 [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field Rusty Russell
2007-05-04 13:02 ` [RFC PATCH 2/3] lguest: Boot with virtual == physical to get closer to native Linux Rusty Russell
2007-05-04 13:07   ` [RFC PATCH 3/3] boot bzImages under paravirt Rusty Russell
2007-05-04 14:38     ` Eric W. Biederman
2007-05-04 14:55       ` Rusty Russell
2007-05-04 15:49         ` H. Peter Anvin
2007-05-04 15:15       ` Jeremy Fitzhardinge
2007-05-04 15:45         ` H. Peter Anvin
2007-05-04 16:13           ` Jeremy Fitzhardinge
2007-05-04 16:43             ` H. Peter Anvin
2007-05-04 16:57             ` Eric W. Biederman
2007-05-04 17:07               ` H. Peter Anvin
2007-05-04 17:30                 ` Eric W. Biederman
2007-05-04 18:22                   ` Jeremy Fitzhardinge
2007-05-04 18:48                     ` Eric W. Biederman
2007-05-04 18:55                       ` Jeremy Fitzhardinge
2007-05-04 19:21                         ` Eric W. Biederman
2007-05-04 16:46         ` Eric W. Biederman
2007-05-04 17:25           ` Jeremy Fitzhardinge
2007-05-04 17:27             ` H. Peter Anvin
2007-05-04 17:36             ` Eric W. Biederman
2007-05-04 17:44               ` H. Peter Anvin
2007-05-04 18:25               ` Jeremy Fitzhardinge
2007-05-04 14:01 ` [RFC PATCH 1/3] Replace paravirt_probe with "platform type" boot header field Eric W. Biederman
2007-05-04 14:18   ` Rusty Russell
2007-05-04 14:23     ` Eric W. Biederman
2007-05-04 15:52       ` H. Peter Anvin
2007-05-04 16:48         ` Eric W. Biederman
2007-05-04 17:13           ` H. Peter Anvin
2007-05-04 18:30             ` Eric W. Biederman
2007-05-04 18:55               ` H. Peter Anvin
2007-05-04 19:10                 ` Eric W. Biederman
2007-05-04 19:14                   ` H. Peter Anvin
2007-05-04 19:31                     ` Eric W. Biederman
2007-05-04 19:19                   ` H. Peter Anvin
2007-05-04 15:10     ` Eric W. Biederman
2007-05-04 15:53     ` H. Peter Anvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).