LKML Archive on
help / color / mirror / Atom feed
From: Avi Kivity <>
To: kvm-devel <>
Cc: linux-kernel <>,
	Andrew Morton <>
Subject: [RFC] Stable kvm userspace interface
Date: Tue, 09 Jan 2007 15:37:27 +0200	[thread overview]
Message-ID: <> (raw)

I had originally hoped to get this in for 2.6.20.  It now looks like .20 
will have a shorter cycle than usual, and the mmu took a bit longer than 
expected, so it's more realistic to aim for 2.6.21.

The current kvm userspace interface has several deficiencies:

- open("/dev/kvm") returns a different object (a new vm) per invocation; 
this is "unusual" by Linux standards
- all vcpus share the same inode and struct file, which can cause 
scalability problems on very large smps.  This isn't a problem for 
current hardware, which has moderate core counts and huge vmexit 
latencies, not to mention a limit of one vcpu per vm, but I'd like to 
future-proof the interface.
- the KVM_VCPU_RUN ioctl() copies a needless chuck of data back and forth
- the PIO handlers communicate by means of registers (for single I/O) or 
virtual addresses (for string I/O).  Instead the values should be 
explicit fields in some structure, and physical addresses should be used 
to remove the need to translate addresses in userspace.
- the interrupt code still needs work to properly support the local apic 
with Windows guests.
- userspace must rely on delivered signals, which are slow, and cannot 
use queued signals (a la pselect()/ppoll()).

I propose the following as the new, stable, kvm api:

// open a handle to the kvm interface.  does not create a vm.
int kvm_fd = open("/dev/kvm", O_RDWR);

// the kvm interface supports just three ioctls:
ioctl(kvm_fd, KVM_GET_API_VERSION, 0);
ioctl(kvm_fd, KVM_GET_MSR_LIST, &msr_list);
int vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0);

// vm ioctls:
ioctl(vm_fd, KVM_VM_CREATE_MEMORY_REGION, &slot);
ioctl(vm_fd, KVM_VM_GET_DIRTY_LOG, &dirty_log);
int vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, vcpu_slot_number);

// each vcpu is a separate fd/inode.  this ensures no cacheline bouncing
// when the kernel refcounts the inodes on syscalls.

// kvm_vcpu_area contains the exit reasons and associated data, and
// results returned by userspace to resolve the exit reasons.
struct kvm_vcpu_area *vcpu_area = mmap(NULL, PAGE_SIZE, ..., vcpu_fd, 0);

struct kvm_vcpu_area {
    u32 vcpu_area_size;
    u32 exit_reason;

    sigset_t sigmask;  // for use during vcpu execution

    union {
	struct kvm_pio pio;
	struct kvm_mmio mmio;
	struct kvm_cpuid cpuid;
	// etc.
	char padding[...];

    struct kvm_irq irq; // acks from vm; injection from userspace

// vcpu ioctls

ioctl(vcpu_fd, KVM_VCPU_RUN, 0); // all comms through mmap()ed  vcpu_area
ioctl(vcpu_fd, KVM_VCPU_GET_REGS, &regs);
ioctl(vcpu_fd, KVM_VCPU_SET_REGS, &regs);
ioctl(vcpu_fd, KVM_VCPU_GET_SREGS, &sregs);
ioctl(vcpu_fd, KVM_VCPU_SET_SREGS, &sregs);
ioctl(vcpu_fd, KVM_VCPU_GET_MSRS, &msrs);
ioctl(vcpu_fd, KVM_VCPU_SET_MSRS, &msrs);
ioctl(vcpu_fd, KVM_VCPU_DEBUG_GUEST, &debug);

struct kvm_memory_region {
	__u32 slot;
	__u32 flags;
	__u64 guest_phys_addr;
	__u64 memory_size; /* bytes */

/* for kvm_memory_region::flags */

#define KVM_EXIT_TYPE_VM_EXIT    2

enum kvm_exit_reason {
	KVM_EXIT_UNKNOWN          = 0,
	KVM_EXIT_IO               = 2,
	KVM_EXIT_CPUID            = 3,
	KVM_EXIT_DEBUG            = 4,
	KVM_EXIT_HLT              = 5,
	KVM_EXIT_MMIO             = 6,

struct kvm_regs {
        // note: no vcpu!

	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
	__u64 rax, rbx, rcx, rdx;
	__u64 rsi, rdi, rsp, rbp;
	__u64 r8,  r9,  r10, r11;
	__u64 r12, r13, r14, r15;
	__u64 rip, rflags;

struct kvm_segment {
	__u64 base;
	__u32 limit;
	__u16 selector;
	__u8  type;
	__u8  present, dpl, db, s, l, g, avl;
	__u8  unusable;
	__u8  padding;

struct kvm_dtable {
	__u64 base;
	__u16 limit;
	__u16 padding[3];

struct kvm_sregs {
	/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
	struct kvm_segment cs, ds, es, fs, gs, ss;
	struct kvm_segment tr, ldt;
	struct kvm_dtable gdt, idt;
	__u64 cr0, cr2, cr3, cr4, cr8;

struct kvm_msr_entry {
	__u32 index;
	__u32 reserved;
	__u64 data;

struct kvm_msrs {
	__u32 nmsrs; /* number of msrs in entries */
	__u32 padding;

	struct kvm_msr_entry entries[0];

struct kvm_msr_list {
	__u32 nmsrs; /* number of msrs in entries */
	__u32 indices[0];

struct kvm_breakpoint {
	__u32 enabled;
	__u32 padding;
	__u64 address;

struct kvm_debug_guest {
	__u32 enabled;
	__u32 singlestep;
	struct kvm_breakpoint breakpoints[4];

struct kvm_dirty_log {
	__u32 slot;
	__u32 padding;
	union {
		void __user *dirty_bitmap; /* one bit per page */
		__u64 padding;

Comments and questions are welcome.

Thanks to Arnd Bergmann for his contributions and advice on this issue.

error compiling committee.c: too many arguments to function

             reply	other threads:[~2007-01-09 13:37 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-01-09 13:37 Avi Kivity [this message]
2007-01-09 13:47 ` Jeff Garzik
2007-01-09 14:02   ` [kvm-devel] " James Morris
2007-01-09 14:11   ` Avi Kivity
2007-01-11  7:34   ` [kvm-devel] " Arnd Bergmann
2007-01-11  8:03     ` Avi Kivity
2007-01-11  8:26     ` Jeff Garzik
2007-01-11  8:32       ` Avi Kivity
2007-01-12 11:19       ` Pavel Machek
2007-01-11 17:40     ` David Lang
2007-01-11  7:26 ` Arnd Bergmann
2007-01-11  8:02   ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \ \ \ \ \ \
    --subject='Re: [RFC] Stable kvm userspace interface' \

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).