LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
       [not found] <1294308645-31113-1-git-send-email-zamsden@redhat.com>
@ 2011-01-06 10:10 ` Zachary Amsden
  2011-01-06 10:41   ` Alexander Graf
                     ` (3 more replies)
  2011-01-06 10:10 ` [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR Zachary Amsden
  1 sibling, 4 replies; 22+ messages in thread
From: Zachary Amsden @ 2011-01-06 10:10 UTC (permalink / raw)
  To: kvm, Avi Kivity, Marcelo Tosatti, Glauber Costa
  Cc: Zachary Amsden, linux-kernel

Reasons to trap the TSC are numerous, but we want to avoid it as much
as possible for performance reasons.

We provide two conservative modes via modules parameters and userspace
hinting.  First, the module can be loaded with "tsc_auto=1" as a module
parameter, which turns on conservative TSC trapping only when it is
required (when unstable TSC or faster KHZ CPU is detected).

For userspace hinting, we enable trapping only if necessary.  Userspace
can hint that a VM needs a fixed frequency TSC, and also that SMP
stability will be required.  In that case, we conservatively turn on
trapping when it is needed.  In addition, users may now specify the
desired TSC rate at which to run.  If this rate differs significantly
from the host rate, trapping will be enabled.

There is also an override control to allow TSC trapping to be turned on
or off unconditionally for testing.

We indicate to pvclock users that the TSC is being trapped, to allow
avoiding overhead and directly using RDTSCP (only for SVM).  This
optimization is not yet implemented.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
---
 arch/x86/include/asm/kvm_host.h    |    6 +-
 arch/x86/include/asm/pvclock-abi.h |    1 +
 arch/x86/kvm/svm.c                 |   20 ++++++
 arch/x86/kvm/vmx.c                 |   21 +++++++
 arch/x86/kvm/x86.c                 |  113 +++++++++++++++++++++++++++++++++---
 arch/x86/kvm/x86.h                 |    2 +
 include/linux/kvm.h                |   15 +++++
 7 files changed, 168 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ff651b7..6cce67a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -452,6 +452,8 @@ struct kvm_arch {
 	u32 virtual_tsc_khz;
 	u32 virtual_tsc_mult;
 	s8 virtual_tsc_shift;
+	bool tsc_trapping;
+	u32 tsc_flags;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
@@ -575,6 +577,8 @@ struct kvm_x86_ops {
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
 	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+	void (*set_tsc_trapping)(struct kvm_vcpu *vcpu, bool trap);
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
@@ -582,8 +586,6 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
-	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
-
 	const struct trace_print_flags *exit_reasons_str;
 };
 
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 35f2d19..315ead5 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -40,5 +40,6 @@ struct pvclock_wall_clock {
 } __attribute__((__packed__));
 
 #define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
+#define PVCLOCK_TSC_TRAPPED_BIT (1 << 1)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c929d00..af48be9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -806,6 +806,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 				(1ULL << INTERCEPT_MONITOR) |
 				(1ULL << INTERCEPT_MWAIT);
 
+	kvm_setup_tsc_trapping(&svm->vcpu);
+
 	control->iopm_base_pa = iopm_base;
 	control->msrpm_base_pa = __pa(svm->msrpm);
 	control->int_ctl = V_INTR_MASKING_MASK;
@@ -1038,6 +1040,15 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
 	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
 }
 
+static void svm_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	if (trap)
+		svm->vmcb->control.intercept |= 1ULL << INTERCEPT_RDTSC;
+	else
+		svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_RDTSC);
+}
+
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
 {
 	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
@@ -2497,6 +2508,13 @@ static int task_switch_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int rdtsc_interception(struct vcpu_svm *svm)
+{
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+	kvm_read_tsc(&svm->vcpu);
+	return 1;
+}
+
 static int cpuid_interception(struct vcpu_svm *svm)
 {
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
@@ -2833,6 +2851,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_SMI]				= nop_on_interception,
 	[SVM_EXIT_INIT]				= nop_on_interception,
 	[SVM_EXIT_VINTR]			= interrupt_window_interception,
+	[SVM_EXIT_RDTSC]			= rdtsc_interception,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
@@ -3676,6 +3695,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.write_tsc_offset = svm_write_tsc_offset,
 	.adjust_tsc_offset = svm_adjust_tsc_offset,
+	.set_tsc_trapping = svm_set_tsc_trapping,
 
 	.set_tdp_cr3 = set_tdp_cr3,
 };
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 28c72da..3516d18 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2631,6 +2631,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
 	kvm_write_tsc(&vmx->vcpu, 0);
+	kvm_setup_tsc_trapping(&vmx->vcpu);
 
 	return 0;
 }
@@ -2770,6 +2771,18 @@ out:
 	return ret;
 }
 
+static void vmx_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap)
+{
+	u32 cpu_based_vm_exec_control;
+
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	if (trap)
+		cpu_based_vm_exec_control |= CPU_BASED_RDTSC_EXITING;
+	else
+		cpu_based_vm_exec_control &= ~CPU_BASED_RDTSC_EXITING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
@@ -3359,6 +3372,12 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_rdtsc(struct kvm_vcpu *vcpu)
+{
+	kvm_read_tsc(vcpu);
+	return 1;
+}
+
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
 	skip_emulated_instruction(vcpu);
@@ -3651,6 +3670,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
 	[EXIT_REASON_HLT]                     = handle_halt,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
+	[EXIT_REASON_RDTSC]		      = handle_rdtsc,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
@@ -4339,6 +4359,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.write_tsc_offset = vmx_write_tsc_offset,
 	.adjust_tsc_offset = vmx_adjust_tsc_offset,
+	.set_tsc_trapping = vmx_set_tsc_trapping,
 
 	.set_tdp_cr3 = vmx_set_cr3,
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a339e50..bbcd582 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -95,6 +95,12 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
+static int __read_mostly tsc_trap = 1;
+module_param(tsc_trap, int, S_IRUGO);
+
+static bool __read_mostly tsc_auto = 1;
+module_param(tsc_auto, bool, S_IRUGO);
+
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
@@ -1058,6 +1064,8 @@ static void update_pvclock(struct kvm_vcpu *v,
 	pvclock->tsc_timestamp = tsc_timestamp;
 	pvclock->system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	pvclock->flags = 0;
+	if (v->kvm->arch.tsc_trapping)
+		pvclock->flags |= PVCLOCK_TSC_TRAPPED_BIT;
 }
 
 static void update_user_kvmclock(struct kvm_vcpu *v,
@@ -1072,6 +1080,18 @@ static void update_user_kvmclock(struct kvm_vcpu *v,
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 }
 
+void kvm_read_tsc(struct kvm_vcpu *vcpu)
+{
+	u64 tsc;
+	s64 kernel_ns = get_kernel_ns();
+
+	tsc = compute_guest_tsc(vcpu, kernel_ns);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)tsc);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tsc >> 32);
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_read_tsc);
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags;
@@ -1198,6 +1218,55 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	return 0;
 }
 
+void kvm_setup_tsc_trapping(struct kvm_vcpu *vcpu)
+{
+	struct kvm_arch *arch = &vcpu->kvm->arch;
+	int trap;
+	bool tsc_underrun, tsc_overrun;
+
+	/*
+	 * First, establish rate differences outside NTP correction boundary.
+	 * N.B. - virtual_tsc_khz may not yet be known, in which case it is
+	 * assumed the host rate will be used; guard against this in overrun.
+	 */
+	u64 max_tsc_ull = max_tsc_khz * 1000000ULL;
+	tsc_overrun = (arch->virtual_tsc_khz &&
+		       arch->virtual_tsc_khz * 1000500ULL < max_tsc_ull);
+	tsc_underrun = (arch->virtual_tsc_khz * 999500ULL > max_tsc_ull);
+
+	/*
+	 * We must trap if we have unstable TSC and a hint from userspace that
+	 * SMP is required; also, if we want a fixed rate and the max TSC rate
+	 * exceeds the VM rate by over 500 ppm (the maximum NTP slew rate).
+	 */
+	trap =
+	  (check_tsc_unstable() &&
+	      (arch->tsc_flags & KVM_TSC_FLAG_SMP_COHERENCY)) ||
+	  ((arch->tsc_flags & KVM_TSC_FLAG_FIXED_RATE) &&
+	      (tsc_overrun || tsc_underrun));
+
+	/*
+	 * Auto-selection: if we have no guidance from userspace, we can't
+	 * know if VCPUs will be added, so assume SMP, as it is difficult to
+	 * switch other CPUs into trapping mode after they have started
+	 */
+	if (tsc_auto)
+		trap |= (tsc_overrun || check_tsc_unstable());
+
+	/* tsc_trap (module parameter) overrides explicit choice */
+	if (tsc_trap != 0)
+		trap = (tsc_trap > 0);
+
+	/* Correct untrapped underrun with catchup */
+	if (!trap && tsc_underrun)
+		vcpu->arch.tsc_catchup = 1;
+
+	vcpu->kvm->arch.tsc_trapping = trap;
+	kvm_x86_ops->set_tsc_trapping(vcpu, trap);
+	pr_debug("kvm: set trap mode %d on vcpu %d\n", trap, vcpu->vcpu_id);
+}
+EXPORT_SYMBOL_GPL(kvm_setup_tsc_trapping);
+
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -1962,6 +2031,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
+	case KVM_CAP_TSC_CONTROL:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3535,7 +3605,30 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_TSC_CONTROL: {
+		struct kvm_tsc_control user_tsc;
+
+		r = -EFAULT;
+		if (copy_from_user(&user_tsc, argp, sizeof(user_tsc)))
+			goto out;
+
+		r = -EINVAL;
+		if (user_tsc.flags &
+		    ~(KVM_TSC_FLAG_FIXED_RATE |
+		      KVM_TSC_FLAG_SMP_COHERENCY))
+			goto out;
 
+		if (user_tsc.tsc_khz &&
+			(user_tsc.tsc_khz > KVM_TSC_MAX_KHZ ||
+			 user_tsc.tsc_khz < KVM_TSC_MIN_KHZ))
+			goto out;
+
+		if (user_tsc.tsc_khz)
+			kvm_arch_set_tsc_khz(kvm, user_tsc.tsc_khz);
+
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
@@ -5222,7 +5315,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
-	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+	/*
+	 * We only need to record this for unstable, passthrough TSC.
+	 * Since the host clocksource will not be TSC in that case, we
+	 * risk going backwards during recalibration of kvmclock due to
+	 * differing clock resolution.
+	 */
+	if (!vcpu->kvm->arch.tsc_trapping && check_tsc_unstable())
+		kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
 
 	atomic_set(&vcpu->guest_mode, 0);
 	smp_wmb();
@@ -5777,14 +5877,11 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
-						unsigned int id)
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
-		printk_once(KERN_WARNING
-		"kvm: SMP vm created on host with unstable TSC; "
-		"guest TSC will not be reliable\n");
-	return kvm_x86_ops->vcpu_create(kvm, id);
+	struct kvm_vcpu *vcpu;
+	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+	return vcpu;
 }
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2cea414..6afa64f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,5 +75,7 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+void kvm_read_tsc(struct kvm_vcpu *vcpu);
+void kvm_setup_tsc_trapping(struct kvm_vcpu *vcpu);
 
 #endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 919ae53..cb97e53 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -540,6 +540,8 @@ struct kvm_ppc_pvinfo {
 #endif
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_TSC_CONTROL 59
+
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -619,6 +621,17 @@ struct kvm_clock_data {
 	__u32 pad[9];
 };
 
+struct kvm_tsc_control {
+	__u32 flags;
+	__u32 tsc_khz;
+};
+
+#define KVM_TSC_FLAG_FIXED_RATE		(1 << 0)
+#define KVM_TSC_FLAG_SMP_COHERENCY	(1 << 1)
+
+#define KVM_TSC_MIN_KHZ 16000		/* 16 MHz, slower than first Pentium */
+#define KVM_TSC_MAX_KHZ 100000000	/* 100 GHz, good for a few years */
+
 /*
  * ioctls for VM fds
  */
@@ -676,6 +689,8 @@ struct kvm_clock_data {
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 /* Available with KVM_CAP_PPC_GET_PVINFO */
 #define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
+/* Available with KVM_CAP_TSC_CONTROL */
+#define KVM_TSC_CONTROL		  _IOW(KVMIO,  0xa2, struct kvm_tsc_control)
 
 /*
  * ioctls for vcpu fds
-- 
1.7.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
       [not found] <1294308645-31113-1-git-send-email-zamsden@redhat.com>
  2011-01-06 10:10 ` [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX Zachary Amsden
@ 2011-01-06 10:10 ` Zachary Amsden
  2011-01-06 10:34   ` Alexander Graf
  2011-01-07 10:48   ` Marcelo Tosatti
  1 sibling, 2 replies; 22+ messages in thread
From: Zachary Amsden @ 2011-01-06 10:10 UTC (permalink / raw)
  To: kvm, Avi Kivity, Marcelo Tosatti, Glauber Costa
  Cc: Zachary Amsden, linux-kernel

Use an MSR to allow "soft" migration to hosts which do not support
TSC trapping.  Rather than make this a required element of any
migration protocol, we allow the TSC rate to be exported as a data
field (useful in its own right), but we also allow a one time write
of the MSR during VM creation.  The result is that for the common
use case, no protocol change is required to communicate TSC rate
to the receiving host.

This allows administrative tools to configure migration policy
as they see appropriate.  Rather than dictate this policy with the
KVM implementation, we properly allow migration to hosts which both
do and do not support setting of the TSC rate on the receiving end.
If it is wished to not support migration to a host which lacks
support for the TSC rate feature, that can be coordinated externally.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
---
 arch/x86/include/asm/kvm_para.h |    1 +
 arch/x86/kvm/x86.c              |   28 ++++++++++++++++++++++++++--
 include/linux/kvm.h             |    1 +
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b562b6..723e60d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -32,6 +32,7 @@
 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_TSC_KHZ		0x4b564d02
 
 #define KVM_MAX_MMU_OP_BATCH           32
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbcd582..ff5addc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -786,10 +786,11 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	7
+#define KVM_SAVE_MSRS_BEGIN	8
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+	MSR_KVM_TSC_KHZ,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 	HV_X64_MSR_APIC_ASSIST_PAGE,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -1608,6 +1609,25 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		}
 		break;
 	}
+	case MSR_KVM_TSC_KHZ: {
+		struct kvm_arch *arch = &vcpu->kvm->arch;
+		/*
+		 * If userspace indicates a fixed rate TSC, accept attempts
+		 * to set the TSC rate to support incoming migrations.
+		 * This is a write once MSR, to prevent guest tampering.
+		 */
+		if (arch->tsc_flags & KVM_TSC_FLAG_KHZ_MSR_WRITABLE) {
+			if (data > KVM_TSC_MAX_KHZ || data < KVM_TSC_MIN_KHZ)
+				return 1;
+
+			spin_lock(&arch->clock_lock);
+			kvm_arch_set_tsc_khz(vcpu->kvm, data);
+			kvm_setup_tsc_trapping(vcpu);
+			arch->tsc_flags &= ~KVM_TSC_FLAG_KHZ_MSR_WRITABLE;
+			spin_unlock(&arch->clock_lock);
+		}
+		break;
+	}
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1884,6 +1904,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_SYSTEM_TIME_NEW:
 		data = vcpu->arch.time;
 		break;
+	case MSR_KVM_TSC_KHZ:
+		data = vcpu->kvm->arch.virtual_tsc_khz;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -3615,7 +3638,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (user_tsc.flags &
 		    ~(KVM_TSC_FLAG_FIXED_RATE |
-		      KVM_TSC_FLAG_SMP_COHERENCY))
+		      KVM_TSC_FLAG_SMP_COHERENCY |
+		      KVM_TSC_FLAG_KHZ_MSR_WRITABLE))
 			goto out;
 
 		if (user_tsc.tsc_khz &&
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index cb97e53..0e316d8 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -628,6 +628,7 @@ struct kvm_tsc_control {
 
 #define KVM_TSC_FLAG_FIXED_RATE		(1 << 0)
 #define KVM_TSC_FLAG_SMP_COHERENCY	(1 << 1)
+#define KVM_TSC_FLAG_KHZ_MSR_WRITABLE	(1 << 2)
 
 #define KVM_TSC_MIN_KHZ 16000		/* 16 MHz, slower than first Pentium */
 #define KVM_TSC_MAX_KHZ 100000000	/* 100 GHz, good for a few years */
-- 
1.7.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
  2011-01-06 10:10 ` [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR Zachary Amsden
@ 2011-01-06 10:34   ` Alexander Graf
  2011-01-06 11:27     ` Zachary Amsden
  2011-01-07 10:48   ` Marcelo Tosatti
  1 sibling, 1 reply; 22+ messages in thread
From: Alexander Graf @ 2011-01-06 10:34 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: kvm, Zachary Amsden, linux-kernel


Am 06.01.2011 um 11:10 schrieb Zachary Amsden <zamsden@redhat.com>:

> Use an MSR to allow "soft" migration to hosts which do not support
> TSC trapping.  Rather than make this a required element of any
> migration protocol, we allow the TSC rate to be exported as a data
> field (useful in its own right), but we also allow a one time write
> of the MSR during VM creation.  The result is that for the common
> use case, no protocol change is required to communicate TSC rate
> to the receiving host.
> 
> This allows administrative tools to configure migration policy
> as they see appropriate.  Rather than dictate this policy with the
> KVM implementation, we properly allow migration to hosts which both
> do and do not support setting of the TSC rate on the receiving end.
> If it is wished to not support migration to a host which lacks
> support for the TSC rate feature, that can be coordinated externally.

Isn't there a real hw equivalent of such a register? It might make more sense to just implement that then.


Alex

> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 10:10 ` [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX Zachary Amsden
@ 2011-01-06 10:41   ` Alexander Graf
  2011-01-06 11:30     ` Zachary Amsden
  2011-01-06 11:32   ` Avi Kivity
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 22+ messages in thread
From: Alexander Graf @ 2011-01-06 10:41 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: kvm, Zachary Amsden, linux-kernel


Am 06.01.2011 um 11:10 schrieb Zachary Amsden <zamsden@redhat.com>:

> Reasons to trap the TSC are numerous, but we want to avoid it as much
> as possible for performance reasons.
> 
> We provide two conservative modes via modules parameters and userspace
> hinting.  First, the module can be loaded with "tsc_auto=1" as a module
> parameter, which turns on conservative TSC trapping only when it is
> required (when unstable TSC or faster KHZ CPU is detected).
> 
> For userspace hinting, we enable trapping only if necessary.  Userspace
> can hint that a VM needs a fixed frequency TSC, and also that SMP
> stability will be required.  In that case, we conservatively turn on
> trapping when it is needed.  In addition, users may now specify the
> desired TSC rate at which to run.  If this rate differs significantly
> from the host rate, trapping will be enabled.
> 
> There is also an override control to allow TSC trapping to be turned on
> or off unconditionally for testing.
> 
> We indicate to pvclock users that the TSC is being trapped, to allow
> avoiding overhead and directly using RDTSCP (only for SVM).  This
> optimization is not yet implemented.

When migrating, the implementation could switch from non-trapped to trapped, making it less attractive. The guest however does not get notified about this change. Same for the other way around.

Would it make sense to add a kvmclock interrupt to notify the guest of such a change?


Alex

> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
  2011-01-06 10:34   ` Alexander Graf
@ 2011-01-06 11:27     ` Zachary Amsden
  2011-01-06 11:40       ` Alexander Graf
  0 siblings, 1 reply; 22+ messages in thread
From: Zachary Amsden @ 2011-01-06 11:27 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm, linux-kernel

On 01/06/2011 12:34 AM, Alexander Graf wrote:
> Am 06.01.2011 um 11:10 schrieb Zachary Amsden<zamsden@redhat.com>:
>
>    
>> Use an MSR to allow "soft" migration to hosts which do not support
>> TSC trapping.  Rather than make this a required element of any
>> migration protocol, we allow the TSC rate to be exported as a data
>> field (useful in its own right), but we also allow a one time write
>> of the MSR during VM creation.  The result is that for the common
>> use case, no protocol change is required to communicate TSC rate
>> to the receiving host.
>>
>> This allows administrative tools to configure migration policy
>> as they see appropriate.  Rather than dictate this policy with the
>> KVM implementation, we properly allow migration to hosts which both
>> do and do not support setting of the TSC rate on the receiving end.
>> If it is wished to not support migration to a host which lacks
>> support for the TSC rate feature, that can be coordinated externally.
>>      
> Isn't there a real hw equivalent of such a register? It might make more sense to just implement that then.
>
>    

Unfortunately, no.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 10:41   ` Alexander Graf
@ 2011-01-06 11:30     ` Zachary Amsden
  2011-01-06 11:38       ` Alexander Graf
  0 siblings, 1 reply; 22+ messages in thread
From: Zachary Amsden @ 2011-01-06 11:30 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm, linux-kernel

On 01/06/2011 12:41 AM, Alexander Graf wrote:
> Am 06.01.2011 um 11:10 schrieb Zachary Amsden<zamsden@redhat.com>:
>
>    
>> Reasons to trap the TSC are numerous, but we want to avoid it as much
>> as possible for performance reasons.
>>
>> We provide two conservative modes via modules parameters and userspace
>> hinting.  First, the module can be loaded with "tsc_auto=1" as a module
>> parameter, which turns on conservative TSC trapping only when it is
>> required (when unstable TSC or faster KHZ CPU is detected).
>>
>> For userspace hinting, we enable trapping only if necessary.  Userspace
>> can hint that a VM needs a fixed frequency TSC, and also that SMP
>> stability will be required.  In that case, we conservatively turn on
>> trapping when it is needed.  In addition, users may now specify the
>> desired TSC rate at which to run.  If this rate differs significantly
>> from the host rate, trapping will be enabled.
>>
>> There is also an override control to allow TSC trapping to be turned on
>> or off unconditionally for testing.
>>
>> We indicate to pvclock users that the TSC is being trapped, to allow
>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>> optimization is not yet implemented.
>>      
> When migrating, the implementation could switch from non-trapped to trapped, making it less attractive. The guest however does not get notified about this change. Same for the other way around.
>    

That's a policy decision to be made by the userspace agent.  It's better 
than the current situation, where there is no control at all of TSC 
rate.  Here, we're flexible either way.

Also note, moving to a faster processor, trapping kicks in... but the 
processor is faster, so no actual loss is noticed, and the problem 
corrects when the VM is power cycled.

> Would it make sense to add a kvmclock interrupt to notify the guest of such a change?

kvmclock is immune to frequency changes, so it needs no interrupt, it 
just has a version controlled shared area, which is reset.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 10:10 ` [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX Zachary Amsden
  2011-01-06 10:41   ` Alexander Graf
@ 2011-01-06 11:32   ` Avi Kivity
  2011-01-06 20:03     ` Zachary Amsden
  2011-01-07 11:23   ` Marcelo Tosatti
  2011-01-10 11:52   ` Joerg Roedel
  3 siblings, 1 reply; 22+ messages in thread
From: Avi Kivity @ 2011-01-06 11:32 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: kvm, Marcelo Tosatti, Glauber Costa, linux-kernel

On 01/06/2011 12:10 PM, Zachary Amsden wrote:
> Reasons to trap the TSC are numerous, but we want to avoid it as much
> as possible for performance reasons.
>
> We provide two conservative modes via modules parameters and userspace
> hinting.  First, the module can be loaded with "tsc_auto=1" as a module
> parameter, which turns on conservative TSC trapping only when it is
> required (when unstable TSC or faster KHZ CPU is detected).
>
> For userspace hinting, we enable trapping only if necessary.  Userspace
> can hint that a VM needs a fixed frequency TSC, and also that SMP
> stability will be required.  In that case, we conservatively turn on
> trapping when it is needed.  In addition, users may now specify the
> desired TSC rate at which to run.  If this rate differs significantly
> from the host rate, trapping will be enabled.
>
> There is also an override control to allow TSC trapping to be turned on
> or off unconditionally for testing.
>
> We indicate to pvclock users that the TSC is being trapped, to allow
> avoiding overhead and directly using RDTSCP (only for SVM).  This
> optimization is not yet implemented.
>
> Signed-off-by: Zachary Amsden<zamsden@redhat.com>
> ---
>   arch/x86/include/asm/kvm_host.h    |    6 +-
>   arch/x86/include/asm/pvclock-abi.h |    1 +
>   arch/x86/kvm/svm.c                 |   20 ++++++
>   arch/x86/kvm/vmx.c                 |   21 +++++++
>   arch/x86/kvm/x86.c                 |  113 +++++++++++++++++++++++++++++++++---
>   arch/x86/kvm/x86.h                 |    2 +
>   include/linux/kvm.h                |   15 +++++
>   7 files changed, 168 insertions(+), 10 deletions(-)


Haven't reviewed yet, but Documentation/kvm/api.txt is missing here.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 11:30     ` Zachary Amsden
@ 2011-01-06 11:38       ` Alexander Graf
  2011-01-06 20:24         ` Zachary Amsden
  0 siblings, 1 reply; 22+ messages in thread
From: Alexander Graf @ 2011-01-06 11:38 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: kvm, linux-kernel


On 06.01.2011, at 12:30, Zachary Amsden wrote:

> On 01/06/2011 12:41 AM, Alexander Graf wrote:
>> Am 06.01.2011 um 11:10 schrieb Zachary Amsden<zamsden@redhat.com>:
>> 
>>   
>>> Reasons to trap the TSC are numerous, but we want to avoid it as much
>>> as possible for performance reasons.
>>> 
>>> We provide two conservative modes via modules parameters and userspace
>>> hinting.  First, the module can be loaded with "tsc_auto=1" as a module
>>> parameter, which turns on conservative TSC trapping only when it is
>>> required (when unstable TSC or faster KHZ CPU is detected).
>>> 
>>> For userspace hinting, we enable trapping only if necessary.  Userspace
>>> can hint that a VM needs a fixed frequency TSC, and also that SMP
>>> stability will be required.  In that case, we conservatively turn on
>>> trapping when it is needed.  In addition, users may now specify the
>>> desired TSC rate at which to run.  If this rate differs significantly
>>> from the host rate, trapping will be enabled.
>>> 
>>> There is also an override control to allow TSC trapping to be turned on
>>> or off unconditionally for testing.
>>> 
>>> We indicate to pvclock users that the TSC is being trapped, to allow
>>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>>> optimization is not yet implemented.
>>>     
>> When migrating, the implementation could switch from non-trapped to trapped, making it less attractive. The guest however does not get notified about this change. Same for the other way around.
>>   
> 
> That's a policy decision to be made by the userspace agent.  It's better than the current situation, where there is no control at all of TSC rate.  Here, we're flexible either way.
> 
> Also note, moving to a faster processor, trapping kicks in... but the processor is faster, so no actual loss is noticed, and the problem corrects when the VM is power cycled.

Hrm. But even then the guest should be notified to enable it to act accordingly and just recalibrate instead of reboot, no? I'm not saying this is particularly interesting for kvmclock enabled guests, but think of all the < 2.6.2x Linux, *BSD, Solaris, Windows etc. VMs out there that might have an easy means of triggering recalibration (or at least could introduce it), but writing a new clock source is a lot of work.

Of course, sending the notification through a userspace agent would also work. That one would have to be notified about the change too though.

>> Would it make sense to add a kvmclock interrupt to notify the guest of such a change?
> 
> kvmclock is immune to frequency changes, so it needs no interrupt, it just has a version controlled shared area, which is reset.


>>> We indicate to pvclock users that the TSC is being trapped, to allow
>>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>>> optimization is not yet implemented.
>> 

That doesn't sound to me like they're unaffected?


Alex

>> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
  2011-01-06 11:27     ` Zachary Amsden
@ 2011-01-06 11:40       ` Alexander Graf
  2011-01-06 20:34         ` Zachary Amsden
  0 siblings, 1 reply; 22+ messages in thread
From: Alexander Graf @ 2011-01-06 11:40 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: kvm, linux-kernel


On 06.01.2011, at 12:27, Zachary Amsden wrote:

> On 01/06/2011 12:34 AM, Alexander Graf wrote:
>> Am 06.01.2011 um 11:10 schrieb Zachary Amsden<zamsden@redhat.com>:
>> 
>>   
>>> Use an MSR to allow "soft" migration to hosts which do not support
>>> TSC trapping.  Rather than make this a required element of any
>>> migration protocol, we allow the TSC rate to be exported as a data
>>> field (useful in its own right), but we also allow a one time write
>>> of the MSR during VM creation.  The result is that for the common
>>> use case, no protocol change is required to communicate TSC rate
>>> to the receiving host.
>>> 
>>> This allows administrative tools to configure migration policy
>>> as they see appropriate.  Rather than dictate this policy with the
>>> KVM implementation, we properly allow migration to hosts which both
>>> do and do not support setting of the TSC rate on the receiving end.
>>> If it is wished to not support migration to a host which lacks
>>> support for the TSC rate feature, that can be coordinated externally.
>>>     
>> Isn't there a real hw equivalent of such a register? It might make more sense to just implement that then.
>> 
>>   
> 
> Unfortunately, no.

Bleks. I couldn't find anything in AMD documentation either. Intel documentation is usually hard to find and incomplete anyways, so maybe something's hiding there - but if it's hidden so well it's no use to implement either.


Alex


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 11:32   ` Avi Kivity
@ 2011-01-06 20:03     ` Zachary Amsden
  0 siblings, 0 replies; 22+ messages in thread
From: Zachary Amsden @ 2011-01-06 20:03 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Marcelo Tosatti, Glauber Costa, linux-kernel

On 01/06/2011 01:32 AM, Avi Kivity wrote:
> On 01/06/2011 12:10 PM, Zachary Amsden wrote:
>> Reasons to trap the TSC are numerous, but we want to avoid it as much
>> as possible for performance reasons.
>>
>> We provide two conservative modes via modules parameters and userspace
>> hinting.  First, the module can be loaded with "tsc_auto=1" as a module
>> parameter, which turns on conservative TSC trapping only when it is
>> required (when unstable TSC or faster KHZ CPU is detected).
>>
>> For userspace hinting, we enable trapping only if necessary.  Userspace
>> can hint that a VM needs a fixed frequency TSC, and also that SMP
>> stability will be required.  In that case, we conservatively turn on
>> trapping when it is needed.  In addition, users may now specify the
>> desired TSC rate at which to run.  If this rate differs significantly
>> from the host rate, trapping will be enabled.
>>
>> There is also an override control to allow TSC trapping to be turned on
>> or off unconditionally for testing.
>>
>> We indicate to pvclock users that the TSC is being trapped, to allow
>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>> optimization is not yet implemented.
>>
>> Signed-off-by: Zachary Amsden<zamsden@redhat.com>
>> ---
>>   arch/x86/include/asm/kvm_host.h    |    6 +-
>>   arch/x86/include/asm/pvclock-abi.h |    1 +
>>   arch/x86/kvm/svm.c                 |   20 ++++++
>>   arch/x86/kvm/vmx.c                 |   21 +++++++
>>   arch/x86/kvm/x86.c                 |  113 
>> +++++++++++++++++++++++++++++++++---
>>   arch/x86/kvm/x86.h                 |    2 +
>>   include/linux/kvm.h                |   15 +++++
>>   7 files changed, 168 insertions(+), 10 deletions(-)
>
>
> Haven't reviewed yet, but Documentation/kvm/api.txt is missing here.
>

That will be included when I port to upstream head.  When dealing with 
software documentation, too much is never enough.

Zach

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 11:38       ` Alexander Graf
@ 2011-01-06 20:24         ` Zachary Amsden
  2011-01-06 22:38           ` Alexander Graf
  0 siblings, 1 reply; 22+ messages in thread
From: Zachary Amsden @ 2011-01-06 20:24 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm, linux-kernel

On 01/06/2011 01:38 AM, Alexander Graf wrote:
> On 06.01.2011, at 12:30, Zachary Amsden wrote:
>
>    
>> On 01/06/2011 12:41 AM, Alexander Graf wrote:
>>      
>>> Am 06.01.2011 um 11:10 schrieb Zachary Amsden<zamsden@redhat.com>:
>>>
>>>
>>>        
>>>> Reasons to trap the TSC are numerous, but we want to avoid it as much
>>>> as possible for performance reasons.
>>>>
>>>> We provide two conservative modes via modules parameters and userspace
>>>> hinting.  First, the module can be loaded with "tsc_auto=1" as a module
>>>> parameter, which turns on conservative TSC trapping only when it is
>>>> required (when unstable TSC or faster KHZ CPU is detected).
>>>>
>>>> For userspace hinting, we enable trapping only if necessary.  Userspace
>>>> can hint that a VM needs a fixed frequency TSC, and also that SMP
>>>> stability will be required.  In that case, we conservatively turn on
>>>> trapping when it is needed.  In addition, users may now specify the
>>>> desired TSC rate at which to run.  If this rate differs significantly
>>>> from the host rate, trapping will be enabled.
>>>>
>>>> There is also an override control to allow TSC trapping to be turned on
>>>> or off unconditionally for testing.
>>>>
>>>> We indicate to pvclock users that the TSC is being trapped, to allow
>>>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>>>> optimization is not yet implemented.
>>>>
>>>>          
>>> When migrating, the implementation could switch from non-trapped to trapped, making it less attractive. The guest however does not get notified about this change. Same for the other way around.
>>>
>>>        
>> That's a policy decision to be made by the userspace agent.  It's better than the current situation, where there is no control at all of TSC rate.  Here, we're flexible either way.
>>
>> Also note, moving to a faster processor, trapping kicks in... but the processor is faster, so no actual loss is noticed, and the problem corrects when the VM is power cycled.
>>      
> Hrm. But even then the guest should be notified to enable it to act accordingly and just recalibrate instead of reboot, no? I'm not saying this is particularly interesting for kvmclock enabled guests, but think of all the<  2.6.2x Linux, *BSD, Solaris, Windows etc. VMs out there that might have an easy means of triggering recalibration (or at least could introduce it), but writing a new clock source is a lot of work.
>    

That's why I implemented trapping.  So they can migrate and we don't 
need to change the OS.

> Of course, sending the notification through a userspace agent would also work. That one would have to be notified about the change too though.
>    

It's far too complex and far too small of a use case to be worth the 
effort.  Windows doesn't particularly care, and most HALs can be 
switched into a mode where TSC is not used.

Linux actually does support CPU frequency recalibration, but it is 
triggered differently based on the particular form of CPU frequency 
switching supported by the platform / chipset.  Since that isn't 
universal, and we pass through many features of the hardware (CPUID and 
such), there is no reliable way I know of to emulate CPU frequency 
switching for the guest without kernel modifications.  The best bet 
there would be a kernel module providing a KVM cpufreq driver, which 
could be ported to the relevant non-clocksource kernels.

This amount of effort, however, begs the question - if you are going to 
all this trouble, why not port kvmclock support to those kernel?

Solaris 10 and later do have some better virtualization friendly clock 
support.  BSD - we'd probably have to trap.

Again, if the overhead is significant, blah.  Today you have no choice 
but to accept sloppy timekeeping.  You lose nothing with this patch, but 
do gain the flexibility to choose either correct TSC timekeeping or 
native speed TSC.  There are scenarios where both of those can be met 
(uniform speed deployment / virt friendly guest), there are scenarios 
where sloppy timekeeping is appropriate (KVM clock used), and there are 
scenarios where correct timekeeping is appropriate (BSD, earlier 
TSC-based linux, or user-space TSC required).

>    
>>> Would it make sense to add a kvmclock interrupt to notify the guest of such a change?
>>>        
>> kvmclock is immune to frequency changes, so it needs no interrupt, it just has a version controlled shared area, which is reset.
>>      
>
>    
>>>> We indicate to pvclock users that the TSC is being trapped, to allow
>>>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>>>> optimization is not yet implemented.
>>>>          
>>>        
> That doesn't sound to me like they're unaffected?
>    

On Intel RDTSCP traps along with RDTSC.  This means that you can't have 
a trapping, constant rate TSC for userspace without also paying the 
overhead for reading the TSC for kvmclock.  This is not true on SVM, 
where RDTSCP is a separate trap, allowing optimization.

Zach

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
  2011-01-06 11:40       ` Alexander Graf
@ 2011-01-06 20:34         ` Zachary Amsden
  0 siblings, 0 replies; 22+ messages in thread
From: Zachary Amsden @ 2011-01-06 20:34 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm, linux-kernel

On 01/06/2011 01:40 AM, Alexander Graf wrote:
> On 06.01.2011, at 12:27, Zachary Amsden wrote:
>
>    
>> On 01/06/2011 12:34 AM, Alexander Graf wrote:
>>      
>>> Am 06.01.2011 um 11:10 schrieb Zachary Amsden<zamsden@redhat.com>:
>>>
>>>
>>>        
>>>> Use an MSR to allow "soft" migration to hosts which do not support
>>>> TSC trapping.  Rather than make this a required element of any
>>>> migration protocol, we allow the TSC rate to be exported as a data
>>>> field (useful in its own right), but we also allow a one time write
>>>> of the MSR during VM creation.  The result is that for the common
>>>> use case, no protocol change is required to communicate TSC rate
>>>> to the receiving host.
>>>>
>>>> This allows administrative tools to configure migration policy
>>>> as they see appropriate.  Rather than dictate this policy with the
>>>> KVM implementation, we properly allow migration to hosts which both
>>>> do and do not support setting of the TSC rate on the receiving end.
>>>> If it is wished to not support migration to a host which lacks
>>>> support for the TSC rate feature, that can be coordinated externally.
>>>>
>>>>          
>>> Isn't there a real hw equivalent of such a register? It might make more sense to just implement that then.
>>>
>>>
>>>        
>> Unfortunately, no.
>>      
> Bleks. I couldn't find anything in AMD documentation either. Intel documentation is usually hard to find and incomplete anyways, so maybe something's hiding there - but if it's hidden so well it's no use to implement either.
>    

While it makes perfect logical sense to us software people, from a 
hardware perspective it is ridiculous.  There is no signal input to the 
processor which can be counted and returned by a "CPU frequency MSR".  
The CPU frequency is simply what the CPU is clocked at.  And you can't 
burn it onto the die (think of overclocking), unless you have your own 
internal reference oscillator.

No, a CPU frequency MSR is impossible to implement sensibly in hardware 
because

1) if you have an internal clock, you have cross-CPU drift and can't 
easily share a common bus.
2) if you have an external clock, you have no way to measure the 
frequency of it without an internal reference, which again will vary, 
leading to different measured frequency on each CPU.

The only thing that sort of makes sense is a one-time writable MSR that 
is programmed by the BIOS to return the CPU frequency, as retrieved from 
the chipset.  This is merely a convenience for software people who can't 
be bothered to query the chipset directly or measure against a reference 
clock.

So as much as we'd like it, it really does make little sense from a 
hardware perspective.

Zach

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 20:24         ` Zachary Amsden
@ 2011-01-06 22:38           ` Alexander Graf
  2011-01-07  3:10             ` Zachary Amsden
  0 siblings, 1 reply; 22+ messages in thread
From: Alexander Graf @ 2011-01-06 22:38 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: kvm, linux-kernel


On 06.01.2011, at 21:24, Zachary Amsden wrote:

> On 01/06/2011 01:38 AM, Alexander Graf wrote:
>> On 06.01.2011, at 12:30, Zachary Amsden wrote:
>> 
>>   
>>> On 01/06/2011 12:41 AM, Alexander Graf wrote:
>>>     
>>>> Am 06.01.2011 um 11:10 schrieb Zachary Amsden<zamsden@redhat.com>:
>>>> 
>>>> 
>>>>       
>>>>> Reasons to trap the TSC are numerous, but we want to avoid it as much
>>>>> as possible for performance reasons.
>>>>> 
>>>>> We provide two conservative modes via modules parameters and userspace
>>>>> hinting.  First, the module can be loaded with "tsc_auto=1" as a module
>>>>> parameter, which turns on conservative TSC trapping only when it is
>>>>> required (when unstable TSC or faster KHZ CPU is detected).
>>>>> 
>>>>> For userspace hinting, we enable trapping only if necessary.  Userspace
>>>>> can hint that a VM needs a fixed frequency TSC, and also that SMP
>>>>> stability will be required.  In that case, we conservatively turn on
>>>>> trapping when it is needed.  In addition, users may now specify the
>>>>> desired TSC rate at which to run.  If this rate differs significantly
>>>>> from the host rate, trapping will be enabled.
>>>>> 
>>>>> There is also an override control to allow TSC trapping to be turned on
>>>>> or off unconditionally for testing.
>>>>> 
>>>>> We indicate to pvclock users that the TSC is being trapped, to allow
>>>>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>>>>> optimization is not yet implemented.
>>>>> 
>>>>>         
>>>> When migrating, the implementation could switch from non-trapped to trapped, making it less attractive. The guest however does not get notified about this change. Same for the other way around.
>>>> 
>>>>       
>>> That's a policy decision to be made by the userspace agent.  It's better than the current situation, where there is no control at all of TSC rate.  Here, we're flexible either way.
>>> 
>>> Also note, moving to a faster processor, trapping kicks in... but the processor is faster, so no actual loss is noticed, and the problem corrects when the VM is power cycled.
>>>     
>> Hrm. But even then the guest should be notified to enable it to act accordingly and just recalibrate instead of reboot, no? I'm not saying this is particularly interesting for kvmclock enabled guests, but think of all the<  2.6.2x Linux, *BSD, Solaris, Windows etc. VMs out there that might have an easy means of triggering recalibration (or at least could introduce it), but writing a new clock source is a lot of work.
>>   
> 
> That's why I implemented trapping.  So they can migrate and we don't need to change the OS.
> 
>> Of course, sending the notification through a userspace agent would also work. That one would have to be notified about the change too though.
>>   
> 
> It's far too complex and far too small of a use case to be worth the effort.  Windows doesn't particularly care, and most HALs can be switched into a mode where TSC is not used.
> 
> Linux actually does support CPU frequency recalibration, but it is triggered differently based on the particular form of CPU frequency switching supported by the platform / chipset.  Since that isn't universal, and we pass through many features of the hardware (CPUID and such), there is no reliable way I know of to emulate CPU frequency switching for the guest without kernel modifications.  The best bet there would be a kernel module providing a KVM cpufreq driver, which could be ported to the relevant non-clocksource kernels.
> 
> This amount of effort, however, begs the question - if you are going to all this trouble, why not port kvmclock support to those kernel?
> 
> Solaris 10 and later do have some better virtualization friendly clock support.  BSD - we'd probably have to trap.
> 
> Again, if the overhead is significant, blah.  Today you have no choice but to accept sloppy timekeeping.  You lose nothing with this patch, but do gain the flexibility to choose either correct TSC timekeeping or native speed TSC.  There are scenarios where both of those can be met (uniform speed deployment / virt friendly guest), there are scenarios where sloppy timekeeping is appropriate (KVM clock used), and there are scenarios where correct timekeeping is appropriate (BSD, earlier TSC-based linux, or user-space TSC required).

Sure, I'm not saying your patch is bad or goes in the wrong direction. I'd just think it'd be awesome to have an easy way for the guest OS to know that something as crucial as TSC reading speed got changed, hopefully even TSC frequency. Having any form of notification leaves open doors for someone to implement something (think proprietary OSs or out-of-service OSs here). Having no notification leaves us with no choice but taking the penalty and keeping the guest less informed than it has to be.

> 
>>   
>>>> Would it make sense to add a kvmclock interrupt to notify the guest of such a change?
>>>>       
>>> kvmclock is immune to frequency changes, so it needs no interrupt, it just has a version controlled shared area, which is reset.
>>>     
>> 
>>   
>>>>> We indicate to pvclock users that the TSC is being trapped, to allow
>>>>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>>>>> optimization is not yet implemented.
>>>>>         
>>>>       
>> That doesn't sound to me like they're unaffected?
>>   
> 
> On Intel RDTSCP traps along with RDTSC.  This means that you can't have a trapping, constant rate TSC for userspace without also paying the overhead for reading the TSC for kvmclock.  This is not true on SVM, where RDTSCP is a separate trap, allowing optimization.

So how does the guest know that something changed when it's migrated from an AMD machine to an Intel machine?


Alex


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 22:38           ` Alexander Graf
@ 2011-01-07  3:10             ` Zachary Amsden
  0 siblings, 0 replies; 22+ messages in thread
From: Zachary Amsden @ 2011-01-07  3:10 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm, linux-kernel

On 01/06/2011 12:38 PM, Alexander Graf wrote:

<snip>
>
> Sure, I'm not saying your patch is bad or goes in the wrong direction. I'd just think it'd be awesome to have an easy way for the guest OS to know that something as crucial as TSC reading speed got changed, hopefully even TSC frequency. Having any form of notification leaves open doors for someone to implement something (think proprietary OSs or out-of-service OSs here). Having no notification leaves us with no choice but taking the penalty and keeping the guest less informed than it has to be.
>    

We do - register kvmclock and check to make sure the version before and 
after time computations to be sure the frequency hasn't changed.

This doesn't even require an interrupt.

>    
>>      
>>>
>>>        
>>>>> Would it make sense to add a kvmclock interrupt to notify the guest of such a change?
>>>>>
>>>>>            
>>>> kvmclock is immune to frequency changes, so it needs no interrupt, it just has a version controlled shared area, which is reset.
>>>>
>>>>          
>>>
>>>        
>>>>>> We indicate to pvclock users that the TSC is being trapped, to allow
>>>>>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>>>>>> optimization is not yet implemented.
>>>>>>
>>>>>>              
>>>>>
>>>>>            
>>> That doesn't sound to me like they're unaffected?
>>>
>>>        
>> On Intel RDTSCP traps along with RDTSC.  This means that you can't have a trapping, constant rate TSC for userspace without also paying the overhead for reading the TSC for kvmclock.  This is not true on SVM, where RDTSCP is a separate trap, allowing optimization.
>>      
> So how does the guest know that something changed when it's migrated from an AMD machine to an Intel machine?
>    

That can and never should happen.  Simply too much state in the guest 
depends on CPU type, different workarounds are enabled for things, and 
even different instruction sets are activated.

There is no reward for the kind of complexity involved.

Zach

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
  2011-01-06 10:10 ` [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR Zachary Amsden
  2011-01-06 10:34   ` Alexander Graf
@ 2011-01-07 10:48   ` Marcelo Tosatti
  2011-01-07 20:44     ` Zachary Amsden
  1 sibling, 1 reply; 22+ messages in thread
From: Marcelo Tosatti @ 2011-01-07 10:48 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: kvm, Avi Kivity, Glauber Costa, linux-kernel

On Thu, Jan 06, 2011 at 12:10:45AM -1000, Zachary Amsden wrote:
> Use an MSR to allow "soft" migration to hosts which do not support
> TSC trapping.  Rather than make this a required element of any
> migration protocol, we allow the TSC rate to be exported as a data
> field (useful in its own right), but we also allow a one time write
> of the MSR during VM creation.  The result is that for the common
> use case, no protocol change is required to communicate TSC rate
> to the receiving host.

Migration to hosts which do not support the feature can be achieved by
saving/restoring the TSC rate + flags in a subsection. A subsection
seems more appropriate than an MSR for this.

> This allows administrative tools to configure migration policy
> as they see appropriate.  Rather than dictate this policy with the
> KVM implementation, we properly allow migration to hosts which both
> do and do not support setting of the TSC rate on the receiving end.
> If it is wished to not support migration to a host which lacks
> support for the TSC rate feature, that can be coordinated externally.
> 
> Signed-off-by: Zachary Amsden <zamsden@redhat.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 10:10 ` [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX Zachary Amsden
  2011-01-06 10:41   ` Alexander Graf
  2011-01-06 11:32   ` Avi Kivity
@ 2011-01-07 11:23   ` Marcelo Tosatti
  2011-01-09  8:05     ` Zachary Amsden
  2011-01-10 11:52   ` Joerg Roedel
  3 siblings, 1 reply; 22+ messages in thread
From: Marcelo Tosatti @ 2011-01-07 11:23 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: kvm, Avi Kivity, Glauber Costa, linux-kernel

On Thu, Jan 06, 2011 at 12:10:44AM -1000, Zachary Amsden wrote:
> Reasons to trap the TSC are numerous, but we want to avoid it as much
> as possible for performance reasons.
> 
> We provide two conservative modes via modules parameters and userspace
> hinting.  First, the module can be loaded with "tsc_auto=1" as a module
> parameter, which turns on conservative TSC trapping only when it is
> required (when unstable TSC or faster KHZ CPU is detected).
> 
> For userspace hinting, we enable trapping only if necessary.  Userspace
> can hint that a VM needs a fixed frequency TSC, and also that SMP
> stability will be required.  In that case, we conservatively turn on
> trapping when it is needed.  In addition, users may now specify the
> desired TSC rate at which to run.  If this rate differs significantly
> from the host rate, trapping will be enabled.
> 
> There is also an override control to allow TSC trapping to be turned on
> or off unconditionally for testing.
> 
> We indicate to pvclock users that the TSC is being trapped, to allow
> avoiding overhead and directly using RDTSCP (only for SVM).  This
> optimization is not yet implemented.
> 
> Signed-off-by: Zachary Amsden <zamsden@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h    |    6 +-
>  arch/x86/include/asm/pvclock-abi.h |    1 +
>  arch/x86/kvm/svm.c                 |   20 ++++++
>  arch/x86/kvm/vmx.c                 |   21 +++++++
>  arch/x86/kvm/x86.c                 |  113 +++++++++++++++++++++++++++++++++---
>  arch/x86/kvm/x86.h                 |    2 +
>  include/linux/kvm.h                |   15 +++++
>  7 files changed, 168 insertions(+), 10 deletions(-)

- Docs / test case please.
- KVM_TSC_CONTROL ioctl ignores flags field.
- What is the purpose of PVCLOCK_TSC_TRAPPED_BIT?
- Fail to see purpose of module parameters. Configuration from qemu
  should be enough?


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
  2011-01-07 10:48   ` Marcelo Tosatti
@ 2011-01-07 20:44     ` Zachary Amsden
  2011-01-10 13:50       ` Marcelo Tosatti
  0 siblings, 1 reply; 22+ messages in thread
From: Zachary Amsden @ 2011-01-07 20:44 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, Avi Kivity, Glauber Costa, linux-kernel

On 01/07/2011 12:48 AM, Marcelo Tosatti wrote:
> On Thu, Jan 06, 2011 at 12:10:45AM -1000, Zachary Amsden wrote:
>    
>> Use an MSR to allow "soft" migration to hosts which do not support
>> TSC trapping.  Rather than make this a required element of any
>> migration protocol, we allow the TSC rate to be exported as a data
>> field (useful in its own right), but we also allow a one time write
>> of the MSR during VM creation.  The result is that for the common
>> use case, no protocol change is required to communicate TSC rate
>> to the receiving host.
>>      
> Migration to hosts which do not support the feature can be achieved by
> saving/restoring the TSC rate + flags in a subsection. A subsection
> seems more appropriate than an MSR for this.
>    

Yes, I looked at that, but it looked to me like a subsection was 
intended for an optional feature which MUST be present on the 
destination if the source is using the feature.  This way, newer hosts 
without the feature enabled can migrate to older hosts which do not 
support the feature.

The TSC rate migration is slightly different; we may wish to migrate 
from a host with the TSC rate feature enabled to a host which does not 
support the TSC rate feature.  This is exactly the current behavior, the 
TSC rate will change on that migration, and I wanted to preserve that 
behavior.  I don't advise that mode of usage, but there may be use cases 
for it and it should be decided by policy, not dictated by our feature set.

That said, I'm happy to remove the MSR if we truly don't want to support 
that mode of usage.

Zach

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-07 11:23   ` Marcelo Tosatti
@ 2011-01-09  8:05     ` Zachary Amsden
  0 siblings, 0 replies; 22+ messages in thread
From: Zachary Amsden @ 2011-01-09  8:05 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, Avi Kivity, Glauber Costa, linux-kernel

On 01/07/2011 01:23 AM, Marcelo Tosatti wrote:
> On Thu, Jan 06, 2011 at 12:10:44AM -1000, Zachary Amsden wrote:
>    
>> Reasons to trap the TSC are numerous, but we want to avoid it as much
>> as possible for performance reasons.
>>
>> We provide two conservative modes via modules parameters and userspace
>> hinting.  First, the module can be loaded with "tsc_auto=1" as a module
>> parameter, which turns on conservative TSC trapping only when it is
>> required (when unstable TSC or faster KHZ CPU is detected).
>>
>> For userspace hinting, we enable trapping only if necessary.  Userspace
>> can hint that a VM needs a fixed frequency TSC, and also that SMP
>> stability will be required.  In that case, we conservatively turn on
>> trapping when it is needed.  In addition, users may now specify the
>> desired TSC rate at which to run.  If this rate differs significantly
>> from the host rate, trapping will be enabled.
>>
>> There is also an override control to allow TSC trapping to be turned on
>> or off unconditionally for testing.
>>
>> We indicate to pvclock users that the TSC is being trapped, to allow
>> avoiding overhead and directly using RDTSCP (only for SVM).  This
>> optimization is not yet implemented.
>>
>> Signed-off-by: Zachary Amsden<zamsden@redhat.com>
>> ---
>>   arch/x86/include/asm/kvm_host.h    |    6 +-
>>   arch/x86/include/asm/pvclock-abi.h |    1 +
>>   arch/x86/kvm/svm.c                 |   20 ++++++
>>   arch/x86/kvm/vmx.c                 |   21 +++++++
>>   arch/x86/kvm/x86.c                 |  113 +++++++++++++++++++++++++++++++++---
>>   arch/x86/kvm/x86.h                 |    2 +
>>   include/linux/kvm.h                |   15 +++++
>>   7 files changed, 168 insertions(+), 10 deletions(-)
>>      
> - Docs / test case please.
>    

Yes, will do.

> - KVM_TSC_CONTROL ioctl ignores flags field.
>    

Oops.

> - What is the purpose of PVCLOCK_TSC_TRAPPED_BIT?
>    

To allow RDTSCP optimizations for KVM clock when TSC is trapped (because 
a userspace application requires strict TSC).

> - Fail to see purpose of module parameters. Configuration from qemu
>    should be enough?
>    

For users with older versions of qemu who wish to take advantage of the 
feature, or performance / bug testing.  And oops here, the tsc_trap 
should not default to on.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX
  2011-01-06 10:10 ` [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX Zachary Amsden
                     ` (2 preceding siblings ...)
  2011-01-07 11:23   ` Marcelo Tosatti
@ 2011-01-10 11:52   ` Joerg Roedel
  3 siblings, 0 replies; 22+ messages in thread
From: Joerg Roedel @ 2011-01-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: kvm, Avi Kivity, Marcelo Tosatti, Glauber Costa, linux-kernel

On Thu, Jan 06, 2011 at 12:10:44AM -1000, Zachary Amsden wrote:
> +static void svm_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +	if (trap)
> +		svm->vmcb->control.intercept |= 1ULL << INTERCEPT_RDTSC;
> +	else
> +		svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_RDTSC);
> +}

This needs to update the clean-bits. Please use
set_intercept/clr_intercept instead which already takes care of this.

	Joerg


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
  2011-01-07 20:44     ` Zachary Amsden
@ 2011-01-10 13:50       ` Marcelo Tosatti
  2011-01-14 11:00         ` Juan Quintela
  0 siblings, 1 reply; 22+ messages in thread
From: Marcelo Tosatti @ 2011-01-10 13:50 UTC (permalink / raw)
  To: Zachary Amsden, Juan Quintela
  Cc: kvm, Avi Kivity, Glauber Costa, linux-kernel

On Fri, Jan 07, 2011 at 10:44:20AM -1000, Zachary Amsden wrote:
> On 01/07/2011 12:48 AM, Marcelo Tosatti wrote:
> >On Thu, Jan 06, 2011 at 12:10:45AM -1000, Zachary Amsden wrote:
> >>Use an MSR to allow "soft" migration to hosts which do not support
> >>TSC trapping.  Rather than make this a required element of any
> >>migration protocol, we allow the TSC rate to be exported as a data
> >>field (useful in its own right), but we also allow a one time write
> >>of the MSR during VM creation.  The result is that for the common
> >>use case, no protocol change is required to communicate TSC rate
> >>to the receiving host.
> >Migration to hosts which do not support the feature can be achieved by
> >saving/restoring the TSC rate + flags in a subsection. A subsection
> >seems more appropriate than an MSR for this.
> 
> Yes, I looked at that, but it looked to me like a subsection was
> intended for an optional feature which MUST be present on the
> destination if the source is using the feature.  This way, newer
> hosts without the feature enabled can migrate to older hosts which
> do not support the feature.

Right. But you can use a subsection to achieve the same effect. Just
consider that the source is not using the feature if you want to migrate
to an older host without support for it. Juan, is there a problem to
use subsections in this fashion?

With the MSR scheme, there is no way for management to enforce support
of the feature on the destination (at least not that i can see). And 
you create an MSR that does not exist on real hardware.

> 
> The TSC rate migration is slightly different; we may wish to migrate
> from a host with the TSC rate feature enabled to a host which does
> not support the TSC rate feature.  This is exactly the current
> behavior, the TSC rate will change on that migration, and I wanted
> to preserve that behavior.  I don't advise that mode of usage, but
> there may be use cases for it and it should be decided by policy,
> not dictated by our feature set.
> 
> That said, I'm happy to remove the MSR if we truly don't want to
> support that mode of usage.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
  2011-01-10 13:50       ` Marcelo Tosatti
@ 2011-01-14 11:00         ` Juan Quintela
  2011-01-18 15:47           ` Zachary Amsden
  0 siblings, 1 reply; 22+ messages in thread
From: Juan Quintela @ 2011-01-14 11:00 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Zachary Amsden, kvm, Avi Kivity, Glauber Costa, linux-kernel

Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Fri, Jan 07, 2011 at 10:44:20AM -1000, Zachary Amsden wrote:
>> On 01/07/2011 12:48 AM, Marcelo Tosatti wrote:
>> >On Thu, Jan 06, 2011 at 12:10:45AM -1000, Zachary Amsden wrote:
>> >>Use an MSR to allow "soft" migration to hosts which do not support
>> >>TSC trapping.  Rather than make this a required element of any
>> >>migration protocol, we allow the TSC rate to be exported as a data
>> >>field (useful in its own right), but we also allow a one time write
>> >>of the MSR during VM creation.  The result is that for the common
>> >>use case, no protocol change is required to communicate TSC rate
>> >>to the receiving host.
>> >Migration to hosts which do not support the feature can be achieved by
>> >saving/restoring the TSC rate + flags in a subsection. A subsection
>> >seems more appropriate than an MSR for this.
>> 
>> Yes, I looked at that, but it looked to me like a subsection was
>> intended for an optional feature which MUST be present on the
>> destination if the source is using the feature.  This way, newer
>> hosts without the feature enabled can migrate to older hosts which
>> do not support the feature.
>
> Right. But you can use a subsection to achieve the same effect. Just
> consider that the source is not using the feature if you want to migrate
> to an older host without support for it. Juan, is there a problem to
> use subsections in this fashion?
>
> With the MSR scheme, there is no way for management to enforce support
> of the feature on the destination (at least not that i can see). And 
> you create an MSR that does not exist on real hardware.
>
>> 
>> The TSC rate migration is slightly different; we may wish to migrate
>> from a host with the TSC rate feature enabled to a host which does
>> not support the TSC rate feature.  This is exactly the current
>> behavior, the TSC rate will change on that migration, and I wanted
>> to preserve that behavior.  I don't advise that mode of usage, but
>> there may be use cases for it and it should be decided by policy,
>> not dictated by our feature set.
>> 
>> That said, I'm happy to remove the MSR if we truly don't want to
>> support that mode of usage.

Ok, I chime it late.

We are adding a new MSR to the comunication with userspace.  So far so
good, but this new field, need to be transmited to the "other end" of
the migration.  This means a new field for migration (notice that this
is independtly if this is an MSR or not).

        VMSTATE_UINT64_V(system_time_msr, CPUState, 11),
        VMSTATE_UINT64_V(wall_clock_msr, CPUState, 11),

This are the values that we are sending now.
We are getting now, a new value, the problem is how to migrate it.

Solutions:
- create a new field in a new field in CPUState, and up the version.
  That would make backward migration impossible.
- create a new field, and sent it only it is has been used with a
  subsection.  This makes migration backwards if this was not used.

- but, it appears that there if this features is not "known" on
  destination, we can use the old way to migrate information.

  BIG WARNING HERE: I don't claim to understand how clocks work at all

  There is not a way to convince old qemu/kernels to ignore new fields
  for good reason.  So the only solution here is to encode this new
  vcpu->kvm->arch.virtual_tsc_khz in the two previous fields, in a way
  that is understable for old qemu/new qemu.  old qemu will use old
  method, new qemu will use a new method.

  If there is not a common encoding that will work for both old/new
  method, I can't really think of a way to make things work here :(

And as per the warning, I can't think of a way to encode
"virtual_tsc_khz" into "system_time_msr" and "wall_clock_msr" off-hand.

To make things clearer about optional features, after "lots" of
discussions, it was decided that target of migration will never ignore
anything sent, that means that the only one that can decide not to sent
a feature/value is the "source" of the migration.  There is no way to
express:

- try this method, if you don't know
- try this other second best, ...

and so on.


Later, Juan.






^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR
  2011-01-14 11:00         ` Juan Quintela
@ 2011-01-18 15:47           ` Zachary Amsden
  0 siblings, 0 replies; 22+ messages in thread
From: Zachary Amsden @ 2011-01-18 15:47 UTC (permalink / raw)
  To: quintela; +Cc: Marcelo Tosatti, kvm, Avi Kivity, Glauber Costa, linux-kernel

On 01/14/2011 06:00 AM, Juan Quintela wrote:
> Marcelo Tosatti<mtosatti@redhat.com>  wrote:
>    
>> On Fri, Jan 07, 2011 at 10:44:20AM -1000, Zachary Amsden wrote:
>>      
>>> On 01/07/2011 12:48 AM, Marcelo Tosatti wrote:
>>>        
>>>> On Thu, Jan 06, 2011 at 12:10:45AM -1000, Zachary Amsden wrote:
>>>>          
>>>>> Use an MSR to allow "soft" migration to hosts which do not support
>>>>> TSC trapping.  Rather than make this a required element of any
>>>>> migration protocol, we allow the TSC rate to be exported as a data
>>>>> field (useful in its own right), but we also allow a one time write
>>>>> of the MSR during VM creation.  The result is that for the common
>>>>> use case, no protocol change is required to communicate TSC rate
>>>>> to the receiving host.
>>>>>            
>>>> Migration to hosts which do not support the feature can be achieved by
>>>> saving/restoring the TSC rate + flags in a subsection. A subsection
>>>> seems more appropriate than an MSR for this.
>>>>          
>>> Yes, I looked at that, but it looked to me like a subsection was
>>> intended for an optional feature which MUST be present on the
>>> destination if the source is using the feature.  This way, newer
>>> hosts without the feature enabled can migrate to older hosts which
>>> do not support the feature.
>>>        
>> Right. But you can use a subsection to achieve the same effect. Just
>> consider that the source is not using the feature if you want to migrate
>> to an older host without support for it. Juan, is there a problem to
>> use subsections in this fashion?
>>
>> With the MSR scheme, there is no way for management to enforce support
>> of the feature on the destination (at least not that i can see). And
>> you create an MSR that does not exist on real hardware.
>>
>>      
>>> The TSC rate migration is slightly different; we may wish to migrate
>>> from a host with the TSC rate feature enabled to a host which does
>>> not support the TSC rate feature.  This is exactly the current
>>> behavior, the TSC rate will change on that migration, and I wanted
>>> to preserve that behavior.  I don't advise that mode of usage, but
>>> there may be use cases for it and it should be decided by policy,
>>> not dictated by our feature set.
>>>
>>> That said, I'm happy to remove the MSR if we truly don't want to
>>> support that mode of usage.
>>>        
> Ok, I chime it late.
>
> We are adding a new MSR to the comunication with userspace.  So far so
> good, but this new field, need to be transmited to the "other end" of
> the migration.  This means a new field for migration (notice that this
> is independtly if this is an MSR or not).
>
>          VMSTATE_UINT64_V(system_time_msr, CPUState, 11),
>          VMSTATE_UINT64_V(wall_clock_msr, CPUState, 11),
>    

Oh, wow.  I thought the MSRs were sent automatically by qemu based on 
what MSRs the kvm module told it were available.  It looks like EFER and 
STAR and friends are all special cased as part of CPUstate.

So my approach has been doomed from the beginning.

> This are the values that we are sending now.
> We are getting now, a new value, the problem is how to migrate it.
>
> Solutions:
> - create a new field in a new field in CPUState, and up the version.
>    That would make backward migration impossible.
> - create a new field, and sent it only it is has been used with a
>    subsection.  This makes migration backwards if this was not used.
>
> - but, it appears that there if this features is not "known" on
>    destination, we can use the old way to migrate information.
>
>    BIG WARNING HERE: I don't claim to understand how clocks work at all
>
>    There is not a way to convince old qemu/kernels to ignore new fields
>    for good reason.  So the only solution here is to encode this new
>    vcpu->kvm->arch.virtual_tsc_khz in the two previous fields, in a way
>    that is understable for old qemu/new qemu.  old qemu will use old
>    method, new qemu will use a new method.
>
>    If there is not a common encoding that will work for both old/new
>    method, I can't really think of a way to make things work here :(
>
> And as per the warning, I can't think of a way to encode
> "virtual_tsc_khz" into "system_time_msr" and "wall_clock_msr" off-hand.
>
> To make things clearer about optional features, after "lots" of
> discussions, it was decided that target of migration will never ignore
> anything sent, that means that the only one that can decide not to sent
> a feature/value is the "source" of the migration.  There is no way to
> express:
>
> - try this method, if you don't know
> - try this other second best, ...
>    

That decides it then, the feature is migrated in the state it is set if 
it is enabled.

This makes things much simpler all around.

Cheers,

Zach


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2011-01-18 15:48 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1294308645-31113-1-git-send-email-zamsden@redhat.com>
2011-01-06 10:10 ` [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX Zachary Amsden
2011-01-06 10:41   ` Alexander Graf
2011-01-06 11:30     ` Zachary Amsden
2011-01-06 11:38       ` Alexander Graf
2011-01-06 20:24         ` Zachary Amsden
2011-01-06 22:38           ` Alexander Graf
2011-01-07  3:10             ` Zachary Amsden
2011-01-06 11:32   ` Avi Kivity
2011-01-06 20:03     ` Zachary Amsden
2011-01-07 11:23   ` Marcelo Tosatti
2011-01-09  8:05     ` Zachary Amsden
2011-01-10 11:52   ` Joerg Roedel
2011-01-06 10:10 ` [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR Zachary Amsden
2011-01-06 10:34   ` Alexander Graf
2011-01-06 11:27     ` Zachary Amsden
2011-01-06 11:40       ` Alexander Graf
2011-01-06 20:34         ` Zachary Amsden
2011-01-07 10:48   ` Marcelo Tosatti
2011-01-07 20:44     ` Zachary Amsden
2011-01-10 13:50       ` Marcelo Tosatti
2011-01-14 11:00         ` Juan Quintela
2011-01-18 15:47           ` Zachary Amsden

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).