LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] rdmsr_on_cpu, wrmsr_on_cpu 
@ 2007-01-18 14:45 Alexey Dobriyan
  2007-01-18 22:14 ` H. Peter Anvin
  0 siblings, 1 reply; 6+ messages in thread
From: Alexey Dobriyan @ 2007-01-18 14:45 UTC (permalink / raw)
  To: ak, akpm; +Cc: linux-kernel, davej, devel

There was OpenVZ specific bug rendering some cpufreq drivers unusable
on SMP. In short, when cpufreq code thinks it confined itself to
needed cpu by means of set_cpus_allowed() to execute rdmsr, some
"virtual cpu" feature can migrate process to anywhere. This triggers
bugons and does wrong things in general.

This got fixed by introducing rdmsr_on_cpu and wrmsr_on_cpu executing
rdmsr and wrmsr on given physical cpu by means of
smp_call_function_single().

Dave Jones mentioned cpufreq might be not only user of rdmsr_on_cpu()
and wrmsr_on_cpu(), so I'm going to put them into arch/i386/lib/
(after patch gets some more testing othen than compile and UP run)

Does this looks OK?


 arch/i386/kernel/cpu/cpufreq/p4-clockmod.c |   30 ++----------
 arch/i386/lib/Makefile                     |    2
 arch/i386/lib/msr-on-cpu.c                 |   70 +++++++++++++++++++++++++++++
 include/asm-i386/msr.h                     |    3 +
 4 files changed, 81 insertions(+), 24 deletions(-)

--- a/arch/i386/lib/Makefile
+++ b/arch/i386/lib/Makefile
@@ -7,3 +7,5 @@ lib-y = checksum.o delay.o usercopy.o ge
 	bitops.o semaphore.o
 
 lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
+
+obj-y = msr-on-cpu.o
--- /dev/null
+++ b/arch/i386/lib/msr-on-cpu.c
@@ -0,0 +1,70 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+
+#ifdef CONFIG_SMP
+struct msr_info {
+	u32 msr_no;
+	u32 l, h;
+};
+
+static void __rdmsr_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	rdmsr(rv->msr_no, rv->l, rv->h);
+}
+
+void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+	preempt_disable();
+	if (smp_processor_id() == cpu)
+		rdmsr(msr_no, *l, *h);
+	else {
+		struct msr_info rv;
+
+		rv.msr_no = msr_no;
+		smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1);
+		*l = rv.l;
+		*h = rv.h;
+	}
+	preempt_enable();
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	wrmsr(rv->msr_no, rv->l, rv->h);
+}
+
+void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+	preempt_disable();
+	if (smp_processor_id() == cpu)
+		wrmsr(msr_no, l, h);
+	else {
+		struct msr_info rv;
+
+		rv.msr_no = msr_no;
+		rv.l = l;
+		rv.h = h;
+		smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1);
+	}
+	preempt_enable();
+}
+#else
+void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+	rdmsr(msr_no, *l, *h);
+}
+
+void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+	wrmsr(msr_no, l, h);
+}
+#endif
+
+EXPORT_SYMBOL(rdmsr_on_cpu);
+EXPORT_SYMBOL(wrmsr_on_cpu);
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -83,6 +83,9 @@ #define rdpmc(counter,low,high) \
 			  : "c" (counter))
 #endif	/* !CONFIG_PARAVIRT */
 
+void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
+void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+
 /* symbolic names for some interesting MSRs */
 /* Intel defined MSRs. */
 #define MSR_IA32_P5_MC_ADDR		0
--- a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
@@ -63,7 +63,7 @@ static int cpufreq_p4_setdc(unsigned int
 	if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV))
 		return -EINVAL;
 
-	rdmsr(MSR_IA32_THERM_STATUS, l, h);
+	rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
 
 	if (l & 0x01)
 		dprintk("CPU#%d currently thermal throttled\n", cpu);
@@ -71,10 +71,10 @@ static int cpufreq_p4_setdc(unsigned int
 	if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT))
 		newstate = DC_38PT;
 
-	rdmsr(MSR_IA32_THERM_CONTROL, l, h);
+	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
 	if (newstate == DC_DISABLE) {
 		dprintk("CPU#%d disabling modulation\n", cpu);
-		wrmsr(MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
+		wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
 	} else {
 		dprintk("CPU#%d setting duty cycle to %d%%\n",
 			cpu, ((125 * newstate) / 10));
@@ -85,7 +85,7 @@ static int cpufreq_p4_setdc(unsigned int
 		 */
 		l = (l & ~14);
 		l = l | (1<<4) | ((newstate & 0x7)<<1);
-		wrmsr(MSR_IA32_THERM_CONTROL, l, h);
+		wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
 	}
 
 	return 0;
@@ -112,7 +112,6 @@ static int cpufreq_p4_target(struct cpuf
 {
 	unsigned int    newstate = DC_RESV;
 	struct cpufreq_freqs freqs;
-	cpumask_t cpus_allowed;
 	int i;
 
 	if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], target_freq, relation, &newstate))
@@ -133,17 +132,8 @@ static int cpufreq_p4_target(struct cpuf
 	/* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
 	 * Developer's Manual, Volume 3
 	 */
-	cpus_allowed = current->cpus_allowed;
-
-	for_each_cpu_mask(i, policy->cpus) {
-		cpumask_t this_cpu = cpumask_of_cpu(i);
-
-		set_cpus_allowed(current, this_cpu);
-		BUG_ON(smp_processor_id() != i);
-
+	for_each_cpu_mask(i, policy->cpus)
 		cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
-	}
-	set_cpus_allowed(current, cpus_allowed);
 
 	/* notifiers */
 	for_each_cpu_mask(i, policy->cpus) {
@@ -265,17 +255,9 @@ static int cpufreq_p4_cpu_exit(struct cp
 
 static unsigned int cpufreq_p4_get(unsigned int cpu)
 {
-	cpumask_t cpus_allowed;
 	u32 l, h;
 
-	cpus_allowed = current->cpus_allowed;
-
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
-	BUG_ON(smp_processor_id() != cpu);
-
-	rdmsr(MSR_IA32_THERM_CONTROL, l, h);
-
-	set_cpus_allowed(current, cpus_allowed);
+	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
 
 	if (l & 0x10) {
 		l = l >> 1;


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] rdmsr_on_cpu, wrmsr_on_cpu
  2007-01-18 14:45 [PATCH] rdmsr_on_cpu, wrmsr_on_cpu Alexey Dobriyan
@ 2007-01-18 22:14 ` H. Peter Anvin
  2007-01-18 23:21   ` Andi Kleen
  0 siblings, 1 reply; 6+ messages in thread
From: H. Peter Anvin @ 2007-01-18 22:14 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: ak, akpm, linux-kernel, davej, devel

Alexey Dobriyan wrote:
> There was OpenVZ specific bug rendering some cpufreq drivers unusable
> on SMP. In short, when cpufreq code thinks it confined itself to
> needed cpu by means of set_cpus_allowed() to execute rdmsr, some
> "virtual cpu" feature can migrate process to anywhere. This triggers
> bugons and does wrong things in general.
> 
> This got fixed by introducing rdmsr_on_cpu and wrmsr_on_cpu executing
> rdmsr and wrmsr on given physical cpu by means of
> smp_call_function_single().
> 
> Dave Jones mentioned cpufreq might be not only user of rdmsr_on_cpu()
> and wrmsr_on_cpu(), so I'm going to put them into arch/i386/lib/
> (after patch gets some more testing othen than compile and UP run)

The CPUID and MSR drivers need something like this.

HOWEVER -- and this is where things get gnarly -- the CPUID and MSR 
drivers would really like to be able to execute CPUID, WRMSR and RDMSR 
with the entire GPR register set (except the stack pointer) pre-set and 
post-captured, since it's highly likely that there are going to be 
nonstandard MSRs and CPUID levels (already witness Intel breaking the 
CPUID architecture by introducing %ecx dependencies.)

So I would like to see:

/* It probably makes sense to use the same structure on x86 and
    x86-64 */
struct x86_gpr_regs {
	u64 rax, rcx, rdx, rbx;
	u64 rsp, rbp, rsi, rdi;
	u64 r8, r9, r10, r11;
	u64 r12, r13, r14, r15;
};

void rdmsr_on_cpu(unsigned cpu,
	const struct x86_gpr_regs *in, struct x86_gpr_regs *out);
void wrmsr_on_cpu(unsigned cpu,
	const struct x86_gpr_regs *in, struct x86_gpr_regs *out);
void cpuid_on_cpu(unsigned cpu,
	const struct x86_gpr_regs *in, struct x86_gpr_regs *out);

This requires assembly to do in the nonparavirtualized case, of course. 
  I'll try to get that written up in the next day or so.

	-hpa

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] rdmsr_on_cpu, wrmsr_on_cpu
  2007-01-18 22:14 ` H. Peter Anvin
@ 2007-01-18 23:21   ` Andi Kleen
  2007-01-18 23:40     ` H. Peter Anvin
  0 siblings, 1 reply; 6+ messages in thread
From: Andi Kleen @ 2007-01-18 23:21 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Alexey Dobriyan, akpm, linux-kernel, davej, devel


> HOWEVER -- and this is where things get gnarly -- the CPUID and MSR
> drivers would really like to be able to execute CPUID, WRMSR and RDMSR
> with the entire GPR register set (except the stack pointer) pre-set and
> post-captured, since it's highly likely that there are going to be
> nonstandard MSRs and CPUID levels (already witness Intel breaking the
> CPUID architecture by introducing %ecx dependencies.)

That looks like such a specialized requirement that I would suggest 
you keep that in the drivers. The interface for most users would be just
too ugly

-Andi


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] rdmsr_on_cpu, wrmsr_on_cpu
  2007-01-18 23:21   ` Andi Kleen
@ 2007-01-18 23:40     ` H. Peter Anvin
  2007-01-19  0:40       ` Andi Kleen
  0 siblings, 1 reply; 6+ messages in thread
From: H. Peter Anvin @ 2007-01-18 23:40 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Alexey Dobriyan, akpm, linux-kernel, davej, devel

Andi Kleen wrote:
>> HOWEVER -- and this is where things get gnarly -- the CPUID and MSR
>> drivers would really like to be able to execute CPUID, WRMSR and RDMSR
>> with the entire GPR register set (except the stack pointer) pre-set and
>> post-captured, since it's highly likely that there are going to be
>> nonstandard MSRs and CPUID levels (already witness Intel breaking the
>> CPUID architecture by introducing %ecx dependencies.)
> 
> That looks like such a specialized requirement that I would suggest 
> you keep that in the drivers. The interface for most users would be just
> too ugly
> 

It would, but rather than having the paravirtualization interfaces 
duplicate out of control, we could/should implement the less generic 
features in terms of the more generic, above the pvz layer.

	-hpa

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] rdmsr_on_cpu, wrmsr_on_cpu
  2007-01-18 23:40     ` H. Peter Anvin
@ 2007-01-19  0:40       ` Andi Kleen
  2007-01-19  0:45         ` H. Peter Anvin
  0 siblings, 1 reply; 6+ messages in thread
From: Andi Kleen @ 2007-01-19  0:40 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Alexey Dobriyan, akpm, linux-kernel, davej, devel

On Friday 19 January 2007 10:40, H. Peter Anvin wrote:

> It would, but rather than having the paravirtualization interfaces
> duplicate out of control, we could/should implement the less generic
> features in terms of the more generic, above the pvz layer.

I can't see any Hypervisors ever allowing those weird MSRs, so
for paravirtualization it is probably better to just disable then.

-Andi

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] rdmsr_on_cpu, wrmsr_on_cpu
  2007-01-19  0:40       ` Andi Kleen
@ 2007-01-19  0:45         ` H. Peter Anvin
  0 siblings, 0 replies; 6+ messages in thread
From: H. Peter Anvin @ 2007-01-19  0:45 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Alexey Dobriyan, akpm, linux-kernel, davej, devel

Andi Kleen wrote:
> On Friday 19 January 2007 10:40, H. Peter Anvin wrote:
> 
>> It would, but rather than having the paravirtualization interfaces
>> duplicate out of control, we could/should implement the less generic
>> features in terms of the more generic, above the pvz layer.
> 
> I can't see any Hypervisors ever allowing those weird MSRs, so
> for paravirtualization it is probably better to just disable then.
> 

Don't assume they're going to be "weird."  Intel, in particular, is 
notorious in forgetting what they have already documented as architectural.

	-hpa

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2007-01-19  0:45 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-01-18 14:45 [PATCH] rdmsr_on_cpu, wrmsr_on_cpu Alexey Dobriyan
2007-01-18 22:14 ` H. Peter Anvin
2007-01-18 23:21   ` Andi Kleen
2007-01-18 23:40     ` H. Peter Anvin
2007-01-19  0:40       ` Andi Kleen
2007-01-19  0:45         ` H. Peter Anvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).