LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 0/4] improve alternative instruction code and optimize get_cycles_sync
@ 2007-02-28 14:05 Joerg Roedel
  2007-02-28 14:14 ` [PATCH 1/4] i386: extend alternative instructions framework Joerg Roedel
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Joerg Roedel @ 2007-02-28 14:05 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

This series of patches extend the alternative instructions framework on
i386 and x86_64 architectures to support two alternative instruction
replacements. This code is used together with the introduction of the
X86_FEATURE_SYNC_RDTSC flag on i386 to simplify and optimize the
get_cycles_sync() function. The optimization changes this function to
use RDTSCP instead of CPUID;RDTSC if this instruction is available.
Don't use CPUID there is really important if the kernel runs as a KVM
guest, because this instruction is intercepted and causes an expensive
VMEXIT.

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG



^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/4] i386: extend alternative instructions framework
  2007-02-28 14:05 [PATCH 0/4] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
@ 2007-02-28 14:14 ` Joerg Roedel
  2007-02-28 14:18 ` [PATCH 2/4] x86_64: changes to x86_64 architecture for alternative instruction improvements Joerg Roedel
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Joerg Roedel @ 2007-02-28 14:14 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 267 bytes --]

From: Joerg Roedel <joerg.roedel@amd.com>

This patch extends the alternative instructions framework to support 2
alternative instructions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG

[-- Attachment #2: two-alternatives-i386.patch --]
[-- Type: text/plain, Size: 5024 bytes --]

diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 9eca21b..59f1770 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -153,14 +153,23 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
 void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 {
 	struct alt_instr *a;
-	u8 *instr;
+	u8 *instr, *replacement;
+	u8 replacementlen;
 	int diff;
 
 	DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
 	for (a = start; a < end; a++) {
-		BUG_ON(a->replacementlen > a->instrlen);
-		if (!boot_cpu_has(a->cpuid))
+		if (boot_cpu_has(a->cpuid)) {
+			replacement = a->replacement;
+			replacementlen = a->replacementlen;
+		} else if ((a->replacementlen2 > 0) &&
+			   (boot_cpu_has(a->cpuid2))) {
+			replacement = a->replacement2;
+			replacementlen = a->replacementlen2;
+		} else
 			continue;
+
+		BUG_ON(replacementlen > a->instrlen);
 		instr = a->instr;
 #ifdef CONFIG_X86_64
 		/* vsyscall code is not mapped yet. resolve it manually. */
@@ -170,9 +179,9 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 				__FUNCTION__, a->instr, instr);
 		}
 #endif
-		memcpy(instr, a->replacement, a->replacementlen);
-		diff = a->instrlen - a->replacementlen;
-		nop_out(instr + a->replacementlen, diff);
+		memcpy(instr, replacement, replacementlen);
+		diff = a->instrlen - replacementlen;
+		nop_out(instr + replacementlen, diff);
 	}
 }
 
diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h
index b8fa955..4a77e93 100644
--- a/include/asm-i386/alternative.h
+++ b/include/asm-i386/alternative.h
@@ -10,11 +10,14 @@
 struct alt_instr {
 	u8 *instr; 		/* original instruction */
 	u8 *replacement;
+	u8 *replacement2;
 	u8  cpuid;		/* cpuid bit set for replacement */
+	u8  cpuid2;		/* cpuid bit set for replacement2 */
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen; 	/* length of new instruction, <= instrlen */
-	u8  pad;
-};
+	u8  replacementlen2;
+	u8  pad[3];
+}  __attribute__ ((packed));
 
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 
@@ -36,6 +39,12 @@ static inline void alternatives_smp_switch(int smp) {}
 #endif
 
 /*
+ * use this macro(s) if you need more than one output parameter
+ * in alternative_io_*
+ */
+#define ASM_OUTPUT2(a, b) a, b
+
+/*
  * Alternative instructions for different CPU types or capabilities.
  *
  * This allows to use optimized instructions even on generic binary
@@ -53,9 +62,12 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 4\n"					\
 		      "  .long 661b\n"            /* label */		\
 		      "  .long 663f\n"		  /* new instruction */	\
+		      "  .long 0x00\n"					\
 		      "  .byte %c0\n"             /* feature bit */	\
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */	\
 		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */\
@@ -77,14 +89,38 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 4\n"					\
 		      "  .long 661b\n"            /* label */		\
 		      "  .long 663f\n"		  /* new instruction */ \
+		      "  .long 0x00\n"					\
 		      "  .byte %c0\n"             /* feature bit */	\
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */	\
 		      "  .byte 664f-663f\n"       /* replacementlen */ 	\
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */\
 		      ".previous" :: "i" (feature), ##input)
 
+/* Like alternative_io, but supports 2 possible alternatives */
+#define alternative_io_two(oldinstr, newinstr, feat, newinstr2, feat2,\
+		output, input...) \
+	asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
+		      ".section .altinstructions,\"a\"\n"               \
+		      "  .align 4\n"                                    \
+		      "  .long 661b\n"            /* label */           \
+		      "  .long 663f\n"            /* new instruction */ \
+		      "  .long 665f\n"            /* new instruction 2 */\
+		      "  .byte %c[f]\n"        /* feature bit */        \
+		      "  .byte %c[f2]\n"          /* feature bit 2*/    \
+		      "  .byte 662b-661b\n"       /* sourcelen */       \
+		      "  .byte 664f-663f\n"       /* replacementlen */  \
+		      "  .byte 666f-665f\n"       /* replacementlen 2 */\
+		      ".previous\n"                                     \
+		      ".section .altinstr_replacement,\"ax\"\n"         \
+		      "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
+		      "665:\n\t" newinstr2 "\n666:\n" /* replacement2 */ \
+		      ".previous" : output : [f] "i" (feat), \
+		      [f2] "i" (feat2),##input)
+
 /*
  * Alternative inline assembly for SMP.
  *

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/4] x86_64: changes to x86_64 architecture for alternative instruction improvements
  2007-02-28 14:05 [PATCH 0/4] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
  2007-02-28 14:14 ` [PATCH 1/4] i386: extend alternative instructions framework Joerg Roedel
@ 2007-02-28 14:18 ` Joerg Roedel
  2007-02-28 14:21 ` [PATCH 3/4] i386: add the X86_FEATURE_SYNC_RDTSC flag Joerg Roedel
  2007-02-28 14:25 ` [PATCH 4/4] optimize and simplify get_cycles_sync() Joerg Roedel
  3 siblings, 0 replies; 7+ messages in thread
From: Joerg Roedel @ 2007-02-28 14:18 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 277 bytes --]

From: Joerg Roedel <joerg.roedel@amd.com>

In this patch updates the x86_64 architecture to work with the changes
to alternative instructions in i386

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG

[-- Attachment #2: two-alternatives-x86_64.patch --]
[-- Type: text/plain, Size: 5805 bytes --]

diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 9a10a78..ab525ee 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -53,7 +53,10 @@ ENDPROC(clear_page)
 	.align 8
 	.quad clear_page
 	.quad 1b
+	.quad 0
 	.byte X86_FEATURE_REP_GOOD
+	.byte 0
 	.byte .Lclear_page_end - clear_page
 	.byte 2b - 1b
+	.byte 0
 	.previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index 727a5d4..b4d0329 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -113,7 +113,10 @@ ENDPROC(copy_page)
 	.align 8
 	.quad copy_page
 	.quad 1b
+	.quad 0
 	.byte X86_FEATURE_REP_GOOD
+	.byte 0
 	.byte .Lcopy_page_end - copy_page
 	.byte 2b - 1b
+	.byte 0
 	.previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index 70bebd3..d505df3 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -27,9 +27,12 @@
 	.align 8
 	.quad  0b
 	.quad  2b
+	.quad  0
 	.byte  \feature		     /* when feature is set */
+	.byte  0
 	.byte  5
 	.byte  5
+	.byte  0
 	.previous
 	.endm
 
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 0ea0ddc..b1e1686 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -123,7 +123,10 @@ ENDPROC(__memcpy)
 	.align 8
 	.quad memcpy
 	.quad 1b
+	.quad 0
 	.byte X86_FEATURE_REP_GOOD
+	.byte 0
 	.byte .Lfinal - memcpy
 	.byte 2b - 1b
+	.byte 0
 	.previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 2c59481..566e179 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -127,7 +127,10 @@ ENDPROC(__memset)
 	.align 8
 	.quad memset
 	.quad 1b
+	.quad 0
 	.byte X86_FEATURE_REP_GOOD
+	.byte 0
 	.byte .Lfinal - memset
 	.byte 2b - 1b
+	.byte 0
 	.previous
diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h
index a6657b4..63cd8e5 100644
--- a/include/asm-x86_64/alternative.h
+++ b/include/asm-x86_64/alternative.h
@@ -10,11 +10,14 @@
 struct alt_instr {
 	u8 *instr; 		/* original instruction */
 	u8 *replacement;
+	u8 *replacement2;
 	u8  cpuid;		/* cpuid bit set for replacement */
+	u8  cpuid2;		/* cpuid bit set for replacement2 */
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen; 	/* length of new instruction, <= instrlen */
-	u8  pad[5];
-};
+	u8  replacementlen2;
+	u8  pad[3];
+} __attribute__ ((packed));
 
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 
@@ -36,6 +39,12 @@ static inline void alternatives_smp_switch(int smp) {}
 
 #endif
 
+/*
+ * use this macro(s) if you need more than one output parameter
+ * in alternative_io_*
+ */
+#define ASM_OUTPUT2(a, b) a, b
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
@@ -54,9 +63,12 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 8\n"				       \
 		      "  .quad 661b\n"            /* label */          \
 		      "  .quad 663f\n"		  /* new instruction */ \
+		      "  .quad 0x00\n"					\
 		      "  .byte %c0\n"             /* feature bit */    \
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */      \
 		      "  .byte 664f-663f\n"       /* replacementlen */ \
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
@@ -78,9 +90,12 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 8\n"					\
 		      "  .quad 661b\n"            /* label */		\
 		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .quad 0x00\n"					\
 		      "  .byte %c0\n"             /* feature bit */	\
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */	\
 		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
@@ -93,14 +108,37 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 8\n"					\
 		      "  .quad 661b\n"            /* label */		\
 		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .quad 0x00\n"					\
 		      "  .byte %c[feat]\n"        /* feature bit */	\
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */	\
 		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
 		      ".previous" : output : [feat] "i" (feature), ##input)
 
+/* Like alternative_io, but supports 2 possible alternatives */
+#define alternative_io_two(oldinstr, newinstr, feat, newinstr2, feat2,\
+		output, input...) \
+	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
+		      ".section .altinstructions,\"a\"\n"		\
+		      "  .align 8\n"					\
+		      "  .quad 661b\n"            /* label */		\
+		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .quad 665f\n"		  /* new instruction 2 */\
+		      "  .byte %c[f]\n"        /* feature bit */	\
+		      "  .byte %c[f2]\n"	  /* feature bit 2*/	\
+		      "  .byte 662b-661b\n"       /* sourcelen */	\
+		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      "  .byte 666f-665f\n"	  /* replacementlen 2 */\
+		      ".previous\n"					\
+		      ".section .altinstr_replacement,\"ax\"\n"		\
+		      "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
+		      "665:\n\t" newinstr2 "\n666:\n" /* replacement2 */ \
+		      ".previous" : output : [f] "i" (feat), \
+			[f2] "i" (feat2),##input)
 /*
  * Alternative inline assembly for SMP.
  *

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 3/4] i386: add the X86_FEATURE_SYNC_RDTSC flag
  2007-02-28 14:05 [PATCH 0/4] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
  2007-02-28 14:14 ` [PATCH 1/4] i386: extend alternative instructions framework Joerg Roedel
  2007-02-28 14:18 ` [PATCH 2/4] x86_64: changes to x86_64 architecture for alternative instruction improvements Joerg Roedel
@ 2007-02-28 14:21 ` Joerg Roedel
  2007-02-28 14:25 ` [PATCH 4/4] optimize and simplify get_cycles_sync() Joerg Roedel
  3 siblings, 0 replies; 7+ messages in thread
From: Joerg Roedel @ 2007-02-28 14:21 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 337 bytes --]

From: Joerg Roedel <joerg.roedel@amd.com>

This patch adds the  X86_FEATURE_SYNC_RDTSC to the i386 architecture.
This is very helpfull to simplify the get_cycles_sync() function and
remove the #ifdefs from it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG

[-- Attachment #2: i386-x86_sync_rdtsc.patch --]
[-- Type: text/plain, Size: 1673 bytes --]

diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 41cfea5..11f5730 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -241,6 +241,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	if (cpuid_eax(0x80000000) >= 0x80000006)
 		num_cache_leaves = 3;
+
+	clear_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
 }
 
 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 56fe265..403a495 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -188,8 +188,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	}
 #endif
 
-	if (c->x86 == 15)
+	if (c->x86 == 15) {
 		set_bit(X86_FEATURE_P4, c->x86_capability);
+		set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
+	} else
+		clear_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
 	if (c->x86 == 6) 
 		set_bit(X86_FEATURE_P3, c->x86_capability);
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index 3f92b94..a9f1f01 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -75,6 +76,7 @@
 #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
 #define X86_FEATURE_PEBS	(3*32+12)  /* Precise-Event Based Sampling */
 #define X86_FEATURE_BTS		(3*32+13)  /* Branch Trace Store */
+#define X86_FEATURE_SYNC_RDTSC  (3*32+14)  /* RDTSC is serializing */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 4/4] optimize and simplify get_cycles_sync()
  2007-02-28 14:05 [PATCH 0/4] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
                   ` (2 preceding siblings ...)
  2007-02-28 14:21 ` [PATCH 3/4] i386: add the X86_FEATURE_SYNC_RDTSC flag Joerg Roedel
@ 2007-02-28 14:25 ` Joerg Roedel
  2007-03-07  0:25   ` Andrew Morton
  3 siblings, 1 reply; 7+ messages in thread
From: Joerg Roedel @ 2007-02-28 14:25 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 518 bytes --]

From: Joerg Roedel <joerg.roedel@amd.com>

This patch simplifies the get_cycles_sync() function by removing the
#ifdefs from it. Further it introduces an optimization for AMD
processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
which is helpfull if the kernel runs as a KVM guest. Running as a guest
makes CPUID very expensive because it causes an intercept of the guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG

[-- Attachment #2: optimized-get_cycles_sync.patch --]
[-- Type: text/plain, Size: 1605 bytes --]

diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index 3f92b94..a9f1f01 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -49,6 +49,7 @@
 #define X86_FEATURE_MP		(1*32+19) /* MP Capable. */
 #define X86_FEATURE_NX		(1*32+20) /* Execute Disable */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_RDTSCP      (1*32+27) /* RDTSCP */
 #define X86_FEATURE_LM		(1*32+29) /* Long Mode (x86-64) */
 #define X86_FEATURE_3DNOWEXT	(1*32+30) /* AMD 3DNow! extensions */
 #define X86_FEATURE_3DNOW	(1*32+31) /* 3DNow! */
diff --git a/include/asm-x86_64/tsc.h b/include/asm-x86_64/tsc.h
index 9a0a368..05df3f6 100644
--- a/include/asm-x86_64/tsc.h
+++ b/include/asm-x86_64/tsc.h
@@ -34,22 +34,15 @@ static inline cycles_t get_cycles(void)
 /* Like get_cycles, but make sure the CPU is synchronized. */
 static __always_inline cycles_t get_cycles_sync(void)
 {
-	unsigned long long ret;
-#ifdef X86_FEATURE_SYNC_RDTSC
-	unsigned eax;
+	unsigned int a, d;
 
-	/*
-	 * Don't do an additional sync on CPUs where we know
-	 * RDTSC is already synchronous:
-	 */
-	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
-			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
-#else
-	sync_core();
-#endif
-	rdtscll(ret);
+	alternative_io_two("cpuid\nrdtsc",
+			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
+			   "rdtscp", X86_FEATURE_RDTSCP,
+			   ASM_OUTPUT2("=a" (a), "=d" (d)),
+			   "0" (1) : "ecx", "memory");
 
-	return ret;
+	return ((unsigned long long)a) | (((unsigned long long)d)<<32);
 }
 
 extern void tsc_init(void);

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 4/4] optimize and simplify get_cycles_sync()
  2007-02-28 14:25 ` [PATCH 4/4] optimize and simplify get_cycles_sync() Joerg Roedel
@ 2007-03-07  0:25   ` Andrew Morton
  2007-03-09  9:57     ` Joerg Roedel
  0 siblings, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2007-03-07  0:25 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: discuss, Andi Kleen, linux-kernel

On Wed, 28 Feb 2007 15:25:54 +0100
"Joerg Roedel" <joerg.roedel@amd.com> wrote:

> From: Joerg Roedel <joerg.roedel@amd.com>
> 
> This patch simplifies the get_cycles_sync() function by removing the
> #ifdefs from it. Further it introduces an optimization for AMD
> processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
> which is helpfull if the kernel runs as a KVM guest. Running as a guest
> makes CPUID very expensive because it causes an intercept of the guest.

Problem:

http://test.kernel.org/functional/index.html

lots of builds fell over because their binutils versions don't understand
rdtscp.

I don't know how recently rdtscp support was added to binutils, but
this is likely to be a problem.  Perhaps we can change this patch to
embed the hex code for that opcode instead?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 4/4] optimize and simplify get_cycles_sync()
  2007-03-07  0:25   ` Andrew Morton
@ 2007-03-09  9:57     ` Joerg Roedel
  0 siblings, 0 replies; 7+ messages in thread
From: Joerg Roedel @ 2007-03-09  9:57 UTC (permalink / raw)
  To: Andrew Morton; +Cc: discuss, Andi Kleen, linux-kernel

On Tue, Mar 06, 2007 at 04:25:41PM -0800, Andrew Morton wrote:
> On Wed, 28 Feb 2007 15:25:54 +0100
> "Joerg Roedel" <joerg.roedel@amd.com> wrote:
> 
> > From: Joerg Roedel <joerg.roedel@amd.com>
> > 
> > This patch simplifies the get_cycles_sync() function by removing the
> > #ifdefs from it. Further it introduces an optimization for AMD
> > processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
> > which is helpfull if the kernel runs as a KVM guest. Running as a guest
> > makes CPUID very expensive because it causes an intercept of the guest.
> 
> Problem:
> 
> http://test.kernel.org/functional/index.html
> 
> lots of builds fell over because their binutils versions don't understand
> rdtscp.
> 
> I don't know how recently rdtscp support was added to binutils, but
> this is likely to be a problem.  Perhaps we can change this patch to
> embed the hex code for that opcode instead?

Ok, I will submit an updated version soon. Thanks for pointing this out.

Joerg

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG



^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-03-09  9:59 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-28 14:05 [PATCH 0/4] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
2007-02-28 14:14 ` [PATCH 1/4] i386: extend alternative instructions framework Joerg Roedel
2007-02-28 14:18 ` [PATCH 2/4] x86_64: changes to x86_64 architecture for alternative instruction improvements Joerg Roedel
2007-02-28 14:21 ` [PATCH 3/4] i386: add the X86_FEATURE_SYNC_RDTSC flag Joerg Roedel
2007-02-28 14:25 ` [PATCH 4/4] optimize and simplify get_cycles_sync() Joerg Roedel
2007-03-07  0:25   ` Andrew Morton
2007-03-09  9:57     ` Joerg Roedel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).