LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC] locking/mutex: Optimize __mutex_trylock_fast
@ 2018-04-05 9:37 Peter Zijlstra
2018-04-05 9:55 ` Will Deacon
0 siblings, 1 reply; 3+ messages in thread
From: Peter Zijlstra @ 2018-04-05 9:37 UTC (permalink / raw)
To: Will Deacon, Ingo Molnar; +Cc: Matthew Wilcox, linux-kernel
Subject: locking/mutex: Optimize __mutex_trylock_fast()
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Apr 5 11:05:35 CEST 2018
Use try_cmpxchg to avoid the pointless TEST instruction..
And add the (missing) atomic_long_try_cmpxchg*() wrappery.
On x86_64 this gives:
0000000000000710 <mutex_lock>: 0000000000000710 <mutex_lock>:
710: 65 48 8b 14 25 00 00 mov %gs:0x0,%rdx 710: 65 48 8b 14 25 00 00 mov %gs:0x0,%rdx
717: 00 00 717: 00 00
715: R_X86_64_32S current_task 715: R_X86_64_32S current_task
719: 31 c0 xor %eax,%eax 719: 31 c0 xor %eax,%eax
71b: f0 48 0f b1 17 lock cmpxchg %rdx,(%rdi) 71b: f0 48 0f b1 17 lock cmpxchg %rdx,(%rdi)
720: 48 85 c0 test %rax,%rax 720: 75 02 jne 724 <mutex_lock+0x14>
723: 75 02 jne 727 <mutex_lock+0x17> 722: f3 c3 repz retq
725: f3 c3 repz retq 724: eb da jmp 700 <__mutex_lock_slowpath>
727: eb d7 jmp 700 <__mutex_lock_slowpath>
On ARM64 this gives:
000000000000638 <mutex_lock>: 0000000000000638 <mutex_lock>:
638: d5384101 mrs x1, sp_el0 638: d5384101 mrs x1, sp_el0
63c: d2800002 mov x2, #0x0 63c: d2800002 mov x2, #0x0
640: f9800011 prfm pstl1strm, [x0] 640: f9800011 prfm pstl1strm, [x0]
644: c85ffc03 ldaxr x3, [x0] 644: c85ffc03 ldaxr x3, [x0]
648: ca020064 eor x4, x3, x2 648: ca020064 eor x4, x3, x2
64c: b5000064 cbnz x4, 658 <mutex_lock+0x20> 64c: b5000064 cbnz x4, 658 <mutex_lock+0x20>
650: c8047c01 stxr w4, x1, [x0] 650: c8047c01 stxr w4, x1, [x0]
654: 35ffff84 cbnz w4, 644 <mutex_lock+0xc> 654: 35ffff84 cbnz w4, 644 <mutex_lock+0xc>
658: b40000c3 cbz x3, 670 <mutex_lock+0x38> 658: b5000043 cbnz x3, 660 <mutex_lock+0x28>
65c: a9bf7bfd stp x29, x30, [sp,#-16]! 65c: d65f03c0 ret
660: 910003fd mov x29, sp 660: a9bf7bfd stp x29, x30, [sp,#-16]!
664: 97ffffef bl 620 <__mutex_lock_slowpath> 664: 910003fd mov x29, sp
668: a8c17bfd ldp x29, x30, [sp],#16 668: 97ffffee bl 620 <__mutex_lock_slowpath>
66c: d65f03c0 ret 66c: a8c17bfd ldp x29, x30, [sp],#16
670: d65f03c0 ret 670: d65f03c0 ret
Which to my untrained eye just looks different, not worse. Will?
Reported-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/asm-generic/atomic-long.h | 17 +++++++++++++++++
kernel/locking/mutex.c | 3 ++-
2 files changed, 19 insertions(+), 1 deletion(-)
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t;
#define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i)
#define ATOMIC_LONG_PFX(x) atomic64 ## x
+#define ATOMIC_LONG_TYPE s64
#else
@@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t;
#define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i)
#define ATOMIC_LONG_PFX(x) atomic ## x
+#define ATOMIC_LONG_TYPE int
#endif
@@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
#define atomic_long_cmpxchg(l, old, new) \
(ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new)))
+
+#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_acquire(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg_acquire)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_release(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg_release)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+
+
#define atomic_long_xchg_relaxed(v, new) \
(ATOMIC_LONG_PFX(_xchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (new)))
#define atomic_long_xchg_acquire(v, new) \
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struc
static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
+ unsigned long zero = 0UL;
- if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
return true;
return false;
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [RFC] locking/mutex: Optimize __mutex_trylock_fast
2018-04-05 9:37 [RFC] locking/mutex: Optimize __mutex_trylock_fast Peter Zijlstra
@ 2018-04-05 9:55 ` Will Deacon
2018-04-05 11:22 ` Peter Zijlstra
0 siblings, 1 reply; 3+ messages in thread
From: Will Deacon @ 2018-04-05 9:55 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Ingo Molnar, Matthew Wilcox, linux-kernel
Hi Peter,
On Thu, Apr 05, 2018 at 11:37:16AM +0200, Peter Zijlstra wrote:
> Subject: locking/mutex: Optimize __mutex_trylock_fast()
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Thu Apr 5 11:05:35 CEST 2018
>
> Use try_cmpxchg to avoid the pointless TEST instruction..
> And add the (missing) atomic_long_try_cmpxchg*() wrappery.
>
> On x86_64 this gives:
>
> 0000000000000710 <mutex_lock>: 0000000000000710 <mutex_lock>:
> 710: 65 48 8b 14 25 00 00 mov %gs:0x0,%rdx 710: 65 48 8b 14 25 00 00 mov %gs:0x0,%rdx
> 717: 00 00 717: 00 00
> 715: R_X86_64_32S current_task 715: R_X86_64_32S current_task
> 719: 31 c0 xor %eax,%eax 719: 31 c0 xor %eax,%eax
> 71b: f0 48 0f b1 17 lock cmpxchg %rdx,(%rdi) 71b: f0 48 0f b1 17 lock cmpxchg %rdx,(%rdi)
> 720: 48 85 c0 test %rax,%rax 720: 75 02 jne 724 <mutex_lock+0x14>
> 723: 75 02 jne 727 <mutex_lock+0x17> 722: f3 c3 repz retq
> 725: f3 c3 repz retq 724: eb da jmp 700 <__mutex_lock_slowpath>
> 727: eb d7 jmp 700 <__mutex_lock_slowpath>
>
> On ARM64 this gives:
>
> 000000000000638 <mutex_lock>: 0000000000000638 <mutex_lock>:
> 638: d5384101 mrs x1, sp_el0 638: d5384101 mrs x1, sp_el0
> 63c: d2800002 mov x2, #0x0 63c: d2800002 mov x2, #0x0
> 640: f9800011 prfm pstl1strm, [x0] 640: f9800011 prfm pstl1strm, [x0]
> 644: c85ffc03 ldaxr x3, [x0] 644: c85ffc03 ldaxr x3, [x0]
> 648: ca020064 eor x4, x3, x2 648: ca020064 eor x4, x3, x2
> 64c: b5000064 cbnz x4, 658 <mutex_lock+0x20> 64c: b5000064 cbnz x4, 658 <mutex_lock+0x20>
> 650: c8047c01 stxr w4, x1, [x0] 650: c8047c01 stxr w4, x1, [x0]
> 654: 35ffff84 cbnz w4, 644 <mutex_lock+0xc> 654: 35ffff84 cbnz w4, 644 <mutex_lock+0xc>
> 658: b40000c3 cbz x3, 670 <mutex_lock+0x38> 658: b5000043 cbnz x3, 660 <mutex_lock+0x28>
> 65c: a9bf7bfd stp x29, x30, [sp,#-16]! 65c: d65f03c0 ret
> 660: 910003fd mov x29, sp 660: a9bf7bfd stp x29, x30, [sp,#-16]!
> 664: 97ffffef bl 620 <__mutex_lock_slowpath> 664: 910003fd mov x29, sp
> 668: a8c17bfd ldp x29, x30, [sp],#16 668: 97ffffee bl 620 <__mutex_lock_slowpath>
> 66c: d65f03c0 ret 66c: a8c17bfd ldp x29, x30, [sp],#16
> 670: d65f03c0 ret 670: d65f03c0 ret
>
> Which to my untrained eye just looks different, not worse. Will?
Yeah, looks fine. The compiler has just decided to move the slowpath out of
line. One comment on the patch:
>
> Reported-by: Matthew Wilcox <mawilcox@microsoft.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> include/asm-generic/atomic-long.h | 17 +++++++++++++++++
> kernel/locking/mutex.c | 3 ++-
> 2 files changed, 19 insertions(+), 1 deletion(-)
>
> --- a/include/asm-generic/atomic-long.h
> +++ b/include/asm-generic/atomic-long.h
> @@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t;
>
> #define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i)
> #define ATOMIC_LONG_PFX(x) atomic64 ## x
> +#define ATOMIC_LONG_TYPE s64
>
> #else
>
> @@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t;
>
> #define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i)
> #define ATOMIC_LONG_PFX(x) atomic ## x
> +#define ATOMIC_LONG_TYPE int
>
> #endif
>
> @@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
> #define atomic_long_cmpxchg(l, old, new) \
> (ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new)))
>
> +
> +#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
> + (ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
> + (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
Are the casts on old and new strictly needed? We don't have them for the
non-try versions...
Will
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [RFC] locking/mutex: Optimize __mutex_trylock_fast
2018-04-05 9:55 ` Will Deacon
@ 2018-04-05 11:22 ` Peter Zijlstra
0 siblings, 0 replies; 3+ messages in thread
From: Peter Zijlstra @ 2018-04-05 11:22 UTC (permalink / raw)
To: Will Deacon; +Cc: Ingo Molnar, Matthew Wilcox, linux-kernel
On Thu, Apr 05, 2018 at 10:55:45AM +0100, Will Deacon wrote:
> > +#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
> > + (ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
> > + (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
>
> Are the casts on old and new strictly needed? We don't have them for the
> non-try versions...
It was required for the pointer; it doesn't want to convert (unsigned
long *) into (s64 *) without complaining about it :/
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2018-04-05 11:22 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-05 9:37 [RFC] locking/mutex: Optimize __mutex_trylock_fast Peter Zijlstra
2018-04-05 9:55 ` Will Deacon
2018-04-05 11:22 ` Peter Zijlstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).