LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
[not found] <45A3B330.9000104@bull.net>
@ 2007-01-09 16:16 ` Pierre Peiffer
2007-01-09 16:29 ` Ulrich Drepper
` (2 more replies)
2007-01-09 16:16 ` [PATCH 2.6.20-rc4 2/4] Make futex_wait() use an hrtimer for timeout Pierre Peiffer
` (2 subsequent siblings)
3 siblings, 3 replies; 20+ messages in thread
From: Pierre Peiffer @ 2007-01-09 16:16 UTC (permalink / raw)
To: LKML
Cc: Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Jakub Jelinek, Darren Hart,
Sébastien Dugué
Hi,
Today, all threads waiting for a given futex are woken in FIFO order (first
waiter woken first) instead of priority order.
This patch makes use of plist (pirotity ordered lists) instead of simple list in
futex_hash_bucket.
---
futex.c | 66 +++++++++++++++++++++++++++++++++++-----------------------------
1 file changed, 37 insertions(+), 29 deletions(-)
---
Signed-off-by: Sébastien Dugué <sebastien.dugue@bull.net>
Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
---
Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c 2007-01-08 10:40:22.000000000 +0100
+++ linux-2.6/kernel/futex.c 2007-01-08 10:42:07.000000000 +0100
@@ -106,12 +106,12 @@ struct futex_pi_state {
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiters, then make the second condition true.
*/
struct futex_q {
- struct list_head list;
+ struct plist_node list;
wait_queue_head_t waiters;
/* Which hash list lock to use: */
@@ -133,8 +133,8 @@ struct futex_q {
* Split the global futex_lock into every hash list lock.
*/
struct futex_hash_bucket {
- spinlock_t lock;
- struct list_head chain;
+ spinlock_t lock;
+ struct plist_head chain;
};
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -465,13 +465,13 @@ lookup_pi_state(u32 uval, struct futex_h
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
struct task_struct *p;
pid_t pid;
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex(&this->key, &me->key)) {
/*
* Another waiter already exists - bump up
@@ -535,12 +535,12 @@ lookup_pi_state(u32 uval, struct futex_h
*/
static void wake_futex(struct futex_q *q)
{
- list_del_init(&q->list);
+ plist_del(&q->list, &q->list.plist);
if (q->filp)
send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
/*
* The lock in wake_up_all() is a crucial memory barrier after the
- * list_del_init() and also before assigning to q->lock_ptr.
+ * plist_del() and also before assigning to q->lock_ptr.
*/
wake_up_all(&q->waiters);
/*
@@ -653,7 +653,7 @@ static int futex_wake(u32 __user *uaddr,
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret;
@@ -667,7 +667,7 @@ static int futex_wake(u32 __user *uaddr,
spin_lock(&hb->lock);
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
if (this->pi_state) {
ret = -EINVAL;
@@ -695,7 +695,7 @@ futex_wake_op(u32 __user *uaddr1, u32 __
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head;
+ struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
@@ -768,7 +768,7 @@ retry:
head = &hb1->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key1)) {
wake_futex(this);
if (++ret >= nr_wake)
@@ -780,7 +780,7 @@ retry:
head = &hb2->chain;
op_ret = 0;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key2)) {
wake_futex(this);
if (++op_ret >= nr_wake2)
@@ -807,7 +807,7 @@ static int futex_requeue(u32 __user *uad
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head1;
+ struct plist_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
@@ -856,7 +856,7 @@ static int futex_requeue(u32 __user *uad
}
head1 = &hb1->chain;
- list_for_each_entry_safe(this, next, head1, list) {
+ plist_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
continue;
if (++ret <= nr_wake) {
@@ -867,9 +867,13 @@ static int futex_requeue(u32 __user *uad
* requeue.
*/
if (likely(head1 != &hb2->chain)) {
- list_move_tail(&this->list, &hb2->chain);
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
this->lock_ptr = &hb2->lock;
- }
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
this->key = key2;
get_key_refs(&key2);
drop_count++;
@@ -914,7 +918,11 @@ queue_lock(struct futex_q *q, int fd, st
static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
- list_add_tail(&q->list, &hb->chain);
+ plist_node_init(&q->list, current->normal_prio);
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+ plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
}
@@ -969,8 +977,8 @@ static int unqueue_me(struct futex_q *q)
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(q->pi_state);
@@ -988,8 +996,8 @@ static int unqueue_me(struct futex_q *q)
*/
static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
{
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
@@ -1082,10 +1090,10 @@ static int futex_wait(u32 __user *uaddr,
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&q.waiters, &wait);
/*
- * !list_empty() is safe here without any lock.
+ * !plist_node_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (likely(!list_empty(&q.list)))
+ if (likely(!plist_node_empty(&q.list)))
time = schedule_timeout(time);
__set_current_state(TASK_RUNNING);
@@ -1358,7 +1366,7 @@ static int futex_unlock_pi(u32 __user *u
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
u32 uval;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret, attempt = 0;
@@ -1409,7 +1417,7 @@ retry_locked:
*/
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
@@ -1483,10 +1491,10 @@ static unsigned int futex_poll(struct fi
poll_wait(filp, &q->waiters, wait);
/*
- * list_empty() is safe here without any lock.
+ * plist_node_empty() is safe here without any lock.
* q->lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (list_empty(&q->list))
+ if (plist_node_empty(&q->list))
ret = POLLIN | POLLRDNORM;
return ret;
@@ -1869,7 +1877,7 @@ static int __init init(void)
}
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- INIT_LIST_HEAD(&futex_queues[i].chain);
+ plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
spin_lock_init(&futex_queues[i].lock);
}
return 0;
--
Pierre Peiffer
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 2.6.20-rc4 2/4] Make futex_wait() use an hrtimer for timeout
[not found] <45A3B330.9000104@bull.net>
2007-01-09 16:16 ` [PATCH 2.6.20-rc4 1/4] futex priority based wakeup Pierre Peiffer
@ 2007-01-09 16:16 ` Pierre Peiffer
2007-01-09 16:20 ` [PATCH 2.6.20-rc4 3/4] futex_requeue_pi optimization Pierre Peiffer
2007-01-09 16:25 ` [PATCH 2.6.20-rc4 4/4][RFC] sys_futex64 : allows 64bit futexes Pierre Peiffer
3 siblings, 0 replies; 20+ messages in thread
From: Pierre Peiffer @ 2007-01-09 16:16 UTC (permalink / raw)
To: LKML
Cc: Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Jakub Jelinek, Darren Hart,
Sébastien Dugué
Hi,
This patch modifies futex_wait() to use an hrtimer + schedule() in place of
schedule_timeout() in an RT kernel.
More details in the patch header.
--------------------------------------------------------------------------------
This patch modifies futex_wait() to use an hrtimer + schedule() in place of
schedule_timeout().
schedule_timeout() is tick based, therefore the timeout granularity is
the tick (1 ms, 4 ms or 10 ms depending on HZ). By using a high resolution
timer for timeout wakeup, we can attain a much finer timeout granularity
(in the microsecond range). This parallels what is already done for
futex_lock_pi().
The timeout passed to the syscall is no longer converted to jiffies
and is therefore passed to do_futex() and futex_wait() as a timespec
therefore keeping nanosecond resolution.
Also this removes the need to pass the nanoseconds timeout part to
futex_lock_pi() in val2.
In futex_wait(), if the timeout is zero then a regular schedule() is
performed. Otherwise, an hrtimer is fired before schedule() is called.
---
include/linux/futex.h | 2 -
kernel/futex.c | 58 ++++++++++++++++++++++++++++++++------------------
kernel/futex_compat.c | 11 +--------
3 files changed, 41 insertions(+), 30 deletions(-)
---
Signed-off-by: Sébastien Dugué <sebastien.dugue@bull.net>
Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
---
Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c 2007-01-08 10:42:07.000000000 +0100
+++ linux-2.6/kernel/futex.c 2007-01-08 10:42:15.000000000 +0100
@@ -1008,7 +1008,7 @@ static void unqueue_me_pi(struct futex_q
drop_key_refs(&q->key);
}
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+static int futex_wait(u32 __user *uaddr, u32 val, struct timespec *time)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
@@ -1016,6 +1016,8 @@ static int futex_wait(u32 __user *uaddr,
struct futex_q q;
u32 uval;
int ret;
+ struct hrtimer_sleeper t;
+ int rem = 0;
q.pi_state = NULL;
retry:
@@ -1093,8 +1095,31 @@ static int futex_wait(u32 __user *uaddr,
* !plist_node_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (likely(!plist_node_empty(&q.list)))
- time = schedule_timeout(time);
+ if (likely(!plist_node_empty(&q.list))) {
+ if (time->tv_sec == 0 && time->tv_nsec == 0)
+ schedule();
+ else {
+ hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_REL);
+ hrtimer_init_sleeper(&t, current);
+ t.timer.expires = timespec_to_ktime(*time);
+
+ hrtimer_start(&t.timer, t.timer.expires, HRTIMER_REL);
+
+ /*
+ * the timer could have already expired, in which
+ * case current would be flagged for rescheduling.
+ * Don't bother calling schedule.
+ */
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+
+ /* Flag if a timeout occured */
+ rem = (t.task == NULL);
+ }
+ }
+
__set_current_state(TASK_RUNNING);
/*
@@ -1105,7 +1130,7 @@ static int futex_wait(u32 __user *uaddr,
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
- if (time == 0)
+ if (rem)
return -ETIMEDOUT;
/*
* We expect signal_pending(current), but another thread may
@@ -1127,8 +1152,8 @@ static int futex_wait(u32 __user *uaddr,
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
- long nsec, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, int detect, struct timespec *time,
+ int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct task_struct *curr = current;
@@ -1140,11 +1165,11 @@ static int futex_lock_pi(u32 __user *uad
if (refill_pi_state_cache())
return -ENOMEM;
- if (sec != MAX_SCHEDULE_TIMEOUT) {
+ if (time->tv_sec || time->tv_nsec) {
to = &timeout;
hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
hrtimer_init_sleeper(to, current);
- to->timer.expires = ktime_set(sec, nsec);
+ to->timer.expires = timespec_to_ktime(*time);
}
q.pi_state = NULL;
@@ -1780,7 +1805,7 @@ void exit_robust_list(struct task_struct
}
}
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
int ret;
@@ -1806,13 +1831,13 @@ long do_futex(u32 __user *uaddr, int op,
ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
- ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+ ret = futex_lock_pi(uaddr, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
ret = futex_unlock_pi(uaddr);
break;
case FUTEX_TRYLOCK_PI:
- ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+ ret = futex_lock_pi(uaddr, 0, timeout, 1);
break;
default:
ret = -ENOSYS;
@@ -1825,8 +1850,7 @@ asmlinkage long sys_futex(u32 __user *ua
struct timespec __user *utime, u32 __user *uaddr2,
u32 val3)
{
- struct timespec t;
- unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct timespec t = {.tv_sec = 0, .tv_nsec = 0};
u32 val2 = 0;
if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
@@ -1834,12 +1858,6 @@ asmlinkage long sys_futex(u32 __user *ua
return -EFAULT;
if (!timespec_valid(&t))
return -EINVAL;
- if (op == FUTEX_WAIT)
- timeout = timespec_to_jiffies(&t) + 1;
- else {
- timeout = t.tv_sec;
- val2 = t.tv_nsec;
- }
}
/*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE.
@@ -1847,7 +1865,7 @@ asmlinkage long sys_futex(u32 __user *ua
if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
val2 = (u32) (unsigned long) utime;
- return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
}
static int futexfs_get_sb(struct file_system_type *fs_type,
Index: linux-2.6/kernel/futex_compat.c
===================================================================
--- linux-2.6.orig/kernel/futex_compat.c 2007-01-08 09:05:52.000000000 +0100
+++ linux-2.6/kernel/futex_compat.c 2007-01-08 10:42:15.000000000 +0100
@@ -141,8 +141,7 @@ asmlinkage long compat_sys_futex(u32 __u
struct compat_timespec __user *utime, u32 __user *uaddr2,
u32 val3)
{
- struct timespec t;
- unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct timespec t = {.tv_sec = 0, .tv_nsec = 0};
int val2 = 0;
if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
@@ -150,15 +149,9 @@ asmlinkage long compat_sys_futex(u32 __u
return -EFAULT;
if (!timespec_valid(&t))
return -EINVAL;
- if (op == FUTEX_WAIT)
- timeout = timespec_to_jiffies(&t) + 1;
- else {
- timeout = t.tv_sec;
- val2 = t.tv_nsec;
- }
}
if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
val2 = (int) (unsigned long) utime;
- return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
}
Index: linux-2.6/include/linux/futex.h
===================================================================
--- linux-2.6.orig/include/linux/futex.h 2007-01-08 10:40:18.000000000 +0100
+++ linux-2.6/include/linux/futex.h 2007-01-08 10:42:15.000000000 +0100
@@ -94,7 +94,7 @@ struct robust_list_head {
#define ROBUST_LIST_LIMIT 2048
#ifdef __KERNEL__
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
extern int
--
Pierre Peiffer
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 2.6.20-rc4 3/4] futex_requeue_pi optimization
[not found] <45A3B330.9000104@bull.net>
2007-01-09 16:16 ` [PATCH 2.6.20-rc4 1/4] futex priority based wakeup Pierre Peiffer
2007-01-09 16:16 ` [PATCH 2.6.20-rc4 2/4] Make futex_wait() use an hrtimer for timeout Pierre Peiffer
@ 2007-01-09 16:20 ` Pierre Peiffer
2007-01-09 16:33 ` Ulrich Drepper
2007-01-09 16:25 ` [PATCH 2.6.20-rc4 4/4][RFC] sys_futex64 : allows 64bit futexes Pierre Peiffer
3 siblings, 1 reply; 20+ messages in thread
From: Pierre Peiffer @ 2007-01-09 16:20 UTC (permalink / raw)
To: LKML
Cc: Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Jakub Jelinek, Darren Hart,
Sébastien Dugué
Hi,
This patch provides the futex_requeue_pi functionality.
This provides an optimization, already used for (normal) futexes, to be used for
PI-futexes.
This optimization is currently used by the glibc in pthread_broadcast, when
using "normal" mutexes. With futex_requeue_pi, it can be used with PRIO_INHERIT
mutexes too.
---
include/linux/futex.h | 8
kernel/futex.c | 559 +++++++++++++++++++++++++++++++++++++++++++-----
kernel/futex_compat.c | 3
kernel/rtmutex.c | 41 ---
kernel/rtmutex_common.h | 34 ++
5 files changed, 557 insertions(+), 88 deletions(-)
---
Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
---
Index: linux-2.6/include/linux/futex.h
===================================================================
--- linux-2.6.orig/include/linux/futex.h 2007-01-08 10:42:15.000000000 +0100
+++ linux-2.6/include/linux/futex.h 2007-01-08 10:42:21.000000000 +0100
@@ -15,6 +15,7 @@
#define FUTEX_LOCK_PI 6
#define FUTEX_UNLOCK_PI 7
#define FUTEX_TRYLOCK_PI 8
+#define FUTEX_CMP_REQUEUE_PI 9
/*
* Support for robust futexes: the kernel cleans up held futexes at
@@ -83,9 +84,14 @@ struct robust_list_head {
#define FUTEX_OWNER_DIED 0x40000000
/*
+ * Some processes have been requeued on this PI-futex
+ */
+#define FUTEX_WAITER_REQUEUED 0x20000000
+
+/*
* The rest of the robust-futex field is for the TID:
*/
-#define FUTEX_TID_MASK 0x3fffffff
+#define FUTEX_TID_MASK 0x0fffffff
/*
* This limit protects against a deliberately circular list.
Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c 2007-01-08 10:42:15.000000000 +0100
+++ linux-2.6/kernel/futex.c 2007-01-08 10:42:21.000000000 +0100
@@ -52,6 +52,12 @@
#include "rtmutex_common.h"
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
@@ -127,6 +133,12 @@ struct futex_q {
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
struct task_struct *task;
+
+ /*
+ * This waiter is used in case of requeue from a
+ * normal futex to a PI-futex
+ */
+ struct rt_mutex_waiter waiter;
};
/*
@@ -248,6 +260,25 @@ static int get_futex_key(u32 __user *uad
}
/*
+ * Retrieve the original address used to compute this key
+ */
+static void *get_futex_address(union futex_key *key)
+{
+ void *uaddr;
+
+ if (key->both.offset & 1) {
+ /* shared mapping */
+ uaddr = (void*)((key->shared.pgoff << PAGE_SHIFT)
+ + key->shared.offset - 1);
+ } else {
+ /* private mapping */
+ uaddr = (void*)(key->private.address + key->private.offset);
+ }
+
+ return uaddr;
+}
+
+/*
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
@@ -461,7 +492,8 @@ void exit_pi_state_list(struct task_stru
}
static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
@@ -472,7 +504,7 @@ lookup_pi_state(u32 uval, struct futex_h
head = &hb->chain;
plist_for_each_entry_safe(this, next, head, list) {
- if (match_futex(&this->key, &me->key)) {
+ if (match_futex(&this->key, key)) {
/*
* Another waiter already exists - bump up
* the refcount and return its pi_state:
@@ -487,7 +519,7 @@ lookup_pi_state(u32 uval, struct futex_h
WARN_ON(!atomic_read(&pi_state->refcount));
atomic_inc(&pi_state->refcount);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -514,7 +546,7 @@ lookup_pi_state(u32 uval, struct futex_h
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
/* Store the key for possible exit cleanups: */
- pi_state->key = me->key;
+ pi_state->key = *key;
spin_lock_irq(&p->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
@@ -524,7 +556,7 @@ lookup_pi_state(u32 uval, struct futex_h
put_task_struct(p);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -583,6 +615,8 @@ static int wake_futex_pi(u32 __user *uad
*/
if (!(uval & FUTEX_OWNER_DIED)) {
newval = FUTEX_WAITERS | new_owner->pid;
+ /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -646,6 +680,254 @@ double_lock_hb(struct futex_hash_bucket
}
/*
+ * Called from futex_requeue_pi.
+ * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
+ * PI-futex value; search its associated pi_state if an owner exist
+ * or create a new one without owner.
+ */
+static inline int
+lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **pi_state)
+{
+ u32 curval, uval, newval;
+
+retry:
+ /*
+ * We can't handle a fault cleanly because we can't
+ * release the locks here. Simply return the fault.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
+ return -EFAULT;
+
+ /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
+ if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
+ != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
+ /*
+ * No waiters yet, we prepare the futex to have some waiters.
+ */
+
+ uval = curval;
+ newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
+
+ pagefault_disable();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ pagefault_enable();
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+ if (unlikely(curval != uval))
+ goto retry;
+ }
+
+ if (!(curval & FUTEX_TID_MASK)
+ || lookup_pi_state(curval, hb, key, pi_state)) {
+ /* the futex has no owner (yet) or the lookup failed:
+ allocate one pi_state without owner */
+
+ *pi_state = alloc_pi_state();
+
+ /* Already stores the key: */
+ (*pi_state)->key = *key;
+
+ /* init the mutex without owner */
+ __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
+ }
+
+ return 0;
+}
+
+/*
+ * Keep the first nr_wake waiter from futex1, wake up one,
+ * and requeue the next nr_requeue waiters following hashed on
+ * one physical page to another physical page (PI-futex uaddr2)
+ */
+static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct plist_head *head1;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state2 = NULL;
+ struct rt_mutex_waiter *waiter, *top_waiter = NULL;
+ struct rt_mutex *lock2 = NULL;
+ int ret, drop_count = 0;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+retry:
+ /*
+ * First take all the futex related locks:
+ */
+ down_read(¤t->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr1, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = get_futex_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /*
+ * If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ up_read(¤t->mm->mmap_sem);
+
+ ret = get_user(curval, uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ head1 = &hb1->chain;
+ plist_for_each_entry_safe(this, next, head1, list) {
+ if (!match_futex (&this->key, &key1))
+ continue;
+ if (++ret <= nr_wake) {
+ wake_futex(this);
+ } else {
+ /*
+ * FIRST: get and set the pi_state
+ */
+ if (!pi_state2) {
+ int s;
+ /* do this only the first time we requeue someone */
+ s = lookup_pi_state_for_requeue(uaddr2, hb2,
+ &key2, &pi_state2);
+ if (s) {
+ ret = s;
+ goto out_unlock;
+ }
+
+ lock2 = &pi_state2->pi_mutex;
+ spin_lock(&lock2->wait_lock);
+
+ /* Save the top waiter of the wait_list */
+ if (rt_mutex_has_waiters(lock2))
+ top_waiter = rt_mutex_top_waiter(lock2);
+ } else
+ atomic_inc(&pi_state2->refcount);
+
+
+ this->pi_state = pi_state2;
+
+ /*
+ * SECOND: requeue futex_q to the correct hashbucket
+ */
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
+ this->key = key2;
+ get_key_refs(&key2);
+ drop_count++;
+
+
+ /*
+ * THIRD: queue it to lock2
+ */
+ spin_lock_irq(&this->task->pi_lock);
+ waiter = &this->waiter;
+ waiter->task = this->task;
+ waiter->lock = lock2;
+ plist_node_init(&waiter->list_entry, this->task->prio);
+ plist_node_init(&waiter->pi_list_entry, this->task->prio);
+ plist_add(&waiter->list_entry, &lock2->wait_list);
+ this->task->pi_blocked_on = waiter;
+ spin_unlock_irq(&this->task->pi_lock);
+
+ if (ret - nr_wake >= nr_requeue)
+ break;
+ }
+ }
+
+ /* If we've requeued some tasks and the top_waiter of the rt_mutex
+ has changed, we must adjust the priority of the owner, if any */
+ if (drop_count) {
+ struct task_struct *owner = rt_mutex_owner(lock2);
+ if (owner &&
+ (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
+ int chain_walk = 0;
+
+ spin_lock_irq(&owner->pi_lock);
+ if (top_waiter)
+ plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+ else
+ /*
+ * There was no waiters before the requeue,
+ * the flag must be updated
+ */
+ mark_rt_mutex_waiters(lock2);
+
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on) {
+ chain_walk = 1;
+ get_task_struct(owner);
+ }
+
+ spin_unlock_irq(&owner->pi_lock);
+ spin_unlock(&lock2->wait_lock);
+
+ if (chain_walk)
+ rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
+ current);
+ } else {
+ /* No owner or the top_waiter does not change */
+ mark_rt_mutex_waiters(lock2);
+ spin_unlock(&lock2->wait_lock);
+ }
+ }
+
+out_unlock:
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /* drop_key_refs() must be called outside the spinlocks. */
+ while (--drop_count >= 0)
+ drop_key_refs(&key1);
+
+out:
+ up_read(¤t->mm->mmap_sem);
+ return ret;
+}
+
+/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
@@ -992,9 +1274,10 @@ static int unqueue_me(struct futex_q *q)
/*
* PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock is held on entry and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
*/
-static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+static void unqueue_me_pi(struct futex_q *q)
{
WARN_ON(plist_node_empty(&q->list));
plist_del(&q->list, &q->list.plist);
@@ -1003,11 +1286,65 @@ static void unqueue_me_pi(struct futex_q
free_pi_state(q->pi_state);
q->pi_state = NULL;
- spin_unlock(&hb->lock);
+ spin_unlock(q->lock_ptr);
drop_key_refs(&q->key);
}
+/*
+ * Fixup the pi_state owner with current.
+ *
+ * The cur->mm semaphore must be held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct futex_hash_bucket *hb,
+ struct task_struct *curr)
+{
+ u32 newtid = curr->pid | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+ u32 uval, curval, newval;
+ int ret;
+
+ /* Owner died? */
+ if (pi_state->owner != NULL) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+ } else
+ newtid |= FUTEX_OWNER_DIED;
+
+ pi_state->owner = curr;
+
+ spin_lock_irq(&curr->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &curr->pi_state_list);
+ spin_unlock_irq(&curr->pi_lock);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(q);
+ up_read(&curr->mm->mmap_sem);
+ /*
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
+ */
+ ret = get_user(uval, uaddr);
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ return ret;
+}
+
static int futex_wait(u32 __user *uaddr, u32 val, struct timespec *time)
{
struct task_struct *curr = current;
@@ -1016,7 +1353,7 @@ static int futex_wait(u32 __user *uaddr,
struct futex_q q;
u32 uval;
int ret;
- struct hrtimer_sleeper t;
+ struct hrtimer_sleeper t, *to = NULL;
int rem = 0;
q.pi_state = NULL;
@@ -1070,6 +1407,14 @@ static int futex_wait(u32 __user *uaddr,
if (uval != val)
goto out_unlock_release_sem;
+ /*
+ * This rt_mutex_waiter structure is prepared here and will
+ * be used only if this task is requeued from a normal futex to
+ * a PI-futex with futex_requeue_pi.
+ */
+ debug_rt_mutex_init_waiter(&q.waiter);
+ q.waiter.task = NULL;
+
/* Only actually queue if *uaddr contained val. */
__queue_me(&q, hb);
@@ -1099,6 +1444,7 @@ static int futex_wait(u32 __user *uaddr,
if (time->tv_sec == 0 && time->tv_nsec == 0)
schedule();
else {
+ to = &t;
hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_REL);
hrtimer_init_sleeper(&t, current);
t.timer.expires = timespec_to_ktime(*time);
@@ -1127,6 +1473,66 @@ static int futex_wait(u32 __user *uaddr,
* we are the only user of it.
*/
+ if (q.pi_state) {
+ /*
+ * We were woken but have been requeued on a PI-futex.
+ * We have to complete the lock acquisition by taking
+ * the rtmutex.
+ */
+
+ struct rt_mutex *lock = &q.pi_state->pi_mutex;
+
+ spin_lock(&lock->wait_lock);
+ if (unlikely(q.waiter.task)) {
+ remove_waiter(lock, &q.waiter);
+ }
+ spin_unlock(&lock->wait_lock);
+
+ if (rem)
+ ret = -ETIMEDOUT;
+ else
+ ret = rt_mutex_timed_lock(lock, to, 1);
+
+ down_read(&curr->mm->mmap_sem);
+ spin_lock(q.lock_ptr);
+
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (!ret && q.pi_state->owner != curr) {
+ /*
+ * We MUST play with the futex we were requeued on,
+ * NOT the current futex.
+ * We can retrieve it from the key of the pi_state
+ */
+ uaddr = get_futex_address(&q.pi_state->key);
+
+ /* mmap_sem and hash_bucket lock are unlocked at
+ return of this function */
+ ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+ } else {
+ /*
+ * Catch the rare case, where the lock was released
+ * when we were on the way back before we locked
+ * the hash bucket.
+ */
+ if (ret && q.pi_state->owner == curr) {
+ if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+ ret = 0;
+ }
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+ up_read(&curr->mm->mmap_sem);
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
+ return ret;
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
@@ -1146,6 +1552,53 @@ static int futex_wait(u32 __user *uaddr,
return ret;
}
+static void set_pi_futex_owner(struct futex_hash_bucket *hb,
+ union futex_key *key, struct task_struct *p)
+{
+ struct plist_head *head;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex *lock;
+
+ /* Search a waiter that should already exists */
+
+ head = &hb->chain;
+
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, key)) {
+ pi_state = this->pi_state;
+ break;
+ }
+ }
+
+ BUG_ON(!pi_state);
+
+ /* set p as pi_state's owner */
+ atomic_inc(&pi_state->refcount);
+
+ lock = &pi_state->pi_mutex;
+
+ spin_lock(&lock->wait_lock);
+ spin_lock_irq(&p->pi_lock);
+
+ list_add(&pi_state->list, &p->pi_state_list);
+ pi_state->owner = p;
+
+
+ /* set p as pi_mutex's owner */
+ debug_rt_mutex_proxy_lock(lock, p);
+ WARN_ON(rt_mutex_owner(lock));
+ rt_mutex_set_owner(lock, p, 0);
+ rt_mutex_deadlock_account_lock(lock, p);
+
+ plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
+ &p->pi_waiters);
+ __rt_mutex_adjust_prio(p);
+
+ spin_unlock_irq(&p->pi_lock);
+ spin_unlock(&lock->wait_lock);
+}
+
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
@@ -1160,7 +1613,7 @@ static int futex_lock_pi(u32 __user *uad
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
struct futex_q q;
- int ret, attempt = 0;
+ int ret, lock_held, attempt = 0;
if (refill_pi_state_cache())
return -ENOMEM;
@@ -1183,6 +1636,8 @@ static int futex_lock_pi(u32 __user *uad
hb = queue_lock(&q, -1, NULL);
retry_locked:
+ lock_held = 0;
+
/*
* To avoid races, we attempt to take the lock here again
* (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1201,7 +1656,16 @@ static int futex_lock_pi(u32 __user *uad
if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
if (!detect && 0)
force_sig(SIGKILL, current);
- ret = -EDEADLK;
+ /*
+ * Normally, this check is done in user space.
+ * In case of requeue, the owner may attempt to lock this futex,
+ * even if the ownership has already been given by the previous
+ * waker.
+ * In the usual case, this is a case of deadlock, but not in case
+ * of REQUEUE_PI.
+ */
+ if (!(curval & FUTEX_WAITER_REQUEUED))
+ ret = -EDEADLK;
goto out_unlock_release_sem;
}
@@ -1213,7 +1677,18 @@ static int futex_lock_pi(u32 __user *uad
goto out_unlock_release_sem;
uval = curval;
- newval = uval | FUTEX_WAITERS;
+ /*
+ * In case of a requeue, check if there already is an owner
+ * If not, just take the futex.
+ */
+ if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+ /* set current as futex owner */
+ newval = curval | current->pid;
+ lock_held = 1;
+ } else
+ /* Set the WAITERS flag, so the owner will know it has someone
+ to wake at next unlock */
+ newval = curval | FUTEX_WAITERS;
pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1224,11 +1699,16 @@ static int futex_lock_pi(u32 __user *uad
if (unlikely(curval != uval))
goto retry_locked;
+ if (lock_held) {
+ set_pi_futex_owner(hb, &q.key, curr);
+ goto out_unlock_release_sem;
+ }
+
/*
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
*/
- ret = lookup_pi_state(uval, hb, &q);
+ ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
if (unlikely(ret)) {
/*
@@ -1291,45 +1771,10 @@ static int futex_lock_pi(u32 __user *uad
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case.
*/
- if (!ret && q.pi_state->owner != curr) {
- u32 newtid = current->pid | FUTEX_WAITERS;
-
- /* Owner died? */
- if (q.pi_state->owner != NULL) {
- spin_lock_irq(&q.pi_state->owner->pi_lock);
- WARN_ON(list_empty(&q.pi_state->list));
- list_del_init(&q.pi_state->list);
- spin_unlock_irq(&q.pi_state->owner->pi_lock);
- } else
- newtid |= FUTEX_OWNER_DIED;
-
- q.pi_state->owner = current;
-
- spin_lock_irq(¤t->pi_lock);
- WARN_ON(!list_empty(&q.pi_state->list));
- list_add(&q.pi_state->list, ¤t->pi_state_list);
- spin_unlock_irq(¤t->pi_lock);
-
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
- /*
- * We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
- * owner died bit here.
- */
- ret = get_user(uval, uaddr);
- while (!ret) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
- if (curval == -EFAULT)
- ret = -EFAULT;
- if (curval == uval)
- break;
- uval = curval;
- }
- } else {
+ if (!ret && q.pi_state->owner != curr)
+ /* mmap_sem is unlocked at return of this function */
+ ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+ else {
/*
* Catch the rare case, where the lock was released
* when we were on the way back before we locked
@@ -1340,7 +1785,7 @@ static int futex_lock_pi(u32 __user *uad
ret = 0;
}
/* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
+ unqueue_me_pi(&q);
up_read(&curr->mm->mmap_sem);
}
@@ -1709,6 +2154,8 @@ retry:
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
+ mval |= (uval & FUTEX_WAITER_REQUEUED);
nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
if (nval == -EFAULT)
@@ -1839,6 +2286,9 @@ long do_futex(u32 __user *uaddr, int op,
case FUTEX_TRYLOCK_PI:
ret = futex_lock_pi(uaddr, 0, timeout, 1);
break;
+ case FUTEX_CMP_REQUEUE_PI:
+ ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3);
+ break;
default:
ret = -ENOSYS;
}
@@ -1862,7 +2312,8 @@ asmlinkage long sys_futex(u32 __user *ua
/*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+ || op == FUTEX_CMP_REQUEUE_PI)
val2 = (u32) (unsigned long) utime;
return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
Index: linux-2.6/kernel/rtmutex.c
===================================================================
--- linux-2.6.orig/kernel/rtmutex.c 2007-01-08 09:05:50.000000000 +0100
+++ linux-2.6/kernel/rtmutex.c 2007-01-08 10:42:21.000000000 +0100
@@ -56,7 +56,7 @@
* state.
*/
-static void
+void
rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
unsigned long mask)
{
@@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struc
}
/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
- */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
-# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
-
- do {
- owner = *p;
- } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
-}
-#else
-# define rt_mutex_cmpxchg(l,c,n) (0)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- lock->owner = (struct task_struct *)
- ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
-}
-#endif
-
-/*
* Calculate task priority from the waiter list priority
*
* Return task->normal_prio when the waiter list is empty or when
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct
*
* This can be both boosting and unboosting. task->pi_lock must be held.
*/
-static void __rt_mutex_adjust_prio(struct task_struct *task)
+void __rt_mutex_adjust_prio(struct task_struct *task)
{
int prio = rt_mutex_getprio(task);
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
* Decreases task's usage by one - may thus free the task.
* Returns 0 or -EDEADLK.
*/
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- int deadlock_detect,
- struct rt_mutex *orig_lock,
- struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task)
+int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
{
struct rt_mutex *lock;
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt
*
* Must be called with lock->wait_lock held
*/
-static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
{
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
Index: linux-2.6/kernel/rtmutex_common.h
===================================================================
--- linux-2.6.orig/kernel/rtmutex_common.h 2007-01-08 09:05:50.000000000 +0100
+++ linux-2.6/kernel/rtmutex_common.h 2007-01-08 10:42:21.000000000 +0100
@@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_own
}
/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n) (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
* PI-futex support (proxy locking functions, etc.):
*/
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(s
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+
+extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+ unsigned long mask);
+extern void __rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task);
+extern void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
#endif
Index: linux-2.6/kernel/futex_compat.c
===================================================================
--- linux-2.6.orig/kernel/futex_compat.c 2007-01-08 10:42:15.000000000 +0100
+++ linux-2.6/kernel/futex_compat.c 2007-01-08 10:42:21.000000000 +0100
@@ -150,7 +150,8 @@ asmlinkage long compat_sys_futex(u32 __u
if (!timespec_valid(&t))
return -EINVAL;
}
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+ || op == FUTEX_CMP_REQUEUE_PI)
val2 = (int) (unsigned long) utime;
return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
--
Pierre Peiffer
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 2.6.20-rc4 4/4][RFC] sys_futex64 : allows 64bit futexes
[not found] <45A3B330.9000104@bull.net>
` (2 preceding siblings ...)
2007-01-09 16:20 ` [PATCH 2.6.20-rc4 3/4] futex_requeue_pi optimization Pierre Peiffer
@ 2007-01-09 16:25 ` Pierre Peiffer
2007-01-11 21:49 ` Andrew Morton
2007-01-11 22:14 ` Jakub Jelinek
3 siblings, 2 replies; 20+ messages in thread
From: Pierre Peiffer @ 2007-01-09 16:25 UTC (permalink / raw)
To: LKML
Cc: Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Jakub Jelinek, Darren Hart,
Sébastien Dugué
Hi,
This latest patch is an adaptation of the sys_futex64 syscall provided in -rt
patch (originally written by Ingo). It allows the use of 64bit futex.
I have re-worked most of the code to avoid the duplication of the code.
It does not provide the functionality for all architectures, and thus, it can
not be applied "as is".
But, again, feedbacks and comments are welcome.
---
include/asm-x86_64/futex.h | 113 ++++++++++++++++++++
include/asm-x86_64/unistd.h | 4
include/linux/futex.h | 5
include/linux/syscalls.h | 3
kernel/futex.c | 247 ++++++++++++++++++++++++++++++--------------
kernel/futex_compat.c | 3
kernel/sys_ni.c | 1
7 files changed, 299 insertions(+), 77 deletions(-)
---
Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
---
Index: linux-2.6/include/asm-x86_64/futex.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/futex.h 2007-01-08 10:40:17.000000000 +0100
+++ linux-2.6/include/asm-x86_64/futex.h 2007-01-08 10:42:27.000000000 +0100
@@ -41,6 +41,39 @@
"=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "m" (*uaddr), "1" (0))
+#define __futex_atomic_op1_64(insn, ret, oldval, uaddr, oparg) \
+ __asm__ __volatile ( \
+"1: " insn "\n" \
+"2: .section .fixup,\"ax\"\n\
+3: movq %3, %1\n\
+ jmp 2b\n\
+ .previous\n\
+ .section __ex_table,\"a\"\n\
+ .align 8\n\
+ .quad 1b,3b\n\
+ .previous" \
+ : "=r" (oldval), "=r" (ret), "=m" (*uaddr) \
+ : "i" (-EFAULT), "m" (*uaddr), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2_64(insn, ret, oldval, uaddr, oparg) \
+ __asm__ __volatile ( \
+"1: movq %2, %0\n\
+ movq %0, %3\n" \
+ insn "\n" \
+"2: " LOCK_PREFIX "cmpxchgq %3, %2\n\
+ jnz 1b\n\
+3: .section .fixup,\"ax\"\n\
+4: movq %5, %1\n\
+ jmp 3b\n\
+ .previous\n\
+ .section __ex_table,\"a\"\n\
+ .align 8\n\
+ .quad 1b,4b,2b,4b\n\
+ .previous" \
+ : "=&a" (oldval), "=&r" (ret), "=m" (*uaddr), \
+ "=&r" (tem) \
+ : "r" (oparg), "i" (-EFAULT), "m" (*uaddr), "1" (0))
+
static inline int
futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
{
@@ -95,6 +128,60 @@ futex_atomic_op_inuser (int encoded_op,
}
static inline int
+futex_atomic_op_inuser64 (int encoded_op, u64 __user *uaddr)
+{
+ int op = (encoded_op >> 28) & 7;
+ int cmp = (encoded_op >> 24) & 15;
+ u64 oparg = (encoded_op << 8) >> 20;
+ u64 cmparg = (encoded_op << 20) >> 20;
+ u64 oldval = 0, ret, tem;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+ oparg = 1 << oparg;
+
+ if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u64)))
+ return -EFAULT;
+
+ inc_preempt_count();
+
+ switch (op) {
+ case FUTEX_OP_SET:
+ __futex_atomic_op1_64("xchgq %0, %2", ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ADD:
+ __futex_atomic_op1_64(LOCK_PREFIX "xaddq %0, %2", ret, oldval,
+ uaddr, oparg);
+ break;
+ case FUTEX_OP_OR:
+ __futex_atomic_op2_64("orq %4, %3", ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ANDN:
+ __futex_atomic_op2_64("andq %4, %3", ret, oldval, uaddr, ~oparg);
+ break;
+ case FUTEX_OP_XOR:
+ __futex_atomic_op2_64("xorq %4, %3", ret, oldval, uaddr, oparg);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ dec_preempt_count();
+
+ if (!ret) {
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+ case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+ case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+ case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+ case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+ case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+ default: ret = -ENOSYS;
+ }
+ }
+ return ret;
+}
+
+static inline int
futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
{
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
@@ -121,5 +208,31 @@ futex_atomic_cmpxchg_inatomic(int __user
return oldval;
}
+static inline u64
+futex_atomic_cmpxchg_inatomic64(u64 __user *uaddr, u64 oldval, u64 newval)
+{
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u64)))
+ return -EFAULT;
+
+ __asm__ __volatile__(
+ "1: " LOCK_PREFIX "cmpxchgq %3, %1 \n"
+
+ "2: .section .fixup, \"ax\" \n"
+ "3: mov %2, %0 \n"
+ " jmp 2b \n"
+ " .previous \n"
+
+ " .section __ex_table, \"a\" \n"
+ " .align 8 \n"
+ " .quad 1b,3b \n"
+ " .previous \n"
+
+ : "=a" (oldval), "=m" (*uaddr)
+ : "i" (-EFAULT), "r" (newval), "0" (oldval)
+ : "memory"
+ );
+
+ return oldval;
+}
#endif
#endif
Index: linux-2.6/include/asm-x86_64/unistd.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/unistd.h 2007-01-08 10:40:17.000000000 +0100
+++ linux-2.6/include/asm-x86_64/unistd.h 2007-01-08 10:42:27.000000000 +0100
@@ -619,8 +619,10 @@ __SYSCALL(__NR_sync_file_range, sys_sync
__SYSCALL(__NR_vmsplice, sys_vmsplice)
#define __NR_move_pages 279
__SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_futex64 280
+__SYSCALL(__NR_futex64, sys_futex64)
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_futex64
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
Index: linux-2.6/include/linux/syscalls.h
===================================================================
--- linux-2.6.orig/include/linux/syscalls.h 2007-01-08 09:05:48.000000000 +0100
+++ linux-2.6/include/linux/syscalls.h 2007-01-08 10:42:27.000000000 +0100
@@ -178,6 +178,9 @@ asmlinkage long sys_set_tid_address(int
asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
struct timespec __user *utime, u32 __user *uaddr2,
u32 val3);
+asmlinkage long sys_futex64(u64 __user *uaddr, int op, u64 val,
+ struct timespec __user *utime, u64 __user *uaddr2,
+ u64 val3);
asmlinkage long sys_init_module(void __user *umod, unsigned long len,
const char __user *uargs);
Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c 2007-01-08 10:42:21.000000000 +0100
+++ linux-2.6/kernel/futex.c 2007-01-09 11:21:49.000000000 +0100
@@ -60,6 +60,44 @@
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+#ifdef CONFIG_64BIT
+static inline unsigned long
+futex_cmpxchg_inatomic(unsigned long __user *uaddr, unsigned long oldval,
+ unsigned long newval, int futex64)
+{
+ if (futex64)
+ return futex_atomic_cmpxchg_inatomic64((u64 __user *)uaddr,
+ oldval, newval);
+ else {
+ u32 ov = oldval, nv = newval;
+ return futex_atomic_cmpxchg_inatomic((int __user *)uaddr, ov,
+ nv);
+ }
+}
+
+static inline int
+futex_get_user(unsigned long *val, unsigned long __user *uaddr, int futex64)
+{
+ int ret;
+
+ if (futex64)
+ ret = get_user(*val, uaddr);
+ else {
+ u32 __user *addr = (u32 __user *)uaddr;
+
+ ret = get_user(*val, addr);
+ }
+ return ret;
+}
+
+#else
+#define futex_cmpxchg_inatomic(uaddr, oldval, newval, futex64) \
+ futex_atomic_cmpxchg_inatomic((u32*)uaddr, oldval, newval)
+
+#define futex_get_user(val, uaddr, futex64) get_user(*val, uaddr)
+
+#endif
+
/*
* Futexes are matched on equal values of this key.
* The key type depends on whether it's a shared or private mapping.
@@ -165,6 +203,7 @@ static struct futex_hash_bucket *hash_fu
return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
}
+
/*
* Return 1 if two futex_keys are equal, 0 otherwise.
*/
@@ -187,7 +226,7 @@ static inline int match_futex(union fute
*
* Should be called with ¤t->mm->mmap_sem but NOT any spinlocks.
*/
-static int get_futex_key(u32 __user *uaddr, union futex_key *key)
+static int get_futex_key(void *uaddr, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -309,13 +348,30 @@ static void drop_key_refs(union futex_ke
}
}
-static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
+static inline int
+get_futex_value_locked(unsigned long *dest, unsigned long __user *from,
+ int futex64)
{
int ret;
+#ifdef CONFIG_64BIT
+ if (futex64) {
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dest, from, sizeof(u64));
+ pagefault_enable();
+ } else {
+ u32 d;
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(&d, from, sizeof(u32));
+ pagefault_enable();
+
+ *dest = d;
+ }
+#else
pagefault_disable();
ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
pagefault_enable();
+#endif
return ret ? -EFAULT : 0;
}
@@ -588,11 +644,12 @@ static void wake_futex(struct futex_q *q
q->lock_ptr = NULL;
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+static int wake_futex_pi(unsigned long __user *uaddr, unsigned long uval,
+ struct futex_q *this, int futex64)
{
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
- u32 curval, newval;
+ unsigned long curval, newval;
if (!pi_state)
return -EINVAL;
@@ -619,7 +676,7 @@ static int wake_futex_pi(u32 __user *uad
newval |= (uval & FUTEX_WAITER_REQUEUED);
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ curval = futex_cmpxchg_inatomic(uaddr, uval, newval, futex64);
pagefault_enable();
if (curval == -EFAULT)
return -EFAULT;
@@ -643,16 +700,17 @@ static int wake_futex_pi(u32 __user *uad
return 0;
}
-static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
+static int unlock_futex_pi(unsigned long __user *uaddr, unsigned long uval,
+ int futex64)
{
- u32 oldval;
+ unsigned long oldval;
/*
* There is no waiter, so we unlock the futex. The owner died
* bit has not to be preserved here. We are the owner:
*/
pagefault_disable();
- oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
+ oldval = futex_cmpxchg_inatomic(uaddr, uval, 0, futex64);
pagefault_enable();
if (oldval == -EFAULT)
@@ -686,18 +744,19 @@ double_lock_hb(struct futex_hash_bucket
* or create a new one without owner.
*/
static inline int
-lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
+lookup_pi_state_for_requeue(unsigned long __user *uaddr,
+ struct futex_hash_bucket *hb,
union futex_key *key,
- struct futex_pi_state **pi_state)
+ struct futex_pi_state **pi_state, int futex64)
{
- u32 curval, uval, newval;
+ unsigned long curval, uval, newval;
retry:
/*
* We can't handle a fault cleanly because we can't
* release the locks here. Simply return the fault.
*/
- if (get_futex_value_locked(&curval, uaddr))
+ if (get_futex_value_locked(&curval, uaddr, futex64))
return -EFAULT;
/* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
@@ -711,7 +770,7 @@ retry:
newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ curval = futex_cmpxchg_inatomic(uaddr, uval, newval, futex64);
pagefault_enable();
if (unlikely(curval == -EFAULT))
@@ -742,8 +801,9 @@ retry:
* and requeue the next nr_requeue waiters following hashed on
* one physical page to another physical page (PI-futex uaddr2)
*/
-static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2,
- int nr_wake, int nr_requeue, u32 *cmpval)
+static int
+futex_requeue_pi(unsigned long __user *uaddr1, unsigned long __user *uaddr2,
+ int nr_wake, int nr_requeue, unsigned long *cmpval, int futex64)
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
@@ -776,9 +836,9 @@ retry:
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
- u32 curval;
+ unsigned long curval;
- ret = get_futex_value_locked(&curval, uaddr1);
+ ret = get_futex_value_locked(&curval, uaddr1, futex64);
if (unlikely(ret)) {
spin_unlock(&hb1->lock);
@@ -791,7 +851,7 @@ retry:
*/
up_read(¤t->mm->mmap_sem);
- ret = get_user(curval, uaddr1);
+ ret = futex_get_user(&curval, uaddr1, futex64);
if (!ret)
goto retry;
@@ -818,7 +878,8 @@ retry:
int s;
/* do this only the first time we requeue someone */
s = lookup_pi_state_for_requeue(uaddr2, hb2,
- &key2, &pi_state2);
+ &key2, &pi_state2,
+ futex64);
if (s) {
ret = s;
goto out_unlock;
@@ -931,7 +992,7 @@ out:
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-static int futex_wake(u32 __user *uaddr, int nr_wake)
+static int futex_wake(unsigned long __user *uaddr, int nr_wake)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
@@ -972,8 +1033,8 @@ out:
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
- int nr_wake, int nr_wake2, int op)
+futex_wake_op(unsigned long __user *uaddr1, unsigned long __user *uaddr2,
+ int nr_wake, int nr_wake2, int op, int futex64)
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
@@ -997,9 +1058,16 @@ retryfull:
retry:
double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, uaddr2);
+#ifdef CONFIG_64BIT
+ if (futex64)
+ op_ret = futex_atomic_op_inuser64(op, (u64 __user *)uaddr2);
+ else
+ op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+#else
+ op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+#endif
if (unlikely(op_ret < 0)) {
- u32 dummy;
+ unsigned long dummy;
spin_unlock(&hb1->lock);
if (hb1 != hb2)
@@ -1041,7 +1109,7 @@ retry:
*/
up_read(¤t->mm->mmap_sem);
- ret = get_user(dummy, uaddr2);
+ ret = futex_get_user(&dummy, uaddr2, futex64);
if (ret)
return ret;
@@ -1084,8 +1152,9 @@ out:
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
-static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
- int nr_wake, int nr_requeue, u32 *cmpval)
+static int
+futex_requeue(unsigned long __user *uaddr1, unsigned long __user *uaddr2,
+ int nr_wake, int nr_requeue, unsigned long *cmpval, int futex64)
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
@@ -1109,9 +1178,9 @@ static int futex_requeue(u32 __user *uad
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
- u32 curval;
+ unsigned long curval;
- ret = get_futex_value_locked(&curval, uaddr1);
+ ret = get_futex_value_locked(&curval, uaddr1, futex64);
if (unlikely(ret)) {
spin_unlock(&hb1->lock);
@@ -1124,7 +1193,7 @@ static int futex_requeue(u32 __user *uad
*/
up_read(¤t->mm->mmap_sem);
- ret = get_user(curval, uaddr1);
+ ret = futex_get_user(&curval, uaddr1, futex64);
if (!ret)
goto retry;
@@ -1297,13 +1366,13 @@ static void unqueue_me_pi(struct futex_q
* The cur->mm semaphore must be held, it is released at return of this
* function.
*/
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+static int fixup_pi_state_owner(unsigned long __user *uaddr, struct futex_q *q,
struct futex_hash_bucket *hb,
- struct task_struct *curr)
+ struct task_struct *curr, int futex64)
{
- u32 newtid = curr->pid | FUTEX_WAITERS;
+ unsigned long newtid = curr->pid | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
- u32 uval, curval, newval;
+ unsigned long uval, curval, newval;
int ret;
/* Owner died? */
@@ -1330,12 +1399,12 @@ static int fixup_pi_state_owner(u32 __us
* TID. This must be atomic as we have preserve the
* owner died bit here.
*/
- ret = get_user(uval, uaddr);
+ ret = futex_get_user(&uval, uaddr, futex64);
while (!ret) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
newval |= (uval & FUTEX_WAITER_REQUEUED);
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
+ curval = futex_cmpxchg_inatomic(uaddr,uval,
+ newval, futex64);
if (curval == -EFAULT)
ret = -EFAULT;
if (curval == uval)
@@ -1345,13 +1414,14 @@ static int fixup_pi_state_owner(u32 __us
return ret;
}
-static int futex_wait(u32 __user *uaddr, u32 val, struct timespec *time)
+static int futex_wait(unsigned long __user *uaddr, unsigned long val,
+ struct timespec *time, int futex64)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb;
struct futex_q q;
- u32 uval;
+ unsigned long uval;
int ret;
struct hrtimer_sleeper t, *to = NULL;
int rem = 0;
@@ -1386,7 +1456,7 @@ static int futex_wait(u32 __user *uaddr,
* We hold the mmap semaphore, so the mapping cannot have changed
* since we looked it up in get_futex_key.
*/
- ret = get_futex_value_locked(&uval, uaddr);
+ ret = get_futex_value_locked(&uval, uaddr, futex64);
if (unlikely(ret)) {
queue_unlock(&q, hb);
@@ -1396,8 +1466,7 @@ static int futex_wait(u32 __user *uaddr,
* start all over again.
*/
up_read(&curr->mm->mmap_sem);
-
- ret = get_user(uval, uaddr);
+ ret = futex_get_user(&uval, uaddr, futex64);
if (!ret)
goto retry;
@@ -1510,7 +1579,7 @@ static int futex_wait(u32 __user *uaddr,
/* mmap_sem and hash_bucket lock are unlocked at
return of this function */
- ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+ ret = fixup_pi_state_owner(uaddr, &q, hb, curr, futex64);
} else {
/*
* Catch the rare case, where the lock was released
@@ -1605,13 +1674,13 @@ static void set_pi_futex_owner(struct fu
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int detect, struct timespec *time,
- int trylock)
+static int futex_lock_pi(unsigned long __user *uaddr, int detect,
+ struct timespec *time, int trylock, int futex64)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct task_struct *curr = current;
struct futex_hash_bucket *hb;
- u32 uval, newval, curval;
+ unsigned long uval, newval, curval;
struct futex_q q;
int ret, lock_held, attempt = 0;
@@ -1646,7 +1715,7 @@ static int futex_lock_pi(u32 __user *uad
newval = current->pid;
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
+ curval = futex_cmpxchg_inatomic(uaddr, 0, newval, futex64);
pagefault_enable();
if (unlikely(curval == -EFAULT))
@@ -1691,7 +1760,7 @@ static int futex_lock_pi(u32 __user *uad
newval = curval | FUTEX_WAITERS;
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ curval = futex_cmpxchg_inatomic(uaddr, uval, newval, futex64);
pagefault_enable();
if (unlikely(curval == -EFAULT))
@@ -1728,8 +1797,8 @@ static int futex_lock_pi(u32 __user *uad
FUTEX_OWNER_DIED | FUTEX_WAITERS;
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
+ curval = futex_cmpxchg_inatomic(uaddr, uval,
+ newval, futex64);
pagefault_enable();
if (unlikely(curval == -EFAULT))
@@ -1773,7 +1842,7 @@ static int futex_lock_pi(u32 __user *uad
*/
if (!ret && q.pi_state->owner != curr)
/* mmap_sem is unlocked at return of this function */
- ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+ ret = fixup_pi_state_owner(uaddr, &q, hb, curr, futex64);
else {
/*
* Catch the rare case, where the lock was released
@@ -1819,7 +1888,7 @@ static int futex_lock_pi(u32 __user *uad
queue_unlock(&q, hb);
up_read(&curr->mm->mmap_sem);
- ret = get_user(uval, uaddr);
+ ret = futex_get_user(&uval, uaddr, futex64);
if (!ret && (uval != -EFAULT))
goto retry;
@@ -1831,17 +1900,17 @@ static int futex_lock_pi(u32 __user *uad
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr)
+static int futex_unlock_pi(unsigned long __user *uaddr, int futex64)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- u32 uval;
+ unsigned long uval;
struct plist_head *head;
union futex_key key;
int ret, attempt = 0;
retry:
- if (get_user(uval, uaddr))
+ if (futex_get_user(&uval, uaddr, futex64))
return -EFAULT;
/*
* We release only a lock we actually own:
@@ -1868,7 +1937,7 @@ retry_locked:
*/
if (!(uval & FUTEX_OWNER_DIED)) {
pagefault_disable();
- uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+ uval = futex_cmpxchg_inatomic(uaddr, current->pid, 0, futex64);
pagefault_enable();
}
@@ -1890,7 +1959,7 @@ retry_locked:
plist_for_each_entry_safe(this, next, head, list) {
if (!match_futex (&this->key, &key))
continue;
- ret = wake_futex_pi(uaddr, uval, this);
+ ret = wake_futex_pi(uaddr, uval, this, futex64);
/*
* The atomic access to the futex value
* generated a pagefault, so retry the
@@ -1904,7 +1973,7 @@ retry_locked:
* No waiters - kernel unlocks the futex:
*/
if (!(uval & FUTEX_OWNER_DIED)) {
- ret = unlock_futex_pi(uaddr, uval);
+ ret = unlock_futex_pi(uaddr, uval, futex64);
if (ret == -EFAULT)
goto pi_faulted;
}
@@ -1934,7 +2003,7 @@ pi_faulted:
spin_unlock(&hb->lock);
up_read(¤t->mm->mmap_sem);
- ret = get_user(uval, uaddr);
+ ret = futex_get_user(&uval, uaddr, futex64);
if (!ret && (uval != -EFAULT))
goto retry;
@@ -2170,7 +2239,7 @@ retry:
*/
if (!pi) {
if (uval & FUTEX_WAITERS)
- futex_wake(uaddr, 1);
+ futex_wake((unsigned long __user *)uaddr, 1);
}
}
return 0;
@@ -2252,42 +2321,46 @@ void exit_robust_list(struct task_struct
}
}
-long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout,
- u32 __user *uaddr2, u32 val2, u32 val3)
+long do_futex(unsigned long __user *uaddr, int op, unsigned long val,
+ struct timespec *timeout, unsigned long __user *uaddr2,
+ unsigned long val2, unsigned long val3, int fut64)
{
int ret;
switch (op) {
case FUTEX_WAIT:
- ret = futex_wait(uaddr, val, timeout);
+ ret = futex_wait(uaddr, val, timeout, fut64);
break;
case FUTEX_WAKE:
ret = futex_wake(uaddr, val);
break;
case FUTEX_FD:
- /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
- ret = futex_fd(uaddr, val);
+ if (fut64)
+ ret = -ENOSYS;
+ else
+ /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
+ ret = futex_fd((u32 __user *)uaddr, val);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+ ret = futex_requeue(uaddr, uaddr2, val, val2, NULL, fut64);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+ ret = futex_requeue(uaddr, uaddr2, val, val2, &val3, fut64);
break;
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+ ret = futex_wake_op(uaddr, uaddr2, val, val2, val3, fut64);
break;
case FUTEX_LOCK_PI:
- ret = futex_lock_pi(uaddr, val, timeout, 0);
+ ret = futex_lock_pi(uaddr, val, timeout, 0, fut64);
break;
case FUTEX_UNLOCK_PI:
- ret = futex_unlock_pi(uaddr);
+ ret = futex_unlock_pi(uaddr, fut64);
break;
case FUTEX_TRYLOCK_PI:
- ret = futex_lock_pi(uaddr, 0, timeout, 1);
+ ret = futex_lock_pi(uaddr, 0, timeout, 1, fut64);
break;
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3);
+ ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3, fut64);
break;
default:
ret = -ENOSYS;
@@ -2295,6 +2368,33 @@ long do_futex(u32 __user *uaddr, int op,
return ret;
}
+#ifdef CONFIG_64BIT
+
+asmlinkage long
+sys_futex64(u64 __user *uaddr, int op, u64 val,
+ struct timespec __user *utime, u64 __user *uaddr2, u64 val3)
+{
+ struct timespec t = {.tv_sec = 0, .tv_nsec = 0};
+ u64 val2 = 0;
+
+ if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
+ if (copy_from_user(&t, utime, sizeof(t)) != 0)
+ return -EFAULT;
+ if (!timespec_valid(&t))
+ return -EINVAL;
+ }
+ /*
+ * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+ */
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+ || op == FUTEX_CMP_REQUEUE_PI)
+ val2 = (unsigned long) utime;
+
+ return do_futex((unsigned long __user*)uaddr, op, val, &t,
+ (unsigned long __user*)uaddr2, val2, val3, 1);
+}
+
+#endif
asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
struct timespec __user *utime, u32 __user *uaddr2,
@@ -2316,7 +2416,8 @@ asmlinkage long sys_futex(u32 __user *ua
|| op == FUTEX_CMP_REQUEUE_PI)
val2 = (u32) (unsigned long) utime;
- return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
+ return do_futex((unsigned long __user*)uaddr, op, val, &t,
+ (unsigned long __user*)uaddr2, val2, val3, 0);
}
static int futexfs_get_sb(struct file_system_type *fs_type,
Index: linux-2.6/kernel/sys_ni.c
===================================================================
--- linux-2.6.orig/kernel/sys_ni.c 2007-01-08 09:05:48.000000000 +0100
+++ linux-2.6/kernel/sys_ni.c 2007-01-08 10:42:27.000000000 +0100
@@ -41,6 +41,7 @@ cond_syscall(sys_sendmsg);
cond_syscall(sys_recvmsg);
cond_syscall(sys_socketcall);
cond_syscall(sys_futex);
+cond_syscall(sys_futex64);
cond_syscall(compat_sys_futex);
cond_syscall(sys_set_robust_list);
cond_syscall(compat_sys_set_robust_list);
Index: linux-2.6/include/linux/futex.h
===================================================================
--- linux-2.6.orig/include/linux/futex.h 2007-01-08 10:42:21.000000000 +0100
+++ linux-2.6/include/linux/futex.h 2007-01-09 08:36:25.000000000 +0100
@@ -100,8 +100,9 @@ struct robust_list_head {
#define ROBUST_LIST_LIMIT 2048
#ifdef __KERNEL__
-long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout,
- u32 __user *uaddr2, u32 val2, u32 val3);
+long do_futex(unsigned long __user *uaddr, int op, unsigned long val,
+ struct timespec *timeout, unsigned long __user *uaddr2,
+ unsigned long val2, unsigned long val3, int futex64);
extern int
handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
Index: linux-2.6/kernel/futex_compat.c
===================================================================
--- linux-2.6.orig/kernel/futex_compat.c 2007-01-08 10:42:21.000000000 +0100
+++ linux-2.6/kernel/futex_compat.c 2007-01-09 11:13:52.000000000 +0100
@@ -154,5 +154,6 @@ asmlinkage long compat_sys_futex(u32 __u
|| op == FUTEX_CMP_REQUEUE_PI)
val2 = (int) (unsigned long) utime;
- return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
+ return do_futex((unsigned long __user*)uaddr, op, val, &t,
+ (unsigned long __user*)uaddr2, val2, val3, 0);
}
--
Pierre Peiffer
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-09 16:16 ` [PATCH 2.6.20-rc4 1/4] futex priority based wakeup Pierre Peiffer
@ 2007-01-09 16:29 ` Ulrich Drepper
2007-01-10 11:47 ` Pierre Peiffer
2007-01-09 17:59 ` Daniel Walker
2007-01-10 16:11 ` Daniel Walker
2 siblings, 1 reply; 20+ messages in thread
From: Ulrich Drepper @ 2007-01-09 16:29 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Jakub Jelinek, Darren Hart, Sébastien Dugué
[-- Attachment #1: Type: text/plain, Size: 378 bytes --]
Pierre Peiffer wrote:
> This patch makes use of plist (pirotity ordered lists) instead of simple
> list in
> futex_hash_bucket.
I have never seen performance numbers for this. If it is punishing
existing code in a measurable way I think it's not anacceptable default
behavior.
--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 3/4] futex_requeue_pi optimization
2007-01-09 16:20 ` [PATCH 2.6.20-rc4 3/4] futex_requeue_pi optimization Pierre Peiffer
@ 2007-01-09 16:33 ` Ulrich Drepper
2007-01-10 8:17 ` Pierre Peiffer
0 siblings, 1 reply; 20+ messages in thread
From: Ulrich Drepper @ 2007-01-09 16:33 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Jakub Jelinek, Darren Hart, Sébastien Dugué
[-- Attachment #1: Type: text/plain, Size: 345 bytes --]
Pierre Peiffer wrote:
> This provides an optimization, already used for (normal) futexes, to be
> used for
> PI-futexes.
So, this patch implements requeuing from a non-PI futex to a PI futex?
That's the bare minimum needed. What about PI to PI?
--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-09 16:16 ` [PATCH 2.6.20-rc4 1/4] futex priority based wakeup Pierre Peiffer
2007-01-09 16:29 ` Ulrich Drepper
@ 2007-01-09 17:59 ` Daniel Walker
2007-01-10 16:11 ` Daniel Walker
2 siblings, 0 replies; 20+ messages in thread
From: Daniel Walker @ 2007-01-09 17:59 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Jakub Jelinek, Darren Hart,
Sébastien Dugué
On Tue, 2007-01-09 at 17:16 +0100, Pierre Peiffer wrote:
> @@ -133,8 +133,8 @@ struct futex_q {
> * Split the global futex_lock into every hash list lock.
> */
> struct futex_hash_bucket {
> - spinlock_t lock;
> - struct list_head chain;
> + spinlock_t lock;
> + struct plist_head chain;
Should have tabs between spinlock_t and lock , and plist_head and
chain.. It looks like the original didn't, but as long as your cleaning
up may as well get add them.
Daniel
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 3/4] futex_requeue_pi optimization
2007-01-09 16:33 ` Ulrich Drepper
@ 2007-01-10 8:17 ` Pierre Peiffer
2007-01-10 8:24 ` Ulrich Drepper
0 siblings, 1 reply; 20+ messages in thread
From: Pierre Peiffer @ 2007-01-10 8:17 UTC (permalink / raw)
To: Ulrich Drepper
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Jakub Jelinek,
Darren Hart, Sébastien Dugué
Ulrich Drepper wrote :
>
> So, this patch implements requeuing from a non-PI futex to a PI futex?
Yes.
> That's the bare minimum needed. What about PI to PI?
I may miss something, but I don't think there is a need for that.
Currently, futex_requeue is (only) used in pthread_cond_broadcast to requeue
some threads from an internal futex (futex1) to another futex (futex2, which is
the futex behind the cond_mutex)
futex1 does not need to be a PI-futex, I think.
--
Pierre
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 3/4] futex_requeue_pi optimization
2007-01-10 8:17 ` Pierre Peiffer
@ 2007-01-10 8:24 ` Ulrich Drepper
0 siblings, 0 replies; 20+ messages in thread
From: Ulrich Drepper @ 2007-01-10 8:24 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Jakub Jelinek,
Darren Hart, Sébastien Dugué
[-- Attachment #1: Type: text/plain, Size: 296 bytes --]
Pierre Peiffer wrote:
> I may miss something, but I don't think there is a need for that.
Yes, I know, I was asking about it only for completeness. Maybe there
will be a reason to have it some day.
--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-09 16:29 ` Ulrich Drepper
@ 2007-01-10 11:47 ` Pierre Peiffer
2007-01-10 12:03 ` Pierre Peiffer
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Pierre Peiffer @ 2007-01-10 11:47 UTC (permalink / raw)
To: Ulrich Drepper
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Jakub Jelinek, Darren Hart, Sébastien Dugué
Ulrich Drepper a écrit :
>
> I have never seen performance numbers for this. If it is punishing
> existing code in a measurable way I think it's not anacceptable default
> behavior.
>
Here are some numbers. My test program measures the latency of pthread_broadcast
with 1000 pthreads (all threads are blocked on pthread_cond_wait, the time is
measured between the broadcast call and the last woken pthread).
Here are the average latencies after 5000 measures.
[only this patch is used, not the following.
The system is a dual Xeon 2.80GHz with HT enable]
First case: all threads are SCHED_OTHER
* with simple list:
Iterations=5000
Latency (us) min max avg stddev
3869 7400 6656.73 539.35
* with plist:
Iterations=5000
Latency (us) min max avg stddev
3684 7629 6787.97 479.41
Second case: all threads are SCHED_FIFO with priority equally distributed from
priomin to priomax
* with simple list:
Iterations=5000
Latency (us) min max avg stddev
4548 7197 6656.85 463.30
* with plist:
Iterations=5000
Latency (us) min max avg stddev
8289 11752 9720.12 426.45
So, yes it (logically) has a cost, depending of the number of different
priorities used, so it's specially measurable with real-time threads.
With SCHED_OTHER, I suppose that the priorities are not be very distributed.
May be, supposing it makes sense to respect the priority order only for
real-time pthreads, I can register all SCHED_OTHER threads to the same
MAX_RT_PRIO priotity ?
Or do you think this must be set behind a CONFIG* option ?
(Or finally not interesting enough for mainline ?)
--
Pierre
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-10 11:47 ` Pierre Peiffer
@ 2007-01-10 12:03 ` Pierre Peiffer
2007-01-10 12:54 ` Jakub Jelinek
2007-01-11 7:20 ` Ulrich Drepper
2 siblings, 0 replies; 20+ messages in thread
From: Pierre Peiffer @ 2007-01-10 12:03 UTC (permalink / raw)
Cc: Ulrich Drepper, LKML, Dinakar Guniguntala, Jean-Pierre Dion,
Ingo Molnar, Jakub Jelinek, Darren Hart,
Sébastien Dugué
Pierre Peiffer a écrit :
> Ulrich Drepper a écrit :
>>
>> I have never seen performance numbers for this. If it is punishing
>> existing code in a measurable way I think it's not anacceptable default
>> behavior.
> May be, supposing it makes sense to respect the priority order only for
> real-time pthreads, I can register all SCHED_OTHER threads to the same
> MAX_RT_PRIO priotity ?
Moreover, the performance must be considered, sure, but after all, "man
pthread_cond_broadcast" says:
<<
If more than one thread is blocked on a condition variable, the
scheduling policy shall determine the order in which threads are
unblocked.
>>
... this is not true today ...
(of course, "shall" does not mean "mandatory", I know ;-) )
--
Pierre
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-10 11:47 ` Pierre Peiffer
2007-01-10 12:03 ` Pierre Peiffer
@ 2007-01-10 12:54 ` Jakub Jelinek
2007-01-10 15:05 ` Pierre Peiffer
2007-01-11 7:20 ` Ulrich Drepper
2 siblings, 1 reply; 20+ messages in thread
From: Jakub Jelinek @ 2007-01-10 12:54 UTC (permalink / raw)
To: Pierre Peiffer
Cc: Ulrich Drepper, LKML, Dinakar Guniguntala, Jean-Pierre Dion,
Ingo Molnar, Jakub Jelinek, Darren Hart, Sebastien Dugue
On Wed, Jan 10, 2007 at 12:47:21PM +0100, Pierre Peiffer wrote:
> So, yes it (logically) has a cost, depending of the number of different
> priorities used, so it's specially measurable with real-time threads.
> With SCHED_OTHER, I suppose that the priorities are not be very distributed.
>
> May be, supposing it makes sense to respect the priority order only for
> real-time pthreads, I can register all SCHED_OTHER threads to the same
> MAX_RT_PRIO priotity ?
> Or do you think this must be set behind a CONFIG* option ?
> (Or finally not interesting enough for mainline ?)
As soon as there is at least one non-SCHED_OTHER thread among the waiters,
there is no question about whether plist should be used or not, that's
a correctness issue and if we want to conform to POSIX, we have to use that.
I guess Ulrich's question was mainly about performance differences
with/without plist wakeup when all threads are SCHED_OTHER. I'd say for
that a pure pthread_mutex_{lock,unlock} benchmark or even just a program
which uses futex FUTEX_WAIT/FUTEX_WAKE in a bunch of threads would be
better.
In the past we talked with Ingo about the possibilities here, one is use
plist always and prove that it doesn't add measurable overhead over current
FIFO (when only SCHED_OTHER is involved), the other possibility would be
to start using FIFOs as before, but when the first non-SCHED_OTHER thread
decides to wait on the futex, switch it to plist wakeup mode (convert the
FIFO into a plist) and from that point on just use plist wakeups on it.
Jakub
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-10 12:54 ` Jakub Jelinek
@ 2007-01-10 15:05 ` Pierre Peiffer
2007-01-10 18:15 ` Ulrich Drepper
0 siblings, 1 reply; 20+ messages in thread
From: Pierre Peiffer @ 2007-01-10 15:05 UTC (permalink / raw)
To: Jakub Jelinek
Cc: Ulrich Drepper, LKML, Dinakar Guniguntala, Jean-Pierre Dion,
Ingo Molnar, Darren Hart, Sebastien Dugue
Yes, I agree with all, that was what I have in mind too.
And by registering all SHED_OTHER threads with the same priority MAX_RT_PRIO, we
have exactly this behavior, I think :
* the plist, when used with only one priority, behaves exactly as a simple list
(plist is a double simple list: first list contains all nodes sorted by
priority, second list contains the first element of each "priority-based"
sub-list of the first one):
Thus, when only one priority is used, there is nothing to sort at each add
operation. (first list contains all elements by FIFO order, second list contains
one element). So the overhead in this case is minimal and quasi-null.
* Now, if a SCHED_FIFO thread comes to the plist, its priority will be lower
than MAX_RT_PRIO and it will be "naturally" sorted by prio order, and thus, it
will be woken before the SCHED_OTHER threads because of its higher priority
(i.e. lower prio-value in the kernel).
But there can be a performance impact when several processes use different
futexes which have the same hash key.
In fact, the plist contains all waiters _of_all_futexes_ having the same hash
key, not only the waiters of a given futex. This can be more a problem, if one
process uses SCHED_FIFO threads, and the other SCHED_OTHER: the first will
penalize the second... but even in this case, as the second has a lower
priority, this can be acceptable, I think ?
Jakub Jelinek a écrit :
> On Wed, Jan 10, 2007 at 12:47:21PM +0100, Pierre Peiffer wrote:
>> So, yes it (logically) has a cost, depending of the number of different
>> priorities used, so it's specially measurable with real-time threads.
>> With SCHED_OTHER, I suppose that the priorities are not be very distributed.
>>
>> May be, supposing it makes sense to respect the priority order only for
>> real-time pthreads, I can register all SCHED_OTHER threads to the same
>> MAX_RT_PRIO priotity ?
>> Or do you think this must be set behind a CONFIG* option ?
>> (Or finally not interesting enough for mainline ?)
>
> As soon as there is at least one non-SCHED_OTHER thread among the waiters,
> there is no question about whether plist should be used or not, that's
> a correctness issue and if we want to conform to POSIX, we have to use that.
>
> I guess Ulrich's question was mainly about performance differences
> with/without plist wakeup when all threads are SCHED_OTHER. I'd say for
> that a pure pthread_mutex_{lock,unlock} benchmark or even just a program
> which uses futex FUTEX_WAIT/FUTEX_WAKE in a bunch of threads would be
> better.
>
> In the past we talked with Ingo about the possibilities here, one is use
> plist always and prove that it doesn't add measurable overhead over current
> FIFO (when only SCHED_OTHER is involved), the other possibility would be
> to start using FIFOs as before, but when the first non-SCHED_OTHER thread
> decides to wait on the futex, switch it to plist wakeup mode (convert the
> FIFO into a plist) and from that point on just use plist wakeups on it.
>
> Jakub
>
--
Pierre Peiffer
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-09 16:16 ` [PATCH 2.6.20-rc4 1/4] futex priority based wakeup Pierre Peiffer
2007-01-09 16:29 ` Ulrich Drepper
2007-01-09 17:59 ` Daniel Walker
@ 2007-01-10 16:11 ` Daniel Walker
2007-01-10 16:29 ` Pierre Peiffer
2 siblings, 1 reply; 20+ messages in thread
From: Daniel Walker @ 2007-01-10 16:11 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Jakub Jelinek, Darren Hart,
Sébastien Dugué
On Tue, 2007-01-09 at 17:16 +0100, Pierre Peiffer wrote:
> @@ -1358,7 +1366,7 @@ static int futex_unlock_pi(u32 __user *u
> struct futex_hash_bucket *hb;
> struct futex_q *this, *next;
> u32 uval;
> - struct list_head *head;
> + struct plist_head *head;
> union futex_key key;
> int ret, attempt = 0;
>
> @@ -1409,7 +1417,7 @@ retry_locked:
> */
> head = &hb->chain;
>
> - list_for_each_entry_safe(this, next, head, list) {
> + plist_for_each_entry_safe(this, next, head, list) {
> if (!match_futex (&this->key, &key))
> continue;
> ret = wake_futex_pi(uaddr, uval, this);
Is this really necessary? The rtmutex will priority sort the waiters
when you enable priority inheritance. Inside the wake_futex_pi() it
actually just pulls the new owner off another plist inside the the
rtmutex structure.
Daniel
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-10 16:11 ` Daniel Walker
@ 2007-01-10 16:29 ` Pierre Peiffer
2007-01-10 16:33 ` Daniel Walker
0 siblings, 1 reply; 20+ messages in thread
From: Pierre Peiffer @ 2007-01-10 16:29 UTC (permalink / raw)
To: Daniel Walker
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Jakub Jelinek, Darren Hart,
Sébastien Dugué
Daniel Walker a écrit :
> On Tue, 2007-01-09 at 17:16 +0100, Pierre Peiffer wrote:
>> @@ -1358,7 +1366,7 @@ static int futex_unlock_pi(u32 __user *u
>> struct futex_hash_bucket *hb;
>> struct futex_q *this, *next;
>> u32 uval;
>> - struct list_head *head;
>> + struct plist_head *head;
>> union futex_key key;
>> int ret, attempt = 0;
>>
>> @@ -1409,7 +1417,7 @@ retry_locked:
>> */
>> head = &hb->chain;
>>
>> - list_for_each_entry_safe(this, next, head, list) {
>> + plist_for_each_entry_safe(this, next, head, list) {
>> if (!match_futex (&this->key, &key))
>> continue;
>> ret = wake_futex_pi(uaddr, uval, this);
>
>
> Is this really necessary? The rtmutex will priority sort the waiters
> when you enable priority inheritance. Inside the wake_futex_pi() it
> actually just pulls the new owner off another plist inside the the
> rtmutex structure.
Yes. ... necessary for non-PI-futex (ie "normal" futex)...
As the hash_bucket_list is used and common for both futex and PI-futex, yes, in
case of PI_futex, the task is queued two times in two plist.
--
Pierre
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-10 16:29 ` Pierre Peiffer
@ 2007-01-10 16:33 ` Daniel Walker
0 siblings, 0 replies; 20+ messages in thread
From: Daniel Walker @ 2007-01-10 16:33 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Jakub Jelinek, Darren Hart,
Sébastien Dugué
On Wed, 2007-01-10 at 17:29 +0100, Pierre Peiffer wrote:
> >
> > Is this really necessary? The rtmutex will priority sort the waiters
> > when you enable priority inheritance. Inside the wake_futex_pi() it
> > actually just pulls the new owner off another plist inside the the
> > rtmutex structure.
>
> Yes. ... necessary for non-PI-futex (ie "normal" futex)...
>
> As the hash_bucket_list is used and common for both futex and PI-futex, yes, in
> case of PI_futex, the task is queued two times in two plist.
You could make them distinct .. Also Did you consider merging the PI
path and the non-PI code path? then you could modify the rtmutex to
toggle PI depending .
Daniel
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-10 15:05 ` Pierre Peiffer
@ 2007-01-10 18:15 ` Ulrich Drepper
0 siblings, 0 replies; 20+ messages in thread
From: Ulrich Drepper @ 2007-01-10 18:15 UTC (permalink / raw)
To: Pierre Peiffer
Cc: Jakub Jelinek, LKML, Dinakar Guniguntala, Jean-Pierre Dion,
Ingo Molnar, Darren Hart, Sebastien Dugue
[-- Attachment #1: Type: text/plain, Size: 689 bytes --]
Pierre Peiffer wrote:
> But there can be a performance impact when several processes use
> different futexes which have the same hash key.
> In fact, the plist contains all waiters _of_all_futexes_ having the same
> hash key, not only the waiters of a given futex. This can be more a
> problem,
s/can be/is/
There are systems with thousands of active futexes, maybe tens of
thousands. Not only is hash collision likely, it's also a matter of
using and administering the plist. We have to make futexes less
connected, not more. Now I definitely want to see real world tests first.
--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 1/4] futex priority based wakeup
2007-01-10 11:47 ` Pierre Peiffer
2007-01-10 12:03 ` Pierre Peiffer
2007-01-10 12:54 ` Jakub Jelinek
@ 2007-01-11 7:20 ` Ulrich Drepper
2 siblings, 0 replies; 20+ messages in thread
From: Ulrich Drepper @ 2007-01-11 7:20 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Jakub Jelinek, Darren Hart, Sébastien Dugué
[-- Attachment #1: Type: text/plain, Size: 355 bytes --]
Pierre Peiffer wrote:
> Here are the average latencies after 5000 measures.
> [...]
Use something more realistic. I suggest using the Volano benchmark
under your favorite JVM. I found it to be quite representative and you
get a nice number you can show.
--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 4/4][RFC] sys_futex64 : allows 64bit futexes
2007-01-09 16:25 ` [PATCH 2.6.20-rc4 4/4][RFC] sys_futex64 : allows 64bit futexes Pierre Peiffer
@ 2007-01-11 21:49 ` Andrew Morton
2007-01-11 22:14 ` Jakub Jelinek
1 sibling, 0 replies; 20+ messages in thread
From: Andrew Morton @ 2007-01-11 21:49 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Jakub Jelinek, Darren Hart,
Sébastien Dugué
On Tue, 09 Jan 2007 17:25:26 +0100
Pierre Peiffer <pierre.peiffer@bull.net> wrote:
> static inline int
> +futex_atomic_op_inuser64 (int encoded_op, u64 __user *uaddr)
Your email client performs space-stuffing. Please see if you can turn that
off. (It's fixable at my end with s/^ /^/g but it's a nuisance).
> +{
> + int op = (encoded_op >> 28) & 7;
> + int cmp = (encoded_op >> 24) & 15;
> + u64 oparg = (encoded_op << 8) >> 20;
> + u64 cmparg = (encoded_op << 20) >> 20;
> + u64 oldval = 0, ret, tem;
> +
> + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
> + oparg = 1 << oparg;
> +
> + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u64)))
> + return -EFAULT;
> +
> + inc_preempt_count();
What is that open-coded, uncommented inc_preempt_count() doing in there?
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2.6.20-rc4 4/4][RFC] sys_futex64 : allows 64bit futexes
2007-01-09 16:25 ` [PATCH 2.6.20-rc4 4/4][RFC] sys_futex64 : allows 64bit futexes Pierre Peiffer
2007-01-11 21:49 ` Andrew Morton
@ 2007-01-11 22:14 ` Jakub Jelinek
1 sibling, 0 replies; 20+ messages in thread
From: Jakub Jelinek @ 2007-01-11 22:14 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Ulrich Drepper, Darren Hart
On Tue, Jan 09, 2007 at 05:25:26PM +0100, Pierre Peiffer wrote:
> This latest patch is an adaptation of the sys_futex64 syscall provided in
> -rt
> patch (originally written by Ingo). It allows the use of 64bit futex.
>
> I have re-worked most of the code to avoid the duplication of the code.
>
> It does not provide the functionality for all architectures, and thus, it
> can
> not be applied "as is".
> But, again, feedbacks and comments are welcome.
Why do you support all operations for 64-bit futexes?
IMHO PI futexes don't make sense for 64-bit futexes, PI futexes have
hardcoded bit layout of the 32-bit word. Similarly, FUTEX_WAKE
is not really necessary for 64-bit futexes, 32-bit futex's FUTEX_WAKE
can wake it equally well (it never reads anything, all it cares
is about the futex's address). Similarly, I don't see a need for
FUTEX_WAKE_OP (and this could simplify the patch quite a lot, no
need to change asm*/futex.h headers at all).
All that's needed is 64-bit FUTEX_WAIT and perhaps FUTEX_CMP_REQUEUE.
Jakub
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2007-01-11 22:16 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <45A3B330.9000104@bull.net>
2007-01-09 16:16 ` [PATCH 2.6.20-rc4 1/4] futex priority based wakeup Pierre Peiffer
2007-01-09 16:29 ` Ulrich Drepper
2007-01-10 11:47 ` Pierre Peiffer
2007-01-10 12:03 ` Pierre Peiffer
2007-01-10 12:54 ` Jakub Jelinek
2007-01-10 15:05 ` Pierre Peiffer
2007-01-10 18:15 ` Ulrich Drepper
2007-01-11 7:20 ` Ulrich Drepper
2007-01-09 17:59 ` Daniel Walker
2007-01-10 16:11 ` Daniel Walker
2007-01-10 16:29 ` Pierre Peiffer
2007-01-10 16:33 ` Daniel Walker
2007-01-09 16:16 ` [PATCH 2.6.20-rc4 2/4] Make futex_wait() use an hrtimer for timeout Pierre Peiffer
2007-01-09 16:20 ` [PATCH 2.6.20-rc4 3/4] futex_requeue_pi optimization Pierre Peiffer
2007-01-09 16:33 ` Ulrich Drepper
2007-01-10 8:17 ` Pierre Peiffer
2007-01-10 8:24 ` Ulrich Drepper
2007-01-09 16:25 ` [PATCH 2.6.20-rc4 4/4][RFC] sys_futex64 : allows 64bit futexes Pierre Peiffer
2007-01-11 21:49 ` Andrew Morton
2007-01-11 22:14 ` Jakub Jelinek
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).