LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/8] sched: fix incorrect irq lock usage in normalize_rt_tasks()
[not found] <20080204210258.118479000@chello.nl>
@ 2008-02-04 21:02 ` Peter Zijlstra
2008-02-04 21:03 ` [PATCH 2/8] sched: rt-group: deal with PI Peter Zijlstra
` (6 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-04 21:02 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel; +Cc: tong.n.li, Peter Zijlstra
[-- Attachment #1: sched-fix-normalize-rt-tasks.patch --]
[-- Type: text/plain, Size: 1114 bytes --]
lockdep spotted this bogus irq locking. normalize_rt_tasks() can be called
from hardirq context through sysrq-n
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7345,7 +7345,7 @@ void normalize_rt_tasks(void)
unsigned long flags;
struct rq *rq;
- read_lock_irq(&tasklist_lock);
+ read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
/*
* Only normalize user tasks:
@@ -7371,16 +7371,16 @@ void normalize_rt_tasks(void)
continue;
}
- spin_lock_irqsave(&p->pi_lock, flags);
+ spin_lock(&p->pi_lock);
rq = __task_rq_lock(p);
normalize_task(rq, p);
__task_rq_unlock(rq);
- spin_unlock_irqrestore(&p->pi_lock, flags);
+ spin_unlock(&p->pi_lock);
} while_each_thread(g, p);
- read_unlock_irq(&tasklist_lock);
+ read_unlock_irqrestore(&tasklist_lock, flags);
}
#endif /* CONFIG_MAGIC_SYSRQ */
--
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 2/8] sched: rt-group: deal with PI
[not found] <20080204210258.118479000@chello.nl>
2008-02-04 21:02 ` [PATCH 1/8] sched: fix incorrect irq lock usage in normalize_rt_tasks() Peter Zijlstra
@ 2008-02-04 21:03 ` Peter Zijlstra
2008-02-04 21:03 ` [PATCH 3/8] sched: rt-group: interface Peter Zijlstra
` (5 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-04 21:03 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel; +Cc: tong.n.li, Peter Zijlstra
[-- Attachment #1: sched-rt-group-pi.patch --]
[-- Type: text/plain, Size: 3807 bytes --]
Steven mentioned the fun case where a lock holding task will be throttled.
Simple fix: allow groups that have boosted tasks to run anyway.
If a runnable task in a throttled group gets boosted the dequeue/enqueue
done by rt_mutex_setprio() is enough to unthrottle the group.
This is ofcourse not quite correct. Two possible ways forward are:
- second prio array for boosted tasks
- boost to a prio ceiling (this would also work for deadline scheduling)
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 3 +++
kernel/sched_rt.c | 43 ++++++++++++++++++++++++++++++++++++++-----
2 files changed, 41 insertions(+), 5 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -374,6 +374,8 @@ struct rt_rq {
u64 rt_time;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ unsigned long rt_nr_boosted;
+
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
@@ -7116,6 +7118,7 @@ static void init_rt_rq(struct rt_rq *rt_
rt_rq->rt_throttled = 0;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
#endif
}
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -110,6 +110,23 @@ static void sched_rt_ratio_dequeue(struc
dequeue_rt_entity(rt_se);
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+ struct task_struct *p;
+
+ if (rt_rq)
+ return !!rt_rq->rt_nr_boosted;
+
+ p = rt_task_of(rt_se);
+ return p->prio != p->normal_prio;
+}
+
#else
static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
@@ -149,6 +166,10 @@ static inline void sched_rt_ratio_dequeu
{
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
+}
#endif
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -172,7 +193,7 @@ static int sched_rt_ratio_exceeded(struc
return 0;
if (rt_rq->rt_throttled)
- return 1;
+ return rt_rq_throttled(rt_rq);
period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
@@ -183,8 +204,10 @@ static int sched_rt_ratio_exceeded(struc
rq->rt_throttled = 1;
rt_rq->rt_throttled = 1;
- sched_rt_ratio_dequeue(rt_rq);
- return 1;
+ if (rt_rq_throttled(rt_rq)) {
+ sched_rt_ratio_dequeue(rt_rq);
+ return 1;
+ }
}
return 0;
@@ -265,6 +288,10 @@ void inc_rt_tasks(struct sched_rt_entity
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+#endif
}
static inline
@@ -295,6 +322,12 @@ void dec_rt_tasks(struct sched_rt_entity
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif /* CONFIG_SMP */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted--;
+
+ WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +336,7 @@ static void enqueue_rt_entity(struct sch
struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
- if (group_rq && group_rq->rt_throttled)
+ if (group_rq && rt_rq_throttled(group_rq))
return;
list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -496,7 +529,7 @@ static struct task_struct *pick_next_tas
if (unlikely(!rt_rq->rt_nr_running))
return NULL;
- if (sched_rt_ratio_exceeded(rt_rq))
+ if (rt_rq_throttled(rt_rq))
return NULL;
do {
--
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 3/8] sched: rt-group: interface
[not found] <20080204210258.118479000@chello.nl>
2008-02-04 21:02 ` [PATCH 1/8] sched: fix incorrect irq lock usage in normalize_rt_tasks() Peter Zijlstra
2008-02-04 21:03 ` [PATCH 2/8] sched: rt-group: deal with PI Peter Zijlstra
@ 2008-02-04 21:03 ` Peter Zijlstra
2008-02-06 1:31 ` Randy Dunlap
2008-02-23 19:48 ` Paul Menage
2008-02-04 21:03 ` [PATCH 4/8] sched: rt-group: make rt groups scheduling configurable Peter Zijlstra
` (4 subsequent siblings)
7 siblings, 2 replies; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-04 21:03 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel; +Cc: tong.n.li, Peter Zijlstra
[-- Attachment #1: sched-rt-group-interface.patch --]
[-- Type: text/plain, Size: 17165 bytes --]
Change the rt_ratio interface to rt_runtime_us, to match rt_period_us.
This avoids picking a granularity for the ratio.
Extend the /sys/kernel/uids/<uid>/ interface to allow setting
the group's rt_runtime.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Documentation/ABI/testing/sysfs-kernel-uids | 6 +
Documentation/sched-rt-group.txt | 59 +++++++++++
include/linux/sched.h | 7 -
kernel/sched.c | 145 +++++++++++++++++++++-------
kernel/sched_rt.c | 53 ++++------
kernel/sysctl.c | 32 +++---
kernel/user.c | 28 +++++
7 files changed, 250 insertions(+), 80 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1507,8 +1507,6 @@ extern unsigned int sysctl_sched_child_r
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_rt_period;
-extern unsigned int sysctl_sched_rt_ratio;
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
extern unsigned int sysctl_sched_min_bal_int_shares;
extern unsigned int sysctl_sched_max_bal_int_shares;
@@ -1518,6 +1516,8 @@ int sched_nr_latency_handler(struct ctl_
struct file *file, void __user *buffer, size_t *length,
loff_t *ppos);
#endif
+extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
extern unsigned int sysctl_sched_compat_yield;
@@ -1997,6 +1997,9 @@ extern void sched_destroy_group(struct t
extern void sched_move_task(struct task_struct *tsk);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern unsigned long sched_group_shares(struct task_group *tg);
+extern int sched_group_set_rt_runtime(struct task_group *tg,
+ long rt_runtime_us);
+extern long sched_group_rt_runtime(struct task_group *tg);
#endif
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -176,7 +176,7 @@ struct task_group {
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
- unsigned int rt_ratio;
+ u64 rt_runtime;
/*
* shares assigned to a task group governs how much of cpu bandwidth
@@ -654,19 +654,21 @@ const_debug unsigned int sysctl_sched_fe
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
* default: 1s
*/
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+unsigned int sysctl_sched_rt_period = 1000000;
-#define SCHED_RT_FRAC_SHIFT 16
-#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
/*
- * ratio of time -rt tasks may consume.
- * default: 95%
+ * single value that denotes runtime == period, ie unlimited time.
*/
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+#define RUNTIME_INF ((u64)~0ULL)
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -7191,7 +7193,8 @@ void __init sched_init(void)
&per_cpu(init_cfs_rq, i),
&per_cpu(init_sched_entity, i), i, 1);
- init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+ init_task_group.rt_runtime =
+ sysctl_sched_rt_runtime * NSEC_PER_USEC;
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(rq, &init_task_group,
&per_cpu(init_rt_rq, i),
@@ -7586,7 +7589,7 @@ struct task_group *sched_create_group(vo
goto err;
tg->shares = NICE_0_LOAD;
- tg->rt_ratio = 0; /* XXX */
+ tg->rt_runtime = 0;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
@@ -7780,30 +7783,76 @@ unsigned long sched_group_shares(struct
}
/*
- * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
*/
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+ if (runtime == RUNTIME_INF)
+ return 1ULL << 16;
+
+ runtime *= (1ULL << 16);
+ do_div(runtime, period);
+ return runtime;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
struct task_group *tgi;
unsigned long total = 0;
+ unsigned long global_ratio =
+ to_ratio(sysctl_sched_rt_period,
+ sysctl_sched_rt_runtime < 0 ?
+ RUNTIME_INF : sysctl_sched_rt_runtime);
rcu_read_lock();
- list_for_each_entry_rcu(tgi, &task_groups, list)
- total += tgi->rt_ratio;
- rcu_read_unlock();
+ list_for_each_entry_rcu(tgi, &task_groups, list) {
+ if (tgi == tg)
+ continue;
- if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
- return -EINVAL;
+ total += to_ratio(period, tgi->rt_runtime);
+ }
+ rcu_read_unlock();
- tg->rt_ratio = rt_ratio;
- return 0;
+ return total + to_ratio(period, runtime) < global_ratio;
}
-unsigned long sched_group_rt_ratio(struct task_group *tg)
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
- return tg->rt_ratio;
+ u64 rt_runtime, rt_period;
+ int err = 0;
+
+ rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us == -1)
+ rt_runtime = rt_period;
+
+ mutex_lock(&rt_constraints_mutex);
+ if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+ if (rt_runtime_us == -1)
+ rt_runtime = RUNTIME_INF;
+ tg->rt_runtime = rt_runtime;
+ unlock:
+ mutex_unlock(&rt_constraints_mutex);
+
+ return err;
}
+long sched_group_rt_runtime(struct task_group *tg)
+{
+ u64 rt_runtime_us;
+
+ if (tg->rt_runtime == RUNTIME_INF)
+ return -1;
+
+ rt_runtime_us = tg->rt_runtime;
+ do_div(rt_runtime_us, NSEC_PER_USEC);
+ return rt_runtime_us;
+}
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7879,17 +7928,49 @@ static u64 cpu_shares_read_uint(struct c
return (u64) tg->shares;
}
-static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
- u64 rt_ratio_val)
-{
- return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *unused_ppos)
+{
+ char buffer[64];
+ int retval = 0;
+ s64 val;
+ char *end;
+
+ if (!nbytes)
+ return -EINVAL;
+ if (nbytes >= sizeof(buffer))
+ return -E2BIG;
+ if (copy_from_user(buffer, userbuf, nbytes))
+ return -EFAULT;
+
+ buffer[nbytes] = 0; /* nul-terminate */
+
+ /* strip newline if necessary */
+ if (nbytes && (buffer[nbytes-1] == '\n'))
+ buffer[nbytes-1] = 0;
+ val = simple_strtoll(buffer, &end, 0);
+ if (*end)
+ return -EINVAL;
+
+ /* Pass to subsystem */
+ retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+ if (!retval)
+ retval = nbytes;
+ return retval;
}
-static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
-{
- struct task_group *tg = cgroup_tg(cgrp);
+static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ char __user *buf, size_t nbytes,
+ loff_t *ppos)
+{
+ char tmp[64];
+ long val = sched_group_rt_runtime(cgroup_tg(cgrp));
+ int len = sprintf(tmp, "%ld\n", val);
- return (u64) tg->rt_ratio;
+ return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
static struct cftype cpu_files[] = {
@@ -7899,9 +7980,9 @@ static struct cftype cpu_files[] = {
.write_uint = cpu_shares_write_uint,
},
{
- .name = "rt_ratio",
- .read_uint = cpu_rt_ratio_read_uint,
- .write_uint = cpu_rt_ratio_write_uint,
+ .name = "rt_runtime_us",
+ .read = cpu_rt_runtime_read,
+ .write = cpu_rt_runtime_write,
},
};
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -57,12 +57,12 @@ static inline int on_rt_rq(struct sched_
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
if (!rt_rq->tg)
- return SCHED_RT_FRAC;
+ return RUNTIME_INF;
- return rt_rq->tg->rt_ratio;
+ return rt_rq->tg->rt_runtime;
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(
static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
-static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struc
}
}
-static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -129,9 +129,12 @@ static int rt_se_boosted(struct sched_rt
#else
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
- return sysctl_sched_rt_ratio;
+ if (sysctl_sched_rt_runtime == -1)
+ return RUNTIME_INF;
+
+ return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -158,11 +161,11 @@ static inline struct rt_rq *group_rt_rq(
return NULL;
}
-static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
}
-static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
}
@@ -184,28 +187,24 @@ static inline int rt_se_prio(struct sche
return rt_task_of(rt_se)->prio;
}
-static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
- unsigned int rt_ratio = sched_rt_ratio(rt_rq);
- u64 period, ratio;
+ u64 runtime = sched_rt_runtime(rt_rq);
- if (rt_ratio == SCHED_RT_FRAC)
+ if (runtime == RUNTIME_INF)
return 0;
if (rt_rq->rt_throttled)
return rt_rq_throttled(rt_rq);
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
- ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
- if (rt_rq->rt_time > ratio) {
+ if (rt_rq->rt_time > runtime) {
struct rq *rq = rq_of_rt_rq(rt_rq);
rq->rt_throttled = 1;
rt_rq->rt_throttled = 1;
if (rt_rq_throttled(rt_rq)) {
- sched_rt_ratio_dequeue(rt_rq);
+ sched_rt_rq_dequeue(rt_rq);
return 1;
}
}
@@ -219,17 +218,16 @@ static void update_sched_rt_period(struc
u64 period;
while (rq->clock > rq->rt_period_expire) {
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+ period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
rq->rt_period_expire += period;
for_each_leaf_rt_rq(rt_rq, rq) {
- unsigned long rt_ratio = sched_rt_ratio(rt_rq);
- u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+ u64 runtime = sched_rt_runtime(rt_rq);
- rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
- if (rt_rq->rt_throttled) {
+ rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
+ if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
rt_rq->rt_throttled = 0;
- sched_rt_ratio_enqueue(rt_rq);
+ sched_rt_rq_enqueue(rt_rq);
}
}
@@ -262,12 +260,7 @@ static void update_curr_rt(struct rq *rq
cpuacct_charge(curr, delta_exec);
rt_rq->rt_time += delta_exec;
- /*
- * might make it a tad more accurate:
- *
- * update_sched_rt_period(rq);
- */
- if (sched_rt_ratio_exceeded(rt_rq))
+ if (sched_rt_runtime_exceeded(rt_rq))
resched_task(curr);
}
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -311,22 +311,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "sched_rt_period_ms",
- .data = &sysctl_sched_rt_period,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "sched_rt_ratio",
- .data = &sysctl_sched_rt_ratio,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
{
.ctl_name = CTL_UNNUMBERED,
@@ -348,6 +332,22 @@ static struct ctl_table kern_table[] = {
#endif
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_rt_period_us",
+ .data = &sysctl_sched_rt_period,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_rt_runtime_us",
+ .data = &sysctl_sched_rt_runtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "sched_compat_yield",
.data = &sysctl_sched_compat_yield,
.maxlen = sizeof(unsigned int),
Index: linux-2.6/Documentation/ABI/testing/sysfs-kernel-uids
===================================================================
--- linux-2.6.orig/Documentation/ABI/testing/sysfs-kernel-uids
+++ linux-2.6/Documentation/ABI/testing/sysfs-kernel-uids
@@ -12,3 +12,9 @@ Description:
B has shares = 2048, User B will get twice the CPU
bandwidth user A will. For more details refer
Documentation/sched-design-CFS.txt
+
+What: /sys/kernel/uids/<uid>/cpu_rt_runtime_us
+Date: January 2008
+Contact: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Description: See Documentation/sched-rt-group.txt
+
Index: linux-2.6/Documentation/sched-rt-group.txt
===================================================================
--- /dev/null
+++ linux-2.6/Documentation/sched-rt-group.txt
@@ -0,0 +1,59 @@
+
+
+Real-Time group scheduling.
+
+The problem space:
+
+In order to schedule multiple groups of realtime tasks each group must
+be assigned a fixed portion of the cpu time available. Without a minimum
+guarantee a realtime group can obviously fall short. A fuzzy upper limit
+is of no use since it cannot be relied upon. Which leaves us with just
+the single fixed portion.
+
+CPU time is divided by means of specifying how much time can be spend
+running in a given period. Say a frame fixed realtime renderer must
+deliver a 25 frames a second, which yields a period of 0.04s. Now say
+it will also have to play some music and respond to input, leaving it
+with around 80% for the graphics. We can then give this group a runtime
+of 0.8 * 0.04s = 0.032s.
+
+This way the graphics group will have a 0.04s period with a 0.032s runtime
+limit.
+
+Now if the audio thread needs to refill the dma buffer every 0.005s, but
+needs only about 3% cpu time to do so, it will can do with a 0.03 * 0.005s
+= 0.00015s.
+
+
+The Interface:
+
+system wide:
+
+/proc/sys/kernel/sched_rt_period_ms
+/proc/sys/kernel/sched_rt_runtime_us
+
+CONFIG_FAIR_USER_SCHED
+
+/sys/kernel/uids/<uid>/cpu_rt_runtime_us
+
+or
+
+CONFIG_FAIR_CGROUP_SCHED
+
+/cgroup/<cgroup>/cpu.rt_runtime_us
+
+[ time is specified in us because the interface is s32, this gives an
+ operating range of ~35m to 1us ]
+
+The period takes values in [ 1, INT_MAX ], runtime in [ -1, INT_MAX - 1 ].
+
+A runtime of -1 specifies runtime == period, ie. no limit.
+
+New groups get the period from /proc/sys/kernel/sched_rt_period_us and
+a runtime of 0.
+
+Settings are constrainted to:
+
+ \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
+
+in order to keep the configuration schedulable.
Index: linux-2.6/kernel/user.c
===================================================================
--- linux-2.6.orig/kernel/user.c
+++ linux-2.6/kernel/user.c
@@ -156,9 +156,37 @@ static ssize_t cpu_shares_store(struct k
static struct kobj_attribute cpu_share_attr =
__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+ return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+}
+
+static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+ unsigned long rt_runtime;
+ int rc;
+
+ sscanf(buf, "%lu", &rt_runtime);
+
+ rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
+
+ return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_rt_runtime_attr =
+ __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+
/* default attributes per uid directory */
static struct attribute *uids_attributes[] = {
&cpu_share_attr.attr,
+ &cpu_rt_runtime_attr.attr,
NULL
};
--
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 4/8] sched: rt-group: make rt groups scheduling configurable
[not found] <20080204210258.118479000@chello.nl>
` (2 preceding siblings ...)
2008-02-04 21:03 ` [PATCH 3/8] sched: rt-group: interface Peter Zijlstra
@ 2008-02-04 21:03 ` Peter Zijlstra
2008-02-04 21:03 ` [PATCH 5/8] sched: rt-group: clean up the ifdeffery Peter Zijlstra
` (3 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-04 21:03 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel; +Cc: tong.n.li, Peter Zijlstra
[-- Attachment #1: sched-rt-group-config.patch --]
[-- Type: text/plain, Size: 15975 bytes --]
Make the rt group scheduler compile time configurable.
Enable it by default for cgroup scheduling.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/cgroup_subsys.h | 2
init/Kconfig | 23 +++++--
kernel/sched.c | 130 +++++++++++++++++++++++++++++++-----------
kernel/sched_rt.c | 12 +--
4 files changed, 120 insertions(+), 47 deletions(-)
Index: linux-2.6/include/linux/cgroup_subsys.h
===================================================================
--- linux-2.6.orig/include/linux/cgroup_subsys.h
+++ linux-2.6/include/linux/cgroup_subsys.h
@@ -25,7 +25,7 @@ SUBSYS(ns)
/* */
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
SUBSYS(cpu_cgroup)
#endif
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -332,25 +332,36 @@ config CPUSETS
Say N if unsure.
-config FAIR_GROUP_SCHED
- bool "Fair group CPU scheduler"
+config GROUP_SCHED
+ bool "Group CPU scheduler"
default y
help
This feature lets CPU scheduler recognize task groups and control CPU
bandwidth allocation to such task groups.
+config FAIR_GROUP_SCHED
+ bool "Group scheduling for SCHED_OTHER"
+ depends on GROUP_SCHED
+ default y
+
+config RT_GROUP_SCHED
+ bool "Group scheduling for SCHED_RR/FIFO"
+ depends on EXPERIMENTAL
+ depends on GROUP_SCHED
+ default n
+
choice
- depends on FAIR_GROUP_SCHED
+ depends on GROUP_SCHED
prompt "Basis for grouping tasks"
- default FAIR_USER_SCHED
+ default USER_SCHED
-config FAIR_USER_SCHED
+config USER_SCHED
bool "user id"
help
This option will choose userid as the basis for grouping
tasks, thus providing equal CPU bandwidth to each user.
-config FAIR_CGROUP_SCHED
+config CGROUP_SCHED
bool "Control groups"
depends on CGROUPS
help
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -155,7 +155,7 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO];
};
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
#include <linux/cgroup.h>
@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups);
/* task group related information */
struct task_group {
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
struct cgroup_subsys_state css;
#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
- struct sched_rt_entity **rt_se;
- struct rt_rq **rt_rq;
-
- u64 rt_runtime;
-
/*
* shares assigned to a task group governs how much of cpu bandwidth
* is allocated to the group. The more shares a group has, the more is
@@ -213,24 +210,36 @@ struct task_group {
*
*/
unsigned long shares;
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity **rt_se;
+ struct rt_rq **rt_rq;
+
+ u64 rt_runtime;
+#endif
struct rcu_head rcu;
struct list_head list;
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
/* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-
static struct sched_entity *init_sched_entity_p[NR_CPUS];
static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
static struct rt_rq *init_rt_rq_p[NR_CPUS];
+#endif
/* task_group_mutex serializes add/remove of task groups and also changes to
* a task group's cpu shares.
@@ -240,6 +249,7 @@ static DEFINE_MUTEX(task_group_mutex);
/* doms_cur_mutex serializes access to doms_cur[] array */
static DEFINE_MUTEX(doms_cur_mutex);
+#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/* kernel thread that runs rebalance_shares() periodically */
static struct task_struct *lb_monitor_task;
@@ -247,19 +257,24 @@ static int load_balance_monitor(void *un
#endif
static void set_se_shares(struct sched_entity *se, unsigned long shares);
+#endif
/* Default task group.
* Every task in system belong to this group at bootup.
*/
struct task_group init_task_group = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
.se = init_sched_entity_p,
.cfs_rq = init_cfs_rq_p,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
.rt_se = init_sched_rt_entity_p,
.rt_rq = init_rt_rq_p,
+#endif
};
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
#else
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
@@ -274,9 +289,9 @@ static inline struct task_group *task_gr
{
struct task_group *tg;
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
tg = p->user->tg;
-#elif defined(CONFIG_FAIR_CGROUP_SCHED)
+#elif defined(CONFIG_CGROUP_SCHED)
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
struct task_group, css);
#else
@@ -288,11 +303,15 @@ static inline struct task_group *task_gr
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
+#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
p->se.parent = task_group(p)->se[cpu];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
p->rt.rt_rq = task_group(p)->rt_rq[cpu];
p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
}
static inline void lock_task_group_list(void)
@@ -323,7 +342,7 @@ static inline void unlock_task_group_lis
static inline void lock_doms_cur(void) { }
static inline void unlock_doms_cur(void) { }
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_GROUP_SCHED */
/* CFS-related fields in a runqueue */
struct cfs_rq {
@@ -363,7 +382,7 @@ struct cfs_rq {
struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
int highest_prio; /* highest queued rt task prio */
#endif
#ifdef CONFIG_SMP
@@ -373,7 +392,7 @@ struct rt_rq {
int rt_throttled;
u64 rt_time;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
struct rq *rq;
@@ -449,6 +468,8 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
@@ -7108,7 +7129,7 @@ static void init_rt_rq(struct rt_rq *rt_
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
rt_rq->highest_prio = MAX_RT_PRIO;
#endif
#ifdef CONFIG_SMP
@@ -7119,7 +7140,7 @@ static void init_rt_rq(struct rt_rq *rt_
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
#endif
@@ -7143,7 +7164,9 @@ static void init_tg_cfs_entry(struct rq
se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
se->parent = NULL;
}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
int cpu, int add)
@@ -7172,7 +7195,7 @@ void __init sched_init(void)
init_defrootdomain();
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
list_add(&init_task_group.list, &task_groups);
#endif
@@ -7193,6 +7216,8 @@ void __init sched_init(void)
&per_cpu(init_cfs_rq, i),
&per_cpu(init_sched_entity, i), i, 1);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
init_task_group.rt_runtime =
sysctl_sched_rt_runtime * NSEC_PER_USEC;
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
@@ -7385,9 +7410,9 @@ void set_curr_task(int cpu, struct task_
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
-#ifdef CONFIG_SMP
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
/*
* distribute shares of all task groups among their schedulable entities,
* to reflect load distribution across cpus.
@@ -7543,20 +7568,28 @@ static void free_sched_group(struct task
int i;
for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
if (tg->se)
kfree(tg->se[i]);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
if (tg->rt_se)
kfree(tg->rt_se[i]);
+#endif
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
kfree(tg->cfs_rq);
kfree(tg->se);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
kfree(tg->rt_rq);
kfree(tg->rt_se);
+#endif
kfree(tg);
}
@@ -7564,10 +7597,14 @@ static void free_sched_group(struct task
struct task_group *sched_create_group(void)
{
struct task_group *tg;
+#ifdef CONFIG_FAIR_GROUP_SCHED
struct cfs_rq *cfs_rq;
struct sched_entity *se;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
struct rt_rq *rt_rq;
struct sched_rt_entity *rt_se;
+#endif
struct rq *rq;
int i;
@@ -7575,12 +7612,18 @@ struct task_group *sched_create_group(vo
if (!tg)
return ERR_PTR(-ENOMEM);
+#ifdef CONFIG_FAIR_GROUP_SCHED
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
if (!tg->se)
goto err;
+
+ tg->shares = NICE_0_LOAD;
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
if (!tg->rt_rq)
goto err;
@@ -7588,12 +7631,13 @@ struct task_group *sched_create_group(vo
if (!tg->rt_se)
goto err;
- tg->shares = NICE_0_LOAD;
tg->rt_runtime = 0;
+#endif
for_each_possible_cpu(i) {
rq = cpu_rq(i);
+#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
if (!cfs_rq)
@@ -7604,6 +7648,10 @@ struct task_group *sched_create_group(vo
if (!se)
goto err;
+ init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
rt_rq = kmalloc_node(sizeof(struct rt_rq),
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
if (!rt_rq)
@@ -7614,17 +7662,21 @@ struct task_group *sched_create_group(vo
if (!rt_se)
goto err;
- init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
+#endif
}
lock_task_group_list();
for_each_possible_cpu(i) {
rq = cpu_rq(i);
+#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq = tg->cfs_rq[i];
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
rt_rq = tg->rt_rq[i];
list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+#endif
}
list_add_rcu(&tg->list, &task_groups);
unlock_task_group_list();
@@ -7646,22 +7698,20 @@ static void free_sched_group_rcu(struct
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
- struct cfs_rq *cfs_rq = NULL;
- struct rt_rq *rt_rq = NULL;
int i;
lock_task_group_list();
for_each_possible_cpu(i) {
- cfs_rq = tg->cfs_rq[i];
- list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
- rt_rq = tg->rt_rq[i];
- list_del_rcu(&rt_rq->leaf_rt_rq_list);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ list_del_rcu(&tg->cfs_rq[i]->leaf_cfs_rq_list);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ list_del_rcu(&tg->rt_rq[i]->leaf_rt_rq_list);
+#endif
}
list_del_rcu(&tg->list);
unlock_task_group_list();
- BUG_ON(!cfs_rq);
-
/* wait for possible concurrent references to cfs_rqs complete */
call_rcu(&tg->rcu, free_sched_group_rcu);
}
@@ -7701,6 +7751,7 @@ void sched_move_task(struct task_struct
task_rq_unlock(rq, &flags);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
/* rq->lock to be locked by caller */
static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
@@ -7781,7 +7832,9 @@ unsigned long sched_group_shares(struct
{
return tg->shares;
}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
/*
* Ensure that the real time constraints are schedulable.
*/
@@ -7853,9 +7906,10 @@ long sched_group_rt_runtime(struct task_
do_div(rt_runtime_us, NSEC_PER_USEC);
return rt_runtime_us;
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif
+#endif /* CONFIG_GROUP_SCHED */
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
/* return corresponding task_group object of a cgroup */
static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
@@ -7915,6 +7969,7 @@ cpu_cgroup_attach(struct cgroup_subsys *
sched_move_task(tsk);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
{
@@ -7927,7 +7982,9 @@ static u64 cpu_shares_read_uint(struct c
return (u64) tg->shares;
}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
const char __user *userbuf,
@@ -7972,18 +8029,23 @@ static ssize_t cpu_rt_runtime_read(struc
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
+#endif
static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
.read_uint = cpu_shares_read_uint,
.write_uint = cpu_shares_write_uint,
},
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
{
.name = "rt_runtime_us",
.read = cpu_rt_runtime_read,
.write = cpu_rt_runtime_write,
},
+#endif
};
static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -8002,7 +8064,7 @@ struct cgroup_subsys cpu_cgroup_subsys =
.early_init = 1,
};
-#endif /* CONFIG_FAIR_CGROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_CGROUP_CPUACCT
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -55,7 +55,7 @@ static inline int on_rt_rq(struct sched_
return !list_empty(&rt_se->run_list);
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
@@ -177,7 +177,7 @@ static inline int rt_rq_throttled(struct
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
@@ -269,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_se_prio(rt_se) < rt_rq->highest_prio)
rt_rq->highest_prio = rt_se_prio(rt_se);
#endif
@@ -281,7 +281,7 @@ void inc_rt_tasks(struct sched_rt_entity
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted++;
#endif
@@ -293,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running--;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_rq->rt_nr_running) {
struct rt_prio_array *array;
@@ -315,7 +315,7 @@ void dec_rt_tasks(struct sched_rt_entity
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif /* CONFIG_SMP */
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted--;
--
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 5/8] sched: rt-group: clean up the ifdeffery
[not found] <20080204210258.118479000@chello.nl>
` (3 preceding siblings ...)
2008-02-04 21:03 ` [PATCH 4/8] sched: rt-group: make rt groups scheduling configurable Peter Zijlstra
@ 2008-02-04 21:03 ` Peter Zijlstra
2008-02-04 21:03 ` [PATCH 6/8] sched: rt-group: refure unrunnable tasks Peter Zijlstra
` (2 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-04 21:03 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel; +Cc: tong.n.li, Peter Zijlstra
[-- Attachment #1: sched-rt-group-unclutter.patch --]
[-- Type: text/plain, Size: 4872 bytes --]
Clean up some of the excessive ifdeffery introduces in the last patch.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 150 ++++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 100 insertions(+), 50 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7563,56 +7563,29 @@ static int load_balance_monitor(void *un
}
#endif /* CONFIG_SMP */
-static void free_sched_group(struct task_group *tg)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void free_fair_sched_group(struct task_group *tg)
{
int i;
for_each_possible_cpu(i) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
if (tg->se)
kfree(tg->se[i]);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- if (tg->rt_rq)
- kfree(tg->rt_rq[i]);
- if (tg->rt_se)
- kfree(tg->rt_se[i]);
-#endif
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
kfree(tg->cfs_rq);
kfree(tg->se);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- kfree(tg->rt_rq);
- kfree(tg->rt_se);
-#endif
- kfree(tg);
}
-/* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(void)
+static int alloc_fair_sched_group(struct task_group *tg)
{
- struct task_group *tg;
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct cfs_rq *cfs_rq;
struct sched_entity *se;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- struct rt_rq *rt_rq;
- struct sched_rt_entity *rt_se;
-#endif
struct rq *rq;
int i;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
- if (!tg)
- return ERR_PTR(-ENOMEM);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
@@ -7621,23 +7594,10 @@ struct task_group *sched_create_group(vo
goto err;
tg->shares = NICE_0_LOAD;
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
- tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
- if (!tg->rt_rq)
- goto err;
- tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
- if (!tg->rt_se)
- goto err;
-
- tg->rt_runtime = 0;
-#endif
for_each_possible_cpu(i) {
rq = cpu_rq(i);
-#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
if (!cfs_rq)
@@ -7649,9 +7609,68 @@ struct task_group *sched_create_group(vo
goto err;
init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+ }
+
+ lock_task_group_list();
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ cfs_rq = tg->cfs_rq[i];
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+ }
+ list_add_rcu(&tg->list, &task_groups);
+ unlock_task_group_list();
+
+ return 1;
+
+ err:
+ return 0;
+}
+#else
+static inline void free_fair_sched_group(struct task_group *tg)
+{
+}
+
+static inline int alloc_fair_sched_group(struct task_group *tg)
+{
+ return 1;
+}
#endif
#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (tg->rt_rq)
+ kfree(tg->rt_rq[i]);
+ if (tg->rt_se)
+ kfree(tg->rt_se[i]);
+ }
+
+ kfree(tg->rt_rq);
+ kfree(tg->rt_se);
+}
+
+static int alloc_rt_sched_group(struct task_group *tg)
+{
+ struct rt_rq *rt_rq;
+ struct sched_rt_entity *rt_se;
+ struct rq *rq;
+ int i;
+
+ tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+ if (!tg->rt_rq)
+ goto err;
+ tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+ if (!tg->rt_se)
+ goto err;
+
+ tg->rt_runtime = 0;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
rt_rq = kmalloc_node(sizeof(struct rt_rq),
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
if (!rt_rq)
@@ -7663,24 +7682,55 @@ struct task_group *sched_create_group(vo
goto err;
init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
-#endif
}
lock_task_group_list();
for_each_possible_cpu(i) {
rq = cpu_rq(i);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq = tg->cfs_rq[i];
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
rt_rq = tg->rt_rq[i];
list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
-#endif
}
list_add_rcu(&tg->list, &task_groups);
unlock_task_group_list();
+ return 1;
+
+ err:
+ return 0;
+}
+#else
+static inline void free_rt_sched_group(struct task_group *tg)
+{
+}
+
+static inline int alloc_rt_sched_group(struct task_group *tg)
+{
+ return 1;
+}
+#endif
+
+static void free_sched_group(struct task_group *tg)
+{
+ free_fair_sched_group(tg);
+ free_rt_sched_group(tg);
+ kfree(tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(void)
+{
+ struct task_group *tg;
+
+ tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ if (!alloc_fair_sched_group(tg))
+ goto err;
+
+ if (!alloc_rt_sched_group(tg))
+ goto err;
+
return tg;
err:
--
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 6/8] sched: rt-group: refure unrunnable tasks
[not found] <20080204210258.118479000@chello.nl>
` (4 preceding siblings ...)
2008-02-04 21:03 ` [PATCH 5/8] sched: rt-group: clean up the ifdeffery Peter Zijlstra
@ 2008-02-04 21:03 ` Peter Zijlstra
2008-02-04 21:03 ` [PATCH 7/8] sched: rt-group: synchonised bandwidth period Peter Zijlstra
2008-02-04 21:03 ` [PATCH 8/8] sched: rt-group: smp balancing Peter Zijlstra
7 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-04 21:03 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel; +Cc: tong.n.li, Peter Zijlstra
[-- Attachment #1: sched-rt-group-accept.patch --]
[-- Type: text/plain, Size: 1198 bytes --]
Refuse to accept or create RT tasks in groups that can't run them.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4588,6 +4588,15 @@ recheck:
return -EPERM;
}
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
+ return -EPERM;
+#endif
+
retval = security_task_setscheduler(p, policy, param);
if (retval)
return retval;
@@ -8005,9 +8014,15 @@ static int
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk)
{
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
+ return -EINVAL;
+#else
/* We don't support RT-tasks being in separate groups */
if (tsk->sched_class != &fair_sched_class)
return -EINVAL;
+#endif
return 0;
}
--
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 7/8] sched: rt-group: synchonised bandwidth period
[not found] <20080204210258.118479000@chello.nl>
` (5 preceding siblings ...)
2008-02-04 21:03 ` [PATCH 6/8] sched: rt-group: refure unrunnable tasks Peter Zijlstra
@ 2008-02-04 21:03 ` Peter Zijlstra
2008-02-04 21:03 ` [PATCH 8/8] sched: rt-group: smp balancing Peter Zijlstra
7 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-04 21:03 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel; +Cc: tong.n.li, Peter Zijlstra
[-- Attachment #1: sched-rt-group-period.patch --]
[-- Type: text/plain, Size: 17867 bytes --]
Various SMP balancing algorithms require that the bandwidth period
run in sync.
Possible improvements are moving the rt_bandwidth thing into root_domain
and keeping a span per rt_bandwidth which marks throttled cpus.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 7 +
kernel/sched.c | 263 +++++++++++++++++++++++++++++++++++++----------
kernel/sched_rt.c | 104 ++++++++++++------
kernel/sysctl.c | 4
kernel/time/tick-sched.c | 5
5 files changed, 293 insertions(+), 90 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1519,6 +1519,10 @@ int sched_nr_latency_handler(struct ctl_
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
+int sched_rt_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+
extern unsigned int sysctl_sched_compat_yield;
#ifdef CONFIG_RT_MUTEXES
@@ -2000,6 +2004,9 @@ extern unsigned long sched_group_shares(
extern int sched_group_set_rt_runtime(struct task_group *tg,
long rt_runtime_us);
extern long sched_group_rt_runtime(struct task_group *tg);
+extern int sched_group_set_rt_period(struct task_group *tg,
+ long rt_period_us);
+extern long sched_group_rt_period(struct task_group *tg);
#endif
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -114,6 +114,11 @@ unsigned long long __attribute__((weak))
*/
#define DEF_TIMESLICE (100 * HZ / 1000)
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF ((u64)~0ULL)
+
#ifdef CONFIG_SMP
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -155,6 +160,81 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO];
};
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+ static const ktime_t ktime_zero = { .tv64 = 0 };
+ return ktime_add_ns(ktime_zero, ns);
+}
+
+struct rt_bandwidth {
+ ktime_t rt_period;
+ u64 rt_runtime;
+ struct hrtimer rt_period_timer;
+};
+
+static struct rt_bandwidth def_rt_bandwidth;
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+ struct rt_bandwidth *rt_b =
+ container_of(timer, struct rt_bandwidth, rt_period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, rt_b->rt_period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_rt_period_timer(rt_b, overrun);
+ }
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+static
+void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
+{
+ rt_b->rt_period = ns_to_ktime(period);
+ rt_b->rt_runtime = runtime;
+
+ hrtimer_init(&rt_b->rt_period_timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rt_b->rt_period_timer.function = sched_rt_period_timer;
+ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ ktime_t now;
+
+ if (rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ for (;;) {
+ if (hrtimer_active(&rt_b->rt_period_timer))
+ break;
+
+ now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
+ hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
+ hrtimer_start(&rt_b->rt_period_timer,
+ rt_b->rt_period_timer.expires,
+ HRTIMER_MODE_ABS);
+ }
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ hrtimer_cancel(&rt_b->rt_period_timer);
+}
+#endif
+
#ifdef CONFIG_GROUP_SCHED
#include <linux/cgroup.h>
@@ -216,7 +296,7 @@ struct task_group {
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
- u64 rt_runtime;
+ struct rt_bandwidth rt_bandwidth;
#endif
struct rcu_head rcu;
@@ -462,8 +542,6 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
- u64 rt_period_expire;
- int rt_throttled;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@@ -618,23 +696,6 @@ static void update_rq_clock(struct rq *r
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-unsigned long rt_needs_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- u64 delta;
-
- if (!rq->rt_throttled)
- return 0;
-
- if (rq->clock > rq->rt_period_expire)
- return 1;
-
- delta = rq->rt_period_expire - rq->clock;
- do_div(delta, NSEC_PER_SEC / HZ);
-
- return (unsigned long)delta;
-}
-
/*
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
@@ -686,10 +747,18 @@ unsigned int sysctl_sched_rt_period = 10
*/
int sysctl_sched_rt_runtime = 950000;
-/*
- * single value that denotes runtime == period, ie unlimited time.
- */
-#define RUNTIME_INF ((u64)~0ULL)
+static inline u64 global_rt_period(void)
+{
+ return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_rt_runtime(void)
+{
+ if (sysctl_sched_rt_period < 0)
+ return RUNTIME_INF;
+
+ return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+}
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -3768,7 +3837,6 @@ void scheduler_tick(void)
rq->tick_timestamp = rq->clock;
update_cpu_load(rq);
curr->sched_class->task_tick(rq, curr, 0);
- update_sched_rt_period(rq);
spin_unlock(&rq->lock);
#ifdef CONFIG_SMP
@@ -4593,7 +4661,7 @@ recheck:
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
- if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
+ if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
return -EPERM;
#endif
@@ -7204,6 +7272,14 @@ void __init sched_init(void)
init_defrootdomain();
#endif
+ init_rt_bandwidth(&def_rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ init_rt_bandwidth(&init_task_group.rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+#endif
+
#ifdef CONFIG_GROUP_SCHED
list_add(&init_task_group.list, &task_groups);
#endif
@@ -7227,15 +7303,11 @@ void __init sched_init(void)
#endif
#ifdef CONFIG_RT_GROUP_SCHED
- init_task_group.rt_runtime =
- sysctl_sched_rt_runtime * NSEC_PER_USEC;
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(rq, &init_task_group,
&per_cpu(init_rt_rq, i),
&per_cpu(init_sched_rt_entity, i), i, 1);
#endif
- rq->rt_period_expire = 0;
- rq->rt_throttled = 0;
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
@@ -7419,8 +7491,6 @@ void set_curr_task(int cpu, struct task_
#endif
-#ifdef CONFIG_GROUP_SCHED
-
#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
/*
* distribute shares of all task groups among their schedulable entities,
@@ -7650,6 +7720,8 @@ static void free_rt_sched_group(struct t
{
int i;
+ destroy_rt_bandwidth(&tg->rt_bandwidth);
+
for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
@@ -7675,7 +7747,8 @@ static int alloc_rt_sched_group(struct t
if (!tg->rt_se)
goto err;
- tg->rt_runtime = 0;
+ init_rt_bandwidth(&tg->rt_bandwidth,
+ ktime_to_ns(def_rt_bandwidth.rt_period), 0);
for_each_possible_cpu(i) {
rq = cpu_rq(i);
@@ -7718,6 +7791,7 @@ static inline int alloc_rt_sched_group(s
}
#endif
+#ifdef CONFIG_GROUP_SCHED
static void free_sched_group(struct task_group *tg)
{
free_fair_sched_group(tg);
@@ -7809,6 +7883,7 @@ void sched_move_task(struct task_struct
task_rq_unlock(rq, &flags);
}
+#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
/* rq->lock to be locked by caller */
@@ -7914,59 +7989,129 @@ static int __rt_schedulable(struct task_
struct task_group *tgi;
unsigned long total = 0;
unsigned long global_ratio =
- to_ratio(sysctl_sched_rt_period,
- sysctl_sched_rt_runtime < 0 ?
- RUNTIME_INF : sysctl_sched_rt_runtime);
+ to_ratio(global_rt_period(), global_rt_runtime());
rcu_read_lock();
list_for_each_entry_rcu(tgi, &task_groups, list) {
if (tgi == tg)
continue;
- total += to_ratio(period, tgi->rt_runtime);
+ total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
+ tgi->rt_bandwidth.rt_runtime);
}
rcu_read_unlock();
return total + to_ratio(period, runtime) < global_ratio;
}
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int tg_set_bandwidth(struct task_group *tg,
+ u64 rt_period, u64 rt_runtime)
{
- u64 rt_runtime, rt_period;
int err = 0;
- rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
- rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
- if (rt_runtime_us == -1)
- rt_runtime = rt_period;
-
mutex_lock(&rt_constraints_mutex);
if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
err = -EINVAL;
goto unlock;
}
- if (rt_runtime_us == -1)
- rt_runtime = RUNTIME_INF;
- tg->rt_runtime = rt_runtime;
+ tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+ tg->rt_bandwidth.rt_runtime = rt_runtime;
unlock:
mutex_unlock(&rt_constraints_mutex);
return err;
}
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us < 0)
+ rt_runtime = RUNTIME_INF;
+
+ return tg_set_bandwidth(tg, rt_period, rt_runtime);
+}
+
long sched_group_rt_runtime(struct task_group *tg)
{
u64 rt_runtime_us;
- if (tg->rt_runtime == RUNTIME_INF)
+ if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
return -1;
- rt_runtime_us = tg->rt_runtime;
+ rt_runtime_us = tg->rt_bandwidth.rt_runtime;
do_div(rt_runtime_us, NSEC_PER_USEC);
return rt_runtime_us;
}
+
+int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+ return tg_set_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_period(struct task_group *tg)
+{
+ u64 rt_period_us;
+
+ rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ do_div(rt_period_us, NSEC_PER_USEC);
+ return rt_period_us;
+}
+
+static int sched_rt_global_constraints(void)
+{
+ int ret = 0;
+
+ mutex_lock(&rt_constraints_mutex);
+ if (!__rt_schedulable(NULL, 1, 0))
+ ret = -EINVAL;
+ mutex_unlock(&rt_constraints_mutex);
+
+ return ret;
+}
+#else
+static int sched_rt_global_constraints(void)
+{
+ return 0;
+}
#endif
-#endif /* CONFIG_GROUP_SCHED */
+
+int sched_rt_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ int old_period, old_runtime;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ old_period = sysctl_sched_rt_period;
+ old_runtime = sysctl_sched_rt_runtime;
+
+ ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+
+ if (!ret && write) {
+ ret = sched_rt_global_constraints();
+ if (ret) {
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
+ } else {
+ def_rt_bandwidth.rt_runtime = global_rt_runtime();
+ def_rt_bandwidth.rt_period =
+ ns_to_ktime(global_rt_period());
+ }
+ }
+ mutex_unlock(&mutex);
+
+ return ret;
+}
#ifdef CONFIG_CGROUP_SCHED
@@ -8016,7 +8161,7 @@ cpu_cgroup_can_attach(struct cgroup_subs
{
#ifdef CONFIG_RT_GROUP_SCHED
/* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
+ if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */
@@ -8094,6 +8239,17 @@ static ssize_t cpu_rt_runtime_read(struc
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
+
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+ u64 rt_period_us)
+{
+ return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+ return sched_group_rt_period(cgroup_tg(cgrp));
+}
#endif
static struct cftype cpu_files[] = {
@@ -8110,6 +8266,11 @@ static struct cftype cpu_files[] = {
.read = cpu_rt_runtime_read,
.write = cpu_rt_runtime_write,
},
+ {
+ .name = "rt_period_us",
+ .read_uint = cpu_rt_period_read_uint,
+ .write_uint = cpu_rt_period_write_uint,
+ },
#endif
};
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -62,7 +62,7 @@ static inline u64 sched_rt_runtime(struc
if (!rt_rq->tg)
return RUNTIME_INF;
- return rt_rq->tg->rt_runtime;
+ return rt_rq->tg->rt_bandwidth.rt_runtime;
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -127,14 +127,29 @@ static int rt_se_boosted(struct sched_rt
return p->prio != p->normal_prio;
}
+#ifdef CONFIG_SMP
+static inline cpumask_t sched_rt_period_mask(void)
+{
+ return cpu_rq(smp_processor_id())->rd->span;
+}
#else
+static inline cpumask_t sched_rt_period_mask(void)
+{
+ return cpu_online_map;
+}
+#endif
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
{
- if (sysctl_sched_rt_runtime == -1)
- return RUNTIME_INF;
+ return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
+}
- return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+#else
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+ return def_rt_bandwidth.rt_runtime;
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -173,8 +188,55 @@ static inline int rt_rq_throttled(struct
{
return rt_rq->rt_throttled;
}
+
+static inline cpumask_t sched_rt_period_mask(void)
+{
+ return cpu_online_map;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return &cpu_rq(cpu)->rt;
+}
+
#endif
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+{
+ int i, idle = 1;
+ cpumask_t span;
+
+ if (rt_b->rt_runtime == RUNTIME_INF)
+ return 1;
+
+ span = sched_rt_period_mask();
+ for_each_cpu_mask(i, span) {
+ int enqueue = 0;
+ struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ spin_lock(&rq->lock);
+ if (rt_rq->rt_time) {
+ u64 runtime = rt_b->rt_runtime;
+
+ rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+ if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+ rt_rq->rt_throttled = 0;
+ enqueue = 1;
+ }
+ if (rt_rq->rt_time || rt_rq->rt_nr_running)
+ idle = 0;
+ }
+
+ if (enqueue)
+ sched_rt_rq_enqueue(rt_rq);
+ spin_unlock(&rq->lock);
+ }
+
+ return idle;
+}
+
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
#ifdef CONFIG_RT_GROUP_SCHED
@@ -198,11 +260,7 @@ static int sched_rt_runtime_exceeded(str
return rt_rq_throttled(rt_rq);
if (rt_rq->rt_time > runtime) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- rq->rt_throttled = 1;
rt_rq->rt_throttled = 1;
-
if (rt_rq_throttled(rt_rq)) {
sched_rt_rq_dequeue(rt_rq);
return 1;
@@ -212,29 +270,6 @@ static int sched_rt_runtime_exceeded(str
return 0;
}
-static void update_sched_rt_period(struct rq *rq)
-{
- struct rt_rq *rt_rq;
- u64 period;
-
- while (rq->clock > rq->rt_period_expire) {
- period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
- rq->rt_period_expire += period;
-
- for_each_leaf_rt_rq(rt_rq, rq) {
- u64 runtime = sched_rt_runtime(rt_rq);
-
- rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
- if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
- rt_rq->rt_throttled = 0;
- sched_rt_rq_enqueue(rt_rq);
- }
- }
-
- rq->rt_throttled = 0;
- }
-}
-
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -284,6 +319,11 @@ void inc_rt_tasks(struct sched_rt_entity
#ifdef CONFIG_RT_GROUP_SCHED
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted++;
+
+ if (rt_rq->tg)
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+#else
+ start_rt_bandwidth(&def_rt_bandwidth);
#endif
}
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -336,7 +336,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_rt_period,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &sched_rt_handler,
},
{
.ctl_name = CTL_UNNUMBERED,
@@ -344,7 +344,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_rt_runtime,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &sched_rt_handler,
},
{
.ctl_name = CTL_UNNUMBERED,
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -191,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *l
void tick_nohz_stop_sched_tick(void)
{
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
- unsigned long rt_jiffies;
struct tick_sched *ts;
ktime_t last_update, expires, now;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -244,10 +243,6 @@ void tick_nohz_stop_sched_tick(void)
next_jiffies = get_next_timer_interrupt(last_jiffies);
delta_jiffies = next_jiffies - last_jiffies;
- rt_jiffies = rt_needs_cpu(cpu);
- if (rt_jiffies && rt_jiffies < delta_jiffies)
- delta_jiffies = rt_jiffies;
-
if (rcu_needs_cpu(cpu))
delta_jiffies = 1;
/*
--
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 8/8] sched: rt-group: smp balancing
[not found] <20080204210258.118479000@chello.nl>
` (6 preceding siblings ...)
2008-02-04 21:03 ` [PATCH 7/8] sched: rt-group: synchonised bandwidth period Peter Zijlstra
@ 2008-02-04 21:03 ` Peter Zijlstra
7 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-04 21:03 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel; +Cc: tong.n.li, Peter Zijlstra
[-- Attachment #1: sched-rt-group-smp.patch --]
[-- Type: text/plain, Size: 7868 bytes --]
Currently the rt group scheduling does a per cpu runtime limit, however
the rt load balancer makes no guarantees about an equal spread of real-
time tasks, just that at any one time, the highest priority tasks run.
Solve this by making the runtime limit a global property by borrowing
excessive runtime from the other cpus once the local limit runs out.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 38 ++++++++++++++++++++++-
kernel/sched_rt.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 121 insertions(+), 5 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -169,6 +169,7 @@ static inline ktime_t ns_to_ktime(u64 ns
struct rt_bandwidth {
ktime_t rt_period;
u64 rt_runtime;
+ spinlock_t rt_runtime_lock;
struct hrtimer rt_period_timer;
};
@@ -203,6 +204,8 @@ void init_rt_bandwidth(struct rt_bandwid
rt_b->rt_period = ns_to_ktime(period);
rt_b->rt_runtime = runtime;
+ spin_lock_init(&rt_b->rt_runtime_lock);
+
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
@@ -471,6 +474,8 @@ struct rt_rq {
#endif
int rt_throttled;
u64 rt_time;
+ u64 rt_runtime;
+ spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
@@ -7216,6 +7221,8 @@ static void init_rt_rq(struct rt_rq *rt_
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
+ rt_rq->rt_runtime = 0;
+ spin_lock_init(&rt_rq->rt_runtime_lock);
#ifdef CONFIG_RT_GROUP_SCHED
rt_rq->rt_nr_boosted = 0;
@@ -7252,6 +7259,7 @@ static void init_tg_rt_entry(struct rq *
init_rt_rq(rt_rq, rq);
rt_rq->tg = tg;
rt_rq->rt_se = rt_se;
+ rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
if (add)
list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -7307,6 +7315,8 @@ void __init sched_init(void)
init_tg_rt_entry(rq, &init_task_group,
&per_cpu(init_rt_rq, i),
&per_cpu(init_sched_rt_entity, i), i, 1);
+#else
+ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8007,15 +8017,26 @@ static int __rt_schedulable(struct task_
static int tg_set_bandwidth(struct task_group *tg,
u64 rt_period, u64 rt_runtime)
{
- int err = 0;
+ int i, err = 0;
mutex_lock(&rt_constraints_mutex);
if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
err = -EINVAL;
goto unlock;
}
+
+ spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = tg->rt_rq[i];
+
+ spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_runtime;
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
unlock:
mutex_unlock(&rt_constraints_mutex);
@@ -8079,6 +8100,19 @@ static int sched_rt_global_constraints(v
#else
static int sched_rt_global_constraints(void)
{
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+ spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = global_rt_runtime();
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
return 0;
}
#endif
@@ -8195,7 +8229,7 @@ static u64 cpu_shares_read_uint(struct c
#endif
#ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
const char __user *userbuf,
size_t nbytes, loff_t *unused_ppos)
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struc
if (!rt_rq->tg)
return RUNTIME_INF;
- return rt_rq->tg->rt_bandwidth.rt_runtime;
+ return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -145,11 +150,21 @@ struct rt_rq *sched_rt_period_rt_rq(stru
return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
}
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+ return &rt_rq->tg->rt_bandwidth;
+}
+
#else
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
- return def_rt_bandwidth.rt_runtime;
+ return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(def_rt_bandwidth.rt_period);
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -200,6 +215,11 @@ struct rt_rq *sched_rt_period_rt_rq(stru
return &cpu_rq(cpu)->rt;
}
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+ return &def_rt_bandwidth;
+}
+
#endif
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
@@ -218,8 +238,10 @@ static int do_sched_rt_period_timer(stru
spin_lock(&rq->lock);
if (rt_rq->rt_time) {
- u64 runtime = rt_b->rt_runtime;
+ u64 runtime;
+ spin_lock(&rt_rq->rt_runtime_lock);
+ runtime = rt_rq->rt_runtime;
rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
rt_rq->rt_throttled = 0;
@@ -227,6 +249,7 @@ static int do_sched_rt_period_timer(stru
}
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
+ spin_unlock(&rt_rq->rt_runtime_lock);
}
if (enqueue)
@@ -237,6 +260,47 @@ static int do_sched_rt_period_timer(stru
return idle;
}
+#ifdef CONFIG_SMP
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ int i, weight, more = 0;
+ u64 rt_period;
+
+ weight = cpus_weight(rd->span);
+
+ spin_lock(&rt_b->rt_runtime_lock);
+ rt_period = ktime_to_ns(rt_b->rt_period);
+ for_each_cpu_mask(i, rd->span) {
+ struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+ s64 diff;
+
+ if (iter == rt_rq)
+ continue;
+
+ spin_lock(&iter->rt_runtime_lock);
+ diff = iter->rt_runtime - iter->rt_time;
+ if (diff > 0) {
+ diff /= weight;
+ if (rt_rq->rt_runtime + diff > rt_period)
+ diff = rt_period - rt_rq->rt_runtime;
+ iter->rt_runtime -= diff;
+ rt_rq->rt_runtime += diff;
+ more = 1;
+ if (rt_rq->rt_runtime == rt_period) {
+ spin_unlock(&iter->rt_runtime_lock);
+ break;
+ }
+ }
+ spin_unlock(&iter->rt_runtime_lock);
+ }
+ spin_unlock(&rt_b->rt_runtime_lock);
+
+ return more;
+}
+#endif
+
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
#ifdef CONFIG_RT_GROUP_SCHED
@@ -259,6 +323,22 @@ static int sched_rt_runtime_exceeded(str
if (rt_rq->rt_throttled)
return rt_rq_throttled(rt_rq);
+ if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
+ return 0;
+
+#ifdef CONFIG_SMP
+ if (rt_rq->rt_time > runtime) {
+ int more;
+
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ more = balance_runtime(rt_rq);
+ spin_lock(&rt_rq->rt_runtime_lock);
+
+ if (more)
+ runtime = sched_rt_runtime(rt_rq);
+ }
+#endif
+
if (rt_rq->rt_time > runtime) {
rt_rq->rt_throttled = 1;
if (rt_rq_throttled(rt_rq)) {
@@ -294,9 +374,11 @@ static void update_curr_rt(struct rq *rq
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
+ spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq))
resched_task(curr);
+ spin_unlock(&rt_rq->rt_runtime_lock);
}
static inline
--
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 3/8] sched: rt-group: interface
2008-02-04 21:03 ` [PATCH 3/8] sched: rt-group: interface Peter Zijlstra
@ 2008-02-06 1:31 ` Randy Dunlap
2008-02-23 19:48 ` Paul Menage
1 sibling, 0 replies; 14+ messages in thread
From: Randy Dunlap @ 2008-02-06 1:31 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, tong.n.li
On Mon, 04 Feb 2008 22:03:01 +0100 Peter Zijlstra wrote:
> Change the rt_ratio interface to rt_runtime_us, to match rt_period_us.
> This avoids picking a granularity for the ratio.
>
> Extend the /sys/kernel/uids/<uid>/ interface to allow setting
> the group's rt_runtime.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
> Documentation/ABI/testing/sysfs-kernel-uids | 6 +
> Documentation/sched-rt-group.txt | 59 +++++++++++
> include/linux/sched.h | 7 -
> kernel/sched.c | 145 +++++++++++++++++++++-------
> kernel/sched_rt.c | 53 ++++------
> kernel/sysctl.c | 32 +++---
> kernel/user.c | 28 +++++
> 7 files changed, 250 insertions(+), 80 deletions(-)
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -7780,30 +7783,76 @@ unsigned long sched_group_shares(struct
> }
>
> /*
> - * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
> + * Ensure that the real time constraints are schedulable.
> */
> -int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
> +static DEFINE_MUTEX(rt_constraints_mutex);
> +
> +static unsigned long to_ratio(u64 period, u64 runtime)
> +{
> + if (runtime == RUNTIME_INF)
> + return 1ULL << 16;
> +
> + runtime *= (1ULL << 16);
> + do_div(runtime, period);
Isn't do_div() defined as taking (uint64_t, uint32_t) ?
> + return runtime;
> +}
> +
> Index: linux-2.6/Documentation/sched-rt-group.txt
> ===================================================================
> --- /dev/null
> +++ linux-2.6/Documentation/sched-rt-group.txt
> @@ -0,0 +1,59 @@
> +
> +
> +Real-Time group scheduling.
> +
> +The problem space:
> +
> +In order to schedule multiple groups of realtime tasks each group must
> +be assigned a fixed portion of the cpu time available. Without a minimum
Use "cpu" or "CPU" consistently, please. (I prefer CPU, but ....)
> +guarantee a realtime group can obviously fall short. A fuzzy upper limit
> +is of no use since it cannot be relied upon. Which leaves us with just
> +the single fixed portion.
> +
> +CPU time is divided by means of specifying how much time can be spend
s/spend/spent/
> +running in a given period. Say a frame fixed realtime renderer must
> +deliver a 25 frames a second, which yields a period of 0.04s. Now say
drop "a"^
> +it will also have to play some music and respond to input, leaving it
> +with around 80% for the graphics. We can then give this group a runtime
> +of 0.8 * 0.04s = 0.032s.
> +
> +This way the graphics group will have a 0.04s period with a 0.032s runtime
> +limit.
> +
> +Now if the audio thread needs to refill the dma buffer every 0.005s, but
DMA preferably.
> +needs only about 3% cpu time to do so, it will can do with a 0.03 * 0.005s
s/will can do/can do/
> += 0.00015s.
> +
> +
> +The Interface:
> +
> +system wide:
> +
> +/proc/sys/kernel/sched_rt_period_ms
> +/proc/sys/kernel/sched_rt_runtime_us
> +
> +CONFIG_FAIR_USER_SCHED
> +
> +/sys/kernel/uids/<uid>/cpu_rt_runtime_us
> +
> +or
> +
> +CONFIG_FAIR_CGROUP_SCHED
> +
> +/cgroup/<cgroup>/cpu.rt_runtime_us
> +
> +[ time is specified in us because the interface is s32, this gives an
s/,/;/
> + operating range of ~35m to 1us ]
> +
> +The period takes values in [ 1, INT_MAX ], runtime in [ -1, INT_MAX - 1 ].
> +
> +A runtime of -1 specifies runtime == period, ie. no limit.
> +
> +New groups get the period from /proc/sys/kernel/sched_rt_period_us and
> +a runtime of 0.
> +
> +Settings are constrainted to:
constrained
> +
> + \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
> +
> +in order to keep the configuration schedulable.
---
~Randy
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 3/8] sched: rt-group: interface
2008-02-04 21:03 ` [PATCH 3/8] sched: rt-group: interface Peter Zijlstra
2008-02-06 1:31 ` Randy Dunlap
@ 2008-02-23 19:48 ` Paul Menage
2008-02-23 19:57 ` Peter Zijlstra
1 sibling, 1 reply; 14+ messages in thread
From: Paul Menage @ 2008-02-23 19:48 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, tong.n.li
On Mon, Feb 4, 2008 at 1:03 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
> + struct file *file,
> + const char __user *userbuf,
> + size_t nbytes, loff_t *unused_ppos)
> +{
> + char buffer[64];
> + int retval = 0;
> + s64 val;
> + char *end;
> +
> + if (!nbytes)
> + return -EINVAL;
> + if (nbytes >= sizeof(buffer))
> + return -E2BIG;
> + if (copy_from_user(buffer, userbuf, nbytes))
> + return -EFAULT;
> +
> + buffer[nbytes] = 0; /* nul-terminate */
> +
> + /* strip newline if necessary */
> + if (nbytes && (buffer[nbytes-1] == '\n'))
> + buffer[nbytes-1] = 0;
> + val = simple_strtoll(buffer, &end, 0);
> + if (*end)
> + return -EINVAL;
> +
> + /* Pass to subsystem */
> + retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
> + if (!retval)
> + retval = nbytes;
> + return retval;
> }
>
> -static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
> -{
> - struct task_group *tg = cgroup_tg(cgrp);
> +static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
> + struct file *file,
> + char __user *buf, size_t nbytes,
> + loff_t *ppos)
> +{
> + char tmp[64];
> + long val = sched_group_rt_runtime(cgroup_tg(cgrp));
> + int len = sprintf(tmp, "%ld\n", val);
>
> - return (u64) tg->rt_ratio;
> + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
> }
What's the reason that you can't use the cgroup read_uint/write_uint
methods for this? Is it just because you have -1 as your "unlimited"
value.
If so, could we avoid that problem by using 0 rather than -1 as the
"unlimited" value? It looks from what I've read in the Documentation
changes as though 0 isn't really a meaningful value.
Paul
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 3/8] sched: rt-group: interface
2008-02-23 19:48 ` Paul Menage
@ 2008-02-23 19:57 ` Peter Zijlstra
2008-02-23 20:02 ` Paul Menage
0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-23 19:57 UTC (permalink / raw)
To: Paul Menage; +Cc: Ingo Molnar, linux-kernel, tong.n.li
On Sat, 2008-02-23 at 11:48 -0800, Paul Menage wrote:
> On Mon, Feb 4, 2008 at 1:03 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
> > + struct file *file,
> > + const char __user *userbuf,
> > + size_t nbytes, loff_t *unused_ppos)
> > +{
> > + char buffer[64];
> > + int retval = 0;
> > + s64 val;
> > + char *end;
> > +
> > + if (!nbytes)
> > + return -EINVAL;
> > + if (nbytes >= sizeof(buffer))
> > + return -E2BIG;
> > + if (copy_from_user(buffer, userbuf, nbytes))
> > + return -EFAULT;
> > +
> > + buffer[nbytes] = 0; /* nul-terminate */
> > +
> > + /* strip newline if necessary */
> > + if (nbytes && (buffer[nbytes-1] == '\n'))
> > + buffer[nbytes-1] = 0;
> > + val = simple_strtoll(buffer, &end, 0);
> > + if (*end)
> > + return -EINVAL;
> > +
> > + /* Pass to subsystem */
> > + retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
> > + if (!retval)
> > + retval = nbytes;
> > + return retval;
> > }
> >
> > -static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
> > -{
> > - struct task_group *tg = cgroup_tg(cgrp);
> > +static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
> > + struct file *file,
> > + char __user *buf, size_t nbytes,
> > + loff_t *ppos)
> > +{
> > + char tmp[64];
> > + long val = sched_group_rt_runtime(cgroup_tg(cgrp));
> > + int len = sprintf(tmp, "%ld\n", val);
> >
> > - return (u64) tg->rt_ratio;
> > + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
> > }
>
> What's the reason that you can't use the cgroup read_uint/write_uint
> methods for this? Is it just because you have -1 as your "unlimited"
> value.
Yes.
> If so, could we avoid that problem by using 0 rather than -1 as the
> "unlimited" value? It looks from what I've read in the Documentation
> changes as though 0 isn't really a meaningful value.
0 means no time, quite useful and clearly distinct from inf. time.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 3/8] sched: rt-group: interface
2008-02-23 19:57 ` Peter Zijlstra
@ 2008-02-23 20:02 ` Paul Menage
2008-02-23 20:26 ` Peter Zijlstra
0 siblings, 1 reply; 14+ messages in thread
From: Paul Menage @ 2008-02-23 20:02 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, tong.n.li
On Sat, Feb 23, 2008 at 11:57 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > If so, could we avoid that problem by using 0 rather than -1 as the
> > "unlimited" value? It looks from what I've read in the Documentation
> > changes as though 0 isn't really a meaningful value.
>
> 0 means no time, quite useful and clearly distinct from inf. time.
>
So a real-time task in a cgroup with a 0 rt_runtime can be in the R
state but never actually get to run? OK, if people need to be able to
do that then fair enough.
In that case I guess I'll have to add signed versions of the
read_uint/write_uint methods.
Paul
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 3/8] sched: rt-group: interface
2008-02-23 20:02 ` Paul Menage
@ 2008-02-23 20:26 ` Peter Zijlstra
2008-02-23 20:36 ` Paul Menage
0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2008-02-23 20:26 UTC (permalink / raw)
To: Paul Menage; +Cc: Ingo Molnar, linux-kernel, tong.n.li
On Sat, 2008-02-23 at 12:02 -0800, Paul Menage wrote:
> On Sat, Feb 23, 2008 at 11:57 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > > If so, could we avoid that problem by using 0 rather than -1 as the
> > > "unlimited" value? It looks from what I've read in the Documentation
> > > changes as though 0 isn't really a meaningful value.
> >
> > 0 means no time, quite useful and clearly distinct from inf. time.
> >
>
> So a real-time task in a cgroup with a 0 rt_runtime can be in the R
> state but never actually get to run? OK, if people need to be able to
> do that then fair enough.
Yeah, its an awkward situation, and we refuse new rt tasks in such
groups. But the 0 value is needed so you can have groups that don't
participate in the realtime scheduling because we enforce a
schedulalbility constraint over the groups.
Each group has a runtime ratio, namely: rt_runtime / rt_period.
The sum of this ratio over all groups must be smaller or equal to the
global ratio which must be smaller or equal to 1.
> In that case I guess I'll have to add signed versions of the
> read_uint/write_uint methods.
Yes, I looked at that, I found the interface somewhat unfortunate, it
would mean growing the struct with two more function pointers. Perhaps a
read and write function with abstract data would be better suited. That
would allow for this and more. Sadly it looses type information.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 3/8] sched: rt-group: interface
2008-02-23 20:26 ` Peter Zijlstra
@ 2008-02-23 20:36 ` Paul Menage
0 siblings, 0 replies; 14+ messages in thread
From: Paul Menage @ 2008-02-23 20:36 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, tong.n.li
On Sat, Feb 23, 2008 at 12:26 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
> > In that case I guess I'll have to add signed versions of the
> > read_uint/write_uint methods.
>
> Yes, I looked at that, I found the interface somewhat unfortunate, it
> would mean growing the struct with two more function pointers.
Is that really a big deal? We're talking about a structure that has a
small number (<10 in the current tree) of instances per cgroup
subsystem.
> Perhaps a
> read and write function with abstract data would be better suited. That
> would allow for this and more. Sadly it looses type information.
If the size of the struct cftype really became a problem, I think the
cleanest way to fix it would be to have a union of the potential
function pointers, and add a field to specify which one is in use.
Paul
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2008-02-23 20:37 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20080204210258.118479000@chello.nl>
2008-02-04 21:02 ` [PATCH 1/8] sched: fix incorrect irq lock usage in normalize_rt_tasks() Peter Zijlstra
2008-02-04 21:03 ` [PATCH 2/8] sched: rt-group: deal with PI Peter Zijlstra
2008-02-04 21:03 ` [PATCH 3/8] sched: rt-group: interface Peter Zijlstra
2008-02-06 1:31 ` Randy Dunlap
2008-02-23 19:48 ` Paul Menage
2008-02-23 19:57 ` Peter Zijlstra
2008-02-23 20:02 ` Paul Menage
2008-02-23 20:26 ` Peter Zijlstra
2008-02-23 20:36 ` Paul Menage
2008-02-04 21:03 ` [PATCH 4/8] sched: rt-group: make rt groups scheduling configurable Peter Zijlstra
2008-02-04 21:03 ` [PATCH 5/8] sched: rt-group: clean up the ifdeffery Peter Zijlstra
2008-02-04 21:03 ` [PATCH 6/8] sched: rt-group: refure unrunnable tasks Peter Zijlstra
2008-02-04 21:03 ` [PATCH 7/8] sched: rt-group: synchonised bandwidth period Peter Zijlstra
2008-02-04 21:03 ` [PATCH 8/8] sched: rt-group: smp balancing Peter Zijlstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).