LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [git pull] scheduler fixes
@ 2008-10-28 10:50 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2008-10-28 10:50 UTC (permalink / raw)
To: Linus Torvalds; +Cc: linux-kernel, Andrew Morton, Peter Zijlstra
Linus,
Please pull the latest sched-fixes-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus
Thanks,
Ingo
------------------>
Jiri Kosina (1):
sched: fix documentation reference for sched_min_granularity_ns
Li Zefan (1):
sched: add CONFIG_SMP consistency
Mike Galbraith (1):
sched: weaken sync hint
Peter Zijlstra (4):
sched: fix a find_busiest_group buglet
sched: more accurate min_vruntime accounting
sched: re-instate vruntime based wakeup preemption
sched: virtual time buddy preemption
Documentation/scheduler/sched-design-CFS.txt | 2 +-
include/linux/sched.h | 12 +-
kernel/sched.c | 3 +-
kernel/sched_fair.c | 169 +++++++++++++++++++-------
kernel/sched_idletask.c | 5 +-
kernel/sched_rt.c | 5 +-
6 files changed, 139 insertions(+), 57 deletions(-)
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 9d8eb55..eb471c7 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -92,7 +92,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
way the previous scheduler had, and has no heuristics whatsoever. There is
only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
- /proc/sys/kernel/sched_granularity_ns
+ /proc/sys/kernel/sched_min_granularity_ns
which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
"server" (i.e., good batching) workloads. It defaults to a setting suitable
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8478f33..b483f39 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -936,7 +936,6 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
void (*yield_task) (struct rq *rq);
- int (*select_task_rq)(struct task_struct *p, int sync);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
@@ -944,6 +943,8 @@ struct sched_class {
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
+ int (*select_task_rq)(struct task_struct *p, int sync);
+
unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
struct rq *busiest, unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
@@ -955,16 +956,17 @@ struct sched_class {
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
-#endif
- void (*set_curr_task) (struct rq *rq);
- void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
- void (*task_new) (struct rq *rq, struct task_struct *p);
void (*set_cpus_allowed)(struct task_struct *p,
const cpumask_t *newmask);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+#endif
+
+ void (*set_curr_task) (struct rq *rq);
+ void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+ void (*task_new) (struct rq *rq, struct task_struct *p);
void (*switched_from) (struct rq *this_rq, struct task_struct *task,
int running);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6625c3c..e8819bc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -386,7 +386,6 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
- u64 pair_start;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
@@ -3344,7 +3343,7 @@ small_imbalance:
} else
this_load_per_task = cpu_avg_load_per_task(this_cpu);
- if (max_load - this_load + 2*busiest_load_per_task >=
+ if (max_load - this_load + busiest_load_per_task >=
busiest_load_per_task * imbn) {
*imbalance = busiest_load_per_task;
return busiest;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9573c33..ce514af 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -143,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+ int depth = 0;
+
+ for_each_sched_entity(se)
+ depth++;
+
+ return depth;
+}
+
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+ int se_depth, pse_depth;
+
+ /*
+ * preemption test can be made between sibling entities who are in the
+ * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+ * both tasks until we find their ancestors who are siblings of common
+ * parent.
+ */
+
+ /* First walk up until both entities are at same depth */
+ se_depth = depth_se(*se);
+ pse_depth = depth_se(*pse);
+
+ while (se_depth > pse_depth) {
+ se_depth--;
+ *se = parent_entity(*se);
+ }
+
+ while (pse_depth > se_depth) {
+ pse_depth--;
+ *pse = parent_entity(*pse);
+ }
+
+ while (!is_same_group(*se, *pse)) {
+ *se = parent_entity(*se);
+ *pse = parent_entity(*pse);
+ }
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -193,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return NULL;
}
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -223,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
return se->vruntime - cfs_rq->min_vruntime;
}
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ if (cfs_rq->curr)
+ vruntime = cfs_rq->curr->vruntime;
+
+ if (cfs_rq->rb_leftmost) {
+ struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+ struct sched_entity,
+ run_node);
+
+ if (vruntime == cfs_rq->min_vruntime)
+ vruntime = se->vruntime;
+ else
+ vruntime = min_vruntime(vruntime, se->vruntime);
+ }
+
+ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
+
/*
* Enqueue an entity into the rb-tree:
*/
@@ -256,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
- if (leftmost) {
+ if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
- /*
- * maintain cfs_rq->min_vruntime to be a monotonic increasing
- * value tracking the leftmost vruntime in the tree.
- */
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime, se->vruntime);
- }
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -274,18 +336,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
- struct sched_entity *next;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
-
- if (next_node) {
- next = rb_entry(next_node,
- struct sched_entity, run_node);
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime,
- next->vruntime);
- }
}
if (cfs_rq->next == se)
@@ -424,6 +477,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
+ update_min_vruntime(cfs_rq);
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -613,13 +667,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
- u64 vruntime;
-
- if (first_fair(cfs_rq)) {
- vruntime = min_vruntime(cfs_rq->min_vruntime,
- __pick_next_entity(cfs_rq)->vruntime);
- } else
- vruntime = cfs_rq->min_vruntime;
+ u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
@@ -696,6 +744,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
+ update_min_vruntime(cfs_rq);
}
/*
@@ -742,16 +791,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+
static struct sched_entity *
pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rq *rq = rq_of(cfs_rq);
- u64 pair_slice = rq->clock - cfs_rq->pair_start;
-
- if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) {
- cfs_rq->pair_start = rq->clock;
+ if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
return se;
- }
return cfs_rq->next;
}
@@ -1122,10 +1169,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
- if (!sync && sched_feat(SYNC_WAKEUPS) &&
- curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost)
- sync = 1;
+ if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+ p->se.avg_overlap > sysctl_sched_migration_cost))
+ sync = 0;
/*
* If sync wakeup then subtract the (maximum possible)
@@ -1244,13 +1290,42 @@ static unsigned long wakeup_gran(struct sched_entity *se)
* More easily preempt - nice tasks, while not making it harder for
* + nice tasks.
*/
- if (sched_feat(ASYM_GRAN))
- gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
+ if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
+ gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
return gran;
}
/*
+ * Should 'se' preempt 'curr'.
+ *
+ * |s1
+ * |s2
+ * |s3
+ * g
+ * |<--->|c
+ *
+ * w(c, s1) = -1
+ * w(c, s2) = 0
+ * w(c, s3) = 1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+ s64 gran, vdiff = curr->vruntime - se->vruntime;
+
+ if (vdiff <= 0)
+ return -1;
+
+ gran = wakeup_gran(curr);
+ if (vdiff > gran)
+ return 1;
+
+ return 0;
+}
+
+/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
@@ -1258,7 +1333,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
- s64 delta_exec;
if (unlikely(rt_prio(p->prio))) {
update_rq_clock(rq);
@@ -1296,9 +1370,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
return;
}
- delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
- if (delta_exec > wakeup_gran(pse))
- resched_task(curr);
+ find_matching_se(&se, &pse);
+
+ while (se) {
+ BUG_ON(!pse);
+
+ if (wakeup_preempt_entity(se, pse) == 1) {
+ resched_task(curr);
+ break;
+ }
+
+ se = parent_entity(se);
+ pse = parent_entity(pse);
+ }
}
static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1594,9 +1678,6 @@ static const struct sched_class fair_sched_class = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_fair,
-#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_wakeup,
@@ -1604,6 +1685,8 @@ static const struct sched_class fair_sched_class = {
.put_prev_task = put_prev_task_fair,
#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_fair,
+
.load_balance = load_balance_fair,
.move_one_task = move_one_task_fair,
#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index dec4cca..8a21a2e 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = {
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_idle,
-#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_curr_idle,
@@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = {
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_idle,
+
.load_balance = load_balance_idle,
.move_one_task = move_one_task_idle,
#endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b446dc8..d9ba9d5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1504,9 +1504,6 @@ static const struct sched_class rt_sched_class = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_rt,
-#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_curr_rt,
@@ -1514,6 +1511,8 @@ static const struct sched_class rt_sched_class = {
.put_prev_task = put_prev_task_rt,
#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_rt,
+
.load_balance = load_balance_rt,
.move_one_task = move_one_task_rt,
.set_cpus_allowed = set_cpus_allowed_rt,
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2023-09-17 17:39 [GIT PULL] " Ingo Molnar
@ 2023-09-17 18:24 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2023-09-17 18:24 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Borislav Petkov, Andrew Morton, Juri Lelli, Vincent Guittot,
Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
Daniel Bristot de Oliveira, Valentin Schneider
The pull request you sent on Sun, 17 Sep 2023 19:39:13 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2023-09-17
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/e5a710d13247975444c9a18e28413e566c334271
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2023-09-17 17:39 Ingo Molnar
2023-09-17 18:24 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2023-09-17 17:39 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Borislav Petkov,
Andrew Morton, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman,
Daniel Bristot de Oliveira, Valentin Schneider
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2023-09-17
# HEAD: 108af4b4bd3813610701379a58538e3339b162e4 x86/sched: Restore the SD_ASYM_PACKING flag in the DIE domain
Fix a performance regression on large SMT systems, an Intel SMT4
balancing bug, and a topology setup bug on (Intel) hybrid processors.
Thanks,
Ingo
------------------>
Ricardo Neri (1):
x86/sched: Restore the SD_ASYM_PACKING flag in the DIE domain
Shrikanth Hegde (1):
sched/fair: Optimize should_we_balance() for large SMT systems
Tim Chen (1):
sched/fair: Fix SMT4 group_smt_balance handling
arch/x86/kernel/smpboot.c | 12 +++++++++---
kernel/sched/fair.c | 27 +++++++++++++++++++++++++--
2 files changed, 34 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d40ed3a7dc23..266d05e22ac3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,7 +579,6 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
}
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -603,7 +602,14 @@ static int x86_cluster_flags(void)
return cpu_cluster_flags() | x86_sched_itmt_flags();
}
#endif
-#endif
+
+static int x86_die_flags(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return x86_sched_itmt_flags();
+
+ return 0;
+}
/*
* Set if a package/die has multiple NUMA nodes inside.
@@ -640,7 +646,7 @@ static void __init build_sched_topology(void)
*/
if (!x86_has_numa_in_package) {
x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, SD_INIT_NAME(DIE)
+ cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
};
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8dbff6e7ad4f..cb225921bbca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6619,6 +6619,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Working cpumask for: load_balance, load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -9579,7 +9580,7 @@ static inline long sibling_imbalance(struct lb_env *env,
imbalance /= ncores_local + ncores_busiest;
/* Take advantage of resource in an empty sched group */
- if (imbalance == 0 && local->sum_nr_running == 0 &&
+ if (imbalance <= 1 && local->sum_nr_running == 0 &&
busiest->sum_nr_running > 1)
imbalance = 2;
@@ -9767,6 +9768,15 @@ static bool update_sd_pick_busiest(struct lb_env *env,
break;
case group_smt_balance:
+ /*
+ * Check if we have spare CPUs on either SMT group to
+ * choose has spare or fully busy handling.
+ */
+ if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
+ goto has_spare;
+
+ fallthrough;
+
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -9806,6 +9816,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
else
return true;
}
+has_spare:
/*
* Select not overloaded group with lowest number of idle cpus
@@ -10917,6 +10928,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
+ struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
struct sched_group *sg = env->sd->groups;
int cpu, idle_smt = -1;
@@ -10940,8 +10952,9 @@ static int should_we_balance(struct lb_env *env)
return 1;
}
+ cpumask_copy(swb_cpus, group_balance_mask(sg));
/* Try to find first idle CPU */
- for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ for_each_cpu_and(cpu, swb_cpus, env->cpus) {
if (!idle_cpu(cpu))
continue;
@@ -10953,6 +10966,14 @@ static int should_we_balance(struct lb_env *env)
if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
if (idle_smt == -1)
idle_smt = cpu;
+ /*
+ * If the core is not idle, and first SMT sibling which is
+ * idle has been found, then its not needed to check other
+ * SMT siblings for idleness:
+ */
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
+#endif
continue;
}
@@ -12918,6 +12939,8 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
+ GFP_KERNEL, cpu_to_node(i));
#ifdef CONFIG_CFS_BANDWIDTH
INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2023-09-02 10:09 Ingo Molnar
@ 2023-09-02 16:13 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2023-09-02 16:13 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Borislav Petkov, Andrew Morton, Juri Lelli, Vincent Guittot,
Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
Daniel Bristot de Oliveira, Valentin Schneider
The pull request you sent on Sat, 2 Sep 2023 12:09:50 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2023-09-02
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/c39cbc5b604c7282f210c4a3a743c9f026ed8002
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2023-09-02 10:09 Ingo Molnar
2023-09-02 16:13 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2023-09-02 10:09 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Borislav Petkov,
Andrew Morton, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman,
Daniel Bristot de Oliveira, Valentin Schneider
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2023-09-02
# HEAD: 0d6b35283bcf1a379cf20066544af8e6a6b16b46 sched/core: Report correct state for TASK_IDLE | TASK_FREEZABLE
Miscellaneous scheduler fixes: a reporting fix, a static symbol fix,
and a kernel-doc fix.
Thanks,
Ingo
------------------>
Costa Shulyupin (1):
sched/core: Add kernel-doc for set_cpus_allowed_ptr()
Hao Jia (1):
sched/fair: Make update_entity_lag() static
NeilBrown (1):
sched/core: Report correct state for TASK_IDLE | TASK_FREEZABLE
include/linux/sched.h | 14 ++++++++++++--
kernel/sched/fair.c | 2 +-
2 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 177b3f3676ef..77f01ac385f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1671,7 +1671,7 @@ static inline unsigned int __task_state_index(unsigned int tsk_state,
BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
- if (tsk_state == TASK_IDLE)
+ if ((tsk_state & TASK_IDLE) == TASK_IDLE)
state = TASK_REPORT_IDLE;
/*
@@ -1679,7 +1679,7 @@ static inline unsigned int __task_state_index(unsigned int tsk_state,
* to userspace, we can make this appear as if the task has gone through
* a regular rt_mutex_lock() call.
*/
- if (tsk_state == TASK_RTLOCK_WAIT)
+ if (tsk_state & TASK_RTLOCK_WAIT)
state = TASK_UNINTERRUPTIBLE;
return fls(state);
@@ -1858,7 +1858,17 @@ extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
#ifdef CONFIG_SMP
+
+/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+
+/**
+ * set_cpus_allowed_ptr - set CPU affinity mask of a task
+ * @p: the task
+ * @new_mask: CPU affinity mask
+ *
+ * Return: zero if successful, or a negative error code
+ */
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
extern void release_user_cpus_ptr(struct task_struct *p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 911d0063763c..8dbff6e7ad4f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -699,7 +699,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 lag, limit;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2023-02-17 7:42 Ingo Molnar
@ 2023-02-17 22:47 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2023-02-17 22:47 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Borislav Petkov, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman,
Daniel Bristot de Oliveira, Valentin Schneider
The pull request you sent on Fri, 17 Feb 2023 08:42:08 +0100:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2023-02-17
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/64e0253df67d5e1557e82b923c78f1bad185eb9a
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2023-02-17 7:42 Ingo Molnar
2023-02-17 22:47 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2023-02-17 7:42 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Borislav Petkov,
Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
Ben Segall, Mel Gorman, Daniel Bristot de Oliveira,
Valentin Schneider
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2023-02-17
# HEAD: c2dbe32d5db5c4ead121cf86dabd5ab691fb47fe sched/psi: Fix use-after-free in ep_remove_wait_queue()
Misc scheduler fixes:
- Fix user-after-free bug in call_usermodehelper_exec()
- Fix missing user_cpus_ptr update in __set_cpus_allowed_ptr_locked()
- Fix PSI use-after-free bug in ep_remove_wait_queue()
Thanks,
Ingo
------------------>
Munehisa Kamata (1):
sched/psi: Fix use-after-free in ep_remove_wait_queue()
Peter Zijlstra (1):
freezer,umh: Fix call_usermode_helper_exec() vs SIGKILL
Waiman Long (1):
sched/core: Fix a missed update of user_cpus_ptr
kernel/sched/core.c | 5 ++++-
kernel/sched/psi.c | 7 ++++---
kernel/umh.c | 20 +++++++++++++-------
3 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e838feb6adc5..2a4918a1faa9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2951,8 +2951,11 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
}
if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
- if (cpumask_equal(&p->cpus_mask, ctx->new_mask))
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
goto out;
+ }
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8ac8b81bfee6..02e011cabe91 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1343,10 +1343,11 @@ void psi_trigger_destroy(struct psi_trigger *t)
group = t->group;
/*
- * Wakeup waiters to stop polling. Can happen if cgroup is deleted
- * from under a polling process.
+ * Wakeup waiters to stop polling and clear the queue to prevent it from
+ * being accessed later. Can happen if cgroup is deleted from under a
+ * polling process.
*/
- wake_up_interruptible(&t->event_wait);
+ wake_up_pollfree(&t->event_wait);
mutex_lock(&group->trigger_lock);
diff --git a/kernel/umh.c b/kernel/umh.c
index 850631518665..fbf872c624cb 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -438,21 +438,27 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
- if (wait & UMH_KILLABLE)
- state |= TASK_KILLABLE;
-
if (wait & UMH_FREEZABLE)
state |= TASK_FREEZABLE;
- retval = wait_for_completion_state(&done, state);
- if (!retval)
- goto wait_done;
-
if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_state(&done, state | TASK_KILLABLE);
+ if (!retval)
+ goto wait_done;
+
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
+
+ /*
+ * fallthrough; in case of -ERESTARTSYS now do uninterruptible
+ * wait_for_completion_state(). Since umh_complete() shall call
+ * complete() in a moment if xchg() above returned NULL, this
+ * uninterruptible wait_for_completion_state() will not block
+ * SIGKILL'ed processes for long.
+ */
}
+ wait_for_completion_state(&done, state);
wait_done:
retval = sub_info->retval;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2023-01-12 14:25 Ingo Molnar
@ 2023-01-12 23:01 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2023-01-12 23:01 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Borislav Petkov, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman,
Daniel Bristot de Oliveira, Valentin Schneider
The pull request you sent on Thu, 12 Jan 2023 15:25:31 +0100:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2023-01-12
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/ea66bf86536d4d98932843896e9d940110a06701
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2023-01-12 14:25 Ingo Molnar
2023-01-12 23:01 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2023-01-12 14:25 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Borislav Petkov,
Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
Ben Segall, Mel Gorman, Daniel Bristot de Oliveira,
Valentin Schneider
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2023-01-12
# HEAD: 9a5418bc48babb313d2a62df29ebe21ce8c06c59 sched/core: Use kfree_rcu() in do_set_cpus_allowed()
- Fix scheduler frequency invariance bug related to overly long tickless periods
triggering an integer overflow and disabling the feature.
- Fix use-after-free bug in dup_user_cpus_ptr().
- Fix do_set_cpus_allowed() deadlock scenarios related to calling kfree() with
the pi_lock held. NOTE: the rcu_free() is the 'lazy' solution here - we looked
at patches to free the structure after the pi_lock got dropped, but that looked
quite a bit messier - and none of this is truly performance critical. We can
revisit this if it's too lazy of a solution ...
Thanks,
Ingo
------------------>
Waiman Long (2):
sched/core: Fix use-after-free bug in dup_user_cpus_ptr()
sched/core: Use kfree_rcu() in do_set_cpus_allowed()
Yair Podemsky (1):
sched/core: Fix arch_scale_freq_tick() on tickless systems
kernel/sched/core.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 60 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25b582b6ee5f..bb1ee6d7bdde 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2604,27 +2604,71 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
.user_mask = NULL,
.flags = SCA_USER, /* clear the user requested mask */
};
+ union cpumask_rcuhead {
+ cpumask_t cpumask;
+ struct rcu_head rcu;
+ };
__do_set_cpus_allowed(p, &ac);
- kfree(ac.user_mask);
+
+ /*
+ * Because this is called with p->pi_lock held, it is not possible
+ * to use kfree() here (when PREEMPT_RT=y), therefore punt to using
+ * kfree_rcu().
+ */
+ kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
+}
+
+static cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ /*
+ * See do_set_cpus_allowed() above for the rcu_head usage.
+ */
+ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
+
+ return kmalloc_node(size, GFP_KERNEL, node);
}
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
+ cpumask_t *user_mask;
unsigned long flags;
- if (!src->user_cpus_ptr)
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
return 0;
- dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
- if (!dst->user_cpus_ptr)
+ user_mask = alloc_user_cpus_ptr(node);
+ if (!user_mask)
return -ENOMEM;
- /* Use pi_lock to protect content of user_cpus_ptr */
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * do_set_cpus_allowed().
+ */
raw_spin_lock_irqsave(&src->pi_lock, flags);
- cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
return 0;
}
@@ -3581,6 +3625,11 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
return false;
}
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ return NULL;
+}
+
#endif /* !CONFIG_SMP */
static void
@@ -5504,7 +5553,9 @@ void scheduler_tick(void)
unsigned long thermal_pressure;
u64 resched_latency;
- arch_scale_freq_tick();
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ arch_scale_freq_tick();
+
sched_clock_tick();
rq_lock(rq, &rf);
@@ -8239,8 +8290,8 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (retval)
goto out_put_task;
- user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
- if (!user_mask) {
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (IS_ENABLED(CONFIG_SMP) && !user_mask) {
retval = -ENOMEM;
goto out_put_task;
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2022-08-06 19:21 Ingo Molnar
@ 2022-08-07 0:50 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2022-08-07 0:50 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Daniel Bristot de Oliveira, Valentin Schneider,
Thomas Gleixner
The pull request you sent on Sat, 6 Aug 2022 21:21:12 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2022-08-06
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/cac03ac368fabff0122853de2422d4e17a32de08
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2022-08-06 19:21 Ingo Molnar
2022-08-07 0:50 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2022-08-06 19:21 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Juri Lelli, Vincent Guittot,
Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
Daniel Bristot de Oliveira, Valentin Schneider, Thomas Gleixner
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2022-08-06
# HEAD: 751d4cbc43879229dbc124afefe240b70fd29a85 sched/core: Do not requeue task on CPU excluded from cpus_mask
Various fixes: a deadline scheduler fix, a migration fix, a Sparse fix and a comment fix.
Thanks,
Ingo
------------------>
Ben Dooks (1):
sched/rt: Fix Sparse warnings due to undefined rt.c declarations
Ingo Molnar (1):
exit: Fix typo in comment: s/sub-theads/sub-threads
Mel Gorman (1):
sched/core: Do not requeue task on CPU excluded from cpus_mask
Waiman Long (1):
sched, cpuset: Fix dl_cpu_busy() panic due to empty cs->cpus_allowed
include/linux/sched.h | 2 +-
kernel/cgroup/cpuset.c | 2 +-
kernel/exit.c | 2 +-
kernel/sched/core.c | 16 +++++++++++-----
kernel/sched/sched.h | 7 ++++---
5 files changed, 18 insertions(+), 11 deletions(-)
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2021-07-11 13:32 Ingo Molnar
2021-07-11 18:16 ` Linus Torvalds
@ 2021-07-11 18:22 ` pr-tracker-bot
1 sibling, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2021-07-11 18:22 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Juri Lelli,
Vincent Guittot, Andrew Morton
The pull request you sent on Sun, 11 Jul 2021 15:32:54 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2021-07-11
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/877029d9216dcc842f50d37571f318cd17a30a2d
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2021-07-11 13:32 Ingo Molnar
@ 2021-07-11 18:16 ` Linus Torvalds
2021-07-11 18:22 ` pr-tracker-bot
1 sibling, 0 replies; 384+ messages in thread
From: Linus Torvalds @ 2021-07-11 18:16 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linux Kernel Mailing List, Peter Zijlstra, Juri Lelli,
Vincent Guittot, Andrew Morton
On Sun, Jul 11, 2021 at 6:32 AM Ingo Molnar <mingo@kernel.org> wrote:
>
> - Fix load tracking bug/inconsistency
I was going to report that I still see that
cfs_rq->avg.load_avg || cfs_rq->avg.util_avg || cfs_rq->avg.runnable_avg
WARNING: CPU: 8 PID: 52 at kernel/sched/fair.c:3308
update_blocked_averages+0x4c1/0x5f0
but I'm hoping/assuming that commit ceb6ba45dc80 ("sched/fair: Sync
load_sum with load_avg after dequeue") finally fixes it for good.
Hmm?
Linus
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2021-07-11 13:32 Ingo Molnar
2021-07-11 18:16 ` Linus Torvalds
2021-07-11 18:22 ` pr-tracker-bot
0 siblings, 2 replies; 384+ messages in thread
From: Ingo Molnar @ 2021-07-11 13:32 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Juri Lelli, Vincent Guittot, Andrew Morton
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2021-07-11
# HEAD: 3e1493f46390618ea78607cb30c58fc19e2a5035 sched/uclamp: Ignore max aggregation if rq is idle
Three fixes:
- Fix load tracking bug/inconsistency
- Fix a sporadic CFS bandwidth constraints enforcement bug
- Fix a uclamp utilization tracking bug for newly woken tasks
Thanks,
Ingo
------------------>
Odin Ugedal (1):
sched/fair: Fix CFS bandwidth hrtimer expiry type
Vincent Guittot (1):
sched/fair: Sync load_sum with load_avg after dequeue
Xuewen Yan (1):
sched/uclamp: Ignore max aggregation if rq is idle
kernel/sched/fair.c | 7 ++++---
kernel/sched/sched.h | 21 ++++++++++++++-------
2 files changed, 18 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 45edf61eed73..1b15a19910a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3037,8 +3037,9 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ u32 divider = get_pelt_divider(&se->avg);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
}
#else
static inline void
@@ -5053,7 +5054,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
- u64 remaining;
+ s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
@@ -5061,7 +5062,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
- if (remaining < min_expire)
+ if (remaining < (s64)min_expire)
return 1;
return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c80d42e9589b..14a41a243f7b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2818,20 +2818,27 @@ static __always_inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
struct task_struct *p)
{
- unsigned long min_util;
- unsigned long max_util;
+ unsigned long min_util = 0;
+ unsigned long max_util = 0;
if (!static_branch_likely(&sched_uclamp_used))
return util;
- min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
-
if (p) {
- min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
- max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
+ min_util = uclamp_eff_value(p, UCLAMP_MIN);
+ max_util = uclamp_eff_value(p, UCLAMP_MAX);
+
+ /*
+ * Ignore last runnable task's max clamp, as this task will
+ * reset it. Similarly, no need to read the rq's min clamp.
+ */
+ if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ goto out;
}
+ min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
+ max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+out:
/*
* Since CPU's {min,max}_util clamps are MAX aggregated considering
* RUNNABLE tasks with _different_ clamps, we can end up with an
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2021-06-12 13:02 Ingo Molnar
@ 2021-06-12 19:09 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2021-06-12 19:09 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
The pull request you sent on Sat, 12 Jun 2021 15:02:06 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2021-06-12
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/99f925947ab0fd5c17b74460d8b32f1aa1c86e3a
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2021-06-12 13:02 Ingo Molnar
2021-06-12 19:09 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2021-06-12 13:02 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2021-06-12
# HEAD: 68d7a190682aa4eb02db477328088ebad15acc83 sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling
Misc fixes:
- Fix performance regression caused by lack of intended
batching of RCU callbacks by over-eager NOHZ-full code.
- Fix cgroups related corruption of load_avg and load_sum metrics.
- Three fixes to fix blocked load, util_sum/runnable_sum and
util_est tracking bugs.
Thanks,
Ingo
------------------>
Dietmar Eggemann (1):
sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling
Frederic Weisbecker (1):
tick/nohz: Only check for RCU deferred wakeup on user/guest entry when needed
Vincent Guittot (3):
sched/fair: Keep load_avg and load_sum synced
sched/fair: Make sure to update tg contrib for blocked load
sched/pelt: Ensure that *_sum is always synced with *_avg
include/linux/entry-kvm.h | 3 ++-
include/linux/sched.h | 8 ++++++++
include/linux/tick.h | 7 +++++++
kernel/entry/common.c | 5 +++--
kernel/sched/debug.c | 3 ++-
kernel/sched/fair.c | 28 +++++++++++++++++-----------
kernel/sched/pelt.h | 11 +----------
kernel/time/tick-sched.c | 1 +
8 files changed, 41 insertions(+), 25 deletions(-)
diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 8b2b1d68b954..136b8d97d8c0 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -3,6 +3,7 @@
#define __LINUX_ENTRYKVM_H
#include <linux/entry-common.h>
+#include <linux/tick.h>
/* Transfer to guest mode work */
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
@@ -57,7 +58,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
static inline void xfer_to_guest_mode_prepare(void)
{
lockdep_assert_irqs_disabled();
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();
}
/**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2c881384517..28a98fc4ded4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -350,11 +350,19 @@ struct load_weight {
* Only for tasks we track a moving average of the past instantaneous
* estimated utilization. This allows to absorb sporadic drops in utilization
* of an otherwise almost periodic task.
+ *
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est.enqueued at dequeue
+ * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
+ * for a task) it is safe to use MSB.
*/
struct util_est {
unsigned int enqueued;
unsigned int ewma;
#define UTIL_EST_WEIGHT_SHIFT 2
+#define UTIL_AVG_UNCHANGED 0x80000000
} __attribute__((__aligned__(sizeof(u64))));
/*
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7340613c7eff..1a0ff88fa107 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -11,6 +11,7 @@
#include <linux/context_tracking_state.h>
#include <linux/cpumask.h>
#include <linux/sched.h>
+#include <linux/rcupdate.h>
#ifdef CONFIG_GENERIC_CLOCKEVENTS
extern void __init tick_init(void);
@@ -300,4 +301,10 @@ static inline void tick_nohz_task_switch(void)
__tick_nohz_task_switch();
}
+static inline void tick_nohz_user_enter_prepare(void)
+{
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ rcu_nocb_flush_deferred_wakeup();
+}
+
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a0b3b04fb596..bf16395b9e13 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,6 +5,7 @@
#include <linux/highmem.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
+#include <linux/tick.h>
#include "common.h"
@@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_disable_exit_to_user();
/* Check if any of the above work has queued a deferred wakeup */
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();
ti_work = READ_ONCE(current_thread_info()->flags);
}
@@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
lockdep_assert_irqs_disabled();
/* Flush pending rcuog wakeup before the last need_resched() check */
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9c882f20803e..c5aacbd492a1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = {
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
#define __P(F) __PS(#F, F)
#define P(F) __PS(#F, p->F)
+#define PM(F, M) __PS(#F, p->F & (M))
#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
#define __PN(F) __PSN(#F, F)
#define PN(F) __PSN(#F, p->F)
@@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.util_avg);
P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
- P(se.avg.util_est.enqueued);
+ PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3248e24a90b0..2c8a9352590d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3499,10 +3499,9 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
- s64 delta_sum;
u32 divider;
if (!runnable_sum)
@@ -3549,13 +3548,13 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, divider);
- delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
- delta_avg = load_avg - se->avg.load_avg;
+ delta = load_avg - se->avg.load_avg;
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
- add_positive(&cfs_rq->avg.load_avg, delta_avg);
- add_positive(&cfs_rq->avg.load_sum, delta_sum);
+
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3766,11 +3765,17 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
+
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -3902,7 +3907,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
- return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -4002,7 +4007,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ ue.enqueued = task_util(p);
if (sched_feat(UTIL_EST_FASTUP)) {
if (ue.ewma < ue.enqueued) {
ue.ewma = ue.enqueued;
@@ -4051,6 +4056,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
+ ue.enqueued |= UTIL_AVG_UNCHANGED;
WRITE_ONCE(p->se.avg.util_est, ue);
trace_sched_util_est_se_tp(&p->se);
@@ -8030,7 +8036,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
- update_load_avg(cfs_rq_of(se), se, 0);
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 1462846d244e..cfe94ffd2b38 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
return LOAD_AVG_MAX - 1024 + avg->period_contrib;
}
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
-
static inline void cfs_se_util_change(struct sched_avg *avg)
{
unsigned int enqueued;
@@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
if (!sched_feat(UTIL_EST))
return;
- /* Avoid store if the flag has been already set */
+ /* Avoid store if the flag has been already reset */
enqueued = avg->util_est.enqueued;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 828b091501ca..6784f27a3099 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -230,6 +230,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
+EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2021-05-15 7:50 Ingo Molnar
@ 2021-05-15 17:55 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2021-05-15 17:55 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Borislav Petkov, juri, Vincent Guittot, Andrew Morton
The pull request you sent on Sat, 15 May 2021 09:50:41 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2021-05-15
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/c12a29ed9094b4b9cde8965c12850460b9a79d7c
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2021-05-15 7:50 Ingo Molnar
2021-05-15 17:55 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2021-05-15 7:50 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Borislav Petkov,
juri, Vincent Guittot, Andrew Morton
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2021-05-15
# HEAD: 3743d55b289c203d8f77b7cd47c24926b9d186ae x86, sched: Fix the AMD CPPC maximum performance value on certain AMD Ryzen generations
Fix an idle CPU selection bug, and an AMD Ryzen maximum frequency enumeration bug.
Thanks,
Ingo
------------------>
Gautham R. Shenoy (1):
sched/fair: Fix clearing of has_idle_cores flag in select_idle_cpu()
Huang Rui (1):
x86, sched: Fix the AMD CPPC maximum performance value on certain AMD Ryzen generations
arch/x86/include/asm/processor.h | 2 ++
arch/x86/kernel/cpu/amd.c | 16 ++++++++++++++++
arch/x86/kernel/smpboot.c | 2 +-
drivers/cpufreq/acpi-cpufreq.c | 6 +++++-
kernel/sched/fair.c | 2 +-
5 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 154321d29050..556b2b17c3e2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -787,8 +787,10 @@ DECLARE_PER_CPU(u64, msr_misc_features_shadow);
#ifdef CONFIG_CPU_SUP_AMD
extern u32 amd_get_nodes_per_socket(void);
+extern u32 amd_get_highest_perf(void);
#else
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
+static inline u32 amd_get_highest_perf(void) { return 0; }
#endif
static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2d11384dc9ab..6d7b3b3ea80b 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1165,3 +1165,19 @@ void set_dr_addr_mask(unsigned long mask, int dr)
break;
}
}
+
+u32 amd_get_highest_perf(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
+ (c->x86_model >= 0x70 && c->x86_model < 0x80)))
+ return 166;
+
+ if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
+ (c->x86_model >= 0x40 && c->x86_model < 0x70)))
+ return 166;
+
+ return 255;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0ad5214f598a..7770245cc7fa 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -2043,7 +2043,7 @@ static bool amd_set_max_freq_ratio(void)
return false;
}
- highest_perf = perf_caps.highest_perf;
+ highest_perf = amd_get_highest_perf();
nominal_perf = perf_caps.nominal_perf;
if (!highest_perf || !nominal_perf) {
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index d1bbc16fba4b..7e7450453714 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -646,7 +646,11 @@ static u64 get_max_boost_ratio(unsigned int cpu)
return 0;
}
- highest_perf = perf_caps.highest_perf;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ highest_perf = amd_get_highest_perf();
+ else
+ highest_perf = perf_caps.highest_perf;
+
nominal_perf = perf_caps.nominal_perf;
if (!highest_perf || !nominal_perf) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20aa234ffe04..3248e24a90b0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6217,7 +6217,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
}
if (has_idle_core)
- set_idle_cores(this, false);
+ set_idle_cores(target, false);
if (sched_feat(SIS_PROP) && !has_idle_core) {
time = cpu_clock(this) - time;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2020-08-15 11:27 Ingo Molnar
@ 2020-08-16 1:55 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2020-08-16 1:55 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton, Juri Lelli, Vincent Guittot
The pull request you sent on Sat, 15 Aug 2020 13:27:31 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2020-08-15
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/1195d58f003bf84d37fc5c30017600e09ba567b2
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2020-08-15 11:27 Ingo Molnar
2020-08-16 1:55 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2020-08-15 11:27 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
Juri Lelli, Vincent Guittot
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2020-08-15
# HEAD: cc172ff301d8079e941a6eb31758951a6d764084 sched/debug: Fix the alignment of the show-state debug output
Two fixes: fix a new tracepoint's output value, and fix the formatting
of show-state syslog printouts.
Thanks,
Ingo
------------------>
Libing Zhou (1):
sched/debug: Fix the alignment of the show-state debug output
Phil Auld (1):
sched: Fix use of count for nr_running tracepoint
kernel/sched/core.c | 15 ++++-----------
kernel/sched/sched.h | 2 +-
2 files changed, 5 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4a0e7b449b88..09fd62568ba9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6387,10 +6387,10 @@ void sched_show_task(struct task_struct *p)
if (!try_get_task_stack(p))
return;
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
if (p->state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
+ pr_cont(" running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
@@ -6399,8 +6399,8 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ free, task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
@@ -6435,13 +6435,6 @@ void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
-#if BITS_PER_LONG == 32
- printk(KERN_INFO
- " task PC stack pid father\n");
-#else
- printk(KERN_INFO
- " task PC stack pid father\n");
-#endif
rcu_read_lock();
for_each_process_thread(g, p) {
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3fd283892761..28709f6b0975 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1999,7 +1999,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
if (trace_sched_update_nr_running_tp_enabled()) {
- call_trace_sched_update_nr_running(rq, count);
+ call_trace_sched_update_nr_running(rq, -count);
}
/* Check if we still need preemption */
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2020-07-25 10:47 Ingo Molnar
@ 2020-07-25 22:30 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2020-07-25 22:30 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner
The pull request you sent on Sat, 25 Jul 2020 12:47:20 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2020-07-25
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/3077805eebb63d75ad716e36777a54a3321d34a5
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2020-07-25 10:47 Ingo Molnar
2020-07-25 22:30 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2020-07-25 10:47 UTC (permalink / raw)
To: Linus Torvalds; +Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2020-07-25
# HEAD: 062d3f95b630113e1156a31f376ad36e25da29a7 sched: Warn if garbage is passed to default_wake_function()
Fix a race introduced by the recent loadavg race fix, plus add a
debug check for a hard to debug case of bogus wakeup function flags.
Thanks,
Ingo
------------------>
Chris Wilson (1):
sched: Warn if garbage is passed to default_wake_function()
Peter Zijlstra (1):
sched: Fix race against ptrace_freeze_trace()
kernel/sched/core.c | 25 +++++++++++++++----------
1 file changed, 15 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e15543cb8481..2142c6767682 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4119,9 +4119,6 @@ static void __sched notrace __schedule(bool preempt)
local_irq_disable();
rcu_note_context_switch(preempt);
- /* See deactivate_task() below. */
- prev_state = prev->state;
-
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
@@ -4145,11 +4142,16 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);
switch_count = &prev->nivcsw;
+
/*
- * We must re-load prev->state in case ttwu_remote() changed it
- * before we acquired rq->lock.
+ * We must load prev->state once (task_struct::state is volatile), such
+ * that:
+ *
+ * - we form a control dependency vs deactivate_task() below.
+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
*/
- if (!preempt && prev_state && prev_state == prev->state) {
+ prev_state = prev->state;
+ if (!preempt && prev_state) {
if (signal_pending_state(prev_state, prev)) {
prev->state = TASK_RUNNING;
} else {
@@ -4163,10 +4165,12 @@ static void __sched notrace __schedule(bool preempt)
/*
* __schedule() ttwu()
- * prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...)
- * LOCK rq->lock goto out;
- * smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep();
- * p->on_rq = 0; p->state = TASK_WAKING;
+ * prev_state = prev->state; if (p->on_rq && ...)
+ * if (prev_state) goto out;
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
+ * p->state = TASK_WAKING
+ *
+ * Where __schedule() and ttwu() have matching control dependencies.
*
* After this, schedule() must not care about p->state any more.
*/
@@ -4481,6 +4485,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2020-04-25 9:28 Ingo Molnar
@ 2020-04-25 19:30 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2020-04-25 19:30 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
The pull request you sent on Sat, 25 Apr 2020 11:28:18 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2020-04-25
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/05db498ad94431315c73efe623eab7a7d2d961c6
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2020-04-25 9:28 Ingo Molnar
2020-04-25 19:30 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2020-04-25 9:28 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched/urgent git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2020-04-25
# HEAD: eaf5a92ebde5bca3bb2565616115bd6d579486cd sched/core: Fix reset-on-fork from RT with uclamp
Misc fixes:
- an uclamp accounting fix
- three frequency invariance fixes and a readability improvement
Thanks,
Ingo
------------------>
Giovanni Gherdovich (3):
x86, sched: Bail out of frequency invariance if base frequency is unknown
x86, sched: Account for CPUs with less than 4 cores in freq. invariance
x86, sched: Move check for CPU type to caller function
Peter Zijlstra (Intel) (1):
x86, sched: Don't enable static key when starting secondary CPUs
Quentin Perret (1):
sched/core: Fix reset-on-fork from RT with uclamp
arch/x86/kernel/smpboot.c | 47 +++++++++++++++++++++++++++++++++--------------
kernel/sched/core.c | 9 ++-------
2 files changed, 35 insertions(+), 21 deletions(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fe3ab9632f3b..8c89e4d9ad28 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -147,7 +147,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
-static void init_freq_invariance(void);
+static void init_freq_invariance(bool secondary);
/*
* Report back to the Boot Processor during boot time or to the caller processor
@@ -185,7 +185,7 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
- init_freq_invariance();
+ init_freq_invariance(true);
/*
* Get our bogomips.
@@ -1341,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
- init_freq_invariance();
+ init_freq_invariance(false);
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1877,9 +1877,6 @@ static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
int err, i;
u64 msr;
- if (!x86_match_cpu(has_knl_turbo_ratio_limits))
- return false;
-
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
@@ -1945,18 +1942,23 @@ static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
+ u64 msr;
int err;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq);
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
- *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
- *turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
return true;
}
@@ -1972,7 +1974,8 @@ static bool intel_set_max_freq_ratio(void)
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
- if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
@@ -1985,13 +1988,22 @@ static bool intel_set_max_freq_ratio(void)
return false;
out:
+ /*
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ */
+ if (!base_freq) {
+ pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
+
arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
base_freq);
arch_set_max_freq_ratio(turbo_disabled());
return true;
}
-static void init_counter_refs(void *arg)
+static void init_counter_refs(void)
{
u64 aperf, mperf;
@@ -2002,18 +2014,25 @@ static void init_counter_refs(void *arg)
this_cpu_write(arch_prev_mperf, mperf);
}
-static void init_freq_invariance(void)
+static void init_freq_invariance(bool secondary)
{
bool ret = false;
- if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF))
+ if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return;
+ if (secondary) {
+ if (static_branch_likely(&arch_scale_freq_key)) {
+ init_counter_refs();
+ }
+ return;
+ }
+
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
ret = intel_set_max_freq_ratio();
if (ret) {
- on_each_cpu(init_counter_refs, NULL, 1);
+ init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a61a3b8eaa9..9a2fbf98fd6f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1232,13 +1232,8 @@ static void uclamp_fork(struct task_struct *p)
return;
for_each_clamp_id(clamp_id) {
- unsigned int clamp_value = uclamp_none(clamp_id);
-
- /* By default, RT tasks always get 100% boost */
- if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- clamp_value = uclamp_none(UCLAMP_MAX);
-
- uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
+ uclamp_se_set(&p->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
}
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2020-02-15 9:44 Ingo Molnar
@ 2020-02-15 21:25 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2020-02-15 21:25 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Thomas Gleixner, Peter Zijlstra,
Andrew Morton
The pull request you sent on Sat, 15 Feb 2020 10:44:17 +0100:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/ef78e5b7de5da49769c337a17dcff7d4e82ee6cd
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2020-02-15 9:44 Ingo Molnar
2020-02-15 21:25 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2020-02-15 9:44 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Thomas Gleixner, Peter Zijlstra, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: e9f5490c3574b435ce7fe7a71724aa3866babc7f sched/fair: Fix kernel-doc warning in attach_entity_load_avg()
Misc fixes all over the place:
- Fix NUMA over-balancing between lightly loaded nodes. This is fallout
of the big load-balancer rewrite.
- Fix the NOHZ remote loadavg update logic, which fixes anomalies
like reported 150 loadavg on mostly idle CPUs.
- Fix XFS performance/scalability
- Fix throttled groups unbound task-execution bug
- Fix PSI procfs boundary condition
- Fix the cpu.uclamp.{min,max} cgroup configuration write checks
- Fix DocBook annotations
- Fix RCU annotations
- Fix overly CPU-intensive housekeeper CPU logic loop on large CPU counts
Thanks,
Ingo
------------------>
Madhuparna Bhowmik (1):
sched/core: Annotate curr pointer in rq with __rcu
Mel Gorman (2):
sched/fair: Allow a small load imbalance between low utilisation SD_NUMA domains
sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression
Peter Zijlstra (Intel) (1):
timers/nohz: Update NOHZ load in remote tick
Qais Yousef (1):
sched/uclamp: Reject negative values in cpu_uclamp_write()
Randy Dunlap (1):
sched/fair: Fix kernel-doc warning in attach_entity_load_avg()
Scott Wood (1):
sched/core: Don't skip remote tick for idle CPUs
Suren Baghdasaryan (1):
sched/psi: Fix OOB write when writing 0 bytes to PSI files
Vincent Guittot (1):
sched/fair: Prevent unlimited runtime on throttled group
Wanpeng Li (1):
sched/nohz: Optimize get_nohz_timer_target()
include/linux/sched/nohz.h | 2 ++
kernel/sched/core.c | 63 +++++++++++++++++++++++++---------------------
kernel/sched/fair.c | 56 +++++++++++++++++++++++++++++++----------
kernel/sched/loadavg.c | 33 ++++++++++++++++--------
kernel/sched/psi.c | 3 +++
kernel/sched/sched.h | 15 ++++++++++-
6 files changed, 119 insertions(+), 53 deletions(-)
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 1abe91ff6e4a..6d67e9a5af6b 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -15,9 +15,11 @@ static inline void nohz_balance_enter_idle(int cpu) { }
#ifdef CONFIG_NO_HZ_COMMON
void calc_load_nohz_start(void);
+void calc_load_nohz_remote(struct rq *rq);
void calc_load_nohz_stop(void);
#else
static inline void calc_load_nohz_start(void) { }
+static inline void calc_load_nohz_remote(struct rq *rq) { }
static inline void calc_load_nohz_stop(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fc1dfc007604..1a9983da4408 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -552,27 +552,32 @@ void resched_cpu(int cpu)
*/
int get_nohz_timer_target(void)
{
- int i, cpu = smp_processor_id();
+ int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
- return cpu;
+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (!idle_cpu(cpu))
+ return cpu;
+ default_cpu = cpu;
+ }
rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
+ for_each_cpu_and(i, sched_domain_span(sd),
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
if (cpu == i)
continue;
- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
+ if (!idle_cpu(i)) {
cpu = i;
goto unlock;
}
}
}
- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (default_cpu == -1)
+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ cpu = default_cpu;
unlock:
rcu_read_unlock();
return cpu;
@@ -1442,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
-static inline bool is_per_cpu_kthread(struct task_struct *p)
-{
- if (!(p->flags & PF_KTHREAD))
- return false;
-
- if (p->nr_cpus_allowed != 1)
- return false;
-
- return true;
-}
-
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -3669,28 +3663,32 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
+ if (!tick_nohz_tick_stopped_cpu(cpu))
goto out_requeue;
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr) || cpu_is_offline(cpu))
+ if (cpu_is_offline(cpu))
goto out_unlock;
+ curr = rq->curr;
update_rq_clock(rq);
- delta = rq_clock_task(rq) - curr->se.exec_start;
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a reasonable
+ * amount of time.
+ */
+ delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
curr->sched_class->task_tick(rq, curr, 0);
+ calc_load_nohz_remote(rq);
out_unlock:
rq_unlock_irq(rq, &rf);
-
out_requeue:
+
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
@@ -7063,8 +7061,15 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
- if (running)
+ if (running) {
set_next_task(rq, tsk);
+ /*
+ * After changing group, the running task may have joined a
+ * throttled one but it's still the running task. Trigger a
+ * resched to make sure that task can still run.
+ */
+ resched_curr(rq);
+ }
task_rq_unlock(rq, tsk, &rf);
}
@@ -7260,7 +7265,7 @@ capacity_from_percent(char *buf)
&req.percent);
if (req.ret)
return req;
- if (req.percent > UCLAMP_PERCENT_SCALE) {
+ if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
req.ret = -ERANGE;
return req;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe4e0d775375..3c8a379c357e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
- * @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
@@ -5912,6 +5911,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
(available_idle_cpu(prev) || sched_idle_cpu(prev)))
return prev;
+ /*
+ * Allow a per-cpu kthread to stack with the wakee if the
+ * kworker thread and the tasks previous CPUs are the same.
+ * The assumption is that the wakee queued work for the
+ * per-cpu kthread that is now complete and the wakeup is
+ * essentially a sync wakeup. An obvious example of this
+ * pattern is IO completions.
+ */
+ if (is_per_cpu_kthread(current) &&
+ prev == smp_processor_id() &&
+ this_rq()->nr_running <= 1) {
+ return prev;
+ }
+
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
@@ -8658,10 +8671,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* Try to use spare capacity of local group without overloading it or
* emptying busiest.
- * XXX Spreading tasks across NUMA nodes is not always the best policy
- * and special care should be taken for SD_NUMA domain level before
- * spreading the tasks. For now, load_balance() fully relies on
- * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
*/
if (local->group_type == group_has_spare) {
if (busiest->group_type > group_fully_busy) {
@@ -8701,16 +8710,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
env->imbalance = nr_diff >> 1;
- return;
- }
+ } else {
- /*
- * If there is no overload, we just want to even the number of
- * idle cpus.
- */
- env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
+ /*
+ * If there is no overload, we just want to even the number of
+ * idle cpus.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
busiest->idle_cpus) >> 1);
+ }
+
+ /* Consider allowing a small imbalance between NUMA groups */
+ if (env->sd->flags & SD_NUMA) {
+ unsigned int imbalance_min;
+
+ /*
+ * Compute an allowed imbalance based on a simple
+ * pair of communicating tasks that should remain
+ * local and ignore them.
+ *
+ * NOTE: Generally this would have been based on
+ * the domain size and this was evaluated. However,
+ * the benefit is similar across a range of workloads
+ * and machines but scaling by the domain size adds
+ * the risk that lower domains have to be rebalanced.
+ */
+ imbalance_min = 2;
+ if (busiest->sum_nr_running <= imbalance_min)
+ env->imbalance = 0;
+ }
+
return;
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 28a516575c18..de22da666ac7 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void)
return calc_load_idx & 1;
}
-void calc_load_nohz_start(void)
+static void calc_load_nohz_fold(struct rq *rq)
{
- struct rq *this_rq = this_rq();
long delta;
- /*
- * We're going into NO_HZ mode, if there's any pending delta, fold it
- * into the pending NO_HZ delta.
- */
- delta = calc_load_fold_active(this_rq, 0);
+ delta = calc_load_fold_active(rq, 0);
if (delta) {
int idx = calc_load_write_idx();
@@ -248,6 +243,24 @@ void calc_load_nohz_start(void)
}
}
+void calc_load_nohz_start(void)
+{
+ /*
+ * We're going into NO_HZ mode, if there's any pending delta, fold it
+ * into the pending NO_HZ delta.
+ */
+ calc_load_nohz_fold(this_rq());
+}
+
+/*
+ * Keep track of the load for NOHZ_FULL, must be called between
+ * calc_load_nohz_{start,stop}().
+ */
+void calc_load_nohz_remote(struct rq *rq)
+{
+ calc_load_nohz_fold(rq);
+}
+
void calc_load_nohz_stop(void)
{
struct rq *this_rq = this_rq();
@@ -268,7 +281,7 @@ void calc_load_nohz_stop(void)
this_rq->calc_load_update += LOAD_FREQ;
}
-static long calc_load_nohz_fold(void)
+static long calc_load_nohz_read(void)
{
int idx = calc_load_read_idx();
long delta = 0;
@@ -323,7 +336,7 @@ static void calc_global_nohz(void)
}
#else /* !CONFIG_NO_HZ_COMMON */
-static inline long calc_load_nohz_fold(void) { return 0; }
+static inline long calc_load_nohz_read(void) { return 0; }
static inline void calc_global_nohz(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks)
/*
* Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
*/
- delta = calc_load_nohz_fold();
+ delta = calc_load_nohz_read();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index db7b50bba3f1..38ccd49b9bf6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+ if (!nbytes)
+ return -EINVAL;
+
buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1a88dc8ad11b..9ea647835fd6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -896,7 +896,7 @@ struct rq {
*/
unsigned long nr_uninterruptible;
- struct task_struct *curr;
+ struct task_struct __rcu *curr;
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
@@ -2479,3 +2479,16 @@ static inline void membarrier_switch_mm(struct rq *rq,
{
}
#endif
+
+#ifdef CONFIG_SMP
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+ if (!(p->flags & PF_KTHREAD))
+ return false;
+
+ if (p->nr_cpus_allowed != 1)
+ return false;
+
+ return true;
+}
+#endif
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-12-21 16:19 Ingo Molnar
@ 2019-12-21 18:55 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2019-12-21 18:55 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
The pull request you sent on Sat, 21 Dec 2019 17:19:58 +0100:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/fd7a6d2b8f1d67df76d0e863f003162b931074a1
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2019-12-21 16:19 Ingo Molnar
2019-12-21 18:55 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2019-12-21 16:19 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 6cf82d559e1a1d89f06ff4d428aca479c1dd0be6 sched/cfs: fix spurious active migration
Misc fixes: a (rare) PSI crash fix, a CPU affinity related balancing fix,
and a toning down of active migration attempts.
Thanks,
Ingo
------------------>
Johannes Weiner (2):
sched/psi: Fix sampling error and rare div0 crashes with cgroups and high uptime
psi: Fix a division error in psi poll()
Vincent Guittot (2):
sched/fair: Fix find_idlest_group() to handle CPU affinity
sched/cfs: fix spurious active migration
kernel/sched/fair.c | 13 ++++++++++++-
kernel/sched/psi.c | 5 +++--
2 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08a233e97a01..ba749f579714 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7328,7 +7328,14 @@ static int detach_tasks(struct lb_env *env)
load < 16 && !env->sd->nr_balance_failed)
goto next;
- if (load/2 > env->imbalance)
+ /*
+ * Make sure that we don't migrate too much load.
+ * Nevertheless, let relax the constraint if
+ * scheduler fails to find a good waiting task to
+ * migrate.
+ */
+ if (load/2 > env->imbalance &&
+ env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
goto next;
env->imbalance -= load;
@@ -8417,6 +8424,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (!idlest)
return NULL;
+ /* The local group has been skipped because of CPU affinity */
+ if (!local)
+ return idlest;
+
/*
* If the local group is idler than the selected idlest group
* don't try and push the task.
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 517e3719027e..ce8f6748678a 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -185,7 +185,8 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->avg_next_update = sched_clock() + psi_period;
+ group->avg_last_update = sched_clock();
+ group->avg_next_update = group->avg_last_update + psi_period;
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
@@ -481,7 +482,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
u32 remaining;
remaining = win->size - elapsed;
- growth += div_u64(win->prev_growth * remaining, win->size);
+ growth += div64_u64(win->prev_growth * remaining, win->size);
}
return growth;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-17 16:29 ` Linus Torvalds
2019-11-17 20:43 ` Valentin Schneider
@ 2019-11-18 8:03 ` Ingo Molnar
1 sibling, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2019-11-18 8:03 UTC (permalink / raw)
To: Linus Torvalds
Cc: Valentin Schneider, Linux Kernel Mailing List, Peter Zijlstra,
Thomas Gleixner, Andrew Morton
* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Sun, Nov 17, 2019 at 2:20 AM Valentin Schneider
> <valentin.schneider@arm.com> wrote:
> >
> > AFAIUI the requirement for the enum type is that it has to be an int
> > type that covers all its values, so I could see some funky
> > optimization (e.g. check the returned value is < 512 but it's assumed
> > the type for the enum is 8 bits so this becomes always true). Then
> > again we don't have any explicit check on those returned values, plus
> > they fit in 11 bits, so as you say it's mostly likely inconsequential
> > (and I didn't see any compile diff).
>
> Gcc can - and does - narrow enums to smaller integer types with the
> '-fshort-enums' flag.
Good point - but at least according to the GCC 9.2.1 documentation,
-fshort-enums is a non-default code generation option:
Options for Code Generation Conventions
These machine-independent options control the interface
conventions used in code generation.
Most of them have both positive and negative forms; the negative
form of -ffoo is -fno-foo. In the table below, only one of the
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
forms is listed---the one that is not the default. You can figure
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
out the other form by either removing no- or adding it.
[...]
-fshort-enums
Allocate to an "enum" type only as many bytes as it needs for
the declared range of possible values. Specifically, the
"enum" type is equivalent to the smallest integer type that
has enough room.
Warning: the -fshort-enums switch causes GCC to generate code
that is not binary compatible with code generated without that
switch. Use it to conform to a non-default application binary
interface.
Unless this option is used AFAIK GCC will treat enums as "int" if at
least one enumeration constant is negative, it's "unsigned int"
otherwise.
The only current use reference to the non-standard -fshort-enums option
within the kernel source is the Hexagon arch, which (seemingly
unnecessarily) disables the option:
arch/hexagon/Makefile:KBUILD_CFLAGS += -fno-short-enums
That flag came with the original Hexagon commits, 8 years ago:
e95bf452a9e22 (Richard Kuo 2011-10-31 18:55:58 -0500 10)# Do not use single-byte enums; these will overflow.
e95bf452a9e22 (Richard Kuo 2011-10-31 18:55:58 -0500 11)KBUILD_CFLAGS += -fno-short-enums
Maybe they had a GCC build where it was on by default? Or GCC changed
this option sometime in the past? Or it's simply an unnecessary but
harmless code generation flag out of paranoia?
Out of curiosity I searched all the historic trees, none ever made use of
the -f*short-enums option, so I don't think this is a GCC option we ever
actively utilized or ran into.
> However, in practice nobody uses that, and it can cause interop
> problems. So I think for us, enums are always at least 'int' (they can
> be bigger).
Yeah, the GCC documentation specifically warns that it breaks the ABI:
the size of structs using enums will generally change from 4 bytes to 1
or 2 bytes, and function call signatures will change incompatibly as
well.
BTW., -fshort-enum looks like a bad code generation option to me, on x86
at least, because it will also use 16-bit width, which is generally a bad
idea on x86. If it limited itself to u8 and 32-bit types it could even be
useful.
Also, I wouldn't be surprised if the kernel ABI broke if we attempted to
use -short-enum, I bet there's a lot of accidental reliance on
enum=int/uint.
> That said, mixing enums and values that are bigger than the enumerated
> ones is just a bad idea
>
> It will, for example, cause us to miss compiler warnings (eg switch
> statements with an enum will warn if you don't handle all cases, but
> the 'all cases' is based on the actual enum range, not on the
> _possible_ invalid values).
That's true. Will check whether we can do something about improving the
affected uclamp data structures.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-17 16:29 ` Linus Torvalds
@ 2019-11-17 20:43 ` Valentin Schneider
2019-11-18 8:03 ` Ingo Molnar
1 sibling, 0 replies; 384+ messages in thread
From: Valentin Schneider @ 2019-11-17 20:43 UTC (permalink / raw)
To: Linus Torvalds
Cc: Ingo Molnar, Linux Kernel Mailing List, Peter Zijlstra,
Thomas Gleixner, Andrew Morton
On 17/11/2019 16:29, Linus Torvalds wrote:
> Gcc can - and does - narrow enums to smaller integer types with the
> '-fshort-enums' flag.
>
> However, in practice nobody uses that, and it can cause interop
> problems. So I think for us, enums are always at least 'int' (they can
> be bigger).
>
> That said, mixing enums and values that are bigger than the enumerated
> ones is just a bad idea
>
> It will, for example, cause us to miss compiler warnings (eg switch
> statements with an enum will warn if you don't handle all cases, but
> the 'all cases' is based on the actual enum range, not on the
> _possible_ invalid values).
>
Oh, yet another gcc flag...
Thanks for the detailed write-up.
> Linus
>
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-17 10:19 ` Valentin Schneider
2019-11-17 10:29 ` Ingo Molnar
@ 2019-11-17 16:29 ` Linus Torvalds
2019-11-17 20:43 ` Valentin Schneider
2019-11-18 8:03 ` Ingo Molnar
1 sibling, 2 replies; 384+ messages in thread
From: Linus Torvalds @ 2019-11-17 16:29 UTC (permalink / raw)
To: Valentin Schneider
Cc: Ingo Molnar, Linux Kernel Mailing List, Peter Zijlstra,
Thomas Gleixner, Andrew Morton
On Sun, Nov 17, 2019 at 2:20 AM Valentin Schneider
<valentin.schneider@arm.com> wrote:
>
> AFAIUI the requirement for the enum type is that it has to be an int type that
> covers all its values, so I could see some funky optimization (e.g. check the
> returned value is < 512 but it's assumed the type for the enum is 8 bits so
> this becomes always true). Then again we don't have any explicit check on
> those returned values, plus they fit in 11 bits, so as you say it's
> mostly likely inconsequential (and I didn't see any compile diff).
Gcc can - and does - narrow enums to smaller integer types with the
'-fshort-enums' flag.
However, in practice nobody uses that, and it can cause interop
problems. So I think for us, enums are always at least 'int' (they can
be bigger).
That said, mixing enums and values that are bigger than the enumerated
ones is just a bad idea
It will, for example, cause us to miss compiler warnings (eg switch
statements with an enum will warn if you don't handle all cases, but
the 'all cases' is based on the actual enum range, not on the
_possible_ invalid values).
Linus
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-17 10:19 ` Valentin Schneider
@ 2019-11-17 10:29 ` Ingo Molnar
2019-11-17 16:29 ` Linus Torvalds
1 sibling, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2019-11-17 10:29 UTC (permalink / raw)
To: Valentin Schneider
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
* Valentin Schneider <valentin.schneider@arm.com> wrote:
> On 17/11/2019 09:45, Ingo Molnar wrote:
> > I've picked v2 up instead. I suspect it's not really consequential as
> > enums don't really get truncated by compilers, right? Is there any other
> > negative runtime side effect possible from the imprecise enum/uint
> > typing?
> >
>
> AFAIUI the requirement for the enum type is that it has to be an int type that
> covers all its values, so I could see some funky optimization (e.g. check the
> returned value is < 512 but it's assumed the type for the enum is 8 bits so
> this becomes always true). Then again we don't have any explicit check on
> those returned values, plus they fit in 11 bits, so as you say it's
> mostly likely inconsequential (and I didn't see any compile diff).
Yeah, so unless there's evidence of there being a nonzero chance of this
being misbuilt I'd gravitate towards doing this via via sched/core,
especially so late in the cycle.
> My "worry" wasn't really about this patch, it was more about the
> following one - it didn't like the idea of merging an unneeded patch
> (with a Fixes: tag on top of it).
Yeah, agreed - should be fixed now.
> >>> sched/topology, cpuset: Account for housekeeping CPUs to avoid empty cpumasks
> >>
> >> And this one is no longer needed, as Michal & I understood (IOW the fix in
> >> rc6 is sufficient), see:
> >>
> >> c425c5cb-ba8a-e5f6-d91c-5479779cfb7a@arm.com
> >
> > Ok.
> >
> > I'm inclined to just reduce sched/urgent back to these three fixes:
> >
> > 6e1ff0773f49: sched/uclamp: Fix incorrect condition
> > b90f7c9d2198: sched/pelt: Fix update of blocked PELT ordering
> > ff51ff84d82a: sched/core: Avoid spurious lock dependencies
> >
> > and apply v2 of the uclamp_id type fix to sched/core. This would reduce
> > the risks of a Sunday pull request ...
> >
>
> This sounds good to me. Sorry for the hassle.
No hassle at all - thanks for catching these!
Thanks,
Ingo
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-17 9:45 ` Ingo Molnar
@ 2019-11-17 10:19 ` Valentin Schneider
2019-11-17 10:29 ` Ingo Molnar
2019-11-17 16:29 ` Linus Torvalds
0 siblings, 2 replies; 384+ messages in thread
From: Valentin Schneider @ 2019-11-17 10:19 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
On 17/11/2019 09:45, Ingo Molnar wrote:
> I've picked v2 up instead. I suspect it's not really consequential as
> enums don't really get truncated by compilers, right? Is there any other
> negative runtime side effect possible from the imprecise enum/uint
> typing?
>
AFAIUI the requirement for the enum type is that it has to be an int type that
covers all its values, so I could see some funky optimization (e.g. check the
returned value is < 512 but it's assumed the type for the enum is 8 bits so
this becomes always true). Then again we don't have any explicit check on
those returned values, plus they fit in 11 bits, so as you say it's
mostly likely inconsequential (and I didn't see any compile diff).
My "worry" wasn't really about this patch, it was more about the following
one - it didn't like the idea of merging an unneeded patch (with a Fixes:
tag on top of it).
>>> sched/topology, cpuset: Account for housekeeping CPUs to avoid empty cpumasks
>>
>> And this one is no longer needed, as Michal & I understood (IOW the fix in
>> rc6 is sufficient), see:
>>
>> c425c5cb-ba8a-e5f6-d91c-5479779cfb7a@arm.com
>
> Ok.
>
> I'm inclined to just reduce sched/urgent back to these three fixes:
>
> 6e1ff0773f49: sched/uclamp: Fix incorrect condition
> b90f7c9d2198: sched/pelt: Fix update of blocked PELT ordering
> ff51ff84d82a: sched/core: Avoid spurious lock dependencies
>
> and apply v2 of the uclamp_id type fix to sched/core. This would reduce
> the risks of a Sunday pull request ...
>
This sounds good to me. Sorry for the hassle.
> Thanks,
>
> Ingo
>
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-16 22:44 ` Valentin Schneider
2019-11-17 0:10 ` Linus Torvalds
@ 2019-11-17 9:45 ` Ingo Molnar
2019-11-17 10:19 ` Valentin Schneider
1 sibling, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2019-11-17 9:45 UTC (permalink / raw)
To: Valentin Schneider
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
* Valentin Schneider <valentin.schneider@arm.com> wrote:
> Hi,
>
> On 16/11/2019 21:37, Ingo Molnar wrote:
> > Peter Zijlstra (1):
> > sched/core: Avoid spurious lock dependencies
> >
> > Qais Yousef (1):
> > sched/uclamp: Fix incorrect condition
> >
> > Valentin Schneider (2):
> > sched/uclamp: Fix overzealous type replacement
>
> This one got a v2 (was missing one location), acked by Vincent:
>
> 20191115103908.27610-1-valentin.schneider@arm.com
I've picked v2 up instead. I suspect it's not really consequential as
enums don't really get truncated by compilers, right? Is there any other
negative runtime side effect possible from the imprecise enum/uint
typing?
> > sched/topology, cpuset: Account for housekeeping CPUs to avoid empty cpumasks
>
> And this one is no longer needed, as Michal & I understood (IOW the fix in
> rc6 is sufficient), see:
>
> c425c5cb-ba8a-e5f6-d91c-5479779cfb7a@arm.com
Ok.
I'm inclined to just reduce sched/urgent back to these three fixes:
6e1ff0773f49: sched/uclamp: Fix incorrect condition
b90f7c9d2198: sched/pelt: Fix update of blocked PELT ordering
ff51ff84d82a: sched/core: Avoid spurious lock dependencies
and apply v2 of the uclamp_id type fix to sched/core. This would reduce
the risks of a Sunday pull request ...
Thanks,
Ingo
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-17 0:10 ` Linus Torvalds
@ 2019-11-17 9:31 ` Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2019-11-17 9:31 UTC (permalink / raw)
To: Linus Torvalds
Cc: Valentin Schneider, Linux Kernel Mailing List, Peter Zijlstra,
Thomas Gleixner, Andrew Morton
* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Sat, Nov 16, 2019 at 2:44 PM Valentin Schneider
> <valentin.schneider@arm.com> wrote:
> >
> > > Valentin Schneider (2):
> > > sched/uclamp: Fix overzealous type replacement
> >
> > This one got a v2 (was missing one location), acked by Vincent:
> >
> > 20191115103908.27610-1-valentin.schneider@arm.com
> >
> > > sched/topology, cpuset: Account for housekeeping CPUs to avoid empty cpumasks
> >
> > And this one is no longer needed, as Michal & I understood (IOW the fix in
> > rc6 is sufficient), see:
> >
> > c425c5cb-ba8a-e5f6-d91c-5479779cfb7a@arm.com
>
> Ingo, what do you want me to do? Pull it anyway and send updates
> later? Or skip this pull request?
>
> I'll leave it pending for now,
Yeah, please don't pull - will rework it. Sorry ...
Thanks,
Ingo
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-16 22:44 ` Valentin Schneider
@ 2019-11-17 0:10 ` Linus Torvalds
2019-11-17 9:31 ` Ingo Molnar
2019-11-17 9:45 ` Ingo Molnar
1 sibling, 1 reply; 384+ messages in thread
From: Linus Torvalds @ 2019-11-17 0:10 UTC (permalink / raw)
To: Valentin Schneider
Cc: Ingo Molnar, Linux Kernel Mailing List, Peter Zijlstra,
Thomas Gleixner, Andrew Morton
On Sat, Nov 16, 2019 at 2:44 PM Valentin Schneider
<valentin.schneider@arm.com> wrote:
>
> > Valentin Schneider (2):
> > sched/uclamp: Fix overzealous type replacement
>
> This one got a v2 (was missing one location), acked by Vincent:
>
> 20191115103908.27610-1-valentin.schneider@arm.com
>
> > sched/topology, cpuset: Account for housekeeping CPUs to avoid empty cpumasks
>
> And this one is no longer needed, as Michal & I understood (IOW the fix in
> rc6 is sufficient), see:
>
> c425c5cb-ba8a-e5f6-d91c-5479779cfb7a@arm.com
Ingo, what do you want me to do? Pull it anyway and send updates
later? Or skip this pull request?
I'll leave it pending for now,
Linus
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-16 21:37 Ingo Molnar
@ 2019-11-16 22:44 ` Valentin Schneider
2019-11-17 0:10 ` Linus Torvalds
2019-11-17 9:45 ` Ingo Molnar
0 siblings, 2 replies; 384+ messages in thread
From: Valentin Schneider @ 2019-11-16 22:44 UTC (permalink / raw)
To: Ingo Molnar, Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Hi,
On 16/11/2019 21:37, Ingo Molnar wrote:
> Peter Zijlstra (1):
> sched/core: Avoid spurious lock dependencies
>
> Qais Yousef (1):
> sched/uclamp: Fix incorrect condition
>
> Valentin Schneider (2):
> sched/uclamp: Fix overzealous type replacement
This one got a v2 (was missing one location), acked by Vincent:
20191115103908.27610-1-valentin.schneider@arm.com
> sched/topology, cpuset: Account for housekeeping CPUs to avoid empty cpumasks
And this one is no longer needed, as Michal & I understood (IOW the fix in
rc6 is sufficient), see:
c425c5cb-ba8a-e5f6-d91c-5479779cfb7a@arm.com
>
> Vincent Guittot (1):
> sched/pelt: Fix update of blocked PELT ordering
>
>
> kernel/cgroup/cpuset.c | 8 +++++++-
> kernel/sched/core.c | 9 +++++----
> kernel/sched/fair.c | 29 ++++++++++++++++++++---------
> kernel/sched/sched.h | 2 +-
> 4 files changed, 33 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index c87ee6412b36..e4c10785dc7c 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -798,8 +798,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
> cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
> continue;
>
> + /*
> + * Skip cpusets that would lead to an empty sched domain.
> + * That could be because effective_cpus is empty, or because
> + * it's only spanning CPUs outside the housekeeping mask.
> + */
> if (is_sched_load_balance(cp) &&
> - !cpumask_empty(cp->effective_cpus))
> + cpumask_intersects(cp->effective_cpus,
> + housekeeping_cpumask(HK_FLAG_DOMAIN)))
> csa[csn++] = cp;
>
> /* skip @cp's subtree if not a partition root */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0f2eb3629070..a4f76d3f5011 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -853,7 +853,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
> }
>
> static inline
> -enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
> +unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
> unsigned int clamp_value)
> {
> struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
> @@ -918,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
> return uc_req;
> }
>
> -enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
> +unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
> {
> struct uclamp_se uc_eff;
>
> @@ -1065,7 +1065,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
> * affecting a valid clamp bucket, the next time it's enqueued,
> * it will already see the updated clamp bucket value.
> */
> - if (!p->uclamp[clamp_id].active) {
> + if (p->uclamp[clamp_id].active) {
> uclamp_rq_dec_id(rq, p, clamp_id);
> uclamp_rq_inc_id(rq, p, clamp_id);
> }
> @@ -6019,10 +6019,11 @@ void init_idle(struct task_struct *idle, int cpu)
> struct rq *rq = cpu_rq(cpu);
> unsigned long flags;
>
> + __sched_fork(0, idle);
> +
> raw_spin_lock_irqsave(&idle->pi_lock, flags);
> raw_spin_lock(&rq->lock);
>
> - __sched_fork(0, idle);
> idle->state = TASK_RUNNING;
> idle->se.exec_start = sched_clock();
> idle->flags |= PF_IDLE;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 22a2fed29054..69a81a5709ff 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7547,6 +7547,19 @@ static void update_blocked_averages(int cpu)
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
>
> + /*
> + * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
> + * that RT, DL and IRQ signals have been updated before updating CFS.
> + */
> + curr_class = rq->curr->sched_class;
> + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
> + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
> + update_irq_load_avg(rq, 0);
> +
> + /* Don't need periodic decay once load/util_avg are null */
> + if (others_have_blocked(rq))
> + done = false;
> +
> /*
> * Iterates the task_group tree in a bottom up fashion, see
> * list_add_leaf_cfs_rq() for details.
> @@ -7574,14 +7587,6 @@ static void update_blocked_averages(int cpu)
> done = false;
> }
>
> - curr_class = rq->curr->sched_class;
> - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
> - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
> - update_irq_load_avg(rq, 0);
> - /* Don't need periodic decay once load/util_avg are null */
> - if (others_have_blocked(rq))
> - done = false;
> -
> update_blocked_load_status(rq, !done);
> rq_unlock_irqrestore(rq, &rf);
> }
> @@ -7642,12 +7647,18 @@ static inline void update_blocked_averages(int cpu)
>
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
>
> + /*
> + * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
> + * that RT, DL and IRQ signals have been updated before updating CFS.
> + */
> curr_class = rq->curr->sched_class;
> update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
> update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
> update_irq_load_avg(rq, 0);
> +
> + update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
> +
> update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
> rq_unlock_irqrestore(rq, &rf);
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index c8870c5bd7df..49ed949f850c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2309,7 +2309,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
> #endif /* CONFIG_CPU_FREQ */
>
> #ifdef CONFIG_UCLAMP_TASK
> -enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
> +unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
>
> static __always_inline
> unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
>
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2019-11-16 21:37 Ingo Molnar
2019-11-16 22:44 ` Valentin Schneider
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2019-11-16 21:37 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 48a723d23b0d957e5b5861b974864e53c6841de8 sched/topology, cpuset: Account for housekeeping CPUs to avoid empty cpumasks
Misc fixes:
- Fix potential deadlock under CONFIG_DEBUG_OBJECTS=y
- PELT metrics update ordering fix
- uclamp logic fix
- a type casting fix
- final fix (hopefully) for Juno r0 2+4 big.LITTLE systems.
Thanks,
Ingo
------------------>
Peter Zijlstra (1):
sched/core: Avoid spurious lock dependencies
Qais Yousef (1):
sched/uclamp: Fix incorrect condition
Valentin Schneider (2):
sched/uclamp: Fix overzealous type replacement
sched/topology, cpuset: Account for housekeeping CPUs to avoid empty cpumasks
Vincent Guittot (1):
sched/pelt: Fix update of blocked PELT ordering
kernel/cgroup/cpuset.c | 8 +++++++-
kernel/sched/core.c | 9 +++++----
kernel/sched/fair.c | 29 ++++++++++++++++++++---------
kernel/sched/sched.h | 2 +-
4 files changed, 33 insertions(+), 15 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c87ee6412b36..e4c10785dc7c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -798,8 +798,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
continue;
+ /*
+ * Skip cpusets that would lead to an empty sched domain.
+ * That could be because effective_cpus is empty, or because
+ * it's only spanning CPUs outside the housekeeping mask.
+ */
if (is_sched_load_balance(cp) &&
- !cpumask_empty(cp->effective_cpus))
+ cpumask_intersects(cp->effective_cpus,
+ housekeeping_cpumask(HK_FLAG_DOMAIN)))
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0f2eb3629070..a4f76d3f5011 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -853,7 +853,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
}
static inline
-enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
@@ -918,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
return uc_req;
}
-enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
@@ -1065,7 +1065,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
- if (!p->uclamp[clamp_id].active) {
+ if (p->uclamp[clamp_id].active) {
uclamp_rq_dec_id(rq, p, clamp_id);
uclamp_rq_inc_id(rq, p, clamp_id);
}
@@ -6019,10 +6019,11 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ __sched_fork(0, idle);
+
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
- __sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
idle->flags |= PF_IDLE;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 22a2fed29054..69a81a5709ff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7547,6 +7547,19 @@ static void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
+ /*
+ * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
+ * that RT, DL and IRQ signals have been updated before updating CFS.
+ */
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
+ update_irq_load_avg(rq, 0);
+
+ /* Don't need periodic decay once load/util_avg are null */
+ if (others_have_blocked(rq))
+ done = false;
+
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
@@ -7574,14 +7587,6 @@ static void update_blocked_averages(int cpu)
done = false;
}
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
- /* Don't need periodic decay once load/util_avg are null */
- if (others_have_blocked(rq))
- done = false;
-
update_blocked_load_status(rq, !done);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7642,12 +7647,18 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ /*
+ * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
+ * that RT, DL and IRQ signals have been updated before updating CFS.
+ */
curr_class = rq->curr->sched_class;
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
+
+ update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+
update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
rq_unlock_irqrestore(rq, &rf);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8870c5bd7df..49ed949f850c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2309,7 +2309,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-01 17:55 Ingo Molnar
2019-11-01 19:10 ` pr-tracker-bot
@ 2019-11-02 0:15 ` Valentin Schneider
1 sibling, 0 replies; 384+ messages in thread
From: Valentin Schneider @ 2019-11-02 0:15 UTC (permalink / raw)
To: Ingo Molnar, Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Vincent Guittot, Thomas Gleixner
Hi,
On 01/11/2019 18:55, Ingo Molnar wrote:
> Linus,
>
> Please pull the latest sched-urgent-for-linus git tree from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
>
> # HEAD: e284df705cf1eeedb5ec3a66ed82d17a64659150 sched/topology: Allow sched_asym_cpucapacity to be disabled
>
> Fix two scheduler topology bugs/oversights on Juno r0 2+4 big.LITTLE
> systems.
>
I was hoping to fix this up before the PR got out, but as I've been traveling
that didn't happen. FWIW Michal spotted that the empty cpumask fix is not
complete, it also needs to check against the housekeeping cpumask. I've done
that at 20191102001406.10208-1-valentin.schneider@arm.com.
Sorry about that,
Valentin
> Thanks,
>
> Ingo
>
> ------------------>
> Valentin Schneider (2):
> sched/topology: Don't try to build empty sched domains
> sched/topology: Allow sched_asym_cpucapacity to be disabled
>
>
> kernel/cgroup/cpuset.c | 3 ++-
> kernel/sched/topology.c | 11 +++++++++--
> 2 files changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index c52bc91f882b..c87ee6412b36 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -798,7 +798,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
> cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
> continue;
>
> - if (is_sched_load_balance(cp))
> + if (is_sched_load_balance(cp) &&
> + !cpumask_empty(cp->effective_cpus))
> csa[csn++] = cp;
>
> /* skip @cp's subtree if not a partition root */
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index b5667a273bf6..49b835f1305f 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1948,7 +1948,7 @@ static struct sched_domain_topology_level
> static int
> build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
> {
> - enum s_alloc alloc_state;
> + enum s_alloc alloc_state = sa_none;
> struct sched_domain *sd;
> struct s_data d;
> struct rq *rq = NULL;
> @@ -1956,6 +1956,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> struct sched_domain_topology_level *tl_asym;
> bool has_asym = false;
>
> + if (WARN_ON(cpumask_empty(cpu_map)))
> + goto error;
> +
> alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
> if (alloc_state != sa_rootdomain)
> goto error;
> @@ -2026,7 +2029,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> rcu_read_unlock();
>
> if (has_asym)
> - static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
> + static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
>
> if (rq && sched_debug_enabled) {
> pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
> @@ -2121,8 +2124,12 @@ int sched_init_domains(const struct cpumask *cpu_map)
> */
> static void detach_destroy_domains(const struct cpumask *cpu_map)
> {
> + unsigned int cpu = cpumask_any(cpu_map);
> int i;
>
> + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
> + static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
> +
> rcu_read_lock();
> for_each_cpu(i, cpu_map)
> cpu_attach_domain(NULL, &def_root_domain, i);
>
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-11-01 17:55 Ingo Molnar
@ 2019-11-01 19:10 ` pr-tracker-bot
2019-11-02 0:15 ` Valentin Schneider
1 sibling, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2019-11-01 19:10 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Vincent Guittot,
Thomas Gleixner
The pull request you sent on Fri, 1 Nov 2019 18:55:08 +0100:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/0dbe6cb8f7e05bc9611602ef45980a6c57b245a3
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2019-11-01 17:55 Ingo Molnar
2019-11-01 19:10 ` pr-tracker-bot
2019-11-02 0:15 ` Valentin Schneider
0 siblings, 2 replies; 384+ messages in thread
From: Ingo Molnar @ 2019-11-01 17:55 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Vincent Guittot, Thomas Gleixner
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: e284df705cf1eeedb5ec3a66ed82d17a64659150 sched/topology: Allow sched_asym_cpucapacity to be disabled
Fix two scheduler topology bugs/oversights on Juno r0 2+4 big.LITTLE
systems.
Thanks,
Ingo
------------------>
Valentin Schneider (2):
sched/topology: Don't try to build empty sched domains
sched/topology: Allow sched_asym_cpucapacity to be disabled
kernel/cgroup/cpuset.c | 3 ++-
kernel/sched/topology.c | 11 +++++++++--
2 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c52bc91f882b..c87ee6412b36 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -798,7 +798,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
continue;
- if (is_sched_load_balance(cp))
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b5667a273bf6..49b835f1305f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1948,7 +1948,7 @@ static struct sched_domain_topology_level
static int
build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
{
- enum s_alloc alloc_state;
+ enum s_alloc alloc_state = sa_none;
struct sched_domain *sd;
struct s_data d;
struct rq *rq = NULL;
@@ -1956,6 +1956,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
+ if (WARN_ON(cpumask_empty(cpu_map)))
+ goto error;
+
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
@@ -2026,7 +2029,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
rcu_read_unlock();
if (has_asym)
- static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
+ static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
if (rq && sched_debug_enabled) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
@@ -2121,8 +2124,12 @@ int sched_init_domains(const struct cpumask *cpu_map)
*/
static void detach_destroy_domains(const struct cpumask *cpu_map)
{
+ unsigned int cpu = cpumask_any(cpu_map);
int i;
+ if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
+ static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
+
rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-10-12 14:58 Ingo Molnar
@ 2019-10-12 22:35 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2019-10-12 22:35 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton, Vincent Guittot, Juri Lelli
The pull request you sent on Sat, 12 Oct 2019 16:58:36 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/328fefadd9cfa15cd6ab746553d9ef13303c11a6
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2019-10-12 14:58 Ingo Molnar
2019-10-12 22:35 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2019-10-12 14:58 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
Vincent Guittot, Juri Lelli
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 68e7a4d66b0ce04bf18ff2ffded5596ab3618585 sched/vtime: Fix guest/system mis-accounting on task switch
Two fixes: a guest-cputime accounting fix, and a cgroup bandwidth quota
precision fix.
Thanks,
Ingo
------------------>
Frederic Weisbecker (1):
sched/vtime: Fix guest/system mis-accounting on task switch
Xuewei Zhang (1):
sched/fair: Scale bandwidth quota and period without losing quota/period ratio precision
kernel/sched/cputime.c | 6 +++---
kernel/sched/fair.c | 36 ++++++++++++++++++++++--------------
2 files changed, 25 insertions(+), 17 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2305ce89a26c..46ed4e1383e2 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -740,7 +740,7 @@ void vtime_account_system(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
/* We might have scheduled out from guest path */
- if (current->flags & PF_VCPU)
+ if (tsk->flags & PF_VCPU)
vtime_account_guest(tsk, vtime);
else
__vtime_account_system(tsk, vtime);
@@ -783,7 +783,7 @@ void vtime_guest_enter(struct task_struct *tsk)
*/
write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk, vtime);
- current->flags |= PF_VCPU;
+ tsk->flags |= PF_VCPU;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
@@ -794,7 +794,7 @@ void vtime_guest_exit(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
- current->flags &= ~PF_VCPU;
+ tsk->flags &= ~PF_VCPU;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 83ab35e2374f..682a754ea3e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4926,20 +4926,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
- new = (old * 147) / 128; /* ~115% */
- new = min(new, max_cfs_quota_period);
-
- cfs_b->period = ns_to_ktime(new);
-
- /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
- cfs_b->quota *= new;
- cfs_b->quota = div64_u64(cfs_b->quota, old);
-
- pr_warn_ratelimited(
- "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
- smp_processor_id(),
- div_u64(new, NSEC_PER_USEC),
- div_u64(cfs_b->quota, NSEC_PER_USEC));
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
/* reset count so we don't come right back in here */
count = 0;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-10-01 18:15 ` John Stultz
@ 2019-10-01 20:39 ` Joel Fernandes
0 siblings, 0 replies; 384+ messages in thread
From: Joel Fernandes @ 2019-10-01 20:39 UTC (permalink / raw)
To: John Stultz
Cc: Peter Zijlstra, Ingo Molnar, Linus Torvalds,
Linux Kernel Mailing List, Thomas Gleixner, Andrew Morton,
Mathieu Desnoyers, Eric W. Biederman, Alistair Delva
On Tue, Oct 01, 2019 at 11:15:01AM -0700, John Stultz wrote:
> On Tue, Oct 1, 2019 at 12:19 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Sep 30, 2019 at 04:45:49PM -0700, John Stultz wrote:
> > > Reverting the following patches:
> >
> > > "sched/membarrier: Fix p->mm->membarrier_state racy load"
> >
> > ARGH, I fudged it... please try:
> >
> >
> > diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
> > index a39bed2c784f..168479a7d61b 100644
> > --- a/kernel/sched/membarrier.c
> > +++ b/kernel/sched/membarrier.c
> > @@ -174,7 +174,6 @@ static int membarrier_private_expedited(int flags)
> > */
> > if (cpu == raw_smp_processor_id())
> > continue;
> > - rcu_read_lock();
> > p = rcu_dereference(cpu_rq(cpu)->curr);
> > if (p && p->mm == mm)
> > __cpumask_set_cpu(cpu, tmpmask);
>
>
> Yep. Looks like that solves it!
> Tested-by: John Stultz <john.stultz@linaro.org>
Makes sense.
And here I was wondering yesterday why I was seeing bug reports with
t->read_lock_nesting as non-zero when the task was interrupted in user mode ;-)
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
thanks,
- Joel
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-10-01 7:19 ` Peter Zijlstra
@ 2019-10-01 18:15 ` John Stultz
2019-10-01 20:39 ` Joel Fernandes
0 siblings, 1 reply; 384+ messages in thread
From: John Stultz @ 2019-10-01 18:15 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Linus Torvalds, Linux Kernel Mailing List,
Thomas Gleixner, Andrew Morton, Mathieu Desnoyers,
Eric W. Biederman, Joel Fernandes, Alistair Delva
On Tue, Oct 1, 2019 at 12:19 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Sep 30, 2019 at 04:45:49PM -0700, John Stultz wrote:
> > Reverting the following patches:
>
> > "sched/membarrier: Fix p->mm->membarrier_state racy load"
>
> ARGH, I fudged it... please try:
>
>
> diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
> index a39bed2c784f..168479a7d61b 100644
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -174,7 +174,6 @@ static int membarrier_private_expedited(int flags)
> */
> if (cpu == raw_smp_processor_id())
> continue;
> - rcu_read_lock();
> p = rcu_dereference(cpu_rq(cpu)->curr);
> if (p && p->mm == mm)
> __cpumask_set_cpu(cpu, tmpmask);
Yep. Looks like that solves it!
Tested-by: John Stultz <john.stultz@linaro.org>
Thanks so much for the quick turnaround!
-john
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-09-30 23:45 ` John Stultz
@ 2019-10-01 7:19 ` Peter Zijlstra
2019-10-01 18:15 ` John Stultz
0 siblings, 1 reply; 384+ messages in thread
From: Peter Zijlstra @ 2019-10-01 7:19 UTC (permalink / raw)
To: John Stultz
Cc: Ingo Molnar, Linus Torvalds, Linux Kernel Mailing List,
Thomas Gleixner, Andrew Morton, Mathieu Desnoyers,
Eric W. Biederman, Joel Fernandes, Alistair Delva
On Mon, Sep 30, 2019 at 04:45:49PM -0700, John Stultz wrote:
> Reverting the following patches:
> "sched/membarrier: Fix p->mm->membarrier_state racy load"
ARGH, I fudged it... please try:
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index a39bed2c784f..168479a7d61b 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -174,7 +174,6 @@ static int membarrier_private_expedited(int flags)
*/
if (cpu == raw_smp_processor_id())
continue;
- rcu_read_lock();
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-09-28 12:39 Ingo Molnar
2019-09-28 20:50 ` pr-tracker-bot
@ 2019-09-30 23:45 ` John Stultz
2019-10-01 7:19 ` Peter Zijlstra
1 sibling, 1 reply; 384+ messages in thread
From: John Stultz @ 2019-09-30 23:45 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, Linux Kernel Mailing List, Peter Zijlstra,
Thomas Gleixner, Andrew Morton, Mathieu Desnoyers,
Eric W. Biederman, Joel Fernandes, Alistair Delva
On Sat, Sep 28, 2019 at 5:40 AM Ingo Molnar <mingo@kernel.org> wrote:
>
> Please pull the latest sched-urgent-for-linus git tree from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
>
> # HEAD: 4892f51ad54ddff2883a60b6ad4323c1f632a9d6 sched/fair: Avoid redundant EAS calculation
>
> The changes are:
>
> - Apply a number of membarrier related fixes and cleanups, which fixes a
> use-after-free race in the membarrier code.
>
> - Introduce proper RCU protection for tasks on the runqueue - to get rid
> of the subtle task_rcu_dereference() interface that was easy to get
> wrong.
>
> - Misc fixes, but also an EAS speedup.
>
> Thanks,
>
> Ingo
>
> ------------------>
> Eric W. Biederman (4):
> tasks: Add a count of task RCU users
> tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue
> tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code
> tasks, sched/core: RCUify the assignment of rq->curr
>
> KeMeng Shi (1):
> sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr()
>
> Mathieu Desnoyers (7):
> sched/membarrier: Fix private expedited registration check
> sched/membarrier: Remove redundant check
> sched/membarrier: Call sync_core only before usermode for same mm
> sched/membarrier: Fix p->mm->membarrier_state racy load
> selftests, sched/membarrier: Add multi-threaded test
> sched/membarrier: Skip IPIs when mm->mm_users == 1
> sched/membarrier: Return -ENOMEM to userspace on memory allocation failure
>
> Qian Cai (3):
> sched/fair: Remove unused cfs_rq_clock_task() function
> sched/core: Convert vcpu_is_preempted() from macro to an inline function
> sched/fair: Fix -Wunused-but-set-variable warnings
>
> Quentin Perret (1):
> sched/fair: Avoid redundant EAS calculation
>
> Valentin Schneider (2):
> sched/core: Fix preempt_schedule() interrupt return comment
> sched/core: Remove double update_max_interval() call on CPU startup
Hey all,
After rebasing my hikey960 patches onto v5.4-rc1, I started seeing
boot hangs/stalls trying boot AOSP:
[ 9.788182] ------------[ cut here ]------------
[ 9.792829] WARNING: CPU: 7 PID: 516 at
kernel/rcu/tree_plugin.h:293 rcu_note_context_switch+0x48/0x4a8
[ 9.802229] Modules linked in:
[ 9.805298] CPU: 7 PID: 516 Comm: Jit thread pool Not tainted
5.3.0-13104-g0dbefe07634f #1126
[ 9.813822] Hardware name: HiKey960 (DT)
[ 9.817742] pstate: 20400085 (nzCv daIf +PAN -UAO)
[ 9.822530] pc : rcu_note_context_switch+0x48/0x4a8
[ 9.827403] lr : rcu_note_context_switch+0x1c/0x4a8
[ 9.832273] sp : ffffffc012ee3a60
[ 9.835581] x29: ffffffc012ee3a60 x28: ffffff82192d4140
[ 9.840889] x27: 0000000000000000 x26: ffffff821f7b38c0
[ 9.846195] x25: 00000000efb51cf8 x24: ffffffc0117ba000
[ 9.851501] x23: 0000000000000000 x22: ffffff82192d4140
[ 9.856806] x21: 0000000000000000 x20: ffffff821f7b38c0
[ 9.862111] x19: ffffff821f7b44c0 x18: 0000000000000000
[ 9.867416] x17: 0000000000000000 x16: 0000000000000000
[ 9.872721] x15: 0000000000000000 x14: 0000000000000000
[ 9.878026] x13: 0000000000000000 x12: 0000000000000000
[ 9.883331] x11: 0000000000000000 x10: 0000000000000000
[ 9.888636] x9 : 0000000000000000 x8 : ffffffc012ee3c60
[ 9.893941] x7 : ffffffc012ee3c70 x6 : ffffff8219026788
[ 9.899246] x5 : 00000000014a2000 x4 : 0000000000000000
[ 9.904551] x3 : ffffffc20e1fe000 x2 : 0000000000000001
[ 9.909856] x1 : ffffffc0117ba428 x0 : 0000000000000023
[ 9.915163] Call trace:
[ 9.917605] rcu_note_context_switch+0x48/0x4a8
[ 9.922134] __schedule+0x90/0x7d8
[ 9.925530] schedule+0x38/0xc0
[ 9.928667] futex_wait_queue_me+0xc0/0x140
[ 9.932847] futex_wait+0xe0/0x210
[ 9.936242] do_futex+0x618/0xdf8
[ 9.939551] __arm64_sys_futex_time32+0xfc/0x148
[ 9.944167] el0_svc_common.constprop.1+0x64/0x188
[ 9.948955] el0_svc_compat_handler+0x18/0x38
[ 9.953307] el0_svc_compat+0x8/0x2c
[ 9.956876] ---[ end trace cdf2ffd45270a24d ]---
Usually followed by:
[ 30.807092] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[ 30.813207] rcu: Tasks blocked on level-0 rcu_node (CPUs 0-7): P521 P519
[ 30.819998] (detected by 4, t=5255 jiffies, g=169, q=5967)
[ 30.825568] Jit thread pool S 0 521 1 0x00000000
[ 30.831050] Call trace:
[ 30.833498] __switch_to+0xd4/0x230
[ 30.836984] __schedule+0x320/0x7d8
[ 30.840464] schedule+0x38/0xc0
[ 30.843600] futex_wait_queue_me+0xc0/0x140
[ 30.847776] futex_wait+0xe0/0x210
[ 30.851169] do_futex+0x618/0xdf8
[ 30.854476] __arm64_sys_futex+0xfc/0x148
[ 30.858479] el0_svc_common.constprop.1+0x64/0x188
[ 30.863262] el0_svc_handler+0x20/0x80
[ 30.867003] el0_svc+0x8/0xc
[ 30.869876] Jit thread pool S 0 519 1 0x00400000
[ 30.875353] Call trace:
[ 30.877790] __switch_to+0xd4/0x230
[ 30.881271] __schedule+0x320/0x7d8
[ 30.884750] schedule+0x38/0xc0
[ 30.887883] futex_wait_queue_me+0xc0/0x140
[ 30.892057] futex_wait+0xe0/0x210
[ 30.895450] do_futex+0x618/0xdf8
[ 30.898755] __arm64_sys_futex_time32+0xfc/0x148
[ 30.903364] el0_svc_common.constprop.1+0x64/0x188
[ 30.908146] el0_svc_compat_handler+0x18/0x38
[ 30.912494] el0_svc_compat+0x8/0x2c
[ 31.711121] rcu: INFO: rcu_preempt detected expedited stalls on
CPUs/tasks: { P521 P519 } 5440 jiffies s: 77 root: 0x0/T
[ 31.722030] rcu: blocking rcu_node structures:
None of which seems particularly informative as to what might be going
awry. So I bisected the regression down to this merge.
Reverting the following patches:
"sched/membarrier: Return -ENOMEM to userspace on memory allocation failure"
"sched/membarrier: Skip IPIs when mm->mm_users == 1"
"sched/membarrier: Fix p->mm->membarrier_state racy load"
Seems to get things working again, but I've not been able to narrow it
down further yet as I start hitting build issues.
Not sure whats wrong here, but I'm happy to try any patches, or help
with debugging this.
thanks
-john
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-09-28 12:39 Ingo Molnar
@ 2019-09-28 20:50 ` pr-tracker-bot
2019-09-30 23:45 ` John Stultz
1 sibling, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2019-09-28 20:50 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
The pull request you sent on Sat, 28 Sep 2019 14:39:05 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/9c5efe9ae7df78600c0ee7bcce27516eb687fa6e
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2019-09-28 12:39 Ingo Molnar
2019-09-28 20:50 ` pr-tracker-bot
2019-09-30 23:45 ` John Stultz
0 siblings, 2 replies; 384+ messages in thread
From: Ingo Molnar @ 2019-09-28 12:39 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 4892f51ad54ddff2883a60b6ad4323c1f632a9d6 sched/fair: Avoid redundant EAS calculation
The changes are:
- Apply a number of membarrier related fixes and cleanups, which fixes a
use-after-free race in the membarrier code.
- Introduce proper RCU protection for tasks on the runqueue - to get rid
of the subtle task_rcu_dereference() interface that was easy to get
wrong.
- Misc fixes, but also an EAS speedup.
Thanks,
Ingo
------------------>
Eric W. Biederman (4):
tasks: Add a count of task RCU users
tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue
tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code
tasks, sched/core: RCUify the assignment of rq->curr
KeMeng Shi (1):
sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr()
Mathieu Desnoyers (7):
sched/membarrier: Fix private expedited registration check
sched/membarrier: Remove redundant check
sched/membarrier: Call sync_core only before usermode for same mm
sched/membarrier: Fix p->mm->membarrier_state racy load
selftests, sched/membarrier: Add multi-threaded test
sched/membarrier: Skip IPIs when mm->mm_users == 1
sched/membarrier: Return -ENOMEM to userspace on memory allocation failure
Qian Cai (3):
sched/fair: Remove unused cfs_rq_clock_task() function
sched/core: Convert vcpu_is_preempted() from macro to an inline function
sched/fair: Fix -Wunused-but-set-variable warnings
Quentin Perret (1):
sched/fair: Avoid redundant EAS calculation
Valentin Schneider (2):
sched/core: Fix preempt_schedule() interrupt return comment
sched/core: Remove double update_max_interval() call on CPU startup
fs/exec.c | 2 +-
include/linux/mm_types.h | 14 +-
include/linux/rcuwait.h | 20 +-
include/linux/sched.h | 10 +-
include/linux/sched/mm.h | 10 +-
include/linux/sched/task.h | 2 +-
kernel/exit.c | 74 +------
kernel/fork.c | 8 +-
kernel/sched/core.c | 28 +--
kernel/sched/fair.c | 39 +---
kernel/sched/membarrier.c | 239 +++++++++++++--------
kernel/sched/sched.h | 34 +++
tools/testing/selftests/membarrier/.gitignore | 3 +-
tools/testing/selftests/membarrier/Makefile | 5 +-
.../{membarrier_test.c => membarrier_test_impl.h} | 40 ++--
.../membarrier/membarrier_test_multi_thread.c | 73 +++++++
.../membarrier/membarrier_test_single_thread.c | 24 +++
17 files changed, 375 insertions(+), 250 deletions(-)
rename tools/testing/selftests/membarrier/{membarrier_test.c => membarrier_test_impl.h} (95%)
create mode 100644 tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
create mode 100644 tools/testing/selftests/membarrier/membarrier_test_single_thread.c
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-09-05 8:02 Ingo Molnar
@ 2019-09-05 21:15 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2019-09-05 21:15 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
The pull request you sent on Thu, 5 Sep 2019 10:02:52 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/262f7eeddc85c657e3087561dc8c3cfe227363ff
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2019-09-05 8:02 Ingo Molnar
2019-09-05 21:15 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2019-09-05 8:02 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 1251201c0d34fadf69d56efa675c2b7dd0a90eca sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code
This fixes an ABI bug introduced this cycle, plus fixes a throttling bug.
Thanks,
Ingo
------------------>
Ingo Molnar (1):
sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code
Liangyan (1):
sched/fair: Don't assign runtime for throttled cfs_rq
kernel/sched/core.c | 78 ++++++++++++++++++++++++++---------------------------
kernel/sched/fair.c | 5 ++++
2 files changed, 44 insertions(+), 39 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 010d578118d6..df9f1fe5689b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5105,37 +5105,40 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
return retval;
}
-static int sched_read_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr,
- unsigned int usize)
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
{
- int ret;
+ unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
/*
- * If we're handed a smaller struct than we know of,
- * ensure all the unknown bits are 0 - i.e. old
- * user-space does not get uncomplete information.
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
*/
- if (usize < sizeof(*attr)) {
- unsigned char *addr;
- unsigned char *end;
+ kattr->size = min(usize, ksize);
- addr = (void *)attr + usize;
- end = (void *)attr + sizeof(*attr);
-
- for (; addr < end; addr++) {
- if (*addr)
- return -EFBIG;
- }
-
- attr->size = usize;
- }
-
- ret = copy_to_user(uattr, attr, attr->size);
- if (ret)
+ if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
@@ -5145,20 +5148,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
+ * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
* @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size, unsigned int, flags)
+ unsigned int, usize, unsigned int, flags)
{
- struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- };
+ struct sched_attr kattr = { };
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0 || flags)
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
@@ -5171,25 +5172,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (retval)
goto out_unlock;
- attr.sched_policy = p->policy;
+ kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
if (task_has_dl_policy(p))
- __getparam_dl(p, &attr);
+ __getparam_dl(p, &kattr);
else if (task_has_rt_policy(p))
- attr.sched_priority = p->rt_priority;
+ kattr.sched_priority = p->rt_priority;
else
- attr.sched_nice = task_nice(p);
+ kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
- attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
rcu_read_unlock();
- retval = sched_read_attr(uattr, &attr, size);
- return retval;
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
out_unlock:
rcu_read_unlock();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bc9cfeaac8bd..500f5db0de0b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4470,6 +4470,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
if (likely(cfs_rq->runtime_remaining > 0))
return;
+ if (cfs_rq->throttled)
+ return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
@@ -4673,6 +4675,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
if (!cfs_rq_throttled(cfs_rq))
goto next;
+ /* By the above check, this should never be true */
+ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2019-04-20 7:33 Ingo Molnar
@ 2019-04-20 19:25 ` pr-tracker-bot
0 siblings, 0 replies; 384+ messages in thread
From: pr-tracker-bot @ 2019-04-20 19:25 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
The pull request you sent on Sat, 20 Apr 2019 09:33:41 +0200:
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/2b4cf5850db6acef2bbef52e3011f9bf93484209
Thank you!
--
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2019-04-20 7:33 Ingo Molnar
2019-04-20 19:25 ` pr-tracker-bot
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2019-04-20 7:33 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 1b02cd6a2d7f3e2a6a5262887d2cb2912083e42f sched/deadline: Correctly handle active 0-lag timers
A deadline scheduler warning/race fix, and a cfs_period_us quota
calculation workaround where the real fix looks too involved to merge
immediately.
Thanks,
Ingo
------------------>
Phil Auld (1):
sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup
luca abeni (1):
sched/deadline: Correctly handle active 0-lag timers
kernel/sched/deadline.c | 3 +--
kernel/sched/fair.c | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..43901fa3f269 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
if (dl_entity_is_special(dl_se))
return;
- WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)
* If the "0-lag time" already passed, decrease the active
* utilization now, instead of starting a timer
*/
- if (zerolag_time < 0) {
+ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
if (dl_task(p))
sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40bd1e27b1b7..a4d9e14bf138 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+extern const u64 max_cfs_quota_period;
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -4892,6 +4894,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
unsigned long flags;
int overrun;
int idle = 0;
+ int count = 0;
raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
@@ -4899,6 +4902,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ new = (old * 147) / 128; /* ~115% */
+ new = min(new, max_cfs_quota_period);
+
+ cfs_b->period = ns_to_ktime(new);
+
+ /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+ cfs_b->quota *= new;
+ cfs_b->quota = div64_u64(cfs_b->quota, old);
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2018-11-03 23:52 Ingo Molnar
@ 2018-11-04 1:38 ` Linus Torvalds
0 siblings, 0 replies; 384+ messages in thread
From: Linus Torvalds @ 2018-11-04 1:38 UTC (permalink / raw)
To: Ingo Molnar
Cc: Linux Kernel Mailing List, Peter Zijlstra, tglx, Andrew Morton
On Sat, Nov 3, 2018 at 4:52 PM Ingo Molnar <mingo@kernel.org> wrote:
>
> A memory (under-)allocation fix and a comment fix.
Pulled,
Linus
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2018-11-03 23:52 Ingo Molnar
2018-11-04 1:38 ` Linus Torvalds
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2018-11-03 23:52 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 993f0b0510dad98b4e6e39506834dab0d13fd539 sched/topology: Fix off by one bug
A memory (under-)allocation fix and a comment fix.
Thanks,
Ingo
------------------>
Muchun Song (1):
sched/rt: Update comment in pick_next_task_rt()
Peter Zijlstra (1):
sched/topology: Fix off by one bug
kernel/sched/rt.c | 2 +-
kernel/sched/topology.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2e2955a8cf8f..a21ea6021929 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1561,7 +1561,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* We may dequeue prev's rt_rq in put_prev_task().
- * So, we update time before rt_nr_running check.
+ * So, we update time before rt_queued check.
*/
if (prev->sched_class == &rt_sched_class)
update_curr_rt(rq);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9d74371e4aad..8d7f15ba5916 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1337,7 +1337,7 @@ void sched_init_numa(void)
int level = 0;
int i, j, k;
- sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
if (!sched_domains_numa_distance)
return;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2018-10-20 8:45 Ingo Molnar
@ 2018-10-20 13:28 ` Greg Kroah-Hartman
0 siblings, 0 replies; 384+ messages in thread
From: Greg Kroah-Hartman @ 2018-10-20 13:28 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Linus Torvalds, Peter Zijlstra, Thomas Gleixner,
Andrew Morton, Mike Galbraith
On Sat, Oct 20, 2018 at 10:45:16AM +0200, Ingo Molnar wrote:
> Greg,
>
> Please pull the latest sched-urgent-for-linus git tree from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
Now merged, thanks.
greg k-h
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2018-10-20 8:45 Ingo Molnar
2018-10-20 13:28 ` Greg Kroah-Hartman
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2018-10-20 8:45 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: linux-kernel, Linus Torvalds, Peter Zijlstra, Thomas Gleixner,
Andrew Morton, Mike Galbraith
Greg,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 9845c49cc9bbb317a0bc9e9cf78d8e09d54c9af0 sched/fair: Fix the min_vruntime update logic in dequeue_entity()
Two fixes: a CFS-throttling bug fix, and an interactivity fix.
Thanks,
Ingo
------------------>
Phil Auld (1):
sched/fair: Fix throttle_list starvation with low CFS quota
Song Muchun (1):
sched/fair: Fix the min_vruntime update logic in dequeue_entity()
kernel/sched/fair.c | 24 ++++++++++++++++++++----
kernel/sched/sched.h | 2 ++
2 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fc4a371bdd2..908c9cdae2f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4001,7 +4001,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* put back on, and if we advance min_vruntime, we'll be placed back
* further than we started -- ie. we'll be penalized.
*/
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
}
@@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
/*
* Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us
+ * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+ * not running add to the tail so that later runqueues don't get starved.
*/
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (cfs_b->distribute_running)
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ else
+ list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
/*
* If we're the first throttled task, make sure the bandwidth
@@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* in us over-using our runtime if it is all used during this loop, but
* only by limited amounts in that extreme case.
*/
- while (throttled && cfs_b->runtime > 0) {
+ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
+ cfs_b->distribute_running = 1;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
raw_spin_lock(&cfs_b->lock);
+ cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/* confirm we're still not at a refresh boundary */
raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->distribute_running) {
+ raw_spin_unlock(&cfs_b->lock);
+ return;
+ }
+
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock(&cfs_b->lock);
return;
@@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
runtime = cfs_b->runtime;
expires = cfs_b->runtime_expires;
+ if (runtime)
+ cfs_b->distribute_running = 1;
+
raw_spin_unlock(&cfs_b->lock);
if (!runtime)
@@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ cfs_b->distribute_running = 0;
raw_spin_unlock(&cfs_b->lock);
}
@@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ cfs_b->distribute_running = 0;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa330de04..9683f458aec7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -346,6 +346,8 @@ struct cfs_bandwidth {
int nr_periods;
int nr_throttled;
u64 throttled_time;
+
+ bool distribute_running;
#endif
};
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2018-10-05 9:50 Ingo Molnar
@ 2018-10-05 23:06 ` Greg Kroah-Hartman
0 siblings, 0 replies; 384+ messages in thread
From: Greg Kroah-Hartman @ 2018-10-05 23:06 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Linus Torvalds, Peter Zijlstra, Thomas Gleixner,
Andrew Morton, mel, Jirka Hladky, Srikar Dronamraju, Mel Gorman
On Fri, Oct 05, 2018 at 11:50:17AM +0200, Ingo Molnar wrote:
> Greg,
>
> Please pull the latest sched-urgent-for-linus git tree from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
Now merged, thanks.
greg k-h
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2018-10-05 9:50 Ingo Molnar
2018-10-05 23:06 ` Greg Kroah-Hartman
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2018-10-05 9:50 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: linux-kernel, Linus Torvalds, Peter Zijlstra, Thomas Gleixner,
Andrew Morton, mel, Jirka Hladky, Srikar Dronamraju, Mel Gorman
Greg,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 37355bdc5a129899f6b245900a8eb944a092f7fd sched/numa: Migrate pages to local nodes quicker early in the lifetime of a task
These fixes address a rather involved performance regression between v4.17->v4.19 in the
sched/numa auto-balancing code. Since distros really need this fix we accelerated it to
sched/urgent for a faster upstream merge.
NUMA scheduling and balancing performance is now largely back to v4.17 levels, without
reintroducing the NUMA placement bugs that v4.18 and v4.19 fixed.
Many thanks to Srikar Dronamraju, Mel Gorman and Jirka Hladky, for reporting, testing,
re-testing and solving this rather complex set of bugs.
Thanks,
Ingo
------------------>
Mel Gorman (3):
sched/numa: Limit the conditions where scan period is reset
mm, sched/numa: Remove rate-limiting of automatic NUMA balancing migration
sched/numa: Migrate pages to local nodes quicker early in the lifetime of a task
Srikar Dronamraju (5):
sched/numa: Stop multiple tasks from moving to the CPU at the same time
sched/numa: Pass destination CPU as a parameter to migrate_task_rq
sched/numa: Reset scan rate whenever task moves across nodes
mm/migrate: Use spin_trylock() while resetting rate limit
sched/numa: Avoid task migration for small NUMA improvement
include/linux/mmzone.h | 6 ---
include/trace/events/migrate.h | 27 -----------
kernel/sched/core.c | 2 +-
kernel/sched/deadline.c | 2 +-
kernel/sched/fair.c | 104 +++++++++++++++++++++++++++++++++++------
kernel/sched/sched.h | 3 +-
mm/migrate.c | 57 ----------------------
mm/page_alloc.c | 2 -
8 files changed, 95 insertions(+), 108 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1e22d96734e0..3f4c0b167333 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -671,12 +671,6 @@ typedef struct pglist_data {
#ifdef CONFIG_NUMA_BALANCING
/* Lock serializing the migrate rate limiting window */
spinlock_t numabalancing_migrate_lock;
-
- /* Rate limiting time interval */
- unsigned long numabalancing_migrate_next_window;
-
- /* Number of pages migrated during the rate limiting time interval */
- unsigned long numabalancing_migrate_nr_pages;
#endif
/*
* This is a per-node reserve of pages that are not available
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 711372845945..705b33d1e395 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -70,33 +70,6 @@ TRACE_EVENT(mm_migrate_pages,
__print_symbolic(__entry->mode, MIGRATE_MODE),
__print_symbolic(__entry->reason, MIGRATE_REASON))
);
-
-TRACE_EVENT(mm_numa_migrate_ratelimit,
-
- TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages),
-
- TP_ARGS(p, dst_nid, nr_pages),
-
- TP_STRUCT__entry(
- __array( char, comm, TASK_COMM_LEN)
- __field( pid_t, pid)
- __field( int, dst_nid)
- __field( unsigned long, nr_pages)
- ),
-
- TP_fast_assign(
- memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
- __entry->pid = p->pid;
- __entry->dst_nid = dst_nid;
- __entry->nr_pages = nr_pages;
- ),
-
- TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu",
- __entry->comm,
- __entry->pid,
- __entry->dst_nid,
- __entry->nr_pages)
-);
#endif /* _TRACE_MIGRATE_H */
/* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 625bc9897f62..ad97f3ba5ec5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1167,7 +1167,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
- p->sched_class->migrate_task_rq(p);
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
perf_event_task_migrate(p);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 997ea7b839fa..91e4202b0634 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1607,7 +1607,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
return cpu;
}
-static void migrate_task_rq_dl(struct task_struct *p)
+static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
struct rq *rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f808ddf2a868..7fc4a371bdd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1392,6 +1392,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int last_cpupid, this_cpupid;
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+
+ /*
+ * Allow first faults or private faults to migrate immediately early in
+ * the lifetime of a task. The magic number 4 is based on waiting for
+ * two full passes of the "multi-stage node selection" test that is
+ * executed below.
+ */
+ if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
+ return true;
/*
* Multi-stage node selection is used in conjunction with a periodic
@@ -1410,7 +1421,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* This quadric squishes small probabilities, making it less likely we
* act on an unlikely task<->page relation.
*/
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
if (!cpupid_pid_unset(last_cpupid) &&
cpupid_to_nid(last_cpupid) != dst_nid)
return false;
@@ -1514,6 +1524,21 @@ struct task_numa_env {
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
+ struct rq *rq = cpu_rq(env->dst_cpu);
+
+ /* Bail out if run-queue part of active NUMA balance. */
+ if (xchg(&rq->numa_migrate_on, 1))
+ return;
+
+ /*
+ * Clear previous best_cpu/rq numa-migrate flag, since task now
+ * found a better CPU to move/swap.
+ */
+ if (env->best_cpu != -1) {
+ rq = cpu_rq(env->best_cpu);
+ WRITE_ONCE(rq->numa_migrate_on, 0);
+ }
+
if (env->best_task)
put_task_struct(env->best_task);
if (p)
@@ -1552,6 +1577,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,
return (imb > old_imb);
}
+/*
+ * Maximum NUMA importance can be 1998 (2*999);
+ * SMALLIMP @ 30 would be close to 1998/64.
+ * Used to deter task migration.
+ */
+#define SMALLIMP 30
+
/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1569,6 +1601,9 @@ static void task_numa_compare(struct task_numa_env *env,
long moveimp = imp;
int dist = env->dist;
+ if (READ_ONCE(dst_rq->numa_migrate_on))
+ return;
+
rcu_read_lock();
cur = task_rcu_dereference(&dst_rq->curr);
if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
@@ -1582,7 +1617,7 @@ static void task_numa_compare(struct task_numa_env *env,
goto unlock;
if (!cur) {
- if (maymove || imp > env->best_imp)
+ if (maymove && moveimp >= env->best_imp)
goto assign;
else
goto unlock;
@@ -1625,15 +1660,21 @@ static void task_numa_compare(struct task_numa_env *env,
task_weight(cur, env->dst_nid, dist);
}
- if (imp <= env->best_imp)
- goto unlock;
-
if (maymove && moveimp > imp && moveimp > env->best_imp) {
- imp = moveimp - 1;
+ imp = moveimp;
cur = NULL;
goto assign;
}
+ /*
+ * If the NUMA importance is less than SMALLIMP,
+ * task migration might only result in ping pong
+ * of tasks and also hurt performance due to cache
+ * misses.
+ */
+ if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
+ goto unlock;
+
/*
* In the overloaded case, try and keep the load balanced.
*/
@@ -1710,6 +1751,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_cpu = -1,
};
struct sched_domain *sd;
+ struct rq *best_rq;
unsigned long taskweight, groupweight;
int nid, ret, dist;
long taskimp, groupimp;
@@ -1805,20 +1847,17 @@ static int task_numa_migrate(struct task_struct *p)
if (env.best_cpu == -1)
return -EAGAIN;
- /*
- * Reset the scan period if the task is being rescheduled on an
- * alternative node to recheck if the tasks is now properly placed.
- */
- p->numa_scan_period = task_scan_start(p);
-
+ best_rq = cpu_rq(env.best_cpu);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
return ret;
}
ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
@@ -2596,6 +2635,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
+static void update_scan_period(struct task_struct *p, int new_cpu)
+{
+ int src_nid = cpu_to_node(task_cpu(p));
+ int dst_nid = cpu_to_node(new_cpu);
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ return;
+
+ if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
+ return;
+
+ if (src_nid == dst_nid)
+ return;
+
+ /*
+ * Allow resets if faults have been trapped before one scan
+ * has completed. This is most likely due to a new task that
+ * is pulled cross-node due to wakeups or load balancing.
+ */
+ if (p->numa_scan_seq) {
+ /*
+ * Avoid scan adjustments if moving to the preferred
+ * node or if the task was not previously running on
+ * the preferred node.
+ */
+ if (dst_nid == p->numa_preferred_nid ||
+ (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ return;
+ }
+
+ p->numa_scan_period = task_scan_start(p);
+}
+
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2609,6 +2681,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
+static inline void update_scan_period(struct task_struct *p, int new_cpu)
+{
+}
+
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -6275,7 +6351,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
-static void migrate_task_rq_fair(struct task_struct *p)
+static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
/*
* As blocked tasks retain absolute vruntime the migration needs to
@@ -6328,6 +6404,8 @@ static void migrate_task_rq_fair(struct task_struct *p)
/* We have migrated, no longer consider this task hot */
p->se.exec_start = 0;
+
+ update_scan_period(p, new_cpu);
}
static void task_dead_fair(struct task_struct *p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4a2e8cae63c4..455fa330de04 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -783,6 +783,7 @@ struct rq {
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
+ unsigned int numa_migrate_on;
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
@@ -1523,7 +1524,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
- void (*migrate_task_rq)(struct task_struct *p);
+ void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
diff --git a/mm/migrate.c b/mm/migrate.c
index d6a2e89b086a..5e285c1249a0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1855,46 +1855,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
return newpage;
}
-/*
- * page migration rate limiting control.
- * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
- * window of time. Default here says do not migrate more than 1280M per second.
- */
-static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-
-/* Returns true if the node is migrate rate-limited after the update */
-static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
- unsigned long nr_pages)
-{
- /*
- * Rate-limit the amount of data that is being migrated to a node.
- * Optimal placement is no good if the memory bus is saturated and
- * all the time is being spent migrating!
- */
- if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
- spin_lock(&pgdat->numabalancing_migrate_lock);
- pgdat->numabalancing_migrate_nr_pages = 0;
- pgdat->numabalancing_migrate_next_window = jiffies +
- msecs_to_jiffies(migrate_interval_millisecs);
- spin_unlock(&pgdat->numabalancing_migrate_lock);
- }
- if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
- trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
- nr_pages);
- return true;
- }
-
- /*
- * This is an unlocked non-atomic update so errors are possible.
- * The consequences are failing to migrate when we potentiall should
- * have which is not severe enough to warrant locking. If it is ever
- * a problem, it can be converted to a per-cpu counter.
- */
- pgdat->numabalancing_migrate_nr_pages += nr_pages;
- return false;
-}
-
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
@@ -1967,14 +1927,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (page_is_file_cache(page) && PageDirty(page))
goto out;
- /*
- * Rate-limit the amount of data that is being migrated to a node.
- * Optimal placement is no good if the memory bus is saturated and
- * all the time is being spent migrating!
- */
- if (numamigrate_update_ratelimit(pgdat, 1))
- goto out;
-
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated)
goto out;
@@ -2021,14 +1973,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
unsigned long mmun_start = address & HPAGE_PMD_MASK;
unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
- /*
- * Rate-limit the amount of data that is being migrated to a node.
- * Optimal placement is no good if the memory bus is saturated and
- * all the time is being spent migrating!
- */
- if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
- goto out_dropref;
-
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
HPAGE_PMD_ORDER);
@@ -2125,7 +2069,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-out_dropref:
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
entry = pmd_modify(entry, vma->vm_page_prot);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 89d2a2ab3fe6..706a738c0aee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6197,8 +6197,6 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
static void pgdat_init_numabalancing(struct pglist_data *pgdat)
{
spin_lock_init(&pgdat->numabalancing_migrate_lock);
- pgdat->numabalancing_migrate_nr_pages = 0;
- pgdat->numabalancing_migrate_next_window = jiffies;
}
#else
static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2018-09-15 13:20 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2018-09-15 13:20 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 882a78a9f39f5535b209b4aa0a1741e35b8c67fb sched/fair: Fix kernel-doc notation warning
Misc fixes: various scheduler metrics corner case fixes, a sched_features deadlock fix, and a
topology fix for certain NUMA systems.
Thanks,
Ingo
------------------>
Jiada Wang (1):
sched/debug: Fix potential deadlock when writing to sched_features
Randy Dunlap (1):
sched/fair: Fix kernel-doc notation warning
Srikar Dronamraju (1):
sched/topology: Set correct NUMA topology type
Steve Muckle (1):
sched/fair: Fix vruntime_normalized() for remote non-migration wakeup
Vincent Guittot (3):
sched/pelt: Fix update_blocked_averages() for RT and DL classes
sched/fair: Fix scale_rt_capacity() for SMT
sched/fair: Fix load_balance redo for !imbalance
kernel/sched/debug.c | 6 ++++--
kernel/sched/fair.c | 26 +++++++++++++++++---------
kernel/sched/topology.c | 5 +----
3 files changed, 22 insertions(+), 15 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 60caf1fb94e0..6383aa6a60ca 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -89,12 +89,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
static void sched_feat_disable(int i)
{
- static_key_disable(&sched_feat_keys[i]);
+ static_key_disable_cpuslocked(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
- static_key_enable(&sched_feat_keys[i]);
+ static_key_enable_cpuslocked(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
@@ -146,9 +146,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
+ cpus_read_lock();
inode_lock(inode);
ret = sched_feat_set(cmp);
inode_unlock(inode);
+ cpus_read_unlock();
if (ret < 0)
return ret;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b39fb596f6c1..f808ddf2a868 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3362,6 +3362,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
+ * @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
@@ -7263,6 +7264,7 @@ static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
+ const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
@@ -7299,8 +7301,10 @@ static void update_blocked_averages(int cpu)
if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7365,13 +7369,16 @@ static inline void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
+ const struct sched_class *curr_class;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@@ -7482,10 +7489,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long scale_rt_capacity(int cpu)
+static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+ unsigned long max = arch_scale_cpu_capacity(sd, cpu);
unsigned long used, free;
unsigned long irq;
@@ -7507,7 +7514,7 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(cpu);
+ unsigned long capacity = scale_rt_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
@@ -8269,7 +8276,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
- return sds.busiest;
+ return env->imbalance ? sds.busiest : NULL;
out_balanced:
env->imbalance = 0;
@@ -9638,7 +9645,8 @@ static inline bool vruntime_normalized(struct task_struct *p)
* - A task which has been woken up by try_to_wake_up() and
* waiting for actually being woken up by sched_ttwu_pending().
*/
- if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+ if (!se->sum_exec_runtime ||
+ (p->state == TASK_WAKING && p->sched_remote_wakeup))
return true;
return false;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 56a0fed30c0a..505a41c42b96 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1295,7 +1295,7 @@ static void init_numa_topology_type(void)
n = sched_max_numa_distance;
- if (sched_domains_numa_levels <= 1) {
+ if (sched_domains_numa_levels <= 2) {
sched_numa_topology_type = NUMA_DIRECT;
return;
}
@@ -1380,9 +1380,6 @@ void sched_init_numa(void)
break;
}
- if (!level)
- return;
-
/*
* 'level' contains the number of unique distances
*
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2018-07-30 17:56 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2018-07-30 17:56 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: f3d133ee0a17d5694c6f21873eec9863e11fa423 sched/rt: Restore rt_runtime after disabling RT_RUNTIME_SHARE
Misc fixes:
- a deadline scheduler related bug fix which triggered a kernel warning
- an RT_RUNTIME_SHARE fix
- a stop_machine preemption fix
- a potential NULL dereference fix in sched_domain_debug_one()
Thanks,
Ingo
------------------>
Daniel Bristot de Oliveira (1):
sched/deadline: Update rq_clock of later_rq when pushing a task
Hailong Liu (1):
sched/rt: Restore rt_runtime after disabling RT_RUNTIME_SHARE
Isaac J. Manjarres (1):
stop_machine: Disable preemption after queueing stopper threads
Yi Wang (1):
sched/topology: Check variable group before dereferencing it
kernel/sched/deadline.c | 8 +++++++-
kernel/sched/rt.c | 2 ++
kernel/sched/topology.c | 2 +-
kernel/stop_machine.c | 10 +++++++++-
4 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 10c7b51c0d1f..b5fbdde6afa9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2090,8 +2090,14 @@ static int push_dl_task(struct rq *rq)
sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
add_rq_bw(&next_task->dl, &later_rq->dl);
+
+ /*
+ * Update the later_rq clock here, because the clock is used
+ * by the cpufreq_update_util() inside __add_running_bw().
+ */
+ update_rq_clock(later_rq);
add_running_bw(&next_task->dl, &later_rq->dl);
- activate_task(later_rq, next_task, 0);
+ activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
ret = 1;
resched_curr(later_rq);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 572567078b60..eaaec8364f96 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -836,6 +836,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
* can be time-consuming. Try to avoid it when possible.
*/
raw_spin_lock(&rt_rq->rt_runtime_lock);
+ if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
+ rt_rq->rt_runtime = rt_b->rt_runtime;
skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
if (skip)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05a831427bc7..56a0fed30c0a 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -47,7 +47,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
}
- if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
+ if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1ff523dae6e2..e190d1ef3a23 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -260,6 +260,15 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
err = 0;
__cpu_stop_queue_work(stopper1, work1, &wakeq);
__cpu_stop_queue_work(stopper2, work2, &wakeq);
+ /*
+ * The waking up of stopper threads has to happen
+ * in the same scheduling context as the queueing.
+ * Otherwise, there is a possibility of one of the
+ * above stoppers being woken up by another CPU,
+ * and preempting us. This will cause us to n ot
+ * wake up the other stopper forever.
+ */
+ preempt_disable();
unlock:
raw_spin_unlock(&stopper2->lock);
raw_spin_unlock_irq(&stopper1->lock);
@@ -271,7 +280,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
}
if (!err) {
- preempt_disable();
wake_up_q(&wakeq);
preempt_enable();
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2018-07-21 12:49 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2018-07-21 12:49 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: e117cb52bdb4d376b711bee34af6434c9e314b3b sched/deadline: Fix switched_from_dl() warning
Two fixes: a stop-machine preemption fix and a SCHED_DEADLINE fix.
Thanks,
Ingo
------------------>
Isaac J. Manjarres (1):
stop_machine: Disable preemption when waking two stopper threads
Juri Lelli (1):
sched/deadline: Fix switched_from_dl() warning
kernel/sched/deadline.c | 11 ++++++++++-
kernel/stop_machine.c | 6 +++++-
2 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fbfc3f1d368a..10c7b51c0d1f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2290,8 +2290,17 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && p->dl.dl_runtime)
task_non_contending(p);
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ /*
+ * Inactive timer is armed. However, p is leaving DEADLINE and
+ * might migrate away from this rq while continuing to run on
+ * some other class. We need to remove its contribution from
+ * this rq running_bw now, or sub_rq_bw (below) will complain.
+ */
+ if (p->dl.dl_non_contending)
+ sub_running_bw(&p->dl, &rq->dl);
sub_rq_bw(&p->dl, &rq->dl);
+ }
/*
* We cannot use inactive_task_timer() to invoke sub_running_bw()
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index f89014a2c238..1ff523dae6e2 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -270,7 +270,11 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
goto retry;
}
- wake_up_q(&wakeq);
+ if (!err) {
+ preempt_disable();
+ wake_up_q(&wakeq);
+ preempt_enable();
+ }
return err;
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2018-03-25 8:57 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2018-03-25 8:57 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: e9ca267096674eadd1fd479279bcb58df1486049 sched/debug: Adjust newlines for better alignment
Two sched debug output related fixes: a console output fix and formatting fixes.
Thanks,
Ingo
------------------>
Joe Lawrence (2):
sched/debug: Fix per-task line continuation for console output
sched/debug: Adjust newlines for better alignment
kernel/sched/debug.c | 29 +++++++++++++++++------------
1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1ca0130ed4f9..72c401b3b15c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -32,7 +32,7 @@ static DEFINE_SPINLOCK(sched_debug_lock);
if (m) \
seq_printf(m, x); \
else \
- printk(x); \
+ pr_cont(x); \
} while (0)
/*
@@ -501,12 +501,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
{
struct task_struct *g, *p;
- SEQ_printf(m,
- "\nrunnable tasks:\n"
- " S task PID tree-key switches prio"
- " wait-time sum-exec sum-sleep\n"
- "-------------------------------------------------------"
- "----------------------------------------------------\n");
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "runnable tasks:\n");
+ SEQ_printf(m, " S task PID tree-key switches prio"
+ " wait-time sum-exec sum-sleep\n");
+ SEQ_printf(m, "-------------------------------------------------------"
+ "----------------------------------------------------\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -527,9 +527,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
#else
- SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
@@ -595,9 +597,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#ifdef CONFIG_RT_GROUP_SCHED
- SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
#else
- SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "rt_rq[%d]:\n", cpu);
#endif
#define P(x) \
@@ -624,7 +628,8 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
{
struct dl_bw *dl_bw;
- SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "dl_rq[%d]:\n", cpu);
#define PU(x) \
SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2018-02-15 1:00 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2018-02-15 1:00 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 43d1b29b27c76e7454cd6c85bec4d0e9cbb039f3 sched/cpufreq: Remove unused SUGOV_KTHREAD_PRIORITY macro
Misc fixes:
- fix rq->lock lockdep annotation bug
- fix/improve update_curr_rt() and update_curr_dl() accounting
- update documentation
- remove unused macro
Thanks,
Ingo
------------------>
Leo Yan (1):
sched/cpufreq: Remove unused SUGOV_KTHREAD_PRIORITY macro
Mathieu Desnoyers (1):
membarrier-sync-core: Document architecture support
Peter Zijlstra (1):
sched/core: Fix DEBUG_SPINLOCK annotation for rq->lock
Wen Yang (2):
sched/deadline: Make update_curr_dl() more accurate
sched/rt: Make update_curr_rt() more accurate
.../sched/membarrier-sync-core/arch-support.txt | 62 ++++++++++++++++++++++
kernel/sched/core.c | 27 ++++++----
kernel/sched/cpufreq_schedutil.c | 2 -
kernel/sched/deadline.c | 6 ++-
kernel/sched/rt.c | 3 +-
5 files changed, 84 insertions(+), 16 deletions(-)
create mode 100644 Documentation/features/sched/membarrier-sync-core/arch-support.txt
diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
new file mode 100644
index 000000000000..2c815a7f1ba7
--- /dev/null
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -0,0 +1,62 @@
+#
+# Feature name: membarrier-sync-core
+# Kconfig: ARCH_HAS_MEMBARRIER_SYNC_CORE
+# description: arch supports core serializing membarrier
+#
+# Architecture requirements
+#
+# * arm64
+#
+# Rely on eret context synchronization when returning from IPI handler, and
+# when returning to user-space.
+#
+# * x86
+#
+# x86-32 uses IRET as return from interrupt, which takes care of the IPI.
+# However, it uses both IRET and SYSEXIT to go back to user-space. The IRET
+# instruction is core serializing, but not SYSEXIT.
+#
+# x86-64 uses IRET as return from interrupt, which takes care of the IPI.
+# However, it can return to user-space through either SYSRETL (compat code),
+# SYSRETQ, or IRET.
+#
+# Given that neither SYSRET{L,Q}, nor SYSEXIT, are core serializing, we rely
+# instead on write_cr3() performed by switch_mm() to provide core serialization
+# after changing the current mm, and deal with the special case of kthread ->
+# uthread (temporarily keeping current mm into active_mm) by issuing a
+# sync_core_before_usermode() in that specific case.
+#
+ -----------------------
+ | arch |status|
+ -----------------------
+ | alpha: | TODO |
+ | arc: | TODO |
+ | arm: | TODO |
+ | arm64: | ok |
+ | blackfin: | TODO |
+ | c6x: | TODO |
+ | cris: | TODO |
+ | frv: | TODO |
+ | h8300: | TODO |
+ | hexagon: | TODO |
+ | ia64: | TODO |
+ | m32r: | TODO |
+ | m68k: | TODO |
+ | metag: | TODO |
+ | microblaze: | TODO |
+ | mips: | TODO |
+ | mn10300: | TODO |
+ | nios2: | TODO |
+ | openrisc: | TODO |
+ | parisc: | TODO |
+ | powerpc: | TODO |
+ | s390: | TODO |
+ | score: | TODO |
+ | sh: | TODO |
+ | sparc: | TODO |
+ | tile: | TODO |
+ | um: | TODO |
+ | unicore32: | TODO |
+ | x86: | ok |
+ | xtensa: | TODO |
+ -----------------------
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bf724c1952ea..e7c535eee0a6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2601,19 +2601,31 @@ static inline void finish_task(struct task_struct *prev)
#endif
}
-static inline void finish_lock_switch(struct rq *rq)
+static inline void
+prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+ rq_unpin_lock(rq, rf);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
- rq->lock.owner = current;
+ rq->lock.owner = next;
#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq)
+{
/*
* If we are tracking spinlock dependencies then we have to
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-
raw_spin_unlock_irq(&rq->lock);
}
@@ -2844,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- /*
- * Since the runqueue lock will be released by the next
- * task (which is an invalid locking op but in the case
- * of the scheduler it's an obvious special-case), so we
- * do an early lockdep release here:
- */
- rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index dd062a1c8cf0..7936f548e071 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -19,8 +19,6 @@
#include "sched.h"
-#define SUGOV_KTHREAD_PRIORITY 50
-
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9bb0e0c412ec..9df09782025c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1153,6 +1153,7 @@ static void update_curr_dl(struct rq *rq)
struct sched_dl_entity *dl_se = &curr->dl;
u64 delta_exec, scaled_delta_exec;
int cpu = cpu_of(rq);
+ u64 now;
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
@@ -1165,7 +1166,8 @@ static void update_curr_dl(struct rq *rq)
* natural solution, but the full ramifications of this
* approach need further study.
*/
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ now = rq_clock_task(rq);
+ delta_exec = now - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0)) {
if (unlikely(dl_se->dl_yielded))
goto throttle;
@@ -1178,7 +1180,7 @@ static void update_curr_dl(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq_clock_task(rq);
+ curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 663b2355a3aa..aad49451584e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- u64 now = rq_clock_task(rq);
u64 delta_exec;
+ u64 now;
if (curr->sched_class != &rt_sched_class)
return;
+ now = rq_clock_task(rq);
delta_exec = now - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2018-01-12 13:48 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2018-01-12 13:48 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 541676078b52f365f53d46ee5517d305cd1b6350 membarrier: Disable preemption when calling smp_call_function_many()
A Kconfig fix, a build fix and a membarrier bug fix.
Thanks,
Ingo
------------------>
Geert Uytterhoeven (1):
sched/isolation: Make CONFIG_CPU_ISOLATION=y depend on SMP or COMPILE_TEST
Mathieu Desnoyers (1):
membarrier: Disable preemption when calling smp_call_function_many()
Valentin Ilie (1):
ia64, sched/cputime: Fix build error if CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y
arch/ia64/kernel/time.c | 2 +-
init/Kconfig | 1 +
kernel/sched/membarrier.c | 2 ++
3 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index c6ecb97151a2..9025699049ca 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -88,7 +88,7 @@ void vtime_flush(struct task_struct *tsk)
}
if (ti->softirq_time) {
- delta = cycle_to_nsec(ti->softirq_time));
+ delta = cycle_to_nsec(ti->softirq_time);
account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ);
}
diff --git a/init/Kconfig b/init/Kconfig
index 690a381adee0..c1221332e128 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -461,6 +461,7 @@ endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION
bool "CPU isolation"
+ depends on SMP || COMPILE_TEST
default y
help
Make sure that CPUs running critical tasks are not disturbed by
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..9bcbacba82a8 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void)
rcu_read_unlock();
}
if (!fallback) {
+ preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
free_cpumask_var(tmpmask);
}
cpus_read_unlock();
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2017-12-15 15:35 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2017-12-15 15:35 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: f73c52a5bcd1710994e53fbccc378c42b97a06b6 sched/rt: Do not pull from current CPU if only one CPU to pull
Two fixes: a crash fix for an ARM SoC platform, and kernel-doc warnings fixes.
Thanks,
Ingo
------------------>
Randy Dunlap (1):
sched/core: Fix kernel-doc warnings after code movement
Steven Rostedt (1):
sched/rt: Do not pull from current CPU if only one CPU to pull
kernel/sched/core.c | 22 +++++++++++-----------
kernel/sched/rt.c | 8 +++++++-
2 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75554f366fd3..644fa2e3d993 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
return ret;
}
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
@@ -5144,6 +5133,17 @@ static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
return retval;
}
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct timespec __user *, interval)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4056c19ca3f0..665ace2fc558 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq)
bool resched = false;
struct task_struct *p;
struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
- if (likely(!rt_overloaded(this_rq)))
+ if (likely(!rt_overload_count))
return;
/*
@@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+ /* If we are the only overloaded CPU do nothing */
+ if (rt_overload_count == 1 &&
+ cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+ return;
+
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2017-12-06 22:21 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2017-12-06 22:21 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: a4c3c04974d648ee6e1a09ef4131eb32a02ab494 sched/fair: Update and fix the runnable propagation rule
This includes a fix for the add_wait_queue() queue ordering brown paperbag bug,
plus PELT accounting fixes for cgroups scheduling artifacts.
Thanks,
Ingo
------------------>
Omar Sandoval (1):
sched/wait: Fix add_wait_queue() behavioral change
Vincent Guittot (1):
sched/fair: Update and fix the runnable propagation rule
kernel/sched/fair.c | 102 +++++++++++++++++++++++++++++++++++++---------------
kernel/sched/wait.c | 2 +-
2 files changed, 74 insertions(+), 30 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4037e19bbca2..2fe3aa853e4d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3413,9 +3413,9 @@ void set_task_rq_fair(struct sched_entity *se,
* _IFF_ we look at the pure running and runnable sums. Because they
* represent the very same entity, just at different points in the hierarchy.
*
- *
- * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
- * simply copies the running sum over.
+ * Per the above update_tg_cfs_util() is trivial and simply copies the running
+ * sum over (but still wrong, because the group entity and group rq do not have
+ * their PELT windows aligned).
*
* However, update_tg_cfs_runnable() is more complex. So we have:
*
@@ -3424,11 +3424,11 @@ void set_task_rq_fair(struct sched_entity *se,
* And since, like util, the runnable part should be directly transferable,
* the following would _appear_ to be the straight forward approach:
*
- * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
+ * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
*
* And per (1) we have:
*
- * ge->avg.running_avg == grq->avg.running_avg
+ * ge->avg.runnable_avg == grq->avg.runnable_avg
*
* Which gives:
*
@@ -3447,27 +3447,28 @@ void set_task_rq_fair(struct sched_entity *se,
* to (shortly) return to us. This only works by keeping the weights as
* integral part of the sum. We therefore cannot decompose as per (3).
*
- * OK, so what then?
+ * Another reason this doesn't work is that runnable isn't a 0-sum entity.
+ * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
+ * rq itself is runnable anywhere between 2/3 and 1 depending on how the
+ * runnable section of these tasks overlap (or not). If they were to perfectly
+ * align the rq as a whole would be runnable 2/3 of the time. If however we
+ * always have at least 1 runnable task, the rq as a whole is always runnable.
*
+ * So we'll have to approximate.. :/
*
- * Another way to look at things is:
+ * Given the constraint:
*
- * grq->avg.load_avg = \Sum se->avg.load_avg
+ * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
*
- * Therefore, per (2):
+ * We can construct a rule that adds runnable to a rq by assuming minimal
+ * overlap.
*
- * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ * On removal, we'll assume each task is equally runnable; which yields:
*
- * And the very thing we're propagating is a change in that sum (someone
- * joined/left). So we can easily know the runnable change, which would be, per
- * (2) the already tracked se->load_avg divided by the corresponding
- * se->weight.
+ * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
*
- * Basically (4) but in differential form:
+ * XXX: only do this for the part of runnable > running ?
*
- * d(runnable_avg) += se->avg.load_avg / se->load.weight
- * (5)
- * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
*/
static inline void
@@ -3479,6 +3480,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
if (!delta)
return;
+ /*
+ * The relation between sum and avg is:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * however, the PELT windows are not aligned between grq and gse.
+ */
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
@@ -3491,33 +3500,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long runnable_sum = gcfs_rq->prop_runnable_sum;
- long runnable_load_avg, load_avg;
- s64 runnable_load_sum, load_sum;
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ unsigned long runnable_load_avg, load_avg;
+ u64 runnable_load_sum, load_sum = 0;
+ s64 delta_sum;
if (!runnable_sum)
return;
gcfs_rq->prop_runnable_sum = 0;
+ if (runnable_sum >= 0) {
+ /*
+ * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
+ * the CPU is saturated running == runnable.
+ */
+ runnable_sum += se->avg.load_sum;
+ runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+ } else {
+ /*
+ * Estimate the new unweighted runnable_sum of the gcfs_rq by
+ * assuming all tasks are equally runnable.
+ */
+ if (scale_load_down(gcfs_rq->load.weight)) {
+ load_sum = div_s64(gcfs_rq->avg.load_sum,
+ scale_load_down(gcfs_rq->load.weight));
+ }
+
+ /* But make sure to not inflate se's runnable */
+ runnable_sum = min(se->avg.load_sum, load_sum);
+ }
+
+ /*
+ * runnable_sum can't be lower than running_sum
+ * As running sum is scale with cpu capacity wehreas the runnable sum
+ * is not we rescale running_sum 1st
+ */
+ running_sum = se->avg.util_sum /
+ arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ runnable_sum = max(runnable_sum, running_sum);
+
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, LOAD_AVG_MAX);
- add_positive(&se->avg.load_sum, runnable_sum);
- add_positive(&se->avg.load_avg, load_avg);
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
+ delta_avg = load_avg - se->avg.load_avg;
- add_positive(&cfs_rq->avg.load_avg, load_avg);
- add_positive(&cfs_rq->avg.load_sum, load_sum);
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+ delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
+ delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
- add_positive(&se->avg.runnable_load_sum, runnable_sum);
- add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+ se->avg.runnable_load_sum = runnable_sum;
+ se->avg.runnable_load_avg = runnable_load_avg;
if (se->on_rq) {
- add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
- add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
+ add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
}
}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 98feab7933c7..929ecb7d6b78 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&wq_head->lock, flags);
- __add_wait_queue_entry_tail(wq_head, wq_entry);
+ __add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2017-11-26 12:43 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2017-11-26 12:43 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 3f5fe9fef5b2da06b6319fab8123056da5217c3f sched/debug: Fix task state recording/printout
Misc fixes: a documentation fix, a Sparse warning fix and a debugging fix.
Thanks,
Ingo
------------------>
Claudio Scordino (1):
sched/deadline: Fix the description of runtime accounting in the documentation
Dan Carpenter (1):
sched/deadline: Don't use dubious signed bitfields
Thomas Gleixner (1):
sched/debug: Fix task state recording/printout
Documentation/scheduler/sched-deadline.txt | 13 ++++++++++---
include/linux/sched.h | 8 ++++----
include/trace/events/sched.h | 6 +++---
3 files changed, 17 insertions(+), 10 deletions(-)
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index e89e36ec15a5..8ce78f82ae23 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -204,10 +204,17 @@ CONTENTS
It does so by decrementing the runtime of the executing task Ti at a pace equal
to
- dq = -max{ Ui, (1 - Uinact) } dt
+ dq = -max{ Ui / Umax, (1 - Uinact - Uextra) } dt
- where Uinact is the inactive utilization, computed as (this_bq - running_bw),
- and Ui is the bandwidth of task Ti.
+ where:
+
+ - Ui is the bandwidth of task Ti;
+ - Umax is the maximum reclaimable utilization (subjected to RT throttling
+ limits);
+ - Uinact is the (per runqueue) inactive utilization, computed as
+ (this_bq - running_bw);
+ - Uextra is the (per runqueue) extra reclaimable utilization
+ (subjected to RT throttling limits).
Let's now see a trivial example of two deadline tasks with runtime equal
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5dc7c98b0a2..21991d668d35 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -473,10 +473,10 @@ struct sched_dl_entity {
* conditions between the inactive timer handler and the wakeup
* code.
*/
- int dl_throttled : 1;
- int dl_boosted : 1;
- int dl_yielded : 1;
- int dl_non_contending : 1;
+ unsigned int dl_throttled : 1;
+ unsigned int dl_boosted : 1;
+ unsigned int dl_yielded : 1;
+ unsigned int dl_non_contending : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 306b31de5194..bc01e06bc716 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -116,9 +116,9 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
* RUNNING (we will not have dequeued if state != RUNNING).
*/
if (preempt)
- return TASK_STATE_MAX;
+ return TASK_REPORT_MAX;
- return task_state_index(p);
+ return 1 << task_state_index(p);
}
#endif /* CREATE_TRACE_POINTS */
@@ -164,7 +164,7 @@ TRACE_EVENT(sched_switch,
{ 0x40, "P" }, { 0x80, "I" }) :
"R",
- __entry->prev_state & TASK_STATE_MAX ? "+" : "",
+ __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
__entry->next_comm, __entry->next_pid, __entry->next_prio)
);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2017-10-14 16:11 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2017-10-14 16:11 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Thomas Gleixner,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 024c9d2faebdad3fb43fe49ad68e91a36190f1e2 sched/core: Ensure load_balance() respects the active_mask
Three fixes that address an SMP balancing performance regression.
Thanks,
Ingo
------------------>
Peter Zijlstra (3):
sched/core: Fix wake_affine() performance regression
sched/core: Address more wake_affine() regressions
sched/core: Ensure load_balance() respects the active_mask
include/linux/sched/topology.h | 8 ---
kernel/sched/fair.c | 140 ++++++++++++++---------------------------
kernel/sched/features.h | 3 +
3 files changed, 49 insertions(+), 102 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d7b6dab956ec..7d065abc7a47 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
-
- /*
- * Some variables from the most recent sd_lb_stats for this domain,
- * used by wake_affine().
- */
- unsigned long nr_running;
- unsigned long load;
- unsigned long capacity;
};
struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
return 1;
}
-struct llc_stats {
- unsigned long nr_running;
- unsigned long load;
- unsigned long capacity;
- int has_capacity;
-};
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ * will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ * scheduling latency of the CPUs. This seems to work
+ * for the overloaded case.
+ */
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
{
- struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-
- if (!sds)
- return false;
+ if (idle_cpu(this_cpu))
+ return true;
- stats->nr_running = READ_ONCE(sds->nr_running);
- stats->load = READ_ONCE(sds->load);
- stats->capacity = READ_ONCE(sds->capacity);
- stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+ if (sync && cpu_rq(this_cpu)->nr_running == 1)
+ return true;
- return true;
+ return false;
}
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
- */
static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int prev_cpu, int sync)
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
{
- struct llc_stats prev_stats, this_stats;
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- if (!get_llc_stats(&prev_stats, prev_cpu) ||
- !get_llc_stats(&this_stats, this_cpu))
- return false;
+ this_eff_load = target_load(this_cpu, sd->wake_idx);
+ prev_eff_load = source_load(prev_cpu, sd->wake_idx);
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current LLC.
- */
if (sync) {
unsigned long current_load = task_h_load(current);
- /* in this case load hits 0 and this LLC is considered 'idle' */
- if (current_load > this_stats.load)
+ if (current_load > this_eff_load)
return true;
- this_stats.load -= current_load;
+ this_eff_load -= current_load;
}
- /*
- * The has_capacity stuff is not SMT aware, but by trying to balance
- * the nr_running on both ends we try and fill the domain at equal
- * rates, thereby first consuming cores before siblings.
- */
-
- /* if the old cache has capacity, stay there */
- if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
- return false;
-
- /* if this cache has capacity, come here */
- if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
- return true;
-
- /*
- * Check to see if we can move the load without causing too much
- * imbalance.
- */
task_load = task_h_load(p);
- this_eff_load = 100;
- this_eff_load *= prev_stats.capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_stats.capacity;
+ this_eff_load += task_load;
+ if (sched_feat(WA_BIAS))
+ this_eff_load *= 100;
+ this_eff_load *= capacity_of(prev_cpu);
- this_eff_load *= this_stats.load + task_load;
- prev_eff_load *= prev_stats.load - task_load;
+ prev_eff_load -= task_load;
+ if (sched_feat(WA_BIAS))
+ prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
return this_eff_load <= prev_eff_load;
}
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine;
+ bool affine = false;
- /*
- * Default to no affine wakeups; wake_affine() should not effect a task
- * placement the load-balancer feels inclined to undo. The conservative
- * option is therefore to not move tasks when they wake up.
- */
- affine = false;
+ if (sched_feat(WA_IDLE) && !affine)
+ affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
- /*
- * If the wakeup is across cache domains, try to evaluate if movement
- * makes sense, otherwise rely on select_idle_siblings() to do
- * placement inside the cache domain.
- */
- if (!cpus_share_cache(prev_cpu, this_cpu))
- affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_WEIGHT) && !affine)
+ affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
-
- if (!shared)
- return;
-
- /*
- * Since these are sums over groups they can contain some CPUs
- * multiple times for the NUMA domains.
- *
- * Currently only wake_affine_llc() and find_busiest_group()
- * uses these numbers, only the last is affected by this problem.
- *
- * XXX fix that.
- */
- WRITE_ONCE(shared->nr_running, sds->total_running);
- WRITE_ONCE(shared->load, sds->total_load);
- WRITE_ONCE(shared->capacity, sds->total_capacity);
}
/**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
int cpu, balance_cpu = -1;
/*
+ * Ensure the balancing environment is consistent; can happen
+ * when the softirq triggers 'during' hotplug.
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+ return 0;
+
+ /*
* In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2017-09-13 17:57 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2017-09-13 17:57 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Thomas Gleixner, Peter Zijlstra, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 9469eb01db891b55367ee7539f1b9f7f6fd2819d sched/debug: Add debugfs knob for "sched_debug"
Three CPU hotplug related fixes and a debugging improvement.
Thanks,
Ingo
------------------>
Peter Zijlstra (4):
sched/fair: Avoid newidle balance for !active CPUs
sched/fair: Plug hole between hotplug and active_load_balance()
sched/core: WARN() when migrating to an offline CPU
sched/debug: Add debugfs knob for "sched_debug"
kernel/sched/core.c | 4 ++++
kernel/sched/debug.c | 5 +++++
kernel/sched/fair.c | 13 +++++++++++++
kernel/sched/sched.h | 2 ++
kernel/sched/topology.c | 4 +---
5 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 136a76d80dbf..18a6966567da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1173,6 +1173,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
+ /*
+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
+ */
+ WARN_ON_ONCE(!cpu_online(new_cpu));
#endif
trace_sched_migrate_task(p, new_cpu);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4a23bbc3111b..b19d06ea6e10 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -181,11 +181,16 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
+__read_mostly bool sched_debug_enabled;
+
static __init int sched_init_debug(void)
{
debugfs_create_file("sched_features", 0644, NULL, NULL,
&sched_feat_fops);
+ debugfs_create_bool("sched_debug", 0644, NULL,
+ &sched_debug_enabled);
+
return 0;
}
late_initcall(sched_init_debug);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8415d1ec2b84..efeebed935ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8448,6 +8448,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
this_rq->idle_stamp = rq_clock(this_rq);
/*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
+
+ /*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
@@ -8554,6 +8560,13 @@ static int active_load_balance_cpu_stop(void *data)
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
+ /*
+ * Between queueing the stop-work and running it is a hole in which
+ * CPUs can become inactive. We should not move tasks from or to
+ * inactive CPUs.
+ */
+ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
+ goto out_unlock;
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ab1c7f5409a0..7ea2a0339771 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1954,6 +1954,8 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
#ifdef CONFIG_SCHED_DEBUG
+extern bool sched_debug_enabled;
+
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6f7b43982f73..2ab2aa68c796 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -14,11 +14,9 @@ cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_debug_enabled;
-
static int __init sched_debug_setup(char *str)
{
- sched_debug_enabled = 1;
+ sched_debug_enabled = true;
return 0;
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2017-09-12 15:35 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2017-09-12 15:35 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 46123355af729514e6fa8b8a9dd1e645e61a6466 sched/fair: Fix nuisance kernel-doc warning
Three fixes:
- fix a suspend/resume cpusets bug
- fix a !CONFIG_NUMA_BALANCING bug
- fix a kerneldoc warning
Thanks,
Ingo
------------------>
Peter Zijlstra (2):
sched/fair: Fix wake_affine_llc() balancing rules
sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs
Randy Dunlap (1):
sched/fair: Fix nuisance kernel-doc warning
include/linux/cpuset.h | 6 ++++++
kernel/cgroup/cpuset.c | 16 +++++++++++++++-
kernel/power/process.c | 5 ++++-
kernel/sched/core.c | 7 +++----
kernel/sched/fair.c | 4 ++--
5 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index e74655d941b7..a1e6a33a4b03 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -51,7 +51,9 @@ static inline void cpuset_dec(void)
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
+extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
+extern void cpuset_wait_for_hotplug(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -164,11 +166,15 @@ static inline bool cpusets_enabled(void) { return false; }
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
+static inline void cpuset_force_rebuild(void) { }
+
static inline void cpuset_update_active_cpus(void)
{
partition_sched_domains(1, NULL, NULL);
}
+static inline void cpuset_wait_for_hotplug(void) { }
+
static inline void cpuset_cpus_allowed(struct task_struct *p,
struct cpumask *mask)
{
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039bafebb..0513ee39698b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2267,6 +2267,13 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
mutex_unlock(&cpuset_mutex);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2341,8 +2348,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
+ if (cpus_updated || force_rebuild) {
+ force_rebuild = false;
rebuild_sched_domains();
+ }
}
void cpuset_update_active_cpus(void)
@@ -2355,6 +2364,11 @@ void cpuset_update_active_cpus(void)
schedule_work(&cpuset_hotplug_work);
}
+void cpuset_wait_for_hotplug(void)
+{
+ flush_work(&cpuset_hotplug_work);
+}
+
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78672d324a6e..50f25cb370c6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -20,8 +20,9 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/cpuset.h>
-/*
+/*
* Timeout for stopping processes
*/
unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
@@ -202,6 +203,8 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
+ cpuset_wait_for_hotplug();
+
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6d2c7ff9ba98..136a76d80dbf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5556,16 +5556,15 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
return;
- }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
+ cpuset_force_rebuild();
}
cpuset_update_active_cpus();
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8d5868771cb3..8415d1ec2b84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5435,7 +5435,7 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
return false;
/* if this cache has capacity, come here */
- if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+ if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
return true;
/*
@@ -7719,7 +7719,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* number.
*
* Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in *imbalance.
+ * this CPU. The amount of the imbalance is returned in env->imbalance.
*
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2017-07-21 10:18 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2017-07-21 10:18 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 193be41e33168a3a06eb9d356d9e39c69de161d2 sched/deadline: Fix confusing comments about selection of top pi-waiter
A cputime fix and code comments/organization fix to the deadline scheduler.
Thanks,
Ingo
------------------>
Joel Fernandes (1):
sched/deadline: Fix confusing comments about selection of top pi-waiter
Wanpeng Li (1):
sched/cputime: Don't use smp_processor_id() in preemptible context
kernel/sched/cputime.c | 6 +++---
kernel/sched/deadline.c | 14 ++++++++------
2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 6e3ea4ac1bda..14d2dbf97c53 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -683,7 +683,7 @@ static u64 vtime_delta(struct vtime *vtime)
{
unsigned long long clock;
- clock = sched_clock_cpu(smp_processor_id());
+ clock = sched_clock();
if (clock < vtime->starttime)
return 0;
@@ -814,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
write_seqcount_begin(&vtime->seqcount);
vtime->state = VTIME_SYS;
- vtime->starttime = sched_clock_cpu(smp_processor_id());
+ vtime->starttime = sched_clock();
write_seqcount_end(&vtime->seqcount);
}
@@ -826,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags);
write_seqcount_begin(&vtime->seqcount);
vtime->state = VTIME_SYS;
- vtime->starttime = sched_clock_cpu(cpu);
+ vtime->starttime = sched_clock();
write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a84299f44b5d..755bd3f1a1a9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
struct sched_dl_entity *pi_se = &p->dl;
/*
- * Use the scheduling parameters of the top pi-waiter
- * task if we have one and its (absolute) deadline is
- * smaller than our one... OTW we keep our runtime and
- * deadline.
+ * Use the scheduling parameters of the top pi-waiter task if:
+ * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
+ * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
+ * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
+ * boosted due to a SCHED_DEADLINE pi-waiter).
+ * Otherwise we keep our runtime and deadline.
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
+ if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl;
} else if (!dl_prio(p->normal_prio)) {
/*
* Special case in which we have a !SCHED_DEADLINE task
- * that is going to be deboosted, but exceedes its
+ * that is going to be deboosted, but exceeds its
* runtime while doing so. No point in replenishing
* it, as it's going to return back to its original
* scheduling class after this.
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2017-03-07 20:33 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2017-03-07 20:33 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Thomas Gleixner,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: f94c8d116997597fc00f0812b0ab9256e7b0c58f sched/clock, x86/tsc: Rework the x86 'unstable' sched_clock() interface
A fix for KVM's scheduler clock which (erroneously) was always marked unstable, a
fix for RT/DL load balancing, plus latency fixes.
Thanks,
Ingo
------------------>
Peter Zijlstra (3):
sched/fair: Make select_idle_cpu() more aggressive
sched/core: Fix pick_next_task() for RT,DL
sched/clock, x86/tsc: Rework the x86 'unstable' sched_clock() interface
arch/x86/kernel/cpu/amd.c | 4 ----
arch/x86/kernel/cpu/centaur.c | 2 --
arch/x86/kernel/cpu/common.c | 3 ---
arch/x86/kernel/cpu/cyrix.c | 1 -
arch/x86/kernel/cpu/intel.c | 4 ----
arch/x86/kernel/cpu/transmeta.c | 2 --
arch/x86/kernel/tsc.c | 35 +++++++++++++++++++++++------------
kernel/sched/core.c | 11 ++++++++---
kernel/sched/fair.c | 2 +-
kernel/sched/features.h | 5 +++++
10 files changed, 37 insertions(+), 32 deletions(-)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4e95b2e0d95f..30d924ae5c34 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -555,10 +555,6 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- if (check_tsc_unstable())
- clear_sched_clock_stable();
- } else {
- clear_sched_clock_stable();
}
/* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 2c234a6d94c4..bad8ff078a21 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -104,8 +104,6 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
-
- clear_sched_clock_stable();
}
static void init_centaur(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c64ca5929cb5..9d98e2e15d54 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -86,7 +86,6 @@ static void default_init(struct cpuinfo_x86 *c)
strcpy(c->x86_model_id, "386");
}
#endif
- clear_sched_clock_stable();
}
static const struct cpu_dev default_cpu = {
@@ -1075,8 +1074,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
*/
if (this_cpu->c_init)
this_cpu->c_init(c);
- else
- clear_sched_clock_stable();
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 47416f959a48..31e679238e8d 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -184,7 +184,6 @@ static void early_init_cyrix(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
break;
}
- clear_sched_clock_stable();
}
static void init_cyrix(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 017ecd3bb553..2388bafe5c37 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -161,10 +161,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- if (check_tsc_unstable())
- clear_sched_clock_stable();
- } else {
- clear_sched_clock_stable();
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index c1ea5b999839..6a9ad73a2c54 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -15,8 +15,6 @@ static void early_init_transmeta(struct cpuinfo_x86 *c)
if (xlvl >= 0x80860001)
c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
}
-
- clear_sched_clock_stable();
}
static void init_transmeta(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2724dc82f992..911129fda2f9 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -326,9 +326,16 @@ unsigned long long sched_clock(void)
{
return paravirt_sched_clock();
}
+
+static inline bool using_native_sched_clock(void)
+{
+ return pv_time_ops.sched_clock == native_sched_clock;
+}
#else
unsigned long long
sched_clock(void) __attribute__((alias("native_sched_clock")));
+
+static inline bool using_native_sched_clock(void) { return true; }
#endif
int check_tsc_unstable(void)
@@ -1111,8 +1118,10 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
{
if (tsc_unstable)
return;
+
tsc_unstable = 1;
- clear_sched_clock_stable();
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
@@ -1134,18 +1143,20 @@ static struct clocksource clocksource_tsc = {
void mark_tsc_unstable(char *reason)
{
- if (!tsc_unstable) {
- tsc_unstable = 1;
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
- pr_info("Marking TSC unstable due to %s\n", reason);
- /* Change only the rating, when not registered */
- if (clocksource_tsc.mult)
- clocksource_mark_unstable(&clocksource_tsc);
- else {
- clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
- clocksource_tsc.rating = 0;
- }
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to %s\n", reason);
+ /* Change only the rating, when not registered */
+ if (clocksource_tsc.mult) {
+ clocksource_mark_unstable(&clocksource_tsc);
+ } else {
+ clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
+ clocksource_tsc.rating = 0;
}
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bbfb917a9b49..6699d43a8843 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3273,10 +3273,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
/*
- * Optimization: we know that if all tasks are in
- * the fair class we can call that function directly:
+ * Optimization: we know that if all tasks are in the fair class we can
+ * call that function directly, but only if the @prev task wasn't of a
+ * higher scheduling class, because otherwise those loose the
+ * opportunity to pull in more work from other CPUs.
*/
- if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+ if (likely((prev->sched_class == &idle_sched_class ||
+ prev->sched_class == &fair_sched_class) &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
+
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto again;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 274c747a01ce..b3ee10dd3e85 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5797,7 +5797,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
*/
- if ((avg_idle / 512) < avg_cost)
+ if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
return -1;
time = local_clock();
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa46c2f..1b3c8189b286 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -51,6 +51,11 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
*/
SCHED_FEAT(TTWU_QUEUE, true)
+/*
+ * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
+ */
+SCHED_FEAT(SIS_AVG_CPU, false)
+
#ifdef HAVE_RT_PUSH_IPI
/*
* In order to avoid a thundering herd attack of CPUs that are
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2017-02-28 8:05 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2017-02-28 8:05 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Thomas Gleixner,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 96b777452d8881480fd5be50112f791c17db4b6b sched/cgroup: Move sched_online_group() back into css_online() to fix crash
Two rq-clock warnings related fixes, plus a cgroups related crash fix.
Thanks,
Ingo
------------------>
Konstantin Khlebnikov (1):
sched/cgroup: Move sched_online_group() back into css_online() to fix crash
Peter Zijlstra (1):
sched/core: Fix update_rq_clock() splat on hotplug (and suspend/resume)
Wanpeng Li (1):
sched/fair: Update rq clock before changing a task's CPU affinity
kernel/sched/core.c | 29 +++++++++++++++++------------
1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34e2291a9a6c..e01bd807f0af 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1087,6 +1087,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
int ret = 0;
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
if (p->flags & PF_KTHREAD) {
/*
@@ -5557,7 +5558,7 @@ static void migrate_tasks(struct rq *dead_rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct rq_flags rf, old_rf;
+ struct rq_flags rf;
int dest_cpu;
/*
@@ -5576,7 +5577,9 @@ static void migrate_tasks(struct rq *dead_rq)
* class method both need to have an up-to-date
* value of rq->clock[_task]
*/
+ rq_pin_lock(rq, &rf);
update_rq_clock(rq);
+ rq_unpin_lock(rq, &rf);
for (;;) {
/*
@@ -5589,7 +5592,7 @@ static void migrate_tasks(struct rq *dead_rq)
/*
* pick_next_task() assumes pinned rq->lock:
*/
- rq_pin_lock(rq, &rf);
+ rq_repin_lock(rq, &rf);
next = pick_next_task(rq, &fake_task, &rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5618,13 +5621,6 @@ static void migrate_tasks(struct rq *dead_rq)
continue;
}
- /*
- * __migrate_task() may return with a different
- * rq->lock held and a new cookie in 'rf', but we need
- * to preserve rf::clock_update_flags for 'dead_rq'.
- */
- old_rf = rf;
-
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
@@ -5633,7 +5629,6 @@ static void migrate_tasks(struct rq *dead_rq)
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
- rf = old_rf;
}
raw_spin_unlock(&next->pi_lock);
}
@@ -6816,11 +6811,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -7226,6 +7230,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2016-11-22 15:38 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2016-11-22 15:38 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 8e5bfa8c1f8471aa4a2d30be631ef2b50e10abaf sched/autogroup: Do not use autogroup->tg in zombie threads
Two fixes for autogroup scheduling, for races when turning the feature on/off via
/proc/sys/kernel/sched_autogroup_enabled.
Thanks,
Ingo
------------------>
Oleg Nesterov (2):
sched/autogroup: Fix autogroup_move_group() to never skip sched_move_task()
sched/autogroup: Do not use autogroup->tg in zombie threads
include/linux/sched.h | 2 ++
kernel/exit.c | 1 +
kernel/sched/auto_group.c | 36 ++++++++++++++++++++++++++++--------
3 files changed, 31 insertions(+), 8 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..e9c009dc3a4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2567,6 +2567,7 @@ extern void sched_autogroup_create_attach(struct task_struct *p);
extern void sched_autogroup_detach(struct task_struct *p);
extern void sched_autogroup_fork(struct signal_struct *sig);
extern void sched_autogroup_exit(struct signal_struct *sig);
+extern void sched_autogroup_exit_task(struct task_struct *p);
#ifdef CONFIG_PROC_FS
extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
@@ -2576,6 +2577,7 @@ static inline void sched_autogroup_create_attach(struct task_struct *p) { }
static inline void sched_autogroup_detach(struct task_struct *p) { }
static inline void sched_autogroup_fork(struct signal_struct *sig) { }
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit_task(struct task_struct *p) { }
#endif
extern int yield_to(struct task_struct *p, bool preempt);
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d68c45ebbe3..3076f3089919 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -836,6 +836,7 @@ void __noreturn do_exit(long code)
*/
perf_event_exit_task(tsk);
+ sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
/*
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index a5d966cb8891..f1c8fd566246 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
{
if (tg != &root_task_group)
return false;
-
/*
- * We can only assume the task group can't go away on us if
- * autogroup_move_group() can see us on ->thread_group list.
+ * If we race with autogroup_move_group() the caller can use the old
+ * value of signal->autogroup but in this case sched_move_task() will
+ * be called again before autogroup_kref_put().
+ *
+ * However, there is no way sched_autogroup_exit_task() could tell us
+ * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
*/
if (p->flags & PF_EXITING)
return false;
@@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
return true;
}
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+ /*
+ * We are going to call exit_notify() and autogroup_move_group() can't
+ * see this thread after that: we can no longer use signal->autogroup.
+ * See the PF_EXITING check in task_wants_autogroup().
+ */
+ sched_move_task(p);
+}
+
static void
autogroup_move_group(struct task_struct *p, struct autogroup *ag)
{
@@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
}
p->signal->autogroup = autogroup_kref_get(ag);
-
- if (!READ_ONCE(sysctl_sched_autogroup_enabled))
- goto out;
-
+ /*
+ * We can't avoid sched_move_task() after we changed signal->autogroup,
+ * this process can already run with task_group() == prev->tg or we can
+ * race with cgroup code which can read autogroup = prev under rq->lock.
+ * In the latter case for_each_thread() can not miss a migrating thread,
+ * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+ * can't be removed from thread list, we hold ->siglock.
+ *
+ * If an exiting thread was already removed from thread list we rely on
+ * sched_autogroup_exit_task().
+ */
for_each_thread(p, t)
sched_move_task(t);
-out:
+
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2016-08-18 20:43 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2016-08-18 20:43 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 03cbc732639ddcad15218c4b2046d255851ff1e3 sched/cputime: Resync steal time when guest & host lose sync
Two cputime fixes - hopefully the last ones.
Thanks,
Ingo
------------------>
Peter Zijlstra (1):
sched/cputime: Fix NO_HZ_FULL getrusage() monotonicity regression
Wanpeng Li (1):
sched/cputime: Resync steal time when guest & host lose sync
kernel/sched/cputime.c | 33 +++++++++++++++++++++++++--------
1 file changed, 25 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9858266fb0b3..a846cf89eb96 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
{
#ifdef CONFIG_PARAVIRT
@@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
* idle, or potentially user or system time. Due to rounding,
* other time can exceed ticks occasionally.
*/
- other = account_other_time(cputime);
+ other = account_other_time(ULONG_MAX);
if (other >= cputime)
return;
cputime -= other;
@@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
}
cputime = cputime_one_jiffy;
- steal = steal_account_process_time(cputime);
+ steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
return;
@@ -516,7 +521,7 @@ void account_idle_ticks(unsigned long ticks)
}
cputime = jiffies_to_cputime(ticks);
- steal = steal_account_process_time(cputime);
+ steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
return;
@@ -614,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -649,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
@@ -694,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
unsigned long now = READ_ONCE(jiffies);
cputime_t delta, other;
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+ * ticks, and no need for steal time accounting to make up for
+ * lost ticks. Vtime accounts a rounded version of actual
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
delta = jiffies_to_cputime(now - tsk->vtime_snap);
other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2016-08-12 19:39 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2016-08-12 19:39 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 26f2c75cd2cf10a6120ef02ca9a94db77cc9c8e0 sched/cputime: Fix omitted ticks passed in parameter
Misc fixes: cputime fixes, two deadline scheduler fixes and a cgroups scheduling
fix.
Thanks,
Ingo
------------------>
Frederic Weisbecker (1):
sched/cputime: Fix omitted ticks passed in parameter
Giovanni Gherdovich (1):
sched/cputime: Mitigate performance regression in times()/clock_gettime()
Tommaso Cucinotta (1):
sched/deadline: Fix wrap-around in DL heap
Wanpeng Li (2):
sched/deadline: Fix lock pinning warning during CPU hotplug
sched/cputime: Fix steal time accounting
Xunlei Pang (1):
sched/fair: Fix typo in sync_throttle()
kernel/sched/core.c | 19 +++++++++++++++++++
kernel/sched/cpudeadline.c | 2 +-
kernel/sched/cputime.c | 10 +++++++++-
kernel/sched/deadline.c | 5 ++++-
kernel/sched/fair.c | 2 +-
5 files changed, 34 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c883fe8e440..2a906f20fba7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
#include <linux/context_tracking.h>
#include <linux/compiler.h>
#include <linux/frame.h>
+#include <linux/prefetch.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -2972,6 +2973,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+ struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+ prefetch(curr);
+ prefetch(&curr->exec_start);
+}
+
+/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -3005,6 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
+ prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5be58820465c..d4184498c9f5 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
if (old_idx == IDX_INVALID) {
cp->size++;
- cp->elements[cp->size - 1].dl = 0;
+ cp->elements[cp->size - 1].dl = dl;
cp->elements[cp->size - 1].cpu = cpu;
cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 1934f658c036..9858266fb0b3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -508,13 +508,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
*/
void account_idle_ticks(unsigned long ticks)
{
+ cputime_t cputime, steal;
if (sched_clock_irqtime) {
irqtime_account_idle_ticks(ticks);
return;
}
- account_idle_time(jiffies_to_cputime(ticks));
+ cputime = jiffies_to_cputime(ticks);
+ steal = steal_account_process_time(cputime);
+
+ if (steal >= cputime)
+ return;
+
+ cputime -= steal;
+ account_idle_time(cputime);
}
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217ff4..1ce8867283dc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
*
* XXX figure out if select_task_rq_dl() deals with offline cpus.
*/
- if (unlikely(!rq->online))
+ if (unlikely(!rq->online)) {
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
+ rf.cookie = lockdep_pin_lock(&rq->lock);
+ }
/*
* Queueing this task back might have overloaded rq, check if we need
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4088eedea763..039de34f1521 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
- pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+ cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2016-07-08 13:53 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2016-07-08 13:53 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: ea1dc6fc6242f991656e35e2ed3d90ec1cd13418 sched/fair: Fix calc_cfs_shares() fixed point arithmetics width confusion
Two load-balancing fixes for cgroups-intense workloads.
Thanks,
Ingo
------------------>
Peter Zijlstra (2):
sched/fair: Fix effective_load() to consistently use smoothed load
sched/fair: Fix calc_cfs_shares() fixed point arithmetics width confusion
kernel/sched/fair.c | 42 ++++++++++++++++++++----------------------
1 file changed, 20 insertions(+), 22 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bdcbeea90c95..c8c5d2d48424 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -735,8 +735,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
}
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
{
@@ -2499,28 +2497,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long tg_weight;
+ long tg_weight, load, shares;
/*
- * Use this CPU's real-time load instead of the last load contribution
- * as the updating of the contribution is delayed, and we will use the
- * the real-time load to calc the share. See update_tg_load_avg().
+ * This really should be: cfs_rq->avg.load_avg, but instead we use
+ * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+ * the shares for small weight interactive tasks.
*/
- tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq->load.weight;
+ load = scale_load_down(cfs_rq->load.weight);
- return tg_weight;
-}
-
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- long tg_weight, load, shares;
+ tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
shares = (tg->shares * load);
if (tg_weight)
@@ -2539,6 +2531,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return tg->shares;
}
# endif /* CONFIG_SMP */
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@@ -4946,19 +4939,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
for_each_sched_entity(se) {
- long w, W;
+ struct cfs_rq *cfs_rq = se->my_q;
+ long W, w = cfs_rq_load_avg(cfs_rq);
- tg = se->my_q->tg;
+ tg = cfs_rq->tg;
/*
* W = @wg + \Sum rw_j
*/
- W = wg + calc_tg_weight(tg, se->my_q);
+ W = wg + atomic_long_read(&tg->load_avg);
+
+ /* Ensure \Sum rw_j >= rw_i */
+ W -= cfs_rq->tg_load_avg_contrib;
+ W += w;
/*
* w = rw_i + @wl
*/
- w = cfs_rq_load_avg(se->my_q) + wl;
+ w += wl;
/*
* wl = S * s'_i; see (2)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2016-06-10 12:56 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2016-06-10 12:56 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Thomas Gleixner,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 4698f88c06b893f2acc0b443004a53bf490fde7c sched/debug: Fix 'schedstats=enable' cmdline option
Two scheduler debugging fixes.
Thanks,
Ingo
------------------>
Josh Poimboeuf (2):
sched/debug: Fix /proc/sched_debug regression
sched/debug: Fix 'schedstats=enable' cmdline option
kernel/sched/core.c | 26 +++++++++++++++++++++-----
kernel/sched/debug.c | 15 ++++-----------
kernel/sched/stats.h | 3 +++
3 files changed, 28 insertions(+), 16 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2cae4620c7..385c947482e1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2253,9 +2253,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#endif
#endif
+#ifdef CONFIG_SCHEDSTATS
+
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
-#ifdef CONFIG_SCHEDSTATS
static void set_schedstats(bool enabled)
{
if (enabled)
@@ -2278,11 +2280,16 @@ static int __init setup_schedstats(char *str)
if (!str)
goto out;
+ /*
+ * This code is called before jump labels have been set up, so we can't
+ * change the static branch directly just yet. Instead set a temporary
+ * variable so init_schedstats() can do it later.
+ */
if (!strcmp(str, "enable")) {
- set_schedstats(true);
+ __sched_schedstats = true;
ret = 1;
} else if (!strcmp(str, "disable")) {
- set_schedstats(false);
+ __sched_schedstats = false;
ret = 1;
}
out:
@@ -2293,6 +2300,11 @@ static int __init setup_schedstats(char *str)
}
__setup("schedstats=", setup_schedstats);
+static void __init init_schedstats(void)
+{
+ set_schedstats(__sched_schedstats);
+}
+
#ifdef CONFIG_PROC_SYSCTL
int sysctl_schedstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2313,8 +2325,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,
set_schedstats(state);
return err;
}
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#else /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
/*
* fork()/clone()-time setup:
@@ -7487,6 +7501,8 @@ void __init sched_init(void)
#endif
init_sched_fair_class();
+ init_schedstats();
+
scheduler_running = 1;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cf905f655ba1..0368c393a336 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
-#ifdef CONFIG_SCHEDSTATS
- if (schedstat_enabled()) {
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.statistics.wait_sum),
- SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(p->se.statistics.sum_sleep_runtime));
- }
-#else
+
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- 0LL, 0L,
+ SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
- 0LL, 0L);
-#endif
+ SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 70b3b6a20fb0..78955cbea31c 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
+# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0)
+
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
# define schedstat_set(var, val) do { } while (0)
+# define schedstat_val(rq, field) 0
#endif
#ifdef CONFIG_SCHED_INFO
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2016-05-25 21:58 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2016-05-25 21:58 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: b7e7ade34e6188bee2e3b0d42b51d25137d9e2a5 sched/core: Fix remote wakeups
Two fixes: one for a lost wakeup, the other to fix the compiler optimizing out
preempt operations on ARM64 (and possibly other non-x86 architectures).
Thanks,
Ingo
------------------>
Peter Zijlstra (2):
sched/preempt: Fix preempt_count manipulations
sched/core: Fix remote wakeups
include/asm-generic/preempt.h | 4 ++--
include/linux/sched.h | 1 +
kernel/sched/core.c | 18 +++++++++++-------
3 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index 5d8ffa3e6f8c..c1cde3577551 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -7,10 +7,10 @@
static __always_inline int preempt_count(void)
{
- return current_thread_info()->preempt_count;
+ return READ_ONCE(current_thread_info()->preempt_count);
}
-static __always_inline int *preempt_count_ptr(void)
+static __always_inline volatile int *preempt_count_ptr(void)
{
return ¤t_thread_info()->preempt_count;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6cc0df970f1a..e053517a88b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1533,6 +1533,7 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
+ unsigned sched_remote_wakeup:1;
unsigned :0; /* force alignment to the next boundary */
/* unserialized, strictly 'current' */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 404c0784b1fc..7f2cae4620c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1768,13 +1768,15 @@ void sched_ttwu_pending(void)
cookie = lockdep_pin_lock(&rq->lock);
while (llist) {
+ int wake_flags = 0;
+
p = llist_entry(llist, struct task_struct, wake_entry);
llist = llist_next(llist);
- /*
- * See ttwu_queue(); we only call ttwu_queue_remote() when
- * its a x-cpu wakeup.
- */
- ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
+
+ if (p->sched_remote_wakeup)
+ wake_flags = WF_MIGRATED;
+
+ ttwu_do_activate(rq, p, wake_flags, cookie);
}
lockdep_unpin_lock(&rq->lock, cookie);
@@ -1819,10 +1821,12 @@ void scheduler_ipi(void)
irq_exit();
}
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
if (!set_nr_if_polling(rq->idle))
smp_send_reschedule(cpu);
@@ -1869,7 +1873,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
- ttwu_queue_remote(p, cpu);
+ ttwu_queue_remote(p, cpu, wake_flags);
return;
}
#endif
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2016-05-10 12:00 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2016-05-10 12:00 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 13b5ab02ae118fc8dfdc2b8597688ec4a11d5b53 sched/rt, sched/dl: Don't push if task's scheduling class was changed
A UP kernel cpufreq fix and a rt/dl scheduler corner case fix.
Thanks,
Ingo
------------------>
Rafael J. Wysocki (1):
sched/fair: Fix !CONFIG_SMP kernel cpufreq governor breakage
Xunlei Pang (1):
sched/rt, sched/dl: Don't push if task's scheduling class was changed
kernel/sched/deadline.c | 1 +
kernel/sched/fair.c | 9 ++++++++-
kernel/sched/rt.c | 1 +
3 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index affd97ec9f65..686ec8adf952 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1394,6 +1394,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(later_rq->cpu,
&task->cpus_allowed) ||
task_running(rq, task) ||
+ !dl_task(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fe30e66aff1..40748dc8ea3e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3030,7 +3030,14 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void update_load_avg(struct sched_entity *se, int not_used)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct rq *rq = rq_of(cfs_rq);
+
+ cpufreq_trigger_update(rq_clock(rq));
+}
+
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c41ea7ac1764..ec4f538d4396 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1729,6 +1729,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(lowest_rq->cpu,
tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
+ !rt_task(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2016-03-24 7:51 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2016-03-24 7:51 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 73e6aafd9ea81498d31361f01db84a0118da2d1c sched/cpuacct: Simplify the cpuacct code
Misc fixes: a cgroup fix, a fair-scheduler migration accounting fix, a cputime fix
and two cpuacct cleanups.
Thanks,
Ingo
------------------>
Dongsheng Yang (1):
sched/cpuacct: Rename parameter in cpuusage_write() for readability
Matt Fleming (1):
sched/fair: Add comments to explain select_idle_sibling()
Peter Zijlstra (2):
sched/cgroup: Fix/cleanup cgroup teardown/init
sched/fair: Fix fairness issue on migration
Thomas Gleixner (1):
sched/cputime: Fix steal time accounting vs. CPU hotplug
Zhao Lei (1):
sched/cpuacct: Simplify the cpuacct code
kernel/sched/core.c | 36 +++++++++++++++---------------------
kernel/sched/cpuacct.c | 35 ++++++++++-------------------------
kernel/sched/cpuacct.h | 4 ++--
kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++-------
kernel/sched/sched.h | 13 +++++++++++++
5 files changed, 72 insertions(+), 55 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ea8f49ae0062..2a87bdde8d4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5369,6 +5369,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_UP_PREPARE:
rq->calc_load_update = calc_load_update;
+ account_reset_rq(rq);
break;
case CPU_ONLINE:
@@ -7535,7 +7536,7 @@ void set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
-static void free_sched_group(struct task_group *tg)
+static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
@@ -7561,7 +7562,7 @@ struct task_group *sched_create_group(struct task_group *parent)
return tg;
err:
- free_sched_group(tg);
+ sched_free_group(tg);
return ERR_PTR(-ENOMEM);
}
@@ -7581,17 +7582,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}
/* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
+static void sched_free_group_rcu(struct rcu_head *rhp)
{
/* now it should be safe to free those cfs_rqs */
- free_sched_group(container_of(rhp, struct task_group, rcu));
+ sched_free_group(container_of(rhp, struct task_group, rcu));
}
-/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
/* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
+ call_rcu(&tg->rcu, sched_free_group_rcu);
}
void sched_offline_group(struct task_group *tg)
@@ -8050,31 +8050,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
+ sched_online_group(tg, parent);
+
return &tg->css;
}
-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- struct task_group *parent = css_tg(css->parent);
- if (parent)
- sched_online_group(tg, parent);
- return 0;
+ sched_offline_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_destroy_group(tg);
-}
-
-static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-{
- struct task_group *tg = css_tg(css);
-
- sched_offline_group(tg);
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
}
static void cpu_cgroup_fork(struct task_struct *task)
@@ -8434,9 +8429,8 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
- .css_online = cpu_cgroup_css_online,
- .css_offline = cpu_cgroup_css_offline,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dd7cbb55bbf2..434c2fa41352 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
}
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 reset)
+ u64 val)
{
struct cpuacct *ca = css_ca(css);
int err = 0;
int i;
- if (reset) {
+ /*
+ * Only allow '0' here to do a reset.
+ */
+ if (val) {
err = -EINVAL;
goto out;
}
@@ -235,23 +238,10 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
- int cpu;
-
- cpu = task_cpu(tsk);
rcu_read_lock();
-
- ca = task_ca(tsk);
-
- while (true) {
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- *cpuusage += cputime;
-
- ca = parent_ca(ca);
- if (!ca)
- break;
- }
-
+ for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
+ *this_cpu_ptr(ca->cpuusage) += cputime;
rcu_read_unlock();
}
@@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
*
* Note: it's the caller that updates the account of the root cgroup.
*/
-void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
- struct kernel_cpustat *kcpustat;
struct cpuacct *ca;
rcu_read_lock();
- ca = task_ca(p);
- while (ca != &root_cpuacct) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += val;
- ca = parent_ca(ca);
- }
+ for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
+ this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
rcu_read_unlock();
}
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
index ed605624a5e7..ba72807c73d4 100644
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -1,7 +1,7 @@
#ifdef CONFIG_CGROUP_CPUACCT
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
}
static inline void
-cpuacct_account_field(struct task_struct *p, int index, u64 val)
+cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33130529e9b5..303d6392b389 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3157,17 +3157,25 @@ static inline void check_schedstat_required(void)
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+ bool curr = cfs_rq->curr == se;
+
/*
- * Update the normalized vruntime before updating min_vruntime
- * through calling update_curr().
+ * If we're the current task, we must renormalise before calling
+ * update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+ if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime;
+ update_curr(cfs_rq);
+
/*
- * Update run-time statistics of the 'current'.
+ * Otherwise, renormalise after, such that we're placed at the current
+ * moment in time, instead of some random moment in the past.
*/
- update_curr(cfs_rq);
+ if (renorm && !curr)
+ se->vruntime += cfs_rq->min_vruntime;
+
enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3183,7 +3191,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
}
- if (se != cfs_rq->curr)
+ if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -5047,7 +5055,19 @@ static int select_idle_sibling(struct task_struct *p, int target)
return i;
/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, iterate the domains and find an eligible idle cpu.
+ *
+ * A completely idle sched group at higher domains is more
+ * desirable than an idle group at a lower level, because lower
+ * domains have smaller groups and usually share hardware
+ * resources which causes tasks to contend on them, e.g. x86
+ * hyperthread siblings in the lowest domain (SMT) can contend
+ * on the shared cpu pipeline.
+ *
+ * However, while we prefer idle groups at higher domains
+ * finding an idle cpu at the lowest domain is still better than
+ * returning 'target', which we've already established, isn't
+ * idle.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
for_each_lower_domain(sd) {
@@ -5057,11 +5077,16 @@ static int select_idle_sibling(struct task_struct *p, int target)
tsk_cpus_allowed(p)))
goto next;
+ /* Ensure the entire group is idle */
for_each_cpu(i, sched_group_cpus(sg)) {
if (i == target || !idle_cpu(i))
goto next;
}
+ /*
+ * It doesn't matter which cpu we pick, the
+ * whole group is idle.
+ */
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
goto done;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2ff5a2bd6df..e6d4a3fa3660 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1793,3 +1793,16 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void account_reset_rq(struct rq *rq)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ rq->prev_irq_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT
+ rq->prev_steal_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ rq->prev_steal_time_rq = 0;
+#endif
+}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2016-01-08 12:50 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2016-01-08 12:50 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 093e5840ae76f1082633503964d035f40ed0216d sched/core: Reset task's lockless wake-queues on fork()
Misc scheduler fixes.
Thanks,
Ingo
------------------>
Andrey Ryabinin (1):
sched/fair: Fix multiplication overflow on 32-bit systems
Peter Zijlstra (1):
sched/core: Fix unserialized r-m-w scribbling stuff
Sebastian Andrzej Siewior (1):
sched/core: Reset task's lockless wake-queues on fork()
Sergey Senozhatsky (1):
sched/core: Check tgid in is_global_init()
include/linux/sched.h | 16 +++++++++-------
kernel/fork.c | 1 +
kernel/sched/fair.c | 2 +-
3 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edad7a43edea..fa39434e3fdd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1455,14 +1455,15 @@ struct task_struct {
/* Used for emulating ABI behavior of previous Linux versions */
unsigned int personality;
- unsigned in_execve:1; /* Tell the LSMs that the process is doing an
- * execve */
- unsigned in_iowait:1;
-
- /* Revert to default priority/policy when forking */
+ /* scheduler bits, serialized by scheduler locks */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
+ unsigned :0; /* force alignment to the next boundary */
+
+ /* unserialized, strictly 'current' */
+ unsigned in_execve:1; /* bit to tell LSMs we're in execve */
+ unsigned in_iowait:1;
#ifdef CONFIG_MEMCG
unsigned memcg_may_oom:1;
#endif
@@ -2002,7 +2003,8 @@ static inline int pid_alive(const struct task_struct *p)
}
/**
- * is_global_init - check if a task structure is init
+ * is_global_init - check if a task structure is init. Since init
+ * is free to have sub-threads we need to check tgid.
* @tsk: Task structure to be checked.
*
* Check if a task structure is the first user space task the kernel created.
@@ -2011,7 +2013,7 @@ static inline int pid_alive(const struct task_struct *p)
*/
static inline int is_global_init(struct task_struct *tsk)
{
- return tsk->pid == 1;
+ return task_tgid_nr(tsk) == 1;
}
extern struct pid *cad_pid;
diff --git a/kernel/fork.c b/kernel/fork.c
index fce002ee3ddf..1155eac61687 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -380,6 +380,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
+ tsk->wake_q.next = NULL;
account_kernel_stack(ti, 1);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 90e26b11deaa..cfdc0e61066c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2689,7 +2689,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
int decayed, removed = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
removed = 1;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2015-10-23 11:38 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2015-10-23 11:38 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Mike Galbraith,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 0aaafaabfcba8aa991913cd3280a5dbf7f111a2a sched/core: Add missing lockdep_unpin() annotations
Misc fixes all around the map: an instrumentation fix, a nohz usability fix, a
lockdep annotation fix and two task group scheduling fixes.
Thanks,
Ingo
------------------>
Daniel Bristot de Oliveira (1):
sched, tracing: Stop/start critical timings around the idle=poll idle loop
Frederic Weisbecker (1):
nohz: Revert "nohz: Set isolcpus when nohz_full is set"
Luca Abeni (1):
sched/deadline: Fix migration of SCHED_DEADLINE tasks
Peter Zijlstra (1):
sched/core: Add missing lockdep_unpin() annotations
Yuyang Du (2):
sched/fair: Fix overly small weight for interactive group entities
sched/fair: Update task group's load_avg after task migration
kernel/sched/core.c | 12 ++++++++----
kernel/sched/deadline.c | 17 +++++++++++++----
kernel/sched/fair.c | 9 +++++----
kernel/sched/idle.c | 2 ++
4 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 10a8faa1b0d4..bcd214e4b4d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2366,8 +2366,15 @@ void wake_up_new_task(struct task_struct *p)
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ if (p->sched_class->task_woken) {
+ /*
+ * Nothing relies on rq->lock after this, so its fine to
+ * drop it.
+ */
+ lockdep_unpin_lock(&rq->lock);
p->sched_class->task_woken(rq, p);
+ lockdep_pin_lock(&rq->lock);
+ }
#endif
task_rq_unlock(rq, p, &flags);
}
@@ -7238,9 +7245,6 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
- /* nohz_full won't take effect without isolating the cpus. */
- tick_nohz_full_add_cpus_to(cpu_isolated_map);
-
sched_init_numa();
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc8f01083527..8b0a15e285f9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* Queueing this task back might have overloaded rq, check if we need
* to kick someone away.
*/
- if (has_pushable_dl_tasks(rq))
+ if (has_pushable_dl_tasks(rq)) {
+ /*
+ * Nothing relies on rq->lock after this, so its safe to drop
+ * rq->lock.
+ */
+ lockdep_unpin_lock(&rq->lock);
push_dl_task(rq);
+ lockdep_pin_lock(&rq->lock);
+ }
#endif
unlock:
@@ -1066,8 +1073,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
int target = find_later_rq(p);
if (target != -1 &&
- dl_time_before(p->dl.deadline,
- cpu_rq(target)->dl.earliest_dl.curr))
+ (dl_time_before(p->dl.deadline,
+ cpu_rq(target)->dl.earliest_dl.curr) ||
+ (cpu_rq(target)->dl.dl_nr_running == 0)))
cpu = target;
}
rcu_read_unlock();
@@ -1417,7 +1425,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
- if (!dl_time_before(task->dl.deadline,
+ if (later_rq->dl.dl_nr_running &&
+ !dl_time_before(task->dl.deadline,
later_rq->dl.earliest_dl.curr)) {
/*
* Target rq has tasks of equal or earlier deadline,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e2e3483b1ec..9a5e60fe721a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2363,7 +2363,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
*/
tg_weight = atomic_long_read(&tg->load_avg);
tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq_load_avg(cfs_rq);
+ tg_weight += cfs_rq->load.weight;
return tg_weight;
}
@@ -2373,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
long tg_weight, load, shares;
tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq_load_avg(cfs_rq);
+ load = cfs_rq->load.weight;
shares = (tg->shares * load);
if (tg_weight)
@@ -2664,13 +2664,14 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- int decayed;
struct sched_avg *sa = &cfs_rq->avg;
+ int decayed, removed = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+ removed = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -2688,7 +2689,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- return decayed;
+ return decayed || removed;
}
/* Update task and its cfs_rq load average */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f177c73ae19..4a2ef5a02fd3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
+ stop_critical_timings();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ start_critical_timings();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
return 1;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2015-09-17 8:06 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2015-09-17 8:06 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Thomas Gleixner,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 5473e0cc37c03c576adbda7591a6cc8e37c1bb7f sched: 'Annotate' migrate_tasks()
A migrate_tasks() locking fix, and a late-coming nohz change plus a nohz debug
check.
Thanks,
Ingo
------------------>
Frederic Weisbecker (1):
nohz: Assert existing housekeepers when nohz full enabled
Vatika Harlalka (1):
nohz: Affine unpinned timers to housekeepers
Wanpeng Li (1):
sched: 'Annotate' migrate_tasks()
include/linux/tick.h | 9 +++++++++
kernel/sched/core.c | 36 +++++++++++++++++++++++++++++++-----
kernel/time/tick-sched.c | 15 +++++++++++----
3 files changed, 51 insertions(+), 9 deletions(-)
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 48d901f83f92..e312219ff823 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -147,11 +147,20 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
cpumask_or(mask, mask, tick_nohz_full_mask);
}
+static inline int housekeeping_any_cpu(void)
+{
+ return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+}
+
extern void tick_nohz_full_kick(void);
extern void tick_nohz_full_kick_cpu(int cpu);
extern void tick_nohz_full_kick_all(void);
extern void __tick_nohz_task_switch(void);
#else
+static inline int housekeeping_any_cpu(void)
+{
+ return smp_processor_id();
+}
static inline bool tick_nohz_full_enabled(void) { return false; }
static inline bool tick_nohz_full_cpu(int cpu) { return false; }
static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b864ecee0e1..9b786704d34b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -623,18 +623,21 @@ int get_nohz_timer_target(void)
int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (!idle_cpu(cpu))
+ if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
return cpu;
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i)) {
+ if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
cpu = i;
goto unlock;
}
}
}
+
+ if (!is_housekeeping_cpu(cpu))
+ cpu = housekeeping_any_cpu();
unlock:
rcu_read_unlock();
return cpu;
@@ -5180,24 +5183,47 @@ static void migrate_tasks(struct rq *dead_rq)
break;
/*
- * Ensure rq->lock covers the entire task selection
- * until the migration.
+ * pick_next_task assumes pinned rq->lock.
*/
lockdep_pin_lock(&rq->lock);
next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
+ /*
+ * Rules for changing task_struct::cpus_allowed are holding
+ * both pi_lock and rq->lock, such that holding either
+ * stabilizes the mask.
+ *
+ * Drop rq->lock is not quite as disastrous as it usually is
+ * because !cpu_active at this point, which means load-balance
+ * will not interfere. Also, stop-machine.
+ */
+ lockdep_unpin_lock(&rq->lock);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&next->pi_lock);
+ raw_spin_lock(&rq->lock);
+
+ /*
+ * Since we're inside stop-machine, _nothing_ should have
+ * changed the task, WARN if weird stuff happened, because in
+ * that case the above rq->lock drop is a fail too.
+ */
+ if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+ raw_spin_unlock(&next->pi_lock);
+ continue;
+ }
+
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
- lockdep_unpin_lock(&rq->lock);
rq = __migrate_task(rq, next, dest_cpu);
if (rq != dead_rq) {
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
}
+ raw_spin_unlock(&next->pi_lock);
}
rq->stop = stop;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3319e16f31e5..7c7ec4515983 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -290,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str)
__setup("nohz_full=", tick_nohz_full_setup);
static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+ unsigned long action,
+ void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
/*
- * If we handle the timekeeping duty for full dynticks CPUs,
- * we can't safely shutdown that CPU.
+ * The boot CPU handles housekeeping duty (unbound timers,
+ * workqueues, timekeeping, ...) on behalf of full dynticks
+ * CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
return NOTIFY_BAD;
@@ -370,6 +371,12 @@ void __init tick_nohz_init(void)
cpu_notifier(tick_nohz_cpu_down_callback, 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
+
+ /*
+ * We need at least one CPU to handle housekeeping work such
+ * as timekeeping, unbound timers, workqueues, ...
+ */
+ WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
#endif
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2015-07-04 11:27 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2015-07-04 11:27 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 397f2378f136128623fc237746157aa2564d1082 sched/numa: Fix numa balancing stats in /proc/pid/sched
Debug info and other statistics fixes and related enhancements.
Thanks,
Ingo
------------------>
Naveen N. Rao (2):
sched/stat: Simplify the sched_info accounting dependency
sched/stat: Expose /proc/pid/schedstat if CONFIG_SCHED_INFO=y
Srikar Dronamraju (3):
sched/debug: Move print_cfs_rq() declaration to kernel/sched/sched.h
sched/numa: Show numa_group ID in /proc/sched_debug task listings
sched/numa: Fix numa balancing stats in /proc/pid/sched
fs/proc/base.c | 11 +++++++----
include/linux/sched.h | 8 +++-----
init/Kconfig | 1 +
kernel/sched/core.c | 2 +-
kernel/sched/debug.c | 40 ++++++++++++++++++----------------------
kernel/sched/fair.c | 22 +++++++++++++++++++++-
kernel/sched/sched.h | 13 +++++++++++++
kernel/sched/stats.h | 4 ++--
lib/Kconfig.debug | 5 +++++
9 files changed, 71 insertions(+), 35 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 286a422f440e..ad63fa3306df 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -304,14 +304,17 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
}
#endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
/*
* Provides /proc/PID/schedstat
*/
static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- seq_printf(m, "%llu %llu %lu\n",
+ if (unlikely(!sched_info_on()))
+ seq_printf(m, "0 0 0\n");
+ else
+ seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
@@ -2600,7 +2603,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_STACKTRACE
ONE("stack", S_IRUSR, proc_pid_stack),
#endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
@@ -2948,7 +2951,7 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_STACKTRACE
ONE("stack", S_IRUSR, proc_pid_stack),
#endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6633e83e608a..3505352dd296 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -191,8 +191,6 @@ struct task_group;
#ifdef CONFIG_SCHED_DEBUG
extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
extern void proc_sched_set_task(struct task_struct *p);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
#endif
/*
@@ -849,7 +847,7 @@ extern struct user_struct root_user;
struct backing_dev_info;
struct reclaim_state;
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
struct sched_info {
/* cumulative counters */
unsigned long pcount; /* # of times run on this cpu */
@@ -859,7 +857,7 @@ struct sched_info {
unsigned long long last_arrival,/* when we last ran on a cpu */
last_queued; /* when we were last queued to run */
};
-#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
+#endif /* CONFIG_SCHED_INFO */
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info {
@@ -1408,7 +1406,7 @@ struct task_struct {
int rcu_tasks_idle_cpu;
#endif /* #ifdef CONFIG_TASKS_RCU */
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
struct sched_info sched_info;
#endif
diff --git a/init/Kconfig b/init/Kconfig
index b999fa381bf9..12cb556ad160 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -435,6 +435,7 @@ config TASKSTATS
config TASK_DELAY_ACCT
bool "Enable per-task delay accounting"
depends on TASKSTATS
+ select SCHED_INFO
help
Collect information on time spent by a task waiting for system
resources like cpu, synchronous block I/O completion and swapping
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c86935a7f1f8..abb8785ed1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1975,7 +1975,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
set_task_cpu(p, cpu);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 315c68e015d9..4222ec50ab88 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -142,7 +142,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
0LL, 0L);
#endif
#ifdef CONFIG_NUMA_BALANCING
- SEQ_printf(m, " %d", task_node(p));
+ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -517,11 +517,21 @@ __initcall(init_sched_debug_procfs);
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#ifdef CONFIG_NUMA_BALANCING
+void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf)
+{
+ SEQ_printf(m, "numa_faults node=%d ", node);
+ SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf);
+ SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf);
+}
+#endif
+
+
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
struct mempolicy *pol;
- int node, i;
if (p->mm)
P(mm->numa_scan_seq);
@@ -533,26 +543,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
mpol_get(pol);
task_unlock(p);
- SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
-
- for_each_online_node(node) {
- for (i = 0; i < 2; i++) {
- unsigned long nr_faults = -1;
- int cpu_current, home_node;
-
- if (p->numa_faults)
- nr_faults = p->numa_faults[2*node + i];
-
- cpu_current = !i ? (task_node(p) == node) :
- (pol && node_isset(node, pol->v.nodes));
-
- home_node = (p->numa_preferred_nid == node);
-
- SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
- i, node, cpu_current, home_node, nr_faults);
- }
- }
-
+ P(numa_pages_migrated);
+ P(numa_preferred_nid);
+ P(total_numa_faults);
+ SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
+ task_node(p), task_numa_group_id(p));
+ show_numa_stats(p, m);
mpol_put(pol);
#endif
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40a7fcbf491e..7245039fc67b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8468,7 +8468,27 @@ void print_cfs_stats(struct seq_file *m, int cpu)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+void show_numa_stats(struct task_struct *p, struct seq_file *m)
+{
+ int node;
+ unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+
+ for_each_online_node(node) {
+ if (p->numa_faults) {
+ tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
+ tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ if (p->numa_group) {
+ gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ print_numa_stats(m, node, tsf, tpf, gsf, gpf);
+ }
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void)
{
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aea7c1f393cb..7ef596837dac 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1668,9 +1668,22 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+
+#ifdef CONFIG_SCHED_DEBUG
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void
+show_numa_stats(struct task_struct *p, struct seq_file *m);
+extern void
+print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf);
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 077ebbd5e10f..b0fbc7632de5 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -47,7 +47,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
# define schedstat_set(var, val) do { } while (0)
#endif
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
static inline void sched_info_reset_dequeued(struct task_struct *t)
{
t->sched_info.last_queued = 0;
@@ -156,7 +156,7 @@ sched_info_switch(struct rq *rq,
#define sched_info_depart(rq, t) do { } while (0)
#define sched_info_arrive(rq, next) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#endif /* CONFIG_SCHED_INFO */
/*
* The following are functions that support scheduler-internal time accounting.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b908048f8d6a..e2894b23efb6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -841,9 +841,14 @@ config SCHED_DEBUG
that can help debug the scheduler. The runtime overhead of this
option is minimal.
+config SCHED_INFO
+ bool
+ default n
+
config SCHEDSTATS
bool "Collect scheduler statistics"
depends on DEBUG_KERNEL && PROC_FS
+ select SCHED_INFO
help
If you say Y here, additional code will be inserted into the
scheduler and related routines to collect statistics about
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2015-05-15 7:19 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2015-05-15 7:19 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 533445c6e53368569e50ab3fb712230c03d523f3 sched/core: Fix regression in cpuset_cpu_inactive() for suspend
Two fixes: a suspend/resume related regression fix, and an RT priority
boosting fix.
Thanks,
Ingo
------------------>
Omar Sandoval (1):
sched/core: Fix regression in cpuset_cpu_inactive() for suspend
Thomas Gleixner (1):
sched: Handle priority boosted tasks proper in setscheduler()
include/linux/sched/rt.h | 7 ++++---
kernel/locking/rtmutex.c | 12 ++++++-----
kernel/sched/core.c | 54 +++++++++++++++++++++++-------------------------
3 files changed, 37 insertions(+), 36 deletions(-)
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 6341f5be6e24..a30b172df6e1 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,7 +18,7 @@ static inline int rt_task(struct task_struct *p)
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_check_prio(struct task_struct *task, int newprio);
+extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
@@ -31,9 +31,10 @@ static inline int rt_mutex_getprio(struct task_struct *p)
return p->normal_prio;
}
-static inline int rt_mutex_check_prio(struct task_struct *task, int newprio)
+static inline int rt_mutex_get_effective_prio(struct task_struct *task,
+ int newprio)
{
- return 0;
+ return newprio;
}
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b73279367087..b025295f4966 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -265,15 +265,17 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
}
/*
- * Called by sched_setscheduler() to check whether the priority change
- * is overruled by a possible priority boosting.
+ * Called by sched_setscheduler() to get the priority which will be
+ * effective after the change.
*/
-int rt_mutex_check_prio(struct task_struct *task, int newprio)
+int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
{
if (!task_has_pi_waiters(task))
- return 0;
+ return newprio;
- return task_top_pi_waiter(task)->task->prio <= newprio;
+ if (task_top_pi_waiter(task)->task->prio <= newprio)
+ return task_top_pi_waiter(task)->task->prio;
+ return newprio;
}
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe22f7510bce..57bd333bc4ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3300,15 +3300,18 @@ static void __setscheduler_params(struct task_struct *p,
/* Actually do priority change: must hold pi & rq lock. */
static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr)
+ const struct sched_attr *attr, bool keep_boost)
{
__setscheduler_params(p, attr);
/*
- * If we get here, there was no pi waiters boosting the
- * task. It is safe to use the normal prio.
+ * Keep a potential priority boosting if called from
+ * sched_setscheduler().
*/
- p->prio = normal_prio(p);
+ if (keep_boost)
+ p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
+ else
+ p->prio = normal_prio(p);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -3408,7 +3411,7 @@ static int __sched_setscheduler(struct task_struct *p,
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, queued, running;
- int policy = attr->sched_policy;
+ int new_effective_prio, policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
struct rq *rq;
@@ -3590,15 +3593,14 @@ static int __sched_setscheduler(struct task_struct *p,
oldprio = p->prio;
/*
- * Special case for priority boosted tasks.
- *
- * If the new priority is lower or equal (user space view)
- * than the current (boosted) priority, we just store the new
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
* normal parameters and do not touch the scheduler class and
* the runqueue. This will be done when the task deboost
* itself.
*/
- if (rt_mutex_check_prio(p, newprio)) {
+ new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ if (new_effective_prio == oldprio) {
__setscheduler_params(p, attr);
task_rq_unlock(rq, p, &flags);
return 0;
@@ -3612,7 +3614,7 @@ static int __sched_setscheduler(struct task_struct *p,
put_prev_task(rq, p);
prev_class = p->sched_class;
- __setscheduler(rq, p, attr);
+ __setscheduler(rq, p, attr, true);
if (running)
p->sched_class->set_curr_task(rq);
@@ -6997,27 +6999,23 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
unsigned long flags;
long cpu = (long)hcpu;
struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
case CPU_DOWN_PREPARE:
- /* explicitly allow suspend */
- if (!(action & CPU_TASKS_FROZEN)) {
- bool overflow;
- int cpus;
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
+ rcu_read_unlock_sched();
- if (overflow)
- return notifier_from_errno(-EBUSY);
- }
+ if (overflow)
+ return notifier_from_errno(-EBUSY);
cpuset_update_active_cpus(false);
break;
case CPU_DOWN_PREPARE_FROZEN:
@@ -7346,7 +7344,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
queued = task_on_rq_queued(p);
if (queued)
dequeue_task(rq, p, 0);
- __setscheduler(rq, p, &attr);
+ __setscheduler(rq, p, &attr, false);
if (queued) {
enqueue_task(rq, p, 0);
resched_curr(rq);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2015-02-20 13:42 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2015-02-20 13:42 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 2636ed5f8d15ff9395731593537b4b3fdf2af24d sched/rt: Avoid obvious configuration fail
This tree contains misc fixes: preempt_schedule_common()
and io_schedule() recursion fixes, sched/dl fixes, a
completion_done() revert, two sched/rt fixes and a comment
update patch.
Thanks,
Ingo
------------------>
Frederic Weisbecker (1):
sched: Fix preempt_schedule_common() triggering tracing recursion
Kirill Tkhai (2):
sched/dl: Prevent enqueue of a sleeping task in dl_task_timer()
sched/dl: Do update_rq_clock() in yield_task_dl()
NeilBrown (1):
sched: Prevent recursion in io_schedule()
Oleg Nesterov (1):
sched/completion: Serialize completion_done() with complete()
Peter Zijlstra (4):
sched: Clarify ordering between task_rq_lock() and move_queued_task()
sched: Make dl_task_time() use task_rq_lock()
sched/autogroup: Fix failure to set cpu.rt_runtime_us
sched/rt: Avoid obvious configuration fail
include/linux/sched.h | 10 ++--
kernel/sched/auto_group.c | 6 +--
kernel/sched/completion.c | 19 +++++++-
kernel/sched/core.c | 113 ++++++++++++----------------------------------
kernel/sched/deadline.c | 33 ++++++++++----
kernel/sched/sched.h | 76 +++++++++++++++++++++++++++++++
6 files changed, 155 insertions(+), 102 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef98d2f..cb5cdc777c8a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -363,9 +363,6 @@ extern void show_regs(struct pt_regs *);
*/
extern void show_stack(struct task_struct *task, unsigned long *sp);
-void io_schedule(void);
-long io_schedule_timeout(long timeout);
-
extern void cpu_init (void);
extern void trap_init(void);
extern void update_process_times(int user);
@@ -422,6 +419,13 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
+extern long io_schedule_timeout(long timeout);
+
+static inline void io_schedule(void)
+{
+ io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+}
+
struct nsproxy;
struct user_namespace;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 8a2e230fb86a..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
* so we don't have to move tasks around upon policy change,
* or flail around trying to allocate bandwidth on the fly.
* A bandwidth exception in __sched_setscheduler() allows
- * the policy change to proceed. Thereafter, task_group()
- * returns &root_task_group, so zero bandwidth is required.
+ * the policy change to proceed.
*/
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
if (tg != &root_task_group)
return false;
- if (p->sched_class != &fair_sched_class)
- return false;
-
/*
* We can only assume the task group can't go away on us if
* autogroup_move_group() can see us on ->thread_group list.
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 7052d3fd4e7b..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x)
* first without taking the lock so we can
* return early in the blocking case.
*/
- if (!ACCESS_ONCE(x->done))
+ if (!READ_ONCE(x->done))
return 0;
spin_lock_irqsave(&x->wait.lock, flags);
@@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
*/
bool completion_done(struct completion *x)
{
- return !!ACCESS_ONCE(x->done);
+ if (!READ_ONCE(x->done))
+ return false;
+
+ /*
+ * If ->done, we need to wait for complete() to release ->wait.lock
+ * otherwise we can end up freeing the completion before complete()
+ * is done referencing it.
+ *
+ * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+ * the loads of ->done and ->wait.lock such that we cannot observe
+ * the lock before complete() acquires it while observing the ->done
+ * after it's acquired the lock.
+ */
+ smp_rmb();
+ spin_unlock_wait(&x->wait.lock);
+ return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1f37fe7f77a4..a4869bd426ca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -307,66 +307,6 @@ __read_mostly int scheduler_running;
int sysctl_sched_rt_runtime = 950000;
/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- lockdep_assert_held(&p->pi_lock);
-
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- for (;;) {
- raw_spin_lock_irqsave(&p->pi_lock, *flags);
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-static void __task_rq_unlock(struct rq *rq)
- __releases(rq->lock)
-{
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
- __releases(rq->lock)
- __releases(p->pi_lock)
-{
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-
-/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
static struct rq *this_rq_lock(void)
@@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
-static void preempt_schedule_common(void)
+static void __sched notrace preempt_schedule_common(void)
{
do {
__preempt_count_add(PREEMPT_ACTIVE);
@@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
-void __sched io_schedule(void)
-{
- struct rq *rq = raw_rq();
-
- delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
- schedule();
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = raw_rq();
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
long ret;
+ current->in_iowait = 1;
+ if (old_iowait)
+ blk_schedule_flush_plug(current);
+ else
+ blk_flush_plug(current);
+
delayacct_blkio_start();
+ rq = raw_rq();
atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
+ current->in_iowait = old_iowait;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
+
return ret;
}
+EXPORT_SYMBOL(io_schedule_timeout);
/**
* sys_sched_get_priority_max - return maximum RT priority.
@@ -7644,6 +7577,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
{
struct task_struct *g, *p;
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
for_each_process_thread(g, p) {
if (rt_task(p) && task_group(p) == tg)
return 1;
@@ -7736,6 +7675,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
{
int i, err = 0;
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7792,9 +7742,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
rt_period = (u64)rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (rt_period == 0)
- return -EINVAL;
-
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a027799ae130..3fa8fa6d9403 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity,
dl_timer);
struct task_struct *p = dl_task_of(dl_se);
+ unsigned long flags;
struct rq *rq;
-again:
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (rq != task_rq(p)) {
- /* Task was moved, retrying. */
- raw_spin_unlock(&rq->lock);
- goto again;
- }
+ rq = task_rq_lock(current, &flags);
/*
* We need to take care of several possible races here:
@@ -541,6 +535,26 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
+
+ /*
+ * If the throttle happened during sched-out; like:
+ *
+ * schedule()
+ * deactivate_task()
+ * dequeue_task_dl()
+ * update_curr_dl()
+ * start_dl_timer()
+ * __dequeue_task_dl()
+ * prev->on_rq = 0;
+ *
+ * We can be both throttled and !queued. Replenish the counter
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+ replenish_dl_entity(dl_se, dl_se);
+ goto unlock;
+ }
+
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
@@ -555,7 +569,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
push_dl_task(rq);
#endif
unlock:
- raw_spin_unlock(&rq->lock);
+ task_rq_unlock(rq, current, &flags);
return HRTIMER_NORESTART;
}
@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
+ update_rq_clock(rq);
update_curr_dl(rq);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0870db23d79c..dc0f435a2779 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old cpu in task_rq_lock, the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new cpu in task_rq_lock, the acquire will
+ * pair with the WMB to ensure we must then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2015-02-06 18:31 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2015-02-06 18:31 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 40767b0dc768060266d261b4a330164b4be53f7c sched/deadline: Fix deadline parameter modification handling
Misc fixes.
Thanks,
Ingo
------------------>
Jan Beulich (1):
sched/fair: Avoid using uninitialized variable in preferred_group_nid()
Mike Galbraith (1):
sched: Fix crash if cpuset_cpumask_can_shrink() is passed an empty cpumask
Mikulas Patocka (1):
sched/wait: Remove might_sleep() from wait_event_cmd()
Peter Zijlstra (1):
sched/deadline: Fix deadline parameter modification handling
include/linux/wait.h | 1 -
kernel/sched/core.c | 36 +++++++++++++++++++++++++++++++-----
kernel/sched/deadline.c | 3 ++-
kernel/sched/fair.c | 2 +-
4 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2232ed16635a..37423e0e1379 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -363,7 +363,6 @@ do { \
*/
#define wait_event_cmd(wq, condition, cmd1, cmd2) \
do { \
- might_sleep(); \
if (condition) \
break; \
__wait_event_cmd(wq, condition, cmd1, cmd2); \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc00566e..9e838095beb8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1814,6 +1814,10 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
/*
@@ -1839,7 +1843,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
- hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ init_dl_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2053,9 @@ static inline int dl_bw_cpus(int i)
* allocated bandwidth to reflect the new situation.
*
* This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
*/
static int dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
@@ -3251,15 +3258,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
- init_dl_task_timer(dl_se);
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
- dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
- dl_se->dl_yielded = 0;
+
+ /*
+ * Changing the parameters of a task is 'tricky' and we're not doing
+ * the correct thing -- also see task_dead_dl() and switched_from_dl().
+ *
+ * What we SHOULD do is delay the bandwidth release until the 0-lag
+ * point. This would include retaining the task_struct until that time
+ * and change dl_overflow() to not immediately decrement the current
+ * amount.
+ *
+ * Instead we retain the current runtime/deadline and let the new
+ * parameters take effect after the current reservation period lapses.
+ * This is safe (albeit pessimistic) because the 0-lag point is always
+ * before the current scheduling deadline.
+ *
+ * We can still have temporary overloads because we do not delay the
+ * change in bandwidth until that time; so admission control is
+ * not on the safe side. It does however guarantee tasks will never
+ * consume more than promised.
+ */
}
/*
@@ -4642,6 +4665,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
struct dl_bw *cur_dl_b;
unsigned long flags;
+ if (!cpumask_weight(cur))
+ return ret;
+
rcu_read_lock_sched();
cur_dl_b = dl_bw_of(cpumask_any(cur));
trial_cpus = cpumask_weight(trial);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b52092f2636d..726470d47f87 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1094,6 +1094,7 @@ static void task_dead_dl(struct task_struct *p)
* Since we are TASK_DEAD we won't slip out of the domain!
*/
raw_spin_lock_irq(&dl_b->lock);
+ /* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
@@ -1614,8 +1615,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
+ /* XXX we should retain the bw until 0-lag */
cancel_dl_timer(rq, p);
-
__dl_clear_params(p);
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cbf371b..fe331fc391f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1730,7 +1730,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
nodes = node_online_map;
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
- nodemask_t max_group;
+ nodemask_t max_group = NODE_MASK_NONE;
int a, b;
/* Are there nodes at this distance from each other? */
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2015-01-11 8:47 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2015-01-11 8:47 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
Alexander Viro
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 7f1a169b88f513e32a432ca0f85bfd282d117bd6 sched/fair: Fix RCU stall upon -ENOMEM in sched_create_group()
[ Note: the fs/notify/fanotify/fanotify_user.c fix is an out of
tree fix, found by nested sleep debugging - I hope it's fine to
merge it this way, Al Cc:-ed. ]
Misc fixes: group scheduling corner case fix, two deadline
scheduler fixes, effective_load() overflow fix, nested sleep fix,
6144 CPUs system fix.
Thanks,
Ingo
------------------>
Alex Thorlton (1):
sched: Fix KMALLOC_MAX_SIZE overflow during cpumask allocation
Luca Abeni (2):
sched/deadline: Fix migration of SCHED_DEADLINE tasks
sched/deadline: Avoid double-accounting in case of missed deadlines
Peter Zijlstra (1):
sched, fanotify: Deal with nested sleeps
Tetsuo Handa (1):
sched/fair: Fix RCU stall upon -ENOMEM in sched_create_group()
Yuyang Du (1):
sched: Fix odd values in effective_load() calculations
fs/notify/fanotify/fanotify_user.c | 10 +++++-----
kernel/sched/core.c | 13 +++++--------
kernel/sched/deadline.c | 25 ++++---------------------
kernel/sched/fair.c | 6 +++++-
4 files changed, 19 insertions(+), 35 deletions(-)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c991616acca9..bff8567aa42d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -259,16 +259,15 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
struct fsnotify_event *kevent;
char __user *start;
int ret;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
start = buf;
group = file->private_data;
pr_debug("%s: group=%p\n", __func__, group);
+ add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
-
mutex_lock(&group->notification_mutex);
kevent = get_one_event(group, count);
mutex_unlock(&group->notification_mutex);
@@ -289,7 +288,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
if (start != buf)
break;
- schedule();
+
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
continue;
}
@@ -318,8 +318,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
buf += ret;
count -= ret;
}
+ remove_wait_queue(&group->notification_waitq, &wait);
- finish_wait(&group->notification_waitq, &wait);
if (start != buf && ret != -EFAULT)
ret = buf - start;
return ret;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b5797b78add6..c0accc00566e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7113,9 +7113,6 @@ void __init sched_init(void)
#ifdef CONFIG_RT_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
- alloc_size += num_possible_cpus() * cpumask_size();
-#endif
if (alloc_size) {
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -7135,13 +7132,13 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
+ }
#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (void *)ptr;
- ptr += cpumask_size();
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
+ for_each_possible_cpu(i) {
+ per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
+#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth,
global_rt_period(), global_rt_runtime());
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e5db8c6feebd..b52092f2636d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -570,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
static
int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
{
- int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
- int rorun = dl_se->runtime <= 0;
-
- if (!rorun && !dmiss)
- return 0;
-
- /*
- * If we are beyond our current deadline and we are still
- * executing, then we have already used some of the runtime of
- * the next instance. Thus, if we do not account that, we are
- * stealing bandwidth from the system at each deadline miss!
- */
- if (dmiss) {
- dl_se->runtime = rorun ? dl_se->runtime : 0;
- dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
- }
-
- return 1;
+ return (dl_se->runtime <= 0);
}
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -826,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
- replenish_dl_entity(dl_se, pi_se);
- else
+ if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
update_dl_entity(dl_se, pi_se);
+ else if (flags & ENQUEUE_REPLENISH)
+ replenish_dl_entity(dl_se, pi_se);
__enqueue_dl_entity(dl_se);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df2cdf77f899..40667cbf371b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4005,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ /* init_cfs_bandwidth() was not called */
+ if (!cfs_b->throttled_cfs_rq.next)
+ return;
+
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
}
@@ -4424,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
* wl = S * s'_i; see (2)
*/
if (W > 0 && w < W)
- wl = (w * tg->shares) / W;
+ wl = (w * (long)tg->shares) / W;
else
wl = tg->shares;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-11-20 7:57 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-11-20 7:57 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 6e998916dfe327e785e7c2447959b2c1a3ea4930 sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency
Misc fixes: two NUMA fixes, two cputime fixes and an RCU/lockdep
fix.
Thanks,
Ingo
------------------>
Andrey Ryabinin (1):
sched/numa: Fix out of bounds read in sched_init_numa()
Kirill Tkhai (1):
sched: Remove lockdep check in sched_move_task()
Peter Zijlstra (2):
sched/numa: Avoid selecting oneself as swap target
sched/cputime: Fix cpu_timer_sample_group() double accounting
Stanislaw Gruszka (1):
sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency
include/linux/kernel_stat.h | 5 ----
kernel/sched/core.c | 63 ++++++++++++++----------------------------
kernel/sched/deadline.c | 2 ++
kernel/sched/fair.c | 14 ++++++++++
kernel/sched/rt.c | 2 ++
kernel/sched/sched.h | 2 ++
kernel/time/posix-cpu-timers.c | 2 +-
7 files changed, 42 insertions(+), 48 deletions(-)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 8422b4ed6882..b9376cd5a187 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -77,11 +77,6 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
return kstat_cpu(cpu).irqs_sum;
}
-/*
- * Lock/unlock the current runqueue - to extract task statistics:
- */
-extern unsigned long long task_delta_exec(struct task_struct *);
-
extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
extern void account_steal_time(cputime_t);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 240157c13ddc..24beb9bb4c3e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2475,44 +2475,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
- */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
- u64 ns = 0;
-
- /*
- * Must be ->curr _and_ ->on_rq. If dequeued, we would
- * project cycles that may never be accounted to this
- * thread, breaking clock_gettime().
- */
- if (task_current(rq, p) && task_on_rq_queued(p)) {
- update_rq_clock(rq);
- ns = rq_clock_task(rq) - p->se.exec_start;
- if ((s64)ns < 0)
- ns = 0;
- }
-
- return ns;
-}
-
-unsigned long long task_delta_exec(struct task_struct *p)
-{
- unsigned long flags;
- struct rq *rq;
- u64 ns = 0;
-
- rq = task_rq_lock(p, &flags);
- ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, p, &flags);
-
- return ns;
-}
-
-/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -2521,7 +2483,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
- u64 ns = 0;
+ u64 ns;
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
@@ -2540,7 +2502,16 @@ unsigned long long task_sched_runtime(struct task_struct *p)
#endif
rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (task_current(rq, p) && task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ p->sched_class->update_curr(rq);
+ }
+ ns = p->se.sum_exec_runtime;
task_rq_unlock(rq, p, &flags);
return ns;
@@ -6368,6 +6339,10 @@ static void sched_init_numa(void)
if (!sched_debug())
break;
}
+
+ if (!level)
+ return;
+
/*
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
@@ -7444,8 +7419,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
put_prev_task(rq, tsk);
- tg = container_of(task_css_check(tsk, cpu_cgrp_id,
- lockdep_is_held(&tsk->sighand->siglock)),
+ /*
+ * All callers are synchronized by task_rq_lock(); we do not use RCU
+ * which is pointless here. Thus, we pass "true" to task_css_check()
+ * to prevent lockdep warnings.
+ */
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
tsk->sched_task_group = tg;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5285332392d5..28fa9d9e9201 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1701,4 +1701,6 @@ const struct sched_class dl_sched_class = {
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
+
+ .update_curr = update_curr_dl,
};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 34baa60f8a7b..ef2b104b254c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
+static void update_curr_fair(struct rq *rq)
+{
+ update_curr(cfs_rq_of(&rq->curr->se));
+}
+
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -1180,6 +1185,13 @@ static void task_numa_compare(struct task_numa_env *env,
raw_spin_unlock_irq(&dst_rq->lock);
/*
+ * Because we have preemption enabled we can get migrated around and
+ * end try selecting ourselves (current == env->p) as a swap candidate.
+ */
+ if (cur == env->p)
+ goto unlock;
+
+ /*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
@@ -7949,6 +7961,8 @@ const struct sched_class fair_sched_class = {
.get_rr_interval = get_rr_interval_fair,
+ .update_curr = update_curr_fair,
+
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d024e6ce30ba..20bca398084a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2128,6 +2128,8 @@ const struct sched_class rt_sched_class = {
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+
+ .update_curr = update_curr_rt,
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24156c8434d1..2df8ef067cc5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1135,6 +1135,8 @@ struct sched_class {
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
+ void (*update_curr) (struct rq *rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 492b986195d5..a16b67859e2a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -553,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
*sample = cputime_to_expires(cputime.utime);
break;
case CPUCLOCK_SCHED:
- *sample = cputime.sum_exec_runtime + task_delta_exec(p);
+ *sample = cputime.sum_exec_runtime;
break;
}
return 0;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-10-31 11:17 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-10-31 11:17 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: f3a7e1a9c464a32ee186ab91388313c82e7ce018 sched/dl: Fix preemption checks
Various scheduler fixes all over the place: three SCHED_DL fixes,
three sched/numa fixes, two generic race fixes and a comment fix.
Thanks,
Ingo
------------------>
Chen Hanxiao (1):
sched: Update comments for CLONE_NEWNS
Juri Lelli (2):
sched/deadline: Don't replenish from a !SCHED_DEADLINE entity
sched/deadline: Fix races between rt_mutex_setprio() and dl_task_timer()
Kirill Tkhai (4):
sched: Fix race between task_group and sched_task_group
sched/numa: Fix unsafe get_task_struct() in task_numa_assign()
sched/fair: Fix division by zero sysctl_numa_balancing_scan_size
sched/dl: Fix preemption checks
Oleg Nesterov (1):
sched: stop the unbound recursion in preempt_schedule_context()
Yasuaki Ishimatsu (1):
sched/fair: Care divide error in update_task_scan_period()
arch/x86/include/asm/preempt.h | 1 +
include/uapi/linux/sched.h | 2 +-
kernel/context_tracking.c | 40 -----------------------------------
kernel/sched/core.c | 47 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/deadline.c | 41 ++++++++++++++++++++++++++++--------
kernel/sched/fair.c | 21 ++++++++++++++-----
kernel/sysctl.c | 3 ++-
7 files changed, 99 insertions(+), 56 deletions(-)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 7024c12f7bfe..400873450e33 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -105,6 +105,7 @@ static __always_inline bool should_resched(void)
# ifdef CONFIG_CONTEXT_TRACKING
extern asmlinkage void ___preempt_schedule_context(void);
# define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
+ extern asmlinkage void preempt_schedule_context(void);
# endif
#endif
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 34f9d7387d13..b932be9f5c5b 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -13,7 +13,7 @@
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
-#define CLONE_NEWNS 0x00020000 /* New namespace group? */
+#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
}
NOKPROBE_SYMBOL(context_tracking_user_enter);
-#ifdef CONFIG_PREEMPT
-/**
- * preempt_schedule_context - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
-{
- enum ctx_state prev_ctx;
-
- if (likely(!preemptible()))
- return;
-
- /*
- * Need to disable preemption in case user_exit() is traced
- * and the tracer calls preempt_enable_notrace() causing
- * an infinite recursion.
- */
- preempt_disable_notrace();
- prev_ctx = exception_enter();
- preempt_enable_no_resched_notrace();
-
- preempt_schedule();
-
- preempt_disable_notrace();
- exception_exit(prev_ctx);
- preempt_enable_notrace();
-}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_PREEMPT */
-
/**
* context_tracking_user_exit - Inform the context tracking that the CPU is
* exiting userspace mode and entering the kernel.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44999505e1bf..240157c13ddc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ prev_ctx = exception_enter();
+ __schedule();
+ exception_exit(prev_ctx);
+
+ __preempt_count_sub(PREEMPT_ACTIVE);
+ barrier();
+ } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
+
#endif /* CONFIG_PREEMPT */
/*
@@ -7833,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+ sched_move_task(task);
+}
+
static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
@@ -8205,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
+ .fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 256e577faf1b..5285332392d5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,12 +518,20 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
}
/*
- * We need to take care of a possible races here. In fact, the
- * task might have changed its scheduling policy to something
- * different from SCHED_DEADLINE or changed its reservation
- * parameters (through sched_setattr()).
+ * We need to take care of several possible races here:
+ *
+ * - the task might have changed its scheduling policy
+ * to something different than SCHED_DEADLINE
+ * - the task might have changed its reservation parameters
+ * (through sched_setattr())
+ * - the task might have been boosted by someone else and
+ * might be in the boosting/deboosting path
+ *
+ * In all this cases we bail out, as the task is already
+ * in the runqueue or is going to be enqueued back anyway.
*/
- if (!dl_task(p) || dl_se->dl_new)
+ if (!dl_task(p) || dl_se->dl_new ||
+ dl_se->dl_boosted || !dl_se->dl_throttled)
goto unlock;
sched_clock_tick();
@@ -532,7 +540,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
dl_se->dl_yielded = 0;
if (task_on_rq_queued(p)) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (task_has_dl_policy(rq->curr))
+ if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+ if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
pi_se = &pi_task->dl;
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task
+ * that is going to be deboosted, but exceedes its
+ * runtime while doing so. No point in replenishing
+ * it, as it's going to return back to its original
+ * scheduling class after this.
+ */
+ BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ return;
+ }
/*
* If p is throttled, we do nothing. In fact, if it exhausted
@@ -1607,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
/* Only reschedule if pushing failed */
check_resched = 0;
#endif /* CONFIG_SMP */
- if (check_resched && task_has_dl_policy(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ if (check_resched) {
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+ }
}
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b069bf3e708..34baa60f8a7b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
+ unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
- if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
- windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ if (scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -1164,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env,
long moveimp = imp;
rcu_read_lock();
- cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+
+ raw_spin_lock_irq(&dst_rq->lock);
+ cur = dst_rq->curr;
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe under RCU read lock.
+ * Note that rcu_read_lock() itself can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ raw_spin_unlock_irq(&dst_rq->lock);
/*
* "imp" is the fault differential for the source task between the
@@ -1520,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p,
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4aada6d9fe74..15f2511a1b7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_numa_balancing_scan_size,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "numa_balancing",
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-09-26 11:32 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-09-26 11:32 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 03bd4e1f7265548832a76e7919a81f3137c44fd1 sched: Fix unreleased llc_shared_mask bit during CPU hotplug
A CONFIG_STACK_GROWSUP=y fix, and a hotplug llc CPU mask fix.
Thanks,
Ingo
------------------>
Chuck Ebbert (1):
sched: Fix end_of_stack() and location of stack canary for architectures using CONFIG_STACK_GROWSUP
Wanpeng Li (1):
sched: Fix unreleased llc_shared_mask bit during CPU hotplug
arch/x86/kernel/smpboot.c | 3 +++
include/linux/sched.h | 13 +++++++++++++
2 files changed, 16 insertions(+)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2d872e08fab9..42a2dca984b3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1284,6 +1284,9 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, cpu_sibling_mask(cpu))
cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(cpu_sibling_mask(cpu));
cpumask_clear(cpu_core_mask(cpu));
c->phys_proc_id = 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885ee52b..1f07040d28e3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2608,9 +2608,22 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
task_thread_info(p)->task = p;
}
+/*
+ * Return the address of the last usable long on the stack.
+ *
+ * When the stack grows down, this is just above the thread
+ * info struct. Going any lower will corrupt the threadinfo.
+ *
+ * When the stack grows up, this is the highest address.
+ * Beyond that position, we corrupt data on the next page.
+ */
static inline unsigned long *end_of_stack(struct task_struct *p)
{
+#ifdef CONFIG_STACK_GROWSUP
+ return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
+#else
return (unsigned long *)(task_thread_info(p) + 1);
+#endif
}
#endif
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-07-16 11:18 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-07-16 11:18 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: b6220ad66bcd4a50737eb3c08e9466aa44f3bc98 sched: Fix compiler warnings
A cpufreq lockup fix and a compiler warning fix.
Thanks,
Ingo
------------------>
Guenter Roeck (1):
sched: Fix compiler warnings
Peter Zijlstra (1):
x86, tsc: Fix cpufreq lockup
arch/arm/kernel/topology.c | 2 +-
arch/powerpc/kernel/smp.c | 2 +-
arch/x86/kernel/tsc.c | 4 ++--
include/linux/sched.h | 8 ++++----
4 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 9d85318..e35d880 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -275,7 +275,7 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
}
-static inline const int cpu_corepower_flags(void)
+static inline int cpu_corepower_flags(void)
{
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
}
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 51a3ff7..1007fb8 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -747,7 +747,7 @@ int setup_profiling_timer(unsigned int multiplier)
#ifdef CONFIG_SCHED_SMT
/* cpumask of CPUs with asymetric SMT dependancy */
-static const int powerpc_smt_flags(void)
+static int powerpc_smt_flags(void)
{
int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57e5ce1..ea03031 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -920,9 +920,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- }
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu);
+ }
return 0;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 306f4f0..0376b05 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -872,21 +872,21 @@ enum cpu_idle_type {
#define SD_NUMA 0x4000 /* cross-node balancing */
#ifdef CONFIG_SCHED_SMT
-static inline const int cpu_smt_flags(void)
+static inline int cpu_smt_flags(void)
{
return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
}
#endif
#ifdef CONFIG_SCHED_MC
-static inline const int cpu_core_flags(void)
+static inline int cpu_core_flags(void)
{
return SD_SHARE_PKG_RESOURCES;
}
#endif
#ifdef CONFIG_NUMA
-static inline const int cpu_numa_flags(void)
+static inline int cpu_numa_flags(void)
{
return SD_NUMA;
}
@@ -999,7 +999,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
bool cpus_share_cache(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-typedef const int (*sched_domain_flags_f)(void);
+typedef int (*sched_domain_flags_f)(void);
#define SDTL_OVERLAP 0x01
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-06-01 8:17 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-06-01 8:17 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 6acbfb96976fc3350e30d964acb1dbbdf876d55e sched: Fix hotplug vs. set_cpus_allowed_ptr()
Various fixlets, mostly related to the (root-only) SCHED_DEADLINE
policy, but also a hotplug bug fix and a fix for a NR_CPUS related
overallocation bug causing a suspend/resume regression.
Thanks,
Ingo
------------------>
Juri Lelli (1):
sched/deadline: Restrict user params max value to 2^63 ns
Lai Jiangshan (1):
sched: Fix hotplug vs. set_cpus_allowed_ptr()
Michael Kerrisk (1):
sched: Make sched_setattr() correctly return -EFBIG
Peter Zijlstra (4):
sched: Disallow sched_attr::sched_policy < 0
sched/deadline: Change sched_getparam() behaviour vs SCHED_DEADLINE
sched/deadline: Replace NR_CPUS arrays
sched/cpupri: Replace NR_CPUS arrays
kernel/cpu.c | 6 +++--
kernel/sched/core.c | 55 ++++++++++++++++++++++++++++++++--------------
kernel/sched/cpudeadline.c | 33 ++++++++++++++++++++--------
kernel/sched/cpudeadline.h | 6 ++---
kernel/sched/cpupri.c | 7 ++++++
kernel/sched/cpupri.h | 2 +-
6 files changed, 78 insertions(+), 31 deletions(-)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a9e710e..247979a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -726,10 +726,12 @@ void set_cpu_present(unsigned int cpu, bool present)
void set_cpu_online(unsigned int cpu, bool online)
{
- if (online)
+ if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- else
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
}
void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 13584f1..86f3890 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3195,17 +3195,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
* greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
- * check sched_runtime only since it is always the smaller one.
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
*/
static bool
__checkparam_dl(const struct sched_attr *attr)
{
- return attr && attr->sched_deadline != 0 &&
- (attr->sched_period == 0 ||
- (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
- (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
- attr->sched_runtime >= (2 << (DL_SCALE - 1));
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
}
/*
@@ -3658,8 +3681,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if (!uattr || pid < 0 || flags)
return -EINVAL;
- if (sched_copy_attr(uattr, &attr))
- return -EFAULT;
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if (attr.sched_policy < 0)
+ return -EINVAL;
rcu_read_lock();
retval = -ESRCH;
@@ -3709,7 +3736,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
- struct sched_param lp;
+ struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
@@ -3726,11 +3753,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (retval)
goto out_unlock;
- if (task_has_dl_policy(p)) {
- retval = -EINVAL;
- goto out_unlock;
- }
- lp.sched_priority = p->rt_priority;
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
rcu_read_unlock();
/*
@@ -5052,7 +5076,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_STARTING:
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index ab001b5..bd95963 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
#include <linux/gfp.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include "cpudeadline.h"
static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
{
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
- swap(cp->elements[a], cp->elements[b]);
- swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+ swap(cp->elements[a].cpu, cp->elements[b].cpu);
+ swap(cp->elements[a].dl , cp->elements[b].dl );
+
+ swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
}
static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
- old_idx = cp->cpu_to_idx[cpu];
+ old_idx = cp->elements[cpu].idx;
if (!is_valid) {
/* remove item */
if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
- cp->cpu_to_idx[new_cpu] = old_idx;
- cp->cpu_to_idx[cpu] = IDX_INVALID;
+ cp->elements[new_cpu].idx = old_idx;
+ cp->elements[cpu].idx = IDX_INVALID;
while (old_idx > 0 && dl_time_before(
cp->elements[parent(old_idx)].dl,
cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->size++;
cp->elements[cp->size - 1].dl = 0;
cp->elements[cp->size - 1].cpu = cpu;
- cp->cpu_to_idx[cpu] = cp->size - 1;
+ cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
- for (i = 0; i < NR_CPUS; i++)
- cp->cpu_to_idx[i] = IDX_INVALID;
- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+
+ cp->elements = kcalloc(nr_cpu_ids,
+ sizeof(struct cpudl_item),
+ GFP_KERNEL);
+ if (!cp->elements)
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ kfree(cp->elements);
return -ENOMEM;
+ }
+
+ for_each_possible_cpu(i)
+ cp->elements[i].idx = IDX_INVALID;
+
cpumask_setall(cp->free_cpus);
return 0;
@@ -211,4 +225,5 @@ int cpudl_init(struct cpudl *cp)
void cpudl_cleanup(struct cpudl *cp)
{
free_cpumask_var(cp->free_cpus);
+ kfree(cp->elements);
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789..538c979 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
#define IDX_INVALID -1
-struct array_item {
+struct cpudl_item {
u64 dl;
int cpu;
+ int idx;
};
struct cpudl {
raw_spinlock_t lock;
int size;
- int cpu_to_idx[NR_CPUS];
- struct array_item elements[NR_CPUS];
cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
};
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 3031bac..8834243 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/slab.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -218,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
goto cleanup;
}
+ cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+ if (!cp->cpu_to_pri)
+ goto cleanup;
+
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
+
return 0;
cleanup:
@@ -236,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
{
int i;
+ kfree(cp->cpu_to_pri);
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d7561..6b03334 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
struct cpupri {
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
- int cpu_to_pri[NR_CPUS];
+ int *cpu_to_pri;
};
#ifdef CONFIG_SMP
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-05-22 8:10 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-05-22 8:10 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 2b4cfe64dee0d84506b951d81bf55d9891744d25 sched/numa: Initialize newidle balance stats in sd_numa_init()
The biggest commit is an irqtime accounting loop latency fix, the rest
are misc fixes all over the place: deadline scheduling, docs, numa,
balancer and a bad to-idle latency fix.
Thanks,
Ingo
------------------>
Jason Low (2):
sched: Fix updating rq->max_idle_balance_cost and rq->next_balance in idle_balance()
sched/numa: Initialize newidle balance stats in sd_numa_init()
Juri Lelli (1):
sched/deadline: Fix sched_yield() behavior
Li Zefan (1):
sched/deadline: Fix memory leak
Masanari Iida (1):
sched/docbook: Fix 'make htmldocs' warnings caused by missing description
Peter Zijlstra (1):
sched: Skip double execution of pick_next_task_fair()
Steven Rostedt (Red Hat) (1):
sched: Use CPUPRI_NR_PRIORITIES instead of MAX_RT_PRIO in cpupri check
Thomas Gleixner (1):
sched: Sanitize irq accounting madness
include/linux/sched.h | 7 +++++--
kernel/sched/core.c | 15 +++++++++++++--
kernel/sched/cpudeadline.c | 4 +---
kernel/sched/cpupri.c | 3 +--
kernel/sched/cputime.c | 32 ++++++++++++++++----------------
kernel/sched/deadline.c | 5 +++--
kernel/sched/fair.c | 16 ++++++++--------
7 files changed, 47 insertions(+), 35 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 25f54c7..2a4298f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1153,9 +1153,12 @@ struct sched_dl_entity {
*
* @dl_boosted tells if we are boosted due to DI. If so we are
* outside bandwidth enforcement mechanism (but only until we
- * exit the critical section).
+ * exit the critical section);
+ *
+ * @dl_yielded tells if task gave up the cpu before consuming
+ * all its available runtime during the last job.
*/
- int dl_throttled, dl_new, dl_boosted;
+ int dl_throttled, dl_new, dl_boosted, dl_yielded;
/*
* Bandwidth enforcement timer. Each -deadline task has its
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 268a45e..13584f1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2592,8 +2592,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev);
- if (likely(p && p != RETRY_TASK))
- return p;
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
}
again:
@@ -3124,6 +3130,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
static void __setscheduler_params(struct task_struct *p,
@@ -3639,6 +3646,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
* sys_sched_setattr - same as above, but with extended sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
@@ -3783,6 +3791,7 @@ err_size:
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
* @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, size, unsigned int, flags)
@@ -6017,6 +6026,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
,
.last_balance = jiffies,
.balance_interval = sd_weight,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
};
SD_INIT_NAME(sd, NUMA);
sd->private = &tl->data;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42..ab001b5 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -210,7 +210,5 @@ int cpudl_init(struct cpudl *cp)
*/
void cpudl_cleanup(struct cpudl *cp)
{
- /*
- * nothing to do for the moment
- */
+ free_cpumask_var(cp->free_cpus);
}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b3..3031bac 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -70,8 +70,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
int idx = 0;
int task_pri = convert_prio(p->prio);
- if (task_pri >= MAX_RT_PRIO)
- return 0;
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097c..72fdf06 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
+ struct rq *rq, int ticks)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick())
return;
+ cputime *= ticks;
+ scaled *= ticks;
+
if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_IRQ] += cputime;
} else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
} else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
+ account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_guest_time(p, cputime, scaled);
} else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
}
}
static void irqtime_account_idle_ticks(int ticks)
{
- int i;
struct rq *rq = this_rq();
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
+ irqtime_account_process_tick(current, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {}
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
+ struct rq *rq, int nr_ticks) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
+ irqtime_account_process_tick(p, user_tick, rq, 1);
return;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b080957..800e99b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
if (p->on_rq) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
@@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq)
* We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it
- * new scheduling parameters (thanks to dl_new=1).
+ * new scheduling parameters (thanks to dl_yielded=1).
*/
if (p->dl.runtime > 0) {
- rq->curr->dl.dl_new = 1;
+ rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
update_curr_dl(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7570dd9..0fdb96d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6653,6 +6653,7 @@ static int idle_balance(struct rq *this_rq)
int this_cpu = this_rq->cpu;
idle_enter_fair(this_rq);
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -6705,14 +6706,16 @@ static int idle_balance(struct rq *this_rq)
raw_spin_lock(&this_rq->lock);
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
/*
- * While browsing the domains, we released the rq lock.
- * A task could have be enqueued in the meantime
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task) {
+ if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
- goto out;
- }
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
/*
@@ -6722,9 +6725,6 @@ static int idle_balance(struct rq *this_rq)
this_rq->next_balance = next_balance;
}
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-
out:
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-04-19 10:55 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-04-19 10:55 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Thomas Gleixner,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: a1d9a3231eac4117cadaf4b6bba5b2902c15a33e sched: Check for stop task appearance when balancing happens
Two fixes:
- a SCHED_DEADLINE task selection fix
- a sched/numa related lockdep splat fix
Thanks,
Ingo
------------------>
Kirill Tkhai (1):
sched: Check for stop task appearance when balancing happens
Mike Galbraith (1):
sched/numa: Fix task_numa_free() lockdep splat
kernel/sched/deadline.c | 11 ++++++++++-
kernel/sched/fair.c | 16 +++++++++-------
kernel/sched/rt.c | 7 ++++---
kernel/sched/sched.h | 9 +++++++++
4 files changed, 32 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27ef409..b080957 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1021,8 +1021,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
dl_rq = &rq->dl;
- if (need_pull_dl_task(rq, prev))
+ if (need_pull_dl_task(rq, prev)) {
pull_dl_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a stop task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (rq->stop && rq->stop->on_rq)
+ return RETRY_TASK;
+ }
+
/*
* When prev is DL, we may throttle it in put_prev_task().
* So, we update time before we check for dl_nr_running.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e9bd0b..7570dd9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1497,7 +1497,7 @@ static void task_numa_placement(struct task_struct *p)
/* If the task is part of a group prevent parallel updates to group stats */
if (p->numa_group) {
group_lock = &p->numa_group->lock;
- spin_lock(group_lock);
+ spin_lock_irq(group_lock);
}
/* Find the node with the highest number of faults */
@@ -1572,7 +1572,7 @@ static void task_numa_placement(struct task_struct *p)
}
}
- spin_unlock(group_lock);
+ spin_unlock_irq(group_lock);
}
/* Preferred node as the node with the most faults */
@@ -1677,7 +1677,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!join)
return;
- double_lock(&my_grp->lock, &grp->lock);
+ BUG_ON(irqs_disabled());
+ double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
my_grp->faults[i] -= p->numa_faults_memory[i];
@@ -1691,7 +1692,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
grp->nr_tasks++;
spin_unlock(&my_grp->lock);
- spin_unlock(&grp->lock);
+ spin_unlock_irq(&grp->lock);
rcu_assign_pointer(p->numa_group, grp);
@@ -1710,14 +1711,14 @@ void task_numa_free(struct task_struct *p)
void *numa_faults = p->numa_faults_memory;
if (grp) {
- spin_lock(&grp->lock);
+ spin_lock_irq(&grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] -= p->numa_faults_memory[i];
grp->total_faults -= p->total_numa_faults;
list_del(&p->numa_entry);
grp->nr_tasks--;
- spin_unlock(&grp->lock);
+ spin_unlock_irq(&grp->lock);
rcu_assign_pointer(p->numa_group, NULL);
put_numa_group(grp);
}
@@ -6727,7 +6728,8 @@ static int idle_balance(struct rq *this_rq)
out:
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
- (this_rq->dl.dl_nr_running ||
+ ((this_rq->stop && this_rq->stop->on_rq) ||
+ this_rq->dl.dl_nr_running ||
(this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
pulled_task = -1;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8cdf16..bd2267a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1362,10 +1362,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
pull_rt_task(rq);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl task can slip in, in which case we need to
- * re-start task selection.
+ * means a dl or stop task can slip in, in which case we need
+ * to re-start task selection.
*/
- if (unlikely(rq->dl.dl_nr_running))
+ if (unlikely((rq->stop && rq->stop->on_rq) ||
+ rq->dl.dl_nr_running))
return RETRY_TASK;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c9007f2..456e492 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1385,6 +1385,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
+static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock_irq(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
{
if (l1 > l2)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-03-16 16:36 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-03-16 16:36 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 96b3d28bf4b00f62fc8386ff5d487d1830793a3d sched/clock: Prevent tracing recursion in sched_clock_cpu()
Three small fixes.
Thanks,
Ingo
------------------>
Fernando Luis Vazquez Cao (1):
sched/clock: Prevent tracing recursion in sched_clock_cpu()
Juri Lelli (1):
sched/deadline: Deny unprivileged users to set/change SCHED_DEADLINE policy
Peter Zijlstra (1):
stop_machine: Fix^2 race between stop_two_cpus() and stop_cpus()
kernel/sched/clock.c | 4 ++--
kernel/sched/core.c | 9 +++++++++
kernel/stop_machine.c | 2 +-
3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 43c2bcc..b30a292 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu)
if (unlikely(!sched_clock_running))
return 0ull;
- preempt_disable();
+ preempt_disable_notrace();
scd = cpu_sdc(cpu);
if (cpu != smp_processor_id())
clock = sched_clock_remote(scd);
else
clock = sched_clock_local(scd);
- preempt_enable();
+ preempt_enable_notrace();
return clock;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6edbef2..f5c6635 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3338,6 +3338,15 @@ recheck:
return -EPERM;
}
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ return -EPERM;
+
/*
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84571e0..01fbae5 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
*/
smp_call_function_single(min(cpu1, cpu2),
&irq_cpu_stop_queue_work,
- &call_args, 0);
+ &call_args, 1);
lg_local_unlock(&stop_cpus_lock);
preempt_enable();
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-03-03 16:29 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-03-03 16:29 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: faa5993736d9b44b508cab4f1f3a77d66641c6f4 sched/deadline: Prevent rt_time growth to infinity
Misc fixes, most of them SCHED_DEADLINE fallout.
Thanks,
Ingo
------------------>
George McCollister (1):
sched: Fix double normalization of vruntime
Juri Lelli (2):
sched/deadline: Switch CPU's presence test order
sched/deadline: Prevent rt_time growth to infinity
Kirill Tkhai (1):
sched/deadline: Cleanup RT leftovers from {inc/dec}_dl_migration
kernel/sched/cpudeadline.c | 4 ++--
kernel/sched/deadline.c | 10 ++++++----
kernel/sched/fair.c | 8 ++++----
kernel/sched/rt.c | 8 ++++++++
4 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b8838b..5b9bb42 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx)
static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
{
- WARN_ON(!cpu_present(idx) || idx == IDX_INVALID);
+ WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
if (dl_time_before(new_dl, cp->elements[idx].dl)) {
cp->elements[idx].dl = new_dl;
@@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
}
out:
- WARN_ON(!cpu_present(best_cpu) && best_cpu != -1);
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
return best_cpu;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 15cbc17..6e79b3f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -135,7 +135,6 @@ static void update_dl_migration(struct dl_rq *dl_rq)
static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- dl_rq = &rq_of_dl_rq(dl_rq)->dl;
if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory++;
@@ -146,7 +145,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- dl_rq = &rq_of_dl_rq(dl_rq)->dl;
if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory--;
@@ -564,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
return 1;
}
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
@@ -627,11 +627,13 @@ static void update_curr_dl(struct rq *rq)
struct rt_rq *rt_rq = &rq->rt;
raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_time += delta_exec;
/*
* We'll let actual RT tasks worry about the overflow here, we
- * have our own CBS to keep us inline -- see above.
+ * have our own CBS to keep us inline; only account when RT
+ * bandwidth is relevant.
*/
+ if (sched_rt_bandwidth_account(rt_rq))
+ rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7815709..9b4c4f3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7001,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Ensure the task's vruntime is normalized, so that when its
+ * Ensure the task's vruntime is normalized, so that when it's
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it was on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it was !on_rq, then only when
+ * If it's on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !on_rq, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!se->on_rq && p->state != TASK_RUNNING) {
+ if (!p->on_rq && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b7..1999021 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
#endif /* CONFIG_RT_GROUP_SCHED */
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ return (hrtimer_active(&rt_b->rt_period_timer) ||
+ rt_rq->rt_time < rt_b->rt_runtime);
+}
+
#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-02-22 19:20 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-02-22 19:20 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 995b9ea440862def83e8fcb1b498e68f93d4af59 sched/deadline: Remove useless dl_nr_total
Misc fixlets: a fair number of them resulting from the new
SCHED_DEADLINE code.
Sorry about the weekend pull request ... :-/
Thanks,
Ingo
------------------>
Boris Ostrovsky (1):
sched/deadline: Test for CPU's presence explicitly
Juri Lelli (3):
sched/deadline: Fix bad accounting of nr_running
sched/core: Fix sched_rt_global_validate
sched/core: Make dl_b->lock IRQ safe
Kirill Tkhai (1):
sched/deadline: Remove useless dl_nr_total
Peter Zijlstra (1):
sched: Add 'flags' argument to sched_{set,get}attr() syscalls
Rik van Riel (1):
sched,numa: add cond_resched to task_numa_work
Steven Rostedt (1):
sched/deadline: Fix overflow to handle period==0 and deadline!=0
Vegard Nossum (1):
sched: Fix information leak in sys_sched_getattr()
include/linux/syscalls.h | 6 ++++--
kernel/sched/core.c | 28 ++++++++++++++++------------
kernel/sched/cpudeadline.c | 6 +++---
kernel/sched/deadline.c | 10 +++-------
kernel/sched/fair.c | 2 ++
kernel/sched/sched.h | 1 -
6 files changed, 28 insertions(+), 25 deletions(-)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 40ed9e9..a747a77 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -281,13 +281,15 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
asmlinkage long sys_sched_setparam(pid_t pid,
struct sched_param __user *param);
asmlinkage long sys_sched_setattr(pid_t pid,
- struct sched_attr __user *attr);
+ struct sched_attr __user *attr,
+ unsigned int flags);
asmlinkage long sys_sched_getscheduler(pid_t pid);
asmlinkage long sys_sched_getparam(pid_t pid,
struct sched_param __user *param);
asmlinkage long sys_sched_getattr(pid_t pid,
struct sched_attr __user *attr,
- unsigned int size);
+ unsigned int size,
+ unsigned int flags);
asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
unsigned long __user *user_mask_ptr);
asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131e..6edbef2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1952,7 +1952,7 @@ static int dl_overflow(struct task_struct *p, int policy,
{
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- u64 period = attr->sched_period;
+ u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1;
@@ -3661,13 +3661,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
*/
-SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
{
struct sched_attr attr;
struct task_struct *p;
int retval;
- if (!uattr || pid < 0)
+ if (!uattr || pid < 0 || flags)
return -EINVAL;
if (sched_copy_attr(uattr, &attr))
@@ -3786,7 +3787,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
attr->size = usize;
}
- ret = copy_to_user(uattr, attr, usize);
+ ret = copy_to_user(uattr, attr, attr->size);
if (ret)
return -EFAULT;
@@ -3804,8 +3805,8 @@ err_size:
* @uattr: structure containing the extended parameters.
* @size: sizeof(attr) for fwd/bwd comp.
*/
-SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size)
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size, unsigned int, flags)
{
struct sched_attr attr = {
.size = sizeof(struct sched_attr),
@@ -3814,7 +3815,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
int retval;
if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0)
+ size < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
@@ -7422,6 +7423,7 @@ static int sched_dl_global_constraints(void)
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
int cpu, ret = 0;
+ unsigned long flags;
/*
* Here we want to check the bandwidth not being set to some
@@ -7435,10 +7437,10 @@ static int sched_dl_global_constraints(void)
for_each_possible_cpu(cpu) {
struct dl_bw *dl_b = dl_bw_of(cpu);
- raw_spin_lock(&dl_b->lock);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
if (new_bw < dl_b->total_bw)
ret = -EBUSY;
- raw_spin_unlock(&dl_b->lock);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
if (ret)
break;
@@ -7451,6 +7453,7 @@ static void sched_dl_do_global(void)
{
u64 new_bw = -1;
int cpu;
+ unsigned long flags;
def_dl_bandwidth.dl_period = global_rt_period();
def_dl_bandwidth.dl_runtime = global_rt_runtime();
@@ -7464,9 +7467,9 @@ static void sched_dl_do_global(void)
for_each_possible_cpu(cpu) {
struct dl_bw *dl_b = dl_bw_of(cpu);
- raw_spin_lock(&dl_b->lock);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
dl_b->bw = new_bw;
- raw_spin_unlock(&dl_b->lock);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
}
}
@@ -7475,7 +7478,8 @@ static int sched_rt_global_validate(void)
if (sysctl_sched_rt_period <= 0)
return -EINVAL;
- if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+ if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
return -EINVAL;
return 0;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 045fc74..5b8838b 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx)
static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
{
- WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
+ WARN_ON(!cpu_present(idx) || idx == IDX_INVALID);
if (dl_time_before(new_dl, cp->elements[idx].dl)) {
cp->elements[idx].dl = new_dl;
@@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
}
out:
- WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
+ WARN_ON(!cpu_present(best_cpu) && best_cpu != -1);
return best_cpu;
}
@@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
int old_idx, new_cpu;
unsigned long flags;
- WARN_ON(cpu > num_present_cpus());
+ WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
old_idx = cp->cpu_to_idx[cpu];
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e09..15cbc17 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq)
static void update_dl_migration(struct dl_rq *dl_rq)
{
- if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
+ if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
if (!dl_rq->overloaded) {
dl_set_overload(rq_of_dl_rq(dl_rq));
dl_rq->overloaded = 1;
@@ -137,7 +137,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
struct task_struct *p = dl_task_of(dl_se);
dl_rq = &rq_of_dl_rq(dl_rq)->dl;
- dl_rq->dl_nr_total++;
if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory++;
@@ -149,7 +148,6 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
struct task_struct *p = dl_task_of(dl_se);
dl_rq = &rq_of_dl_rq(dl_rq)->dl;
- dl_rq->dl_nr_total--;
if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory--;
@@ -717,6 +715,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
+ inc_nr_running(rq_of_dl_rq(dl_rq));
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -730,6 +729,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
+ dec_nr_running(rq_of_dl_rq(dl_rq));
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -836,8 +836,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
-
- inc_nr_running(rq);
}
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -850,8 +848,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
__dequeue_task_dl(rq, p, flags);
-
- dec_nr_running(rq);
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2b..7815709 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1757,6 +1757,8 @@ void task_numa_work(struct callback_head *work)
start = end;
if (pages <= 0)
goto out;
+
+ cond_resched();
} while (end != vma->vm_end);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd..f964add 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -462,7 +462,6 @@ struct dl_rq {
} earliest_dl;
unsigned long dl_nr_migratory;
- unsigned long dl_nr_total;
int overloaded;
/*
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-01-31 8:17 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-01-31 8:17 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: a57beec5d427086cdc8d75fd51164577193fa7f4 sched: Make sched_class::get_rr_interval() optional
A crash fix and documentation updates.
Thanks,
Ingo
------------------>
Dario Faggioli (1):
sched/deadline: Add sched_dl documentation
Masanari Iida (1):
sched: Fix docbook parameter annotation error in wait.h
Peter Zijlstra (1):
sched: Make sched_class::get_rr_interval() optional
Documentation/scheduler/00-INDEX | 2 +
Documentation/scheduler/sched-deadline.txt | 281 +++++++++++++++++++++++++++++
include/linux/wait.h | 4 +-
kernel/sched/core.c | 4 +-
kernel/sched/deadline.c | 3 +-
5 files changed, 290 insertions(+), 4 deletions(-)
create mode 100644 Documentation/scheduler/sched-deadline.txt
diff --git a/Documentation/scheduler/00-INDEX b/Documentation/scheduler/00-INDEX
index d2651c4..46702e4 100644
--- a/Documentation/scheduler/00-INDEX
+++ b/Documentation/scheduler/00-INDEX
@@ -10,5 +10,7 @@ sched-nice-design.txt
- How and why the scheduler's nice levels are implemented.
sched-rt-group.txt
- real-time group scheduling.
+sched-deadline.txt
+ - deadline scheduling.
sched-stats.txt
- information on schedstats (Linux Scheduler Statistics).
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
new file mode 100644
index 0000000..18adc92
--- /dev/null
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -0,0 +1,281 @@
+ Deadline Task Scheduling
+ ------------------------
+
+CONTENTS
+========
+
+ 0. WARNING
+ 1. Overview
+ 2. Scheduling algorithm
+ 3. Scheduling Real-Time Tasks
+ 4. Bandwidth management
+ 4.1 System-wide settings
+ 4.2 Task interface
+ 4.3 Default behavior
+ 5. Tasks CPU affinity
+ 5.1 SCHED_DEADLINE and cpusets HOWTO
+ 6. Future plans
+
+
+0. WARNING
+==========
+
+ Fiddling with these settings can result in an unpredictable or even unstable
+ system behavior. As for -rt (group) scheduling, it is assumed that root users
+ know what they're doing.
+
+
+1. Overview
+===========
+
+ The SCHED_DEADLINE policy contained inside the sched_dl scheduling class is
+ basically an implementation of the Earliest Deadline First (EDF) scheduling
+ algorithm, augmented with a mechanism (called Constant Bandwidth Server, CBS)
+ that makes it possible to isolate the behavior of tasks between each other.
+
+
+2. Scheduling algorithm
+==================
+
+ SCHED_DEADLINE uses three parameters, named "runtime", "period", and
+ "deadline" to schedule tasks. A SCHED_DEADLINE task is guaranteed to receive
+ "runtime" microseconds of execution time every "period" microseconds, and
+ these "runtime" microseconds are available within "deadline" microseconds
+ from the beginning of the period. In order to implement this behaviour,
+ every time the task wakes up, the scheduler computes a "scheduling deadline"
+ consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then
+ scheduled using EDF[1] on these scheduling deadlines (the task with the
+ smallest scheduling deadline is selected for execution). Notice that this
+ guaranteed is respected if a proper "admission control" strategy (see Section
+ "4. Bandwidth management") is used.
+
+ Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so
+ that each task runs for at most its runtime every period, avoiding any
+ interference between different tasks (bandwidth isolation), while the EDF[1]
+ algorithm selects the task with the smallest scheduling deadline as the one
+ to be executed first. Thanks to this feature, also tasks that do not
+ strictly comply with the "traditional" real-time task model (see Section 3)
+ can effectively use the new policy.
+
+ In more details, the CBS algorithm assigns scheduling deadlines to
+ tasks in the following way:
+
+ - Each SCHED_DEADLINE task is characterised by the "runtime",
+ "deadline", and "period" parameters;
+
+ - The state of the task is described by a "scheduling deadline", and
+ a "current runtime". These two parameters are initially set to 0;
+
+ - When a SCHED_DEADLINE task wakes up (becomes ready for execution),
+ the scheduler checks if
+
+ current runtime runtime
+ ---------------------------------- > ----------------
+ scheduling deadline - current time period
+
+ then, if the scheduling deadline is smaller than the current time, or
+ this condition is verified, the scheduling deadline and the
+ current budget are re-initialised as
+
+ scheduling deadline = current time + deadline
+ current runtime = runtime
+
+ otherwise, the scheduling deadline and the current runtime are
+ left unchanged;
+
+ - When a SCHED_DEADLINE task executes for an amount of time t, its
+ current runtime is decreased as
+
+ current runtime = current runtime - t
+
+ (technically, the runtime is decreased at every tick, or when the
+ task is descheduled / preempted);
+
+ - When the current runtime becomes less or equal than 0, the task is
+ said to be "throttled" (also known as "depleted" in real-time literature)
+ and cannot be scheduled until its scheduling deadline. The "replenishment
+ time" for this task (see next item) is set to be equal to the current
+ value of the scheduling deadline;
+
+ - When the current time is equal to the replenishment time of a
+ throttled task, the scheduling deadline and the current runtime are
+ updated as
+
+ scheduling deadline = scheduling deadline + period
+ current runtime = current runtime + runtime
+
+
+3. Scheduling Real-Time Tasks
+=============================
+
+ * BIG FAT WARNING ******************************************************
+ *
+ * This section contains a (not-thorough) summary on classical deadline
+ * scheduling theory, and how it applies to SCHED_DEADLINE.
+ * The reader can "safely" skip to Section 4 if only interested in seeing
+ * how the scheduling policy can be used. Anyway, we strongly recommend
+ * to come back here and continue reading (once the urge for testing is
+ * satisfied :P) to be sure of fully understanding all technical details.
+ ************************************************************************
+
+ There are no limitations on what kind of task can exploit this new
+ scheduling discipline, even if it must be said that it is particularly
+ suited for periodic or sporadic real-time tasks that need guarantees on their
+ timing behavior, e.g., multimedia, streaming, control applications, etc.
+
+ A typical real-time task is composed of a repetition of computation phases
+ (task instances, or jobs) which are activated on a periodic or sporadic
+ fashion.
+ Each job J_j (where J_j is the j^th job of the task) is characterised by an
+ arrival time r_j (the time when the job starts), an amount of computation
+ time c_j needed to finish the job, and a job absolute deadline d_j, which
+ is the time within which the job should be finished. The maximum execution
+ time max_j{c_j} is called "Worst Case Execution Time" (WCET) for the task.
+ A real-time task can be periodic with period P if r_{j+1} = r_j + P, or
+ sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally,
+ d_j = r_j + D, where D is the task's relative deadline.
+
+ SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that
+ the jobs' deadlines of a task are respected. In order to do this, a task
+ must be scheduled by setting:
+
+ - runtime >= WCET
+ - deadline = D
+ - period <= P
+
+ IOW, if runtime >= WCET and if period is >= P, then the scheduling deadlines
+ and the absolute deadlines (d_j) coincide, so a proper admission control
+ allows to respect the jobs' absolute deadlines for this task (this is what is
+ called "hard schedulability property" and is an extension of Lemma 1 of [2]).
+
+ References:
+ 1 - C. L. Liu and J. W. Layland. Scheduling algorithms for multiprogram-
+ ming in a hard-real-time environment. Journal of the Association for
+ Computing Machinery, 20(1), 1973.
+ 2 - L. Abeni , G. Buttazzo. Integrating Multimedia Applications in Hard
+ Real-Time Systems. Proceedings of the 19th IEEE Real-time Systems
+ Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf
+ 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab
+ Technical Report. http://xoomer.virgilio.it/lucabe72/pubs/tr-98-01.ps
+
+4. Bandwidth management
+=======================
+
+ In order for the -deadline scheduling to be effective and useful, it is
+ important to have some method to keep the allocation of the available CPU
+ bandwidth to the tasks under control.
+ This is usually called "admission control" and if it is not performed at all,
+ no guarantee can be given on the actual scheduling of the -deadline tasks.
+
+ Since when RT-throttling has been introduced each task group has a bandwidth
+ associated, calculated as a certain amount of runtime over a period.
+ Moreover, to make it possible to manipulate such bandwidth, readable/writable
+ controls have been added to both procfs (for system wide settings) and cgroupfs
+ (for per-group settings).
+ Therefore, the same interface is being used for controlling the bandwidth
+ distrubution to -deadline tasks.
+
+ However, more discussion is needed in order to figure out how we want to manage
+ SCHED_DEADLINE bandwidth at the task group level. Therefore, SCHED_DEADLINE
+ uses (for now) a less sophisticated, but actually very sensible, mechanism to
+ ensure that a certain utilization cap is not overcome per each root_domain.
+
+ Another main difference between deadline bandwidth management and RT-throttling
+ is that -deadline tasks have bandwidth on their own (while -rt ones don't!),
+ and thus we don't need an higher level throttling mechanism to enforce the
+ desired bandwidth.
+
+4.1 System wide settings
+------------------------
+
+ The system wide settings are configured under the /proc virtual file system.
+
+ For now the -rt knobs are used for dl admission control and the -deadline
+ runtime is accounted against the -rt runtime. We realise that this isn't
+ entirely desirable; however, it is better to have a small interface for now,
+ and be able to change it easily later. The ideal situation (see 5.) is to run
+ -rt tasks from a -deadline server; in which case the -rt bandwidth is a direct
+ subset of dl_bw.
+
+ This means that, for a root_domain comprising M CPUs, -deadline tasks
+ can be created while the sum of their bandwidths stays below:
+
+ M * (sched_rt_runtime_us / sched_rt_period_us)
+
+ It is also possible to disable this bandwidth management logic, and
+ be thus free of oversubscribing the system up to any arbitrary level.
+ This is done by writing -1 in /proc/sys/kernel/sched_rt_runtime_us.
+
+
+4.2 Task interface
+------------------
+
+ Specifying a periodic/sporadic task that executes for a given amount of
+ runtime at each instance, and that is scheduled according to the urgency of
+ its own timing constraints needs, in general, a way of declaring:
+ - a (maximum/typical) instance execution time,
+ - a minimum interval between consecutive instances,
+ - a time constraint by which each instance must be completed.
+
+ Therefore:
+ * a new struct sched_attr, containing all the necessary fields is
+ provided;
+ * the new scheduling related syscalls that manipulate it, i.e.,
+ sched_setattr() and sched_getattr() are implemented.
+
+
+4.3 Default behavior
+---------------------
+
+ The default value for SCHED_DEADLINE bandwidth is to have rt_runtime equal to
+ 950000. With rt_period equal to 1000000, by default, it means that -deadline
+ tasks can use at most 95%, multiplied by the number of CPUs that compose the
+ root_domain, for each root_domain.
+
+ A -deadline task cannot fork.
+
+5. Tasks CPU affinity
+=====================
+
+ -deadline tasks cannot have an affinity mask smaller that the entire
+ root_domain they are created on. However, affinities can be specified
+ through the cpuset facility (Documentation/cgroups/cpusets.txt).
+
+5.1 SCHED_DEADLINE and cpusets HOWTO
+------------------------------------
+
+ An example of a simple configuration (pin a -deadline task to CPU0)
+ follows (rt-app is used to create a -deadline task).
+
+ mkdir /dev/cpuset
+ mount -t cgroup -o cpuset cpuset /dev/cpuset
+ cd /dev/cpuset
+ mkdir cpu0
+ echo 0 > cpu0/cpuset.cpus
+ echo 0 > cpu0/cpuset.mems
+ echo 1 > cpuset.cpu_exclusive
+ echo 0 > cpuset.sched_load_balance
+ echo 1 > cpu0/cpuset.cpu_exclusive
+ echo 1 > cpu0/cpuset.mem_exclusive
+ echo $$ > cpu0/tasks
+ rt-app -t 100000:10000:d:0 -D5 (it is now actually superfluous to specify
+ task affinity)
+
+6. Future plans
+===============
+
+ Still missing:
+
+ - refinements to deadline inheritance, especially regarding the possibility
+ of retaining bandwidth isolation among non-interacting tasks. This is
+ being studied from both theoretical and practical points of view, and
+ hopefully we should be able to produce some demonstrative code soon;
+ - (c)group based bandwidth management, and maybe scheduling;
+ - access control for non-root users (and related security concerns to
+ address), which is the best way to allow unprivileged use of the mechanisms
+ and how to prevent non-root users "cheat" the system?
+
+ As already discussed, we are planning also to merge this work with the EDF
+ throttling patches [https://lkml.org/lkml/2010/2/23/239] but we still are in
+ the preliminary phases of the merge and we really seek feedback that would
+ help us decide on the direction it should take.
diff --git a/include/linux/wait.h b/include/linux/wait.h
index eaa00b1..559044c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -286,8 +286,8 @@ do { \
* wait_event_cmd - sleep until a condition gets true
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
- * cmd1: the command will be executed before sleep
- * cmd2: the command will be executed after sleep
+ * @cmd1: the command will be executed before sleep
+ * @cmd2: the command will be executed after sleep
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 36c951b..81343d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4324,7 +4324,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
goto out_unlock;
rq = task_rq_lock(p, &flags);
- time_slice = p->sched_class->get_rr_interval(rq, p);
+ time_slice = 0;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
task_rq_unlock(rq, p, &flags);
rcu_read_unlock();
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0de2482..0dd5e09 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -351,7 +351,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* disrupting the schedulability of the system. Otherwise, we should
* refill the runtime and set the deadline a period in the future,
* because keeping the current (absolute) deadline of the task would
- * result in breaking guarantees promised to other tasks.
+ * result in breaking guarantees promised to other tasks (refer to
+ * Documentation/scheduler/sched-deadline.txt for more informations).
*
* This function returns true if:
*
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2014-01-25 7:26 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2014-01-25 7:26 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 5e3c1afd4587e70c201bf7224b51f747c9a3dfa8 sched/x86/tsc: Initialize multiplier to 0
A couple of regression fixes mostly hitting virtualized setups, but
also some bare metal systems.
Thanks,
Ingo
------------------>
Peter Zijlstra (3):
sched/preempt/x86: Fix voluntary preempt for x86
sched/clock: Fixup early initialization
sched/x86/tsc: Initialize multiplier to 0
Vincent Guittot (1):
Revert "sched: Fix sleep time double accounting in enqueue entity"
arch/x86/kernel/tsc.c | 2 +-
include/linux/preempt.h | 5 -----
kernel/sched/clock.c | 53 ++++++++++++++++++++++++++++++++++++++-----------
kernel/sched/fair.c | 8 +-------
4 files changed, 43 insertions(+), 25 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a3acbac..19e5adb 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -180,7 +180,7 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
static void cyc2ns_data_init(struct cyc2ns_data *data)
{
- data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
+ data->cyc2ns_mul = 0;
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
data->cyc2ns_offset = 0;
data->__count = 0;
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 59749fc..de83b4e 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -134,7 +134,6 @@ do { \
#undef preempt_check_resched
#endif
-#ifdef CONFIG_PREEMPT
#define preempt_set_need_resched() \
do { \
set_preempt_need_resched(); \
@@ -144,10 +143,6 @@ do { \
if (tif_need_resched()) \
set_preempt_need_resched(); \
} while (0)
-#else
-#define preempt_set_need_resched() do { } while (0)
-#define preempt_fold_need_resched() do { } while (0)
-#endif
#ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 6bd6a67..43c2bcc 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -77,35 +77,50 @@ __read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
+static int __sched_clock_stable_early;
int sched_clock_stable(void)
{
- if (static_key_false(&__sched_clock_stable))
- return false;
- return true;
+ return static_key_false(&__sched_clock_stable);
}
-void set_sched_clock_stable(void)
+static void __set_sched_clock_stable(void)
{
if (!sched_clock_stable())
- static_key_slow_dec(&__sched_clock_stable);
+ static_key_slow_inc(&__sched_clock_stable);
+}
+
+void set_sched_clock_stable(void)
+{
+ __sched_clock_stable_early = 1;
+
+ smp_mb(); /* matches sched_clock_init() */
+
+ if (!sched_clock_running)
+ return;
+
+ __set_sched_clock_stable();
}
static void __clear_sched_clock_stable(struct work_struct *work)
{
/* XXX worry about clock continuity */
if (sched_clock_stable())
- static_key_slow_inc(&__sched_clock_stable);
+ static_key_slow_dec(&__sched_clock_stable);
}
static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
void clear_sched_clock_stable(void)
{
- if (keventd_up())
- schedule_work(&sched_clock_work);
- else
- __clear_sched_clock_stable(&sched_clock_work);
+ __sched_clock_stable_early = 0;
+
+ smp_mb(); /* matches sched_clock_init() */
+
+ if (!sched_clock_running)
+ return;
+
+ schedule_work(&sched_clock_work);
}
struct sched_clock_data {
@@ -140,6 +155,20 @@ void sched_clock_init(void)
}
sched_clock_running = 1;
+
+ /*
+ * Ensure that it is impossible to not do a static_key update.
+ *
+ * Either {set,clear}_sched_clock_stable() must see sched_clock_running
+ * and do the update, or we must see their __sched_clock_stable_early
+ * and do the update, or both.
+ */
+ smp_mb(); /* matches {set,clear}_sched_clock_stable() */
+
+ if (__sched_clock_stable_early)
+ __set_sched_clock_stable();
+ else
+ __clear_sched_clock_stable(NULL);
}
/*
@@ -340,7 +369,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
*/
u64 cpu_clock(int cpu)
{
- if (static_key_false(&__sched_clock_stable))
+ if (!sched_clock_stable())
return sched_clock_cpu(cpu);
return sched_clock();
@@ -355,7 +384,7 @@ u64 cpu_clock(int cpu)
*/
u64 local_clock(void)
{
- if (static_key_false(&__sched_clock_stable))
+ if (!sched_clock_stable())
return sched_clock_cpu(raw_smp_processor_id());
return sched_clock();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b24b6cf..efe6457 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2356,13 +2356,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
}
wakeup = 0;
} else {
- /*
- * Task re-woke on same cpu (or else migrate_task_rq_fair()
- * would have made count negative); we must be careful to avoid
- * double-accounting blocked time after synchronizing decays.
- */
- se->avg.last_runnable_update += __synchronize_entity_decay(se)
- << 20;
+ __synchronize_entity_decay(se);
}
/* migrated tasks did not contribute to our blocked load */
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-12-19 16:55 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-12-19 16:55 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, H. Peter Anvin,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 757dfcaa41844595964f1220f1d33182dae49976 sched/rt: Fix rq's cpupri leak while enqueue/dequeue child RT entities
An RT group-scheduling fix and the sched-domains topology setup fix
from Mel.
Thanks,
Ingo
------------------>
Kirill Tkhai (1):
sched/rt: Fix rq's cpupri leak while enqueue/dequeue child RT entities
Mel Gorman (1):
sched: Assign correct scheduling domain to 'sd_llc'
kernel/sched/core.c | 5 +++--
kernel/sched/rt.c | 14 ++++++++++++++
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 19af58f..a88f4a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4902,6 +4902,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
+ struct sched_domain *busy_sd = NULL;
int id = cpu;
int size = 1;
@@ -4909,9 +4910,9 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- sd = sd->parent; /* sd_busy */
+ busy_sd = sd->parent; /* sd_busy */
}
- rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d57275..1c40655 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -901,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -910,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-12-17 13:40 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-12-17 13:40 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, H. Peter Anvin,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 9dbdb155532395ba000c5d5d187658b0e17e529f sched/fair: Rework sched_fair time accounting
Three fixes for scheduler crashes, each triggers in relatively rare,
hardware environment dependent situations.
Thanks,
Ingo
------------------>
Peter Zijlstra (4):
sched: Initialize power_orig for overlapping groups
sched: Remove PREEMPT_NEED_RESCHED from generic code
math64: Add mul_u64_u32_shr()
sched/fair: Rework sched_fair time accounting
arch/x86/Kconfig | 1 +
arch/x86/include/asm/preempt.h | 11 ++++
include/asm-generic/preempt.h | 35 ++++------
include/linux/math64.h | 30 +++++++++
include/linux/sched.h | 5 +-
init/Kconfig | 6 ++
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 144 ++++++++++++++++++-----------------------
8 files changed, 126 insertions(+), 107 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e903c71..0952ecd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,6 +26,7 @@ config X86
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_SUPPORTS_INT128 if X86_64
select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 8729723..c8b0519 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -8,6 +8,12 @@
DECLARE_PER_CPU(int, __preempt_count);
/*
+ * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
+ * that a decrement hitting 0 means we can and should reschedule.
+ */
+#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)
+
+/*
* We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
* that think a non-zero value indicates we cannot preempt.
*/
@@ -74,6 +80,11 @@ static __always_inline void __preempt_count_sub(int val)
__this_cpu_add_4(__preempt_count, -val);
}
+/*
+ * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
+ * a decrement which hits zero means we have no preempt_count and should
+ * reschedule.
+ */
static __always_inline bool __preempt_count_dec_and_test(void)
{
GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index ddf2b42..1cd3f5d 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -3,13 +3,11 @@
#include <linux/thread_info.h>
-/*
- * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
- * that think a non-zero value indicates we cannot preempt.
- */
+#define PREEMPT_ENABLED (0)
+
static __always_inline int preempt_count(void)
{
- return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;
+ return current_thread_info()->preempt_count;
}
static __always_inline int *preempt_count_ptr(void)
@@ -17,11 +15,6 @@ static __always_inline int *preempt_count_ptr(void)
return ¤t_thread_info()->preempt_count;
}
-/*
- * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
- * alternative is loosing a reschedule. Better schedule too often -- also this
- * should be a very rare operation.
- */
static __always_inline void preempt_count_set(int pc)
{
*preempt_count_ptr() = pc;
@@ -41,28 +34,17 @@ static __always_inline void preempt_count_set(int pc)
task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
} while (0)
-/*
- * We fold the NEED_RESCHED bit into the preempt count such that
- * preempt_enable() can decrement and test for needing to reschedule with a
- * single instruction.
- *
- * We invert the actual bit, so that when the decrement hits 0 we know we both
- * need to resched (the bit is cleared) and can resched (no preempt count).
- */
-
static __always_inline void set_preempt_need_resched(void)
{
- *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
}
static __always_inline void clear_preempt_need_resched(void)
{
- *preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
}
static __always_inline bool test_preempt_need_resched(void)
{
- return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);
+ return false;
}
/*
@@ -81,7 +63,12 @@ static __always_inline void __preempt_count_sub(int val)
static __always_inline bool __preempt_count_dec_and_test(void)
{
- return !--*preempt_count_ptr();
+ /*
+ * Because of load-store architectures cannot do per-cpu atomic
+ * operations; we cannot use PREEMPT_NEED_RESCHED because it might get
+ * lost.
+ */
+ return !--*preempt_count_ptr() && tif_need_resched();
}
/*
@@ -89,7 +76,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
*/
static __always_inline bool should_resched(void)
{
- return unlikely(!*preempt_count_ptr());
+ return unlikely(!preempt_count() && tif_need_resched());
}
#ifdef CONFIG_PREEMPT
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 69ed5f5..c45c089 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -133,4 +133,34 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
return ret;
}
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+#ifndef mul_u64_u32_shr
+static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+{
+ return (u64)(((unsigned __int128)a * mul) >> shift);
+}
+#endif /* mul_u64_u32_shr */
+
+#else
+
+#ifndef mul_u64_u32_shr
+static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+{
+ u32 ah, al;
+ u64 ret;
+
+ al = a;
+ ah = a >> 32;
+
+ ret = ((u64)al * mul) >> shift;
+ if (ah)
+ ret += ((u64)ah * mul) << (32 - shift);
+
+ return ret;
+}
+#endif /* mul_u64_u32_shr */
+
+#endif
+
#endif /* _LINUX_MATH64_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 768b037..53f97eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -440,8 +440,6 @@ struct task_cputime {
.sum_exec_runtime = 0, \
}
-#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED)
-
#ifdef CONFIG_PREEMPT_COUNT
#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
#else
@@ -932,7 +930,8 @@ struct pipe_inode_info;
struct uts_namespace;
struct load_weight {
- unsigned long weight, inv_weight;
+ unsigned long weight;
+ u32 inv_weight;
};
struct sched_avg {
diff --git a/init/Kconfig b/init/Kconfig
index 79383d3..4e5d96a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -809,6 +809,12 @@ config GENERIC_SCHED_CLOCK
config ARCH_SUPPORTS_NUMA_BALANCING
bool
+#
+# For architectures that know their GCC __int128 support is sound
+#
+config ARCH_SUPPORTS_INT128
+ bool
+
# For architectures that (ab)use NUMA to represent different memory regions
# all cpu-local but of different latencies, such as SuperH.
#
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e85cda2..19af58f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5112,6 +5112,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+ sg->sgp->power_orig = sg->sgp->power;
/*
* Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd773ad..9030da7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
update_sysctl();
}
-#if BITS_PER_LONG == 32
-# define WMULT_CONST (~0UL)
-#else
-# define WMULT_CONST (1UL << 32)
-#endif
-
+#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+ unsigned long w;
+
+ if (likely(lw->inv_weight))
+ return;
+
+ w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
+ else
+ lw->inv_weight = WMULT_CONST / w;
+}
/*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ * OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
*/
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
- struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
- u64 tmp;
+ u64 fact = scale_load_down(weight);
+ int shift = WMULT_SHIFT;
- /*
- * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
- * entities since MIN_SHARES = 2. Treat weight as 1 if less than
- * 2^SCHED_LOAD_RESOLUTION.
- */
- if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
- tmp = (u64)delta_exec * scale_load_down(weight);
- else
- tmp = (u64)delta_exec;
+ __update_inv_weight(lw);
- if (!lw->inv_weight) {
- unsigned long w = scale_load_down(lw->weight);
-
- if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
- lw->inv_weight = 1;
- else if (unlikely(!w))
- lw->inv_weight = WMULT_CONST;
- else
- lw->inv_weight = WMULT_CONST / w;
+ if (unlikely(fact >> 32)) {
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
}
- /*
- * Check whether we'd overflow the 64-bit multiplication:
- */
- if (unlikely(tmp > WMULT_CONST))
- tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
- WMULT_SHIFT/2);
- else
- tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+ /* hint to use a 32x32->64 mul */
+ fact = (u64)(u32)fact * lw->inv_weight;
+
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
- return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+ return mul_u64_u32_shr(delta_exec, fact, shift);
}
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
/*
* delta /= w
*/
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
return delta;
}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&lw, se->load.weight);
load = &lw;
}
- slice = calc_delta_mine(slice, se->load.weight, load);
+ slice = __calc_delta(slice, se->load.weight, load);
}
return slice;
}
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
#endif
/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
+ * Update the current task's runtime statistics.
*/
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
- unsigned long delta_exec)
-{
- unsigned long delta_exec_weighted;
-
- schedstat_set(curr->statistics.exec_max,
- max((u64)delta_exec, curr->statistics.exec_max));
-
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
- delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
- curr->vruntime += delta_exec_weighted;
- update_min_vruntime(cfs_rq);
-}
-
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
- unsigned long delta_exec;
+ u64 delta_exec;
if (unlikely(!curr))
return;
- /*
- * Get the amount of time the current task was running
- * since the last time we changed load (this cannot
- * overflow on 32 bits):
- */
- delta_exec = (unsigned long)(now - curr->exec_start);
- if (!delta_exec)
+ delta_exec = now - curr->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
return;
- __update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
+ schedstat_set(curr->statistics.exec_max,
+ max(delta_exec, curr->statistics.exec_max));
+
+ curr->sum_exec_runtime += delta_exec;
+ schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+ curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_min_vruntime(cfs_rq);
+
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
}
}
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
}
static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
return rq_clock_task(rq_of(cfs_rq));
}
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-12-02 14:43 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-12-02 14:43 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, H. Peter Anvin,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 96739d6e548e16d76de39d059e1e39e70c187fff sched/doc: Fix generation of device-drivers
Various smaller fixlets, all over the place.
Thanks,
Ingo
------------------>
Alex Shi (1):
sched: Remove unused variable in 'struct sched_domain'
Joe Perches (1):
MAINTAINERS: Update file patterns in the lockdep and scheduler entries
Nicolas Dichtel (1):
sched/doc: Fix generation of device-drivers
Peter Zijlstra (1):
sched: Avoid NULL dereference on sd_busy
Shigeru Yoshida (1):
sched: Fix a trivial typo in comments
Srikar Dronamraju (1):
sched: Check sched_domain before computing group power
Thomas Gleixner (1):
sched: Expose preempt_schedule_irq()
Documentation/DocBook/device-drivers.tmpl | 2 +-
MAINTAINERS | 3 +--
include/linux/sched.h | 2 --
kernel/sched/core.c | 8 ++++----
kernel/sched/fair.c | 27 ++++++++++++++++++++++++---
5 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl
index 6c9d9d3..f517008 100644
--- a/Documentation/DocBook/device-drivers.tmpl
+++ b/Documentation/DocBook/device-drivers.tmpl
@@ -58,7 +58,7 @@
</sect1>
<sect1><title>Wait queues and Wake events</title>
!Iinclude/linux/wait.h
-!Ekernel/wait.c
+!Ekernel/sched/wait.c
</sect1>
<sect1><title>High-resolution timers</title>
!Iinclude/linux/ktime.h
diff --git a/MAINTAINERS b/MAINTAINERS
index f3ef1d1..3f85561 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5236,7 +5236,7 @@ S: Maintained
F: Documentation/lockdep*.txt
F: Documentation/lockstat.txt
F: include/linux/lockdep.h
-F: kernel/lockdep*
+F: kernel/locking/
LOGICAL DISK MANAGER SUPPORT (LDM, Windows 2000/XP/Vista Dynamic Disks)
M: "Richard Russon (FlatCap)" <ldm@flatcap.org>
@@ -7361,7 +7361,6 @@ S: Maintained
F: kernel/sched/
F: include/linux/sched.h
F: include/uapi/linux/sched.h
-F: kernel/wait.c
F: include/linux/wait.h
SCORE ARCHITECTURE
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f7efc86..b122395 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -823,8 +823,6 @@ struct sched_domain {
unsigned int balance_interval; /* initialise to 1. units in ms. */
unsigned int nr_balance_failed; /* initialise to 0 */
- u64 last_update;
-
/* idle_balance() stats */
u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c180860..e85cda2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2660,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
} while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
/*
* this is the entry point to schedule() from kernel preemption
@@ -2693,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
exception_exit(prev_state);
}
-#endif /* CONFIG_PREEMPT */
-
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
@@ -4762,7 +4761,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rt yet then
+ * If we dont want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -4910,8 +4909,9 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
+ sd = sd->parent; /* sd_busy */
}
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8b652e..fd773ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5379,10 +5379,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_cpus(sdg)) {
- struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+ struct sched_group_power *sgp;
+ struct rq *rq = cpu_rq(cpu);
- power_orig += sg->sgp->power_orig;
- power += sg->sgp->power;
+ /*
+ * build_sched_domains() -> init_sched_groups_power()
+ * gets here before we've attached the domains to the
+ * runqueues.
+ *
+ * Use power_of(), which is set irrespective of domains
+ * in update_cpu_power().
+ *
+ * This avoids power/power_orig from being 0 and
+ * causing divide-by-zero issues on boot.
+ *
+ * Runtime updates will correct power_orig.
+ */
+ if (unlikely(!rq->sd)) {
+ power_orig += power_of(cpu);
+ power += power_of(cpu);
+ continue;
+ }
+
+ sgp = rq->sd->groups->sgp;
+ power_orig += sgp->power_orig;
+ power += sgp->power;
}
} else {
/*
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-11-13 20:14 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-11-13 20:14 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, H. Peter Anvin,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
# HEAD: 85b088e934b9943322bfe37077289ae60f1b3414 sched/fair: Avoid integer overflow
Four bugfixes and one performance fix.
Thanks,
Ingo
------------------>
Michael wang (1):
sched: Fix endless sync_sched/rcu() loop inside _cpu_down()
Michal Nazarewicz (1):
sched/fair: Avoid integer overflow
Peter Zijlstra (2):
sched/numa: Cure update_numa_stats() vs. hotplug
sched: Optimize task_sched_runtime()
Rik van Riel (1):
sched/numa: Fix NULL pointer dereference in task_numa_migrate()
kernel/cpu.c | 5 ++++-
kernel/sched/core.c | 14 ++++++++++++++
kernel/sched/fair.c | 31 ++++++++++++++++++++++++++++---
3 files changed, 46 insertions(+), 4 deletions(-)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 63aa50d..2227b58 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -306,7 +306,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
__func__, cpu);
goto out_release;
}
- smpboot_park_threads(cpu);
/*
* By now we've cleared cpu_active_mask, wait for all preempt-disabled
@@ -315,12 +314,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
*
* For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
* not imply sync_sched(), so explicitly call both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
*/
#ifdef CONFIG_PREEMPT
synchronize_sched();
#endif
synchronize_rcu();
+ smpboot_park_threads(cpu);
+
/*
* So now all preempt/rcu users must observe !cpu_active().
*/
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1deccd7..c180860 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2253,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns = 0;
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+ /*
+ * 64-bit doesn't need locks to atomically read a 64bit value.
+ * So we have a optimization chance when the task's delta_exec is 0.
+ * Reading ->on_cpu is racy, but this is ok.
+ *
+ * If we race with it leaving cpu, we'll take a lock. So we're correct.
+ * If we race with it entering cpu, unaccounted time is 0. This is
+ * indistinguishable from the read occurring a few cycles earlier.
+ */
+ if (!p->on_cpu)
+ return p->se.sum_exec_runtime;
+#endif
+
rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, p, &flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df77c60..e8b652e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1000,7 +1000,7 @@ struct numa_stats {
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
- int cpu;
+ int cpu, cpus = 0;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1009,8 +1009,21 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(cpu);
ns->power += power_of(cpu);
+
+ cpus++;
}
+ /*
+ * If we raced with hotplug and there are no CPUs left in our mask
+ * the @ns structure is NULL'ed and task_numa_compare() will
+ * not find this node attractive.
+ *
+ * We'll either bail at !has_capacity, or we'll detect a huge imbalance
+ * and bail there.
+ */
+ if (!cpus)
+ return;
+
ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
ns->has_capacity = (ns->nr_running < ns->capacity);
@@ -1201,9 +1214,21 @@ static int task_numa_migrate(struct task_struct *p)
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ if (sd)
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
rcu_read_unlock();
+ /*
+ * Cpusets can break the scheduler domain tree into smaller
+ * balance domains, some of which do not cross NUMA boundaries.
+ * Tasks that are "trapped" in such domains cannot be migrated
+ * elsewhere, so there is no point in (re)trying.
+ */
+ if (unlikely(!sd)) {
+ p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+ return -EINVAL;
+ }
+
taskweight = task_weight(p, env.src_nid);
groupweight = group_weight(p, env.src_nid);
update_numa_stats(&env.src_stats, env.src_nid);
@@ -2153,7 +2178,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
long contrib;
/* The fraction of a cpu used by this cfs_rq */
- contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
sa->runnable_avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-09-25 18:03 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-09-25 18:03 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 7e3115ef5149fc502e3a2e80719dba54a8e7409d sched/balancing: Fix cfs_rq->task_h_load calculation
Three small fixes.
Thanks,
Ingo
------------------>
Vladimir Davydov (3):
sched/balancing: Fix 'local->avg_load > sds->avg_load' case in calculate_imbalance()
sched/balancing: Fix 'local->avg_load > busiest->avg_load' case in fix_small_imbalance()
sched/balancing: Fix cfs_rq->task_h_load calculation
kernel/sched/fair.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11cd136..7c70201 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4242,7 +4242,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
}
if (!se) {
- cfs_rq->h_load = rq->avg.load_avg_contrib;
+ cfs_rq->h_load = cfs_rq->runnable_load_avg;
cfs_rq->last_h_load_update = now;
}
@@ -4823,8 +4823,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
(busiest->load_per_task * SCHED_POWER_SCALE) /
busiest->group_power;
- if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
- (scaled_busy_load_per_task * imbn)) {
+ if (busiest->avg_load + scaled_busy_load_per_task >=
+ local->avg_load + (scaled_busy_load_per_task * imbn)) {
env->imbalance = busiest->load_per_task;
return;
}
@@ -4896,7 +4896,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
- if (busiest->avg_load < sds->avg_load) {
+ if (busiest->avg_load <= sds->avg_load ||
+ local->avg_load >= sds->avg_load) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-09-18 16:19 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-09-18 16:19 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 13b62e46d5407c7d619aea1dc9c3e0991b631b57 sched: Fix comment for sched_info_depart
Misc fixes.
Thanks,
Ingo
------------------>
Daisuke Nishimura (1):
sched/fair: Fix small race where child->se.parent,cfs_rq might point to invalid ones
Li Bin (1):
sched/Documentation: Update sched-design-CFS.txt documentation
Michael S. Tsirkin (1):
sched: Fix comment for sched_info_depart
Peter Zijlstra (1):
sched/debug: Take PID namespace into account
Documentation/scheduler/sched-design-CFS.txt | 4 +---
kernel/sched/debug.c | 6 +++---
kernel/sched/fair.c | 14 +++++++++-----
kernel/sched/stats.h | 5 +++--
4 files changed, 16 insertions(+), 13 deletions(-)
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index d529e02d..f14f493 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -66,9 +66,7 @@ rq->cfs.load value, which is the sum of the weights of the tasks queued on the
runqueue.
CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the
-p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to
-account for possible wraparounds). CFS picks the "leftmost" task from this
-tree and sticks to it.
+p->se.vruntime key. CFS picks the "leftmost" task from this tree and sticks to it.
As the system progresses forwards, the executed tasks are put into the tree
more and more to the right --- slowly but surely giving a chance for every task
to become the "leftmost task" and thus get on the CPU within a deterministic
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e076bdd..1965599 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -124,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, " ");
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
- p->comm, p->pid,
+ p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -289,7 +289,7 @@ do { \
P(nr_load_updates);
P(nr_uninterruptible);
PN(next_balance);
- P(curr->pid);
+ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
P(cpu_load[0]);
P(cpu_load[1]);
@@ -492,7 +492,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
get_nr_threads(p));
SEQ_printf(m,
"---------------------------------------------------------"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b3fe1c..11cd136 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5928,11 +5928,15 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (unlikely(task_cpu(p) != this_cpu)) {
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
- }
+ /*
+ * Not only the cpu but also the task_group of the parent might have
+ * been changed after parent->se.parent,cfs_rq were copied to
+ * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+ * of child point to valid ones.
+ */
+ rcu_read_lock();
+ __set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
update_curr(cfs_rq);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 5aef494..c7edee7 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -104,8 +104,9 @@ static inline void sched_info_queued(struct task_struct *t)
}
/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily. Now we can calculate how long we ran.
+ * Called when a process ceases being the active-running process involuntarily
+ * due, typically, to expiring its time slice (this may also be called when
+ * switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
* sched_info_queued() to mark that it has now again started waiting on
* the runqueue.
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-08-13 16:58 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-08-13 16:58 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: bf0bd948d1682e3996adc093b43021ed391983e6 sched: Ensure update_cfs_shares() is called for parents of continuously-running tasks
Docbook fixes that make 99% of the diffstat, plus a oneliner fix.
Thanks,
Ingo
------------------>
Peter Zijlstra (1):
sched: Ensure update_cfs_shares() is called for parents of continuously-running tasks
Yacine Belkadi (1):
sched: Fix some kernel-doc warnings
include/linux/sched.h | 6 ++++
kernel/sched/core.c | 82 ++++++++++++++++++++++++++++++++++++++-------------
kernel/sched/cpupri.c | 4 +--
kernel/sched/fair.c | 10 +++++--
4 files changed, 77 insertions(+), 25 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 50d04b9..8230024 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1532,6 +1532,8 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
* Test if a process is not yet dead (at most zombie state)
* If pid_alive fails, then pointers within the task structure
* can be stale and must not be dereferenced.
+ *
+ * Return: 1 if the process is alive. 0 otherwise.
*/
static inline int pid_alive(struct task_struct *p)
{
@@ -1543,6 +1545,8 @@ static inline int pid_alive(struct task_struct *p)
* @tsk: Task structure to be checked.
*
* Check if a task structure is the first user space task the kernel created.
+ *
+ * Return: 1 if the task structure is init. 0 otherwise.
*/
static inline int is_global_init(struct task_struct *tsk)
{
@@ -1893,6 +1897,8 @@ extern struct task_struct *idle_task(int cpu);
/**
* is_idle_task - is the specified task an idle task?
* @p: the task in question.
+ *
+ * Return: 1 if @p is an idle task. 0 otherwise.
*/
static inline bool is_idle_task(const struct task_struct *p)
{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d8eb45..4c3967f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -933,6 +933,8 @@ static int effective_prio(struct task_struct *p)
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
*/
inline int task_curr(const struct task_struct *p)
{
@@ -1482,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
* or @state didn't match @p's state.
*/
static int
@@ -1577,8 +1579,9 @@ out:
* @p: The process to be woken up.
*
* Attempt to wake up the nominated process and move it to the set of runnable
- * processes. Returns 1 if the process was woken up, 0 if it was already
- * running.
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
@@ -2191,6 +2194,8 @@ void scheduler_tick(void)
* This makes sure that uptime, CFS vruntime, load
* balancing, etc... continue to move forward, even
* with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
*/
u64 scheduler_tick_max_deferment(void)
{
@@ -2796,8 +2801,8 @@ EXPORT_SYMBOL(wait_for_completion);
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible.
*
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -2829,8 +2834,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible. The caller is accounted as waiting for IO.
*
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
@@ -2846,7 +2851,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
* This waits for completion of a specific task to be signaled. It is
* interruptible.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_interruptible(struct completion *x)
{
@@ -2865,8 +2870,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
@@ -2883,7 +2888,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
* This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_killable(struct completion *x)
{
@@ -2903,8 +2908,8 @@ EXPORT_SYMBOL(wait_for_completion_killable);
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_killable_timeout(struct completion *x,
@@ -2918,7 +2923,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
* try_wait_for_completion - try to decrement a completion without blocking
* @x: completion structure
*
- * Returns: 0 if a decrement cannot be done without blocking
+ * Return: 0 if a decrement cannot be done without blocking
* 1 if a decrement succeeded.
*
* If a completion is being used as a counting completion,
@@ -2945,7 +2950,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
* completion_done - Test to see if a completion has any waiters
* @x: completion structure
*
- * Returns: 0 if there are waiters (wait_for_completion() in progress)
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
*/
@@ -3182,7 +3187,7 @@ SYSCALL_DEFINE1(nice, int, increment)
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
@@ -3194,6 +3199,8 @@ int task_prio(const struct task_struct *p)
/**
* task_nice - return the nice value of a given task.
* @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
*/
int task_nice(const struct task_struct *p)
{
@@ -3204,6 +3211,8 @@ EXPORT_SYMBOL(task_nice);
/**
* idle_cpu - is a given cpu idle currently?
* @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int idle_cpu(int cpu)
{
@@ -3226,6 +3235,8 @@ int idle_cpu(int cpu)
/**
* idle_task - return the idle task for a given cpu.
* @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
*/
struct task_struct *idle_task(int cpu)
{
@@ -3235,6 +3246,8 @@ struct task_struct *idle_task(int cpu)
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
*/
static struct task_struct *find_process_by_pid(pid_t pid)
{
@@ -3432,6 +3445,8 @@ recheck:
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Return: 0 on success. An error code otherwise.
+ *
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
@@ -3451,6 +3466,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
* current context has permission. For example, this is needed in
* stop_machine(): we create temporary high priority worker threads,
* but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
@@ -3485,6 +3502,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
* @pid: the pid in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
struct sched_param __user *, param)
@@ -3500,6 +3519,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
* sys_sched_setparam - set/change the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
@@ -3509,6 +3530,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
/**
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
*/
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
@@ -3535,6 +3559,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
* sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
@@ -3659,6 +3686,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -3710,6 +3739,8 @@ out_unlock:
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -3744,6 +3775,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*
* This function yields the current CPU to other tasks. If there are no
* other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
*/
SYSCALL_DEFINE0(sched_yield)
{
@@ -3869,7 +3902,7 @@ EXPORT_SYMBOL(yield);
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns:
+ * Return:
* true (>0) if we indeed boosted the target task.
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
@@ -3972,8 +4005,9 @@ long __sched io_schedule_timeout(long timeout)
* sys_sched_get_priority_max - return maximum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{
@@ -3997,8 +4031,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
* sys_sched_get_priority_min - return minimum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{
@@ -4024,6 +4059,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
*
* this syscall writes the default timeslice value of a given process
* into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
*/
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct timespec __user *, interval)
@@ -6632,6 +6670,8 @@ void normalize_rt_tasks(void)
* @cpu: the processor in question.
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
*/
struct task_struct *curr_task(int cpu)
{
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1095e87..8b836b3 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -62,7 +62,7 @@ static int convert_prio(int prio)
* any discrepancies created by racing against the uncertainty of the current
* priority configuration.
*
- * Returns: (int)bool - CPUs were found
+ * Return: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask)
@@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
*
- * Returns: -ENOMEM if memory fails.
+ * Return: -ENOMEM on memory allocation failure.
*/
int cpupri_init(struct cpupri *cp)
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c5..06db94b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2032,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
*/
update_entity_load_avg(curr, 1);
update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4280,6 +4281,8 @@ struct sg_lb_stats {
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ *
+ * Return: The load index.
*/
static inline int get_sd_load_idx(struct sched_domain *sd,
enum cpu_idle_type idle)
@@ -4574,6 +4577,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
*/
static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
@@ -4691,7 +4697,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
* assuming lower CPU number will be equivalent to lower a SMT thread
* number.
*
- * Returns 1 when packing is required and a task should be moved to
+ * Return: 1 when packing is required and a task should be moved to
* this CPU. The amount of the imbalance is returned in *imbalance.
*
* @env: The load balancing environment.
@@ -4869,7 +4875,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
*
- * Returns: - the busiest group if imbalance exists.
+ * Return: - The busiest group if imbalance exists.
* - If no imbalance and user has opted for power-savings balance,
* return the least loaded group whose CPUs can be
* put to idle by rebalancing its tasks onto our group.
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-06-20 9:06 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-06-20 9:06 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Frédéric Weisbecker,
Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 29bb9e5a75684106a37593ad75ec75ff8312731b tracing/context-tracking: Add preempt_schedule_context() for tracing
Two smaller fixes - plus a context tracking tracing fix that is a bit
bigger.
Thanks,
Ingo
------------------>
Andrew Jones (1):
sched/x86: Construct all sibling maps if smt
Steven Rostedt (1):
tracing/context-tracking: Add preempt_schedule_context() for tracing
Vincent Guittot (1):
sched: Fix clear NOHZ_BALANCE_KICK
arch/x86/kernel/smpboot.c | 8 ++++----
include/linux/preempt.h | 18 +++++++++++++++++-
kernel/context_tracking.c | 40 ++++++++++++++++++++++++++++++++++++++++
kernel/sched/core.c | 21 +++++++++++++++++----
4 files changed, 78 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9c73b51..bfd348e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -372,15 +372,15 @@ static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
void __cpuinit set_cpu_sibling_map(int cpu)
{
- bool has_mc = boot_cpu_data.x86_max_cores > 1;
bool has_smt = smp_num_siblings > 1;
+ bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct cpuinfo_x86 *o;
int i;
cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
- if (!has_smt && !has_mc) {
+ if (!has_mp) {
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
cpumask_set_cpu(cpu, cpu_core_mask(cpu));
@@ -394,7 +394,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
if ((i == cpu) || (has_smt && match_smt(c, o)))
link_mask(sibling, cpu, i);
- if ((i == cpu) || (has_mc && match_llc(c, o)))
+ if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(llc_shared, cpu, i);
}
@@ -406,7 +406,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
for_each_cpu(i, cpu_sibling_setup_mask) {
o = &cpu_data(i);
- if ((i == cpu) || (has_mc && match_mc(c, o))) {
+ if ((i == cpu) || (has_mp && match_mc(c, o))) {
link_mask(core, cpu, i);
/*
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 87a03c7..f5d4723 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -33,9 +33,25 @@ do { \
preempt_schedule(); \
} while (0)
+#ifdef CONFIG_CONTEXT_TRACKING
+
+void preempt_schedule_context(void);
+
+#define preempt_check_resched_context() \
+do { \
+ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+ preempt_schedule_context(); \
+} while (0)
+#else
+
+#define preempt_check_resched_context() preempt_check_resched()
+
+#endif /* CONFIG_CONTEXT_TRACKING */
+
#else /* !CONFIG_PREEMPT */
#define preempt_check_resched() do { } while (0)
+#define preempt_check_resched_context() do { } while (0)
#endif /* CONFIG_PREEMPT */
@@ -88,7 +104,7 @@ do { \
do { \
preempt_enable_no_resched_notrace(); \
barrier(); \
- preempt_check_resched(); \
+ preempt_check_resched_context(); \
} while (0)
#else /* !CONFIG_PREEMPT_COUNT */
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 65349f0..6667700 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -71,6 +71,46 @@ void user_enter(void)
local_irq_restore(flags);
}
+#ifdef CONFIG_PREEMPT
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+void __sched notrace preempt_schedule_context(void)
+{
+ struct thread_info *ti = current_thread_info();
+ enum ctx_state prev_ctx;
+
+ if (likely(ti->preempt_count || irqs_disabled()))
+ return;
+
+ /*
+ * Need to disable preemption in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ preempt_disable_notrace();
+ prev_ctx = exception_enter();
+ preempt_enable_no_resched_notrace();
+
+ preempt_schedule();
+
+ preempt_disable_notrace();
+ exception_exit(prev_ctx);
+ preempt_enable_notrace();
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_PREEMPT */
/**
* user_exit - Inform the context tracking that the CPU is
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 58453b8..919bee6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu)
static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
- return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+
+ if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+ return false;
+
+ if (idle_cpu(cpu) && !need_resched())
+ return true;
+
+ /*
+ * We can't run Idle Load Balance on this CPU for this time so we
+ * cancel it and clear NOHZ_BALANCE_KICK
+ */
+ clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+ return false;
}
#else /* CONFIG_NO_HZ_COMMON */
@@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
- && !tick_nohz_full_cpu(smp_processor_id()))
+ if (llist_empty(&this_rq()->wake_list)
+ && !tick_nohz_full_cpu(smp_processor_id())
+ && !got_nohz_idle_kick())
return;
/*
@@ -1417,7 +1430,7 @@ void scheduler_ipi(void)
/*
* Check if someone kicked us for doing the nohz idle load balance.
*/
- if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+ if (unlikely(got_nohz_idle_kick())) {
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-05-02 9:06 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-05-02 9:06 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
Stanislaw Gruszka
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: f3002134158092178be81339ec5a22ff80e6c308 Revert "math64: New div64_u64_rem helper"
This fixes the cputime scaling overflow problems for good without having
bad 32-bit overhead, and gets rid of the div64_u64_rem() helper as well.
Ingo
------------------>
Stanislaw Gruszka (4):
sched: Avoid cputime scaling overflow
sched: Do not account bogus utime
sched: Avoid prev->stime underflow
Revert "math64: New div64_u64_rem helper"
include/linux/math64.h | 19 +-----------
kernel/sched/cputime.c | 80 ++++++++++++++++++++++++++++++++------------------
lib/div64.c | 19 ++++--------
3 files changed, 58 insertions(+), 60 deletions(-)
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 931a619..b8ba855 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -30,15 +30,6 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
}
/**
- * div64_u64_rem - unsigned 64bit divide with 64bit divisor
- */
-static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
-{
- *remainder = dividend % divisor;
- return dividend / divisor;
-}
-
-/**
* div64_u64 - unsigned 64bit divide with 64bit divisor
*/
static inline u64 div64_u64(u64 dividend, u64 divisor)
@@ -70,16 +61,8 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
#endif
-#ifndef div64_u64_rem
-extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder);
-#endif
-
#ifndef div64_u64
-static inline u64 div64_u64(u64 dividend, u64 divisor)
-{
- u64 remainder;
- return div64_u64_rem(dividend, divisor, &remainder);
-}
+extern u64 div64_u64(u64 dividend, u64 divisor);
#endif
#ifndef div64_s64
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 33508dc..337a367 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -506,34 +506,47 @@ void account_idle_ticks(unsigned long ticks)
}
/*
- * Perform (stime * rtime) / total with reduced chances
- * of multiplication overflows by using smaller factors
- * like quotient and remainders of divisions between
- * rtime and total.
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
*/
static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
{
- u64 rem, res, scaled;
+ u64 scaled;
- if (rtime >= total) {
- /*
- * Scale up to rtime / total then add
- * the remainder scaled to stime / total.
- */
- res = div64_u64_rem(rtime, total, &rem);
- scaled = stime * res;
- scaled += div64_u64(stime * rem, total);
- } else {
- /*
- * Same in reverse: scale down to total / rtime
- * then substract that result scaled to
- * to the remaining part.
- */
- res = div64_u64_rem(total, rtime, &rem);
- scaled = div64_u64(stime, res);
- scaled -= div64_u64(scaled * rem, total);
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime) {
+ u64 tmp = rtime; rtime = stime; stime = tmp;
+ }
+
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
+
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
+
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
+
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
+
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
}
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
return (__force cputime_t) scaled;
}
@@ -545,7 +558,7 @@ static void cputime_adjust(struct task_cputime *curr,
struct cputime *prev,
cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, stime, total;
+ cputime_t rtime, stime, utime, total;
if (vtime_accounting_enabled()) {
*ut = curr->utime;
@@ -568,13 +581,21 @@ static void cputime_adjust(struct task_cputime *curr,
*/
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
- if (!rtime) {
- stime = 0;
- } else if (!total) {
- stime = rtime;
- } else {
+ /*
+ * Update userspace visible utime/stime values only if actual execution
+ * time is bigger than already exported. Note that can happen, that we
+ * provided bigger values due to scaling inaccuracy on big numbers.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ if (total) {
stime = scale_stime((__force u64)stime,
(__force u64)rtime, (__force u64)total);
+ utime = rtime - stime;
+ } else {
+ stime = rtime;
+ utime = 0;
}
/*
@@ -583,8 +604,9 @@ static void cputime_adjust(struct task_cputime *curr,
* Let's enforce monotonicity.
*/
prev->stime = max(prev->stime, stime);
- prev->utime = max(prev->utime, rtime - prev->stime);
+ prev->utime = max(prev->utime, utime);
+out:
*ut = prev->utime;
*st = prev->stime;
}
diff --git a/lib/div64.c b/lib/div64.c
index 3af5728..a163b6c 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,10 +79,9 @@ EXPORT_SYMBOL(div_s64_rem);
#endif
/**
- * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder
+ * div64_u64 - unsigned 64bit divide with 64bit divisor
* @dividend: 64bit dividend
* @divisor: 64bit divisor
- * @remainder: 64bit remainder
*
* This implementation is a modified version of the algorithm proposed
* by the book 'Hacker's Delight'. The original source and full proof
@@ -90,33 +89,27 @@ EXPORT_SYMBOL(div_s64_rem);
*
* 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt'
*/
-#ifndef div64_u64_rem
-u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
+#ifndef div64_u64
+u64 div64_u64(u64 dividend, u64 divisor)
{
u32 high = divisor >> 32;
u64 quot;
if (high == 0) {
- u32 rem32;
- quot = div_u64_rem(dividend, divisor, &rem32);
- *remainder = rem32;
+ quot = div_u64(dividend, divisor);
} else {
int n = 1 + fls(high);
quot = div_u64(dividend >> n, divisor >> n);
if (quot != 0)
quot--;
-
- *remainder = dividend - quot * divisor;
- if (*remainder >= divisor) {
+ if ((dividend - quot * divisor) >= divisor)
quot++;
- *remainder -= divisor;
- }
}
return quot;
}
-EXPORT_SYMBOL(div64_u64_rem);
+EXPORT_SYMBOL(div64_u64);
#endif
/**
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-04-14 15:51 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-04-14 15:51 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: e614b3332a4f3f264a26da28e5a1f4cc3aea3974 sched/cputime: Fix accounting on multi-threaded processes
Misc fixlets.
Thanks,
Ingo
------------------>
Stanislaw Gruszka (1):
sched/cputime: Fix accounting on multi-threaded processes
Tejun Heo (1):
sched: Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s
Thomas Gleixner (1):
sched_clock: Prevent 64bit inatomicity on 32bit systems
libin (1):
sched/debug: Fix sd->*_idx limit range avoiding overflow
kernel/sched/clock.c | 26 ++++++++++++++++++++++++++
kernel/sched/core.c | 8 +++++---
kernel/sched/cputime.c | 2 +-
3 files changed, 32 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31..c3ae144 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32bit as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32bit, otherwise the
+ * update on the remote cpu can hit inbetween the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64bit the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32bit dance.
+ */
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
+#endif
/*
* Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7b03cd..fa07792 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1488,8 +1488,10 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- BUG_ON(rq != this_rq());
- BUG_ON(p == current);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
@@ -4931,7 +4933,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
}
static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb..e93cca9 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
t = tsk;
do {
- task_cputime(tsk, &utime, &stime);
+ task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-02-26 7:29 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-02-26 7:29 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 7f6575f1fb963d5231afbceecd3feadb6ab58cd3 cputime: Use local_clock() for full dynticks cputime accounting
Thanks,
Ingo
------------------>
Clark Williams (1):
sched: Move RR_TIMESLICE from sysctl.h to rt.h
Frederic Weisbecker (1):
cputime: Use local_clock() for full dynticks cputime accounting
Li Zhong (1):
cputime: Constify timeval_to_cputime(timeval) argument
Nathan Zimmer (2):
sched: Fix /proc/sched_stat failure on very very large systems
sched: Fix /proc/sched_debug failure on very very large systems
Sha Zhengju (1):
sched/core: Remove the obsolete and unused nr_uninterruptible() function
include/asm-generic/cputime_nsecs.h | 2 +-
include/linux/sched.h | 1 -
include/linux/sched/rt.h | 6 +++
include/linux/sched/sysctl.h | 6 ---
kernel/sched/core.c | 22 +--------
kernel/sched/cputime.c | 2 +-
kernel/sched/debug.c | 90 ++++++++++++++++++++++++++++++++-----
kernel/sched/stats.c | 79 +++++++++++++++++++++++---------
8 files changed, 148 insertions(+), 60 deletions(-)
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
index b6485ca..a8ece9a 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -76,7 +76,7 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
/*
* Convert cputime <-> timeval (msec)
*/
-static inline cputime_t timeval_to_cputime(struct timeval *val)
+static inline cputime_t timeval_to_cputime(const struct timeval *val)
{
u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC;
return (__force cputime_t) ret;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33cc421..f9ca237d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -98,7 +98,6 @@ extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
extern unsigned long nr_running(void);
-extern unsigned long nr_uninterruptible(void);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(int cpu);
extern unsigned long this_cpu_load(void);
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 94e19ea..440434d 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -55,4 +55,10 @@ static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
extern void normalize_rt_tasks(void);
+/*
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
+ */
+#define RR_TIMESLICE (100 * HZ / 1000)
+
#endif /* _SCHED_RT_H */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d2bb0ae..bf8086b 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -91,12 +91,6 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice;
extern unsigned int sysctl_sched_autogroup_enabled;
#endif
-/*
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
- */
-#define RR_TIMESLICE (100 * HZ / 1000)
-
extern int sched_rr_timeslice;
extern int sched_rr_handler(struct ctl_table *table, int write,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 03d7784..b7b03cd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1969,11 +1969,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
/*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
*
* externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
+ * threads, total number of context switches performed since bootup.
*/
unsigned long nr_running(void)
{
@@ -1985,23 +1984,6 @@ unsigned long nr_running(void)
return sum;
}
-unsigned long nr_uninterruptible(void)
-{
- unsigned long i, sum = 0;
-
- for_each_possible_cpu(i)
- sum += cpu_rq(i)->nr_uninterruptible;
-
- /*
- * Since we read the counters lockless, it might be slightly
- * inaccurate. Do not allow it to go below zero though:
- */
- if (unlikely((long)sum < 0))
- sum = 0;
-
- return sum;
-}
-
unsigned long long nr_context_switches(void)
{
int i;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9857329..ed12cbb 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -604,7 +604,7 @@ static unsigned long long vtime_delta(struct task_struct *tsk)
{
unsigned long long clock;
- clock = sched_clock();
+ clock = local_clock();
if (clock < tsk->vtime_snap)
return 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c..c496eb3 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -269,11 +269,11 @@ static void print_cpu(struct seq_file *m, int cpu)
{
unsigned int freq = cpu_khz ? : 1;
- SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+ SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
#else
- SEQ_printf(m, "\ncpu#%d\n", cpu);
+ SEQ_printf(m, "cpu#%d\n", cpu);
#endif
#define P(x) \
@@ -330,6 +330,7 @@ do { \
print_rq(m, rq, cpu);
rcu_read_unlock();
spin_unlock_irqrestore(&sched_debug_lock, flags);
+ SEQ_printf(m, "\n");
}
static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +339,10 @@ static const char *sched_tunable_scaling_names[] = {
"linear"
};
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
{
u64 ktime, sched_clk, cpu_clk;
unsigned long flags;
- int cpu;
local_irq_save(flags);
ktime = ktime_to_ns(ktime_get());
@@ -384,33 +384,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
#undef PN
#undef P
- SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+ SEQ_printf(m, " .%-40s: %d (%s)\n",
+ "sysctl_sched_tunable_scaling",
sysctl_sched_tunable_scaling,
sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+ SEQ_printf(m, "\n");
+}
- for_each_online_cpu(cpu)
- print_cpu(m, cpu);
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ int cpu = (unsigned long)(v - 2);
- SEQ_printf(m, "\n");
+ if (cpu != -1)
+ print_cpu(m, cpu);
+ else
+ sched_debug_header(m);
return 0;
}
void sysrq_sched_debug_show(void)
{
- sched_debug_show(NULL, NULL);
+ int cpu;
+
+ sched_debug_header(NULL);
+ for_each_online_cpu(cpu)
+ print_cpu(NULL, cpu);
+
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return sched_debug_start(file, offset);
+}
+
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations sched_debug_sops = {
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
+};
+
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+ seq_release(inode, file);
+
+ return 0;
}
static int sched_debug_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_debug_show, NULL);
+ int ret = 0;
+
+ ret = seq_open(filp, &sched_debug_sops);
+
+ return ret;
}
static const struct file_operations sched_debug_fops = {
.open = sched_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = sched_debug_release,
};
static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e..e036eda 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
if (mask_str == NULL)
return -ENOMEM;
- seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
- seq_printf(seq, "timestamp %lu\n", jiffies);
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ if (v == (void *)1) {
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ } else {
+ struct rq *rq;
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
#endif
+ cpu = (unsigned long)(v - 2);
+ rq = cpu_rq(cpu);
/* runqueue-specific stats */
seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
return 0;
}
-static int schedstat_open(struct inode *inode, struct file *file)
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
{
- unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
+ unsigned long n = *offset;
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_schedstat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return schedstat_start(file, offset);
+}
+
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations schedstat_sops = {
+ .start = schedstat_start,
+ .next = schedstat_next,
+ .stop = schedstat_stop,
+ .show = show_schedstat,
+};
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &schedstat_sops);
}
+static int schedstat_release(struct inode *inode, struct file *file)
+{
+ return 0;
+};
+
static const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = schedstat_release,
};
static int __init proc_schedstat_init(void)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2013-02-04 18:26 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2013-02-04 18:26 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: cff3c124a7e82ca0ea1d6864b27ef18c403c0773 sched/debug: Fix format string for 32-bit platforms
Three small fixlets.
Thanks,
Ingo
------------------>
Arnd Bergmann (2):
sched: Fix warning in kernel/sched/fair.c
sched/debug: Fix format string for 32-bit platforms
Shawn Bohrer (1):
sched/rt: Use root_domain of rt_rq not current processor
kernel/sched/debug.c | 4 ++--
kernel/sched/fair.c | 2 +-
kernel/sched/rt.c | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2cd3c1b..7ae4c4c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -222,8 +222,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->runnable_load_avg);
SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
cfs_rq->blocked_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
- atomic64_read(&cfs_rq->tg->load_avg));
+ SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg",
+ (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));
SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eea870..81fa536 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2663,7 +2663,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
-static void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct cfs_rq *cfs_rq;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 418feb0..4f02b28 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -566,7 +566,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
static int do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
int i, weight, more = 0;
u64 rt_period;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-10-12 9:11 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-10-12 9:11 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Frédéric Weisbecker,
Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 301a5cba2887d1f640e6d5184b05a6d7132017d5 sched: Update sched_domains_numa_masks[][] when new cpus are onlined
A CPU hotplug related crash fix and a nohz accounting fixlet.
Thanks,
Ingo
------------------>
Frederic Weisbecker (1):
nohz: Fix one jiffy count too far in idle cputime
Tang Chen (2):
sched: Ensure 'sched_domains_numa_levels' is safe to use in other functions
sched: Update sched_domains_numa_masks[][] when new cpus are onlined
kernel/sched/core.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/time/tick-sched.c | 2 +-
2 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c177472..8322d73 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6122,6 +6122,17 @@ static void sched_init_numa(void)
* numbers.
*/
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
if (!sched_domains_numa_masks)
return;
@@ -6176,11 +6187,68 @@ static void sched_init_numa(void)
}
sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+ int i, j;
+ int node = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+ int i, j;
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ sched_domains_numa_masks_set(cpu);
+ break;
+
+ case CPU_DEAD:
+ sched_domains_numa_masks_clear(cpu);
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
}
#else
static inline void sched_init_numa(void)
{
}
+
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ return 0;
+}
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -6629,6 +6697,7 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
+ hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f423bdd..a402608 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
*/
if (ts->tick_stopped) {
touch_softlockup_watchdog();
- if (idle_cpu(cpu))
+ if (is_idle_task(current))
ts->idle_jiffies++;
}
update_process_times(user_mode(regs));
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-09-13 14:43 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-09-13 14:43 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 9450d57eab5cad36774c297da123062744472588 sched: Fix kernel-doc warnings in kernel/sched/fair.c
Smaller fixlets.
Thanks,
Ingo
------------------>
Charles Wang (1):
sched: Add missing call to calc_load_exit_idle()
Peter Boonstoppel (1):
sched: Unthrottle rt runqueues in __disable_runtime()
Peter Zijlstra (1):
sched: Fix load avg vs cpu-hotplug
Randy Dunlap (1):
sched: Fix kernel-doc warnings in kernel/sched/fair.c
kernel/sched/core.c | 34 ++++++++++------------------------
kernel/sched/fair.c | 9 +++++----
kernel/sched/rt.c | 1 +
kernel/sched/sched.h | 1 -
kernel/time/tick-sched.c | 1 +
5 files changed, 17 insertions(+), 29 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fbf1fd0..a4ea245 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5304,27 +5304,17 @@ void idle_task_exit(void)
}
/*
- * While a dead CPU has no uninterruptible tasks queued at this point,
- * it might still have a nonzero ->nr_uninterruptible counter, because
- * for performance reasons the counter is not stricly tracking tasks to
- * their home CPUs. So we just add the counter to another CPU's counter,
- * to keep the global sum constant after CPU-down:
- */
-static void migrate_nr_uninterruptible(struct rq *rq_src)
-{
- struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-
- rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
- rq_src->nr_uninterruptible = 0;
-}
-
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
+ * we might have. Assumes we're called after migrate_tasks() so that the
+ * nr_active count is stable.
+ *
+ * Also see the comment "Global load-average calculations".
*/
-static void calc_global_load_remove(struct rq *rq)
+static void calc_load_migrate(struct rq *rq)
{
- atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- rq->calc_load_active = 0;
+ long delta = calc_load_fold_active(rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
}
/*
@@ -5352,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
rq->stop = NULL;
- /* Ensure any throttled groups are reachable by pick_next_task */
- unthrottle_offline_cfs_rqs(rq);
-
for ( ; ; ) {
/*
* There's this thread running, bail when that's the only
@@ -5618,8 +5605,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
- migrate_nr_uninterruptible(rq);
- calc_global_load_remove(rq);
+ calc_load_migrate(rq);
break;
#endif
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c219bf8..42d9df6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
-void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct cfs_rq *cfs_rq;
@@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -3658,7 +3658,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
* @group: sched_group whose statistics are to be updated.
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
@@ -3805,7 +3804,6 @@ static bool update_sd_pick_busiest(struct lb_env *env,
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
- * @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
@@ -4956,6 +4954,9 @@ static void rq_online_fair(struct rq *rq)
static void rq_offline_fair(struct rq *rq)
{
update_sysctl();
+
+ /* Ensure any throttled groups are reachable by pick_next_task */
+ unthrottle_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 944cb68..e0b7ba9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -691,6 +691,7 @@ balanced:
* runtime - in which case borrowing doesn't make sense.
*/
rt_rq->rt_runtime = RUNTIME_INF;
+ rt_rq->rt_throttled = 0;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f6714d0..0848fa3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1144,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void unthrottle_offline_cfs_rqs(struct rq *rq);
extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 024540f..3a9e5d5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
tick_do_update_jiffies64(now);
update_cpu_load_nohz();
+ calc_load_exit_idle();
touch_softlockup_watchdog();
/*
* Cancel the scheduled timer and restore the tick
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2012-08-20 17:35 ` Linus Torvalds
@ 2012-08-21 7:56 ` Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-08-21 7:56 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Mon, Aug 20, 2012 at 2:12 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > Please pull the latest sched-urgent-for-linus git tree from:
>
> Ingo, *please* get into the habit of writing a short explanation for
> your pull request so that my merge messages can be more relevant, ok?
Yeah, sorry:
Small assorted fixlets, plus a lock contention fix which we felt
rises to the level of needing a fix in -rc.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2012-08-20 9:12 Ingo Molnar
@ 2012-08-20 17:35 ` Linus Torvalds
2012-08-21 7:56 ` Ingo Molnar
0 siblings, 1 reply; 384+ messages in thread
From: Linus Torvalds @ 2012-08-20 17:35 UTC (permalink / raw)
To: Ingo Molnar; +Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
On Mon, Aug 20, 2012 at 2:12 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> Please pull the latest sched-urgent-for-linus git tree from:
Ingo, *please* get into the habit of writing a short explanation for
your pull request so that my merge messages can be more relevant, ok?
Linus
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-08-20 9:12 Ingo Molnar
2012-08-20 17:35 ` Linus Torvalds
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2012-08-20 9:12 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 8f6189684eb4e85e6c593cd710693f09c944450a sched: Fix migration thread runtime bogosity
Thanks,
Ingo
------------------>
Mike Galbraith (3):
sched,cgroup: Fix up task_groups list
sched,rt: fix isolated CPUs leaving root_task_group indefinitely throttled
sched: Fix migration thread runtime bogosity
Peter Zijlstra (1):
sched, cgroup: Reduce rq->lock hold times for large cgroup hierarchies
Stanislaw Gruszka (1):
sched: fix divide by zero at {thread_group,task}_times
kernel/sched/core.c | 35 +++++++++++++++++++++--------------
kernel/sched/fair.c | 11 +++++++++--
kernel/sched/rt.c | 13 +++++++++++++
kernel/sched/sched.h | 8 ++++++--
kernel/sched/stop_task.c | 22 +++++++++++++++++++++-
5 files changed, 70 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2cb4e77..6aa212f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
#endif
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+ u64 temp = (__force u64) rtime;
+
+ temp *= (__force u64) utime;
+
+ if (sizeof(cputime_t) == 4)
+ temp = div_u64(temp, (__force u32) total);
+ else
+ temp = div64_u64(temp, (__force u64) total);
+
+ return (__force cputime_t) temp;
+}
+
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
*/
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
+ if (total)
+ utime = scale_utime(utime, rtime, total);
+ else
utime = rtime;
/*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
total = cputime.utime + cputime.stime;
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) cputime.utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
+ if (total)
+ utime = scale_utime(cputime.utime, rtime, total);
+ else
utime = rtime;
sig->prev_utime = max(sig->prev_utime, utime);
@@ -7246,6 +7252,7 @@ int in_sched_functions(unsigned long addr)
#ifdef CONFIG_CGROUP_SCHED
struct task_group root_task_group;
+LIST_HEAD(task_groups);
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d0cc03b..c219bf8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3387,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data)
static void update_h_load(long cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long now = jiffies;
+
+ if (rq->h_load_throttle == now)
+ return;
+
+ rq->h_load_throttle = now;
+
rcu_read_lock();
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
rcu_read_unlock();
@@ -4293,11 +4301,10 @@ redo:
env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
+ update_h_load(env.src_cpu);
more_balance:
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
- if (!env.loop)
- update_h_load(env.src_cpu);
/*
* cur_ld_moved - load moved in current iteration
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca..944cb68 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
const struct cpumask *span;
span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * FIXME: isolated CPUs should really leave the root task group,
+ * whether they are isolcpus or were isolated via cpusets, lest
+ * the timer run on a CPU which does not service all runqueues,
+ * potentially leaving other CPUs indefinitely throttled. If
+ * isolation is really required, the user will turn the throttle
+ * off to kill the perturbations it causes anyway. Meanwhile,
+ * this maintains functionality for boot and/or troubleshooting.
+ */
+ if (rt_b == &root_task_group.rt_bandwidth)
+ span = cpu_online_mask;
+#endif
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c35a1a7..f6714d0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
struct cfs_rq;
struct rt_rq;
-static LIST_HEAD(task_groups);
+extern struct list_head task_groups;
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-#endif
+#ifdef CONFIG_SMP
+ unsigned long h_load_throttle;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e8..da5eb5b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->on_rq)
+ if (stop && stop->on_rq) {
+ stop->se.exec_start = rq->clock_task;
return stop;
+ }
return NULL;
}
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ delta_exec = rq->clock_task - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq->clock_task;
+ cpuacct_charge(curr, delta_exec);
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
static void set_curr_task_stop(struct rq *rq)
{
+ struct task_struct *stop = rq->stop;
+
+ stop->se.exec_start = rq->clock_task;
}
static void switched_to_stop(struct rq *rq, struct task_struct *p)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-08-03 16:43 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-08-03 16:43 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: b9403130a5350fca59a50ed11c198cb8c7e54119 sched/cleanups: Add load balance cpumask pointer to 'struct lb_env'
Fixes and two late cleanups.
Thanks,
Ingo
------------------>
Alex Shi (1):
sched/numa: Add SD_PERFER_SIBLING to CPU domain
Michael Wang (1):
sched/cleanups: Add load balance cpumask pointer to 'struct lb_env'
Namhyung Kim (1):
sched: Use task_rq_unlock() in __sched_setscheduler()
Srivatsa S. Bhat (1):
sched: Fix comment about PREEMPT_ACTIVE bit location
Ying Xue (1):
sched: Fix minor code style issues
include/linux/hardirq.h | 2 +-
include/linux/topology.h | 1 +
kernel/sched/core.c | 4 +---
kernel/sched/cpupri.c | 10 +++++-----
kernel/sched/fair.c | 29 ++++++++++++++---------------
5 files changed, 22 insertions(+), 24 deletions(-)
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index bb7f309..305f23c 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -22,7 +22,7 @@
*
* - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
* - bit 26 is the NMI_MASK
- * - bit 28 is the PREEMPT_ACTIVE flag
+ * - bit 27 is the PREEMPT_ACTIVE flag
*
* PREEMPT_MASK: 0x000000ff
* SOFTIRQ_MASK: 0x0000ff00
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e91cd43..fec12d6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -164,6 +164,7 @@ int arch_update_cpu_topology(void);
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
+ | 1*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
.balance_interval = 1, \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5d011ef..2cb4e77 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4340,9 +4340,7 @@ recheck:
*/
if (unlikely(policy == p->policy && (!rt_policy(policy) ||
param->sched_priority == p->rt_priority))) {
-
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return 0;
}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586f..23aa789 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -65,8 +65,8 @@ static int convert_prio(int prio)
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask)
{
- int idx = 0;
- int task_pri = convert_prio(p->prio);
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
if (task_pri >= MAX_RT_PRIO)
return 0;
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
*/
void cpupri_set(struct cpupri *cp, int cpu, int newpri)
{
- int *currpri = &cp->cpu_to_pri[cpu];
- int oldpri = *currpri;
- int do_mb = 0;
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ int do_mb = 0;
newpri = convert_prio(newpri);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 22321db..d0cc03b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3069,6 +3069,9 @@ struct lb_env {
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ /* The set of CPUs under consideration for load-balancing */
+ struct cpumask *cpus;
+
unsigned int flags;
unsigned int loop;
@@ -3653,8 +3656,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
- int local_group, const struct cpumask *cpus,
- int *balance, struct sg_lb_stats *sgs)
+ int local_group, int *balance, struct sg_lb_stats *sgs)
{
unsigned long nr_running, max_nr_running, min_nr_running;
unsigned long load, max_cpu_load, min_cpu_load;
@@ -3671,7 +3673,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
max_nr_running = 0;
min_nr_running = ~0UL;
- for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
nr_running = rq->nr_running;
@@ -3800,8 +3802,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct lb_env *env,
- const struct cpumask *cpus,
- int *balance, struct sd_lb_stats *sds)
+ int *balance, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
@@ -3818,8 +3819,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(env, sg, load_idx, local_group,
- cpus, balance, &sgs);
+ update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
if (local_group && !(*balance))
return;
@@ -4055,7 +4055,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* to restore balance.
*
* @env: The load balancing environment.
- * @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
*
@@ -4065,7 +4064,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* put to idle by rebalancing its tasks onto our group.
*/
static struct sched_group *
-find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
+find_busiest_group(struct lb_env *env, int *balance)
{
struct sd_lb_stats sds;
@@ -4075,7 +4074,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(env, cpus, balance, &sds);
+ update_sd_lb_stats(env, balance, &sds);
/*
* this_cpu is not the appropriate cpu to perform load balancing at
@@ -4155,8 +4154,7 @@ ret:
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *find_busiest_queue(struct lb_env *env,
- struct sched_group *group,
- const struct cpumask *cpus)
+ struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
@@ -4171,7 +4169,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (!capacity)
capacity = fix_small_capacity(env->sd, group);
- if (!cpumask_test_cpu(i, cpus))
+ if (!cpumask_test_cpu(i, env->cpus))
continue;
rq = cpu_rq(i);
@@ -4252,6 +4250,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.dst_grpmask = sched_group_cpus(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
};
cpumask_copy(cpus, cpu_active_mask);
@@ -4260,7 +4259,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
schedstat_inc(sd, lb_count[idle]);
redo:
- group = find_busiest_group(&env, cpus, balance);
+ group = find_busiest_group(&env, balance);
if (*balance == 0)
goto out_balanced;
@@ -4270,7 +4269,7 @@ redo:
goto out_balanced;
}
- busiest = find_busiest_queue(&env, group, cpus);
+ busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-07-14 7:57 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-07-14 7:57 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 95c0d71dcb853c2eca5f2231ebbd4c1d3af775b7 MAINTAINERS/sched: Update scheduler file pattern
Note that this commit:
164c33c6adee sched: Fix fork() error path to not crash
Is correct but not complete (some architectures like Tile are
not covered yet) - the resulting additional fixes are still WIP
and I did not want to delay these pending fixes. See this thread
on lkml:
[PATCH] fork: fix error handling in dup_task()
Thanks,
Ingo
------------------>
Namhyung Kim (1):
MAINTAINERS/sched: Update scheduler file pattern
Peter Zijlstra (1):
sched/nohz: Rewrite and fix load-avg computation -- again
Salman Qazi (1):
sched: Fix fork() error path to not crash
MAINTAINERS | 2 +-
include/linux/sched.h | 8 ++
kernel/fork.c | 11 +-
kernel/sched/core.c | 275 ++++++++++++++++++++++++++++++++++------------
kernel/sched/idle_task.c | 1 -
kernel/sched/sched.h | 2 -
kernel/time/tick-sched.c | 2 +
7 files changed, 222 insertions(+), 79 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index 03df1d1..214d754 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5909,7 +5909,7 @@ M: Ingo Molnar <mingo@redhat.com>
M: Peter Zijlstra <peterz@infradead.org>
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
S: Maintained
-F: kernel/sched*
+F: kernel/sched/
F: include/linux/sched.h
SCORE ARCHITECTURE
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4059c0f..20cb749 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1909,6 +1909,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
}
#endif
+#ifdef CONFIG_NO_HZ
+void calc_load_enter_idle(void);
+void calc_load_exit_idle(void);
+#else
+static inline void calc_load_enter_idle(void) { }
+static inline void calc_load_exit_idle(void) { }
+#endif /* CONFIG_NO_HZ */
+
#ifndef CONFIG_CPUMASK_OFFSTACK
static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b..f00e319 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
}
err = arch_dup_task_struct(tsk, orig);
- if (err)
- goto out;
+ /*
+ * We defer looking at err, because we will need this setup
+ * for the clean up path to work correctly.
+ */
tsk->stack = ti;
-
setup_thread_stack(tsk, orig);
+
+ if (err)
+ goto out;
+
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
stackend = end_of_stack(tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4..bb84040 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2161,11 +2161,73 @@ unsigned long this_cpu_load(void)
}
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of cpus, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-cpu deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * cpu to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ * this would add another cross-cpu cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever cpu the task ran
+ * when it went into uninterruptible state and decrement on whatever cpu
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all cpus yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
/* Variables and functions for calc_load */
static atomic_long_t calc_load_tasks;
static unsigned long calc_load_update;
unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
static long calc_load_fold_active(struct rq *this_rq)
{
@@ -2182,6 +2244,9 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -2193,30 +2258,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
#ifdef CONFIG_NO_HZ
/*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two idle-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old idle contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ idle during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a cpu having been in NOHZ idle for multiple
+ * LOAD_FREQ intervals.
*
* When making the ILB scale, we should try to pull this in as well.
*/
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
-void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_write_idx(void)
{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next idle-delta.
+ */
+ if (!time_before(jiffies, calc_load_update))
+ idx++;
+
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+ return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+ struct rq *this_rq = this_rq();
long delta;
+ /*
+ * We're going into NOHZ mode, if there's any pending delta, fold it
+ * into the pending idle delta.
+ */
delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks_idle);
+ if (delta) {
+ int idx = calc_load_write_idx();
+ atomic_long_add(delta, &calc_load_idle[idx]);
+ }
}
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
{
- long delta = 0;
+ struct rq *this_rq = this_rq();
+
+ /*
+ * If we're still before the sample window, we're done.
+ */
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
/*
- * Its got a race, we don't care...
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
*/
- if (atomic_long_read(&calc_load_tasks_idle))
- delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+ this_rq->calc_load_update = calc_load_update;
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_idle[idx]))
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
return delta;
}
@@ -2302,66 +2455,39 @@ static void calc_global_nohz(void)
{
long delta, active, n;
- /*
- * If we crossed a calc_load_update boundary, make sure to fold
- * any pending idle changes, the respective CPUs might have
- * missed the tick driven calc_load_account_active() update
- * due to NO_HZ.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- /*
- * It could be the one fold was all it took, we done!
- */
- if (time_before(jiffies, calc_load_update + 10))
- return;
-
- /*
- * Catch-up, fold however many we are behind still
- */
- delta = jiffies - calc_load_update - 10;
- n = 1 + (delta / LOAD_FREQ);
+ if (!time_before(jiffies, calc_load_update + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- calc_load_update += n * LOAD_FREQ;
-}
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
+ calc_load_update += n * LOAD_FREQ;
+ }
-static inline long calc_load_fold_idle(void)
-{
- return 0;
+ /*
+ * Flip the idle index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
}
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
-{
-}
-#endif
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
+#endif /* CONFIG_NO_HZ */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2495,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
*/
void calc_global_load(unsigned long ticks)
{
- long active;
+ long active, delta;
if (time_before(jiffies, calc_load_update + 10))
return;
+ /*
+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
active = atomic_long_read(&calc_load_tasks);
active = active > 0 ? active * FIXED_1 : 0;
@@ -2384,12 +2517,7 @@ void calc_global_load(unsigned long ticks)
calc_load_update += LOAD_FREQ;
/*
- * Account one period with whatever state we found before
- * folding in the nohz state and ageing the entire idle period.
- *
- * This avoids loosing a sample when we go idle between
- * calc_load_account_active() (10 ticks ago) and now and thus
- * under-accounting.
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
*/
calc_global_nohz();
}
@@ -2406,7 +2534,6 @@ static void calc_load_account_active(struct rq *this_rq)
return;
delta = calc_load_fold_active(this_rq);
- delta += calc_load_fold_idle();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
@@ -2414,6 +2541,10 @@ static void calc_load_account_active(struct rq *this_rq)
}
/*
+ * End of global load-average stuff
+ */
+
+/*
* The exact cpuload at various idx values, calculated at every tick would be
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
*
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604..b6baf37 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
- calc_load_account_idle(rq);
return rq->idle;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d52cea..55844f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void)
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
-void calc_load_account_idle(struct rq *this_rq);
-
#ifdef CONFIG_SCHED_HRTICK
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8699978..4a08472 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -406,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
*/
if (!ts->tick_stopped) {
select_nohz_load_balancer(1);
+ calc_load_enter_idle();
ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
@@ -597,6 +598,7 @@ void tick_nohz_idle_exit(void)
account_idle_ticks(ticks);
#endif
+ calc_load_exit_idle();
touch_softlockup_watchdog();
/*
* Cancel the scheduled timer and restore the tick
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-06-08 20:29 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-06-08 20:29 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: a841f8cef4bb124f0f5563314d0beaf2e1249d72 sched: Fix the relax_domain_level boot parameter
Thanks,
Ingo
------------------>
Alex Shi (1):
sched/numa: Load balance between remote nodes
Dimitri Sivanich (1):
sched: Fix the relax_domain_level boot parameter
Kamalesh Babulal (1):
sched/x86: Calculate booted cores after construction of sibling_mask
Peter Zijlstra (4):
sched/rt: Fix lockdep annotation within find_lock_lowest_rq()
sched: Fix domain iteration
sched: Always initialize cpu-power
sched: Validate assumptions in sched_init_numa()
arch/x86/kernel/smpboot.c | 9 +++
include/linux/sched.h | 11 +++
kernel/sched/core.c | 187 +++++++++++++++++++++++++++++++++++++---------
kernel/sched/fair.c | 7 +-
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 2 +
6 files changed, 179 insertions(+), 39 deletions(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fd019d7..3fab55b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -382,6 +382,15 @@ void __cpuinit set_cpu_sibling_map(int cpu)
if ((i == cpu) || (has_mc && match_llc(c, o)))
link_mask(llc_shared, cpu, i);
+ }
+
+ /*
+ * This needs a separate iteration over the cpus because we rely on all
+ * cpu_sibling_mask links to be set-up.
+ */
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
if ((i == cpu) || (has_mc && match_mc(c, o))) {
link_mask(core, cpu, i);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6029d8c..ac321d7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -876,6 +876,8 @@ struct sched_group_power {
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;
+
+ unsigned long cpumask[0]; /* iteration mask */
};
struct sched_group {
@@ -900,6 +902,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
return to_cpumask(sg->cpumask);
}
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgp->cpumask);
+}
+
/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c46958e..d5594a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
#ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
{
- sched_domain_debug_enabled = 1;
+ sched_debug_enabled = 1;
return 0;
}
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
@@ -5604,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!group->sgp->power) {
+ /*
+ * Even though we initialize ->power to something semi-sane,
+ * we leave power_orig unset. This allows us to detect if
+ * domain iteration is still funny without causing /0 traps.
+ */
+ if (!group->sgp->power_orig) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -5652,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
- if (!sched_domain_debug_enabled)
+ if (!sched_debug_enabled)
return;
if (!sd) {
@@ -5673,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
#endif /* CONFIG_SCHED_DEBUG */
static int sd_degenerate(struct sched_domain *sd)
@@ -5994,6 +6008,44 @@ struct sched_domain_topology_level {
struct sd_data data;
};
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
@@ -6012,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (cpumask_test_cpu(i, covered))
continue;
+ child = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ continue;
+
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(cpu));
@@ -6019,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
goto fail;
sg_span = sched_group_cpus(sg);
-
- child = *per_cpu_ptr(sdd->sd, i);
if (child->child) {
child = child->child;
cpumask_copy(sg_span, sched_domain_span(child));
@@ -6030,13 +6086,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
sg->sgp = *per_cpu_ptr(sdd->sgp, i);
- atomic_inc(&sg->sgp->ref);
+ if (atomic_inc_return(&sg->sgp->ref) == 1)
+ build_group_mask(sd, sg);
+ /*
+ * Initialize sgp->power such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance cpu. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- cpumask_first(sg_span) == cpu) {
- WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
+ group_balance_cpu(sg) == cpu)
groups = sg;
- }
if (!first)
first = sg;
@@ -6109,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
cpumask_clear(sched_group_cpus(sg));
sg->sgp->power = 0;
+ cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -6150,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
sg = sg->next;
} while (sg != sd->groups);
- if (cpu != group_first_cpu(sg))
+ if (cpu != group_balance_cpu(sg))
return;
update_group_power(sd, cpu);
@@ -6200,11 +6268,8 @@ int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
- unsigned long val;
-
- val = simple_strtoul(str, NULL, 0);
- if (val < sched_domain_level_max)
- default_relax_domain_level = val;
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
return 1;
}
@@ -6314,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
-static int sched_domains_numa_scale;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
static inline int sd_local_flags(int level)
{
- if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+ if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
return 0;
return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6379,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
}
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
static void sched_init_numa(void)
{
int next_distance, curr_distance = node_distance(0, 0);
@@ -6386,7 +6486,6 @@ static void sched_init_numa(void)
int level = 0;
int i, j, k;
- sched_domains_numa_scale = curr_distance;
sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
if (!sched_domains_numa_distance)
return;
@@ -6397,23 +6496,41 @@ static void sched_init_numa(void)
*
* Assumes node_distance(0,j) includes all distances in
* node_distance(i,j) in order to avoid cubic time.
- *
- * XXX: could be optimized to O(n log n) by using sort()
*/
next_distance = curr_distance;
for (i = 0; i < nr_node_ids; i++) {
for (j = 0; j < nr_node_ids; j++) {
- int distance = node_distance(0, j);
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(i, k);
+
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+
+ /*
+ * While not a strong assumption it would be nice to know
+ * about cases where if node A is connected to B, B is not
+ * equally connected to A.
+ */
+ if (sched_debug() && node_distance(k, i) != distance)
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (sched_debug() && i && !find_numa_distance(distance))
+ sched_numa_warn("Node-0 not representative");
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
}
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
+
+ /*
+ * In case of sched_debug() we verify the above assumption.
+ */
+ if (!sched_debug())
+ break;
}
/*
* 'level' contains the number of unique distances, excluding the
@@ -6525,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power),
+ sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sgp)
return -ENOMEM;
@@ -6578,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
if (!sd)
return child;
- set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
@@ -6586,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
child->parent = sd;
}
sd->child = child;
+ set_domain_attribute(sd, attr);
return sd;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b2a2d23..c9fd6d6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3602,7 +3602,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
} while (group != child->groups);
}
- sdg->sgp->power = power;
+ sdg->sgp->power_orig = sdg->sgp->power = power;
}
/*
@@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
int i;
if (local_group)
- balance_cpu = group_first_cpu(group);
+ balance_cpu = group_balance_cpu(group);
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
@@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Bias balancing toward cpus of our domain */
if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
+ if (idle_cpu(i) && !first_idle_cpu &&
+ cpumask_test_cpu(i, sched_group_mask(group))) {
first_idle_cpu = 1;
balance_cpu = i;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2a4e8df..573e1ca 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1562,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
task_running(rq, task) ||
!task->on_rq)) {
- raw_spin_unlock(&lowest_rq->lock);
+ double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
break;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccf..6d52cea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_id);
+extern int group_balance_cpu(struct sched_group *sg);
+
#endif /* CONFIG_SMP */
#include "stats.h"
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-06-05 9:21 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-06-05 9:21 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 6a4c96eef42f835734a82c6b512abf9881b7c55d sched: Remove NULL assignment of dattr_cur
Thanks,
Ingo
------------------>
Colin Cross (1):
sched/rt: Fix SCHED_RR across cgroups
Hiroshi Shimamoto (2):
sched: Make sched_feat_names const
sched: Remove the last NULL entry from sched_feat_names
Kamalesh Babulal (1):
sched: Remove NULL assignment of dattr_cur
Peter Zijlstra (6):
sched/x86: Use cpu_llc_shared_mask(cpu) for coregroup_mask
sched/nohz: Fix rq->cpu_load calculations some more
sched: Don't try allocating memory from offline nodes
sched: Fix SD_OVERLAP
sched: Make sure to not re-read variables after validation
sched: Move nr_cpus_allowed out of 'struct sched_rt_entity'
arch/blackfin/kernel/process.c | 2 +-
arch/x86/kernel/smpboot.c | 10 +------
include/linux/init_task.h | 2 +-
include/linux/sched.h | 3 +-
kernel/sched/core.c | 68 +++++++++++++++++++++++++++++++-----------
kernel/sched/fair.c | 42 +++++++++++++++++++-------
kernel/sched/rt.c | 51 ++++++++++++++++++-------------
kernel/time/tick-sched.c | 1 +
8 files changed, 120 insertions(+), 59 deletions(-)
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 2e3994b..62bcea7 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -173,7 +173,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs)
unsigned long newsp;
#ifdef __ARCH_SYNC_CORE_DCACHE
- if (current->rt.nr_cpus_allowed == num_possible_cpus())
+ if (current->nr_cpus_allowed == num_possible_cpus())
set_cpus_allowed_ptr(current, cpumask_of(smp_processor_id()));
#endif
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f56f96d..fd019d7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -410,15 +410,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
/* maps the cpu to the sched domain representing multi-core */
const struct cpumask *cpu_coregroup_mask(int cpu)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
- return cpu_core_mask(cpu);
- else
- return cpu_llc_shared_mask(cpu);
+ return cpu_llc_shared_mask(cpu);
}
static void impress_friends(void)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e4baff5..9e65eff 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -149,6 +149,7 @@ extern struct cred init_cred;
.normal_prio = MAX_PRIO-20, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
+ .nr_cpus_allowed= NR_CPUS, \
.mm = NULL, \
.active_mm = &init_mm, \
.se = { \
@@ -157,7 +158,6 @@ extern struct cred init_cred;
.rt = { \
.run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
.time_slice = RR_TIMESLICE, \
- .nr_cpus_allowed = NR_CPUS, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
INIT_PUSHABLE_TASKS(tsk) \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f45c0b2..0f50e78 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -145,6 +145,7 @@ extern unsigned long this_cpu_load(void);
extern void calc_global_load(unsigned long ticks);
+extern void update_cpu_load_nohz(void);
extern unsigned long get_parent_ip(unsigned long addr);
@@ -1187,7 +1188,6 @@ struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
unsigned int time_slice;
- int nr_cpus_allowed;
struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1252,6 +1252,7 @@ struct task_struct {
#endif
unsigned int policy;
+ int nr_cpus_allowed;
cpumask_t cpus_allowed;
#ifdef CONFIG_PREEMPT_RCU
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb601..c46958e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
#define SCHED_FEAT(name, enabled) \
#name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
#include "features.h"
- NULL
};
#undef SCHED_FEAT
@@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
sched_avg_update(this_rq);
}
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
/*
* Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance.
*/
void update_idle_cpu_load(struct rq *this_rq)
{
- unsigned long curr_jiffies = jiffies;
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
unsigned long load = this_rq->load.weight;
unsigned long pending_updates;
/*
- * Bloody broken means of dealing with nohz, but better than nothing..
- * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
- * update and see 0 difference the one time and 2 the next, even though
- * we ticked at roughtly the same rate.
- *
- * Hence we only use this from nohz_idle_balance() and skip this
- * nonsense when called from the scheduler_tick() since that's
- * guaranteed a stable rate.
+ * bail if there's load or we're actually up-to-date.
*/
if (load || curr_jiffies == this_rq->last_load_update_tick)
return;
@@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq)
}
/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
* Called from scheduler_tick()
*/
static void update_cpu_load_active(struct rq *this_rq)
{
/*
- * See the mess in update_idle_cpu_load().
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
__update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
}
/*
@@ -5997,11 +6029,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ sg->sgp = *per_cpu_ptr(sdd->sgp, i);
atomic_inc(&sg->sgp->ref);
- if (cpumask_test_cpu(cpu, sg_span))
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ cpumask_first(sg_span) == cpu) {
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
groups = sg;
+ }
if (!first)
first = sg;
@@ -6403,7 +6438,7 @@ static void sched_init_numa(void)
return;
for (j = 0; j < nr_node_ids; j++) {
- struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
if (!mask)
return;
@@ -6691,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- dattr_cur = NULL;
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d1..b2a2d23 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
return prev_cpu;
if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available;
+ u64 total, available, age_stamp, avg;
- total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
+
+ total = sched_avg_period() + (rq->clock - age_stamp);
- if (unlikely(total < rq->rt_avg)) {
+ if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
- available = total - rq->rt_avg;
+ available = total - avg;
}
if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,11 +3581,26 @@ void update_group_power(struct sched_domain *sd, int cpu)
power = 0;
- group = child->groups;
- do {
- power += group->sgp->power;
- group = group->next;
- } while (group != child->groups);
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(cpu, sched_group_cpus(sdg))
+ power += power_of(cpu);
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
+
+ group = child->groups;
+ do {
+ power += group->sgp->power;
+ group = group->next;
+ } while (group != child->groups);
+ }
sdg->sgp->power = power;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3..2a4e8df 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
- if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
cpu = task_cpu(p);
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
goto out;
/* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (curr->rt.nr_cpus_allowed < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
curr->prio <= p->prio) &&
- (p->rt.nr_cpus_allowed > 1)) {
+ (p->nr_cpus_allowed > 1)) {
int target = find_lowest_rq(p);
if (target != -1)
@@ -1276,10 +1282,10 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->rt.nr_cpus_allowed == 1)
+ if (rq->curr->nr_cpus_allowed == 1)
return;
- if (p->rt.nr_cpus_allowed != 1
+ if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
(cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
- (p->rt.nr_cpus_allowed > 1))
+ (p->nr_cpus_allowed > 1))
return 1;
return 0;
}
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (task->rt.nr_cpus_allowed == 1)
+ if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(p->rt.nr_cpus_allowed <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!p->on_rq);
BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
has_pushable_tasks(rq) &&
- p->rt.nr_cpus_allowed > 1 &&
+ p->nr_cpus_allowed > 1 &&
rt_task(rq->curr) &&
- (rq->curr->rt.nr_cpus_allowed < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
* Only update if the process changes its state from whether it
* can migrate or not.
*/
- if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
+ if ((p->nr_cpus_allowed > 1) == (weight > 1))
return;
rq = task_rq(p);
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+
update_curr_rt(rq);
watchdog(rq, p);
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
p->rt.time_slice = RR_TIMESLICE;
/*
- * Requeue to the end of queue if we are not the only element
- * on the queue:
+ * Requeue to the end of queue if we (and all of our ancestors) are the
+ * only element on the queue
*/
- if (p->rt.run_list.prev != p->rt.run_list.next) {
- requeue_task_rt(rq, p, 0);
- set_tsk_need_resched(p);
+ for_each_sched_rt_entity(rt_se) {
+ if (rt_se->run_list.prev != rt_se->run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ set_tsk_need_resched(p);
+ return;
+ }
}
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9..0c927cd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void)
/* Update jiffies first */
select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
/*
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-04-27 6:39 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-04-27 6:39 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: fb2cf2c660971bea0ad86a9a5c19ad39eab61344 sched: Fix OOPS when build_sched_domains() percpu allocation fails
Thanks,
Ingo
------------------>
Peter Zijlstra (1):
sched: Fix more load-balancing fallout
he, bo (1):
sched: Fix OOPS when build_sched_domains() percpu allocation fails
kernel/sched/core.c | 22 ++++++++++++++++------
kernel/sched/fair.c | 18 ++++++++++--------
kernel/sched/features.h | 1 +
3 files changed, 27 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4603b9d..0533a68 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6405,16 +6405,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
- struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
- free_sched_groups(sd->groups, 0);
- kfree(*per_cpu_ptr(sdd->sd, j));
- kfree(*per_cpu_ptr(sdd->sg, j));
- kfree(*per_cpu_ptr(sdd->sgp, j));
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgp)
+ kfree(*per_cpu_ptr(sdd->sgp, j));
}
free_percpu(sdd->sd);
+ sdd->sd = NULL;
free_percpu(sdd->sg);
+ sdd->sg = NULL;
free_percpu(sdd->sgp);
+ sdd->sgp = NULL;
}
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d97ebd..e955364 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se))
- list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+ list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
#endif
cfs_rq->nr_running++;
}
@@ -3215,6 +3215,8 @@ static int move_one_task(struct lb_env *env)
static unsigned long task_h_load(struct task_struct *p);
+static const unsigned int sched_nr_migrate_break = 32;
+
/*
* move_tasks tries to move up to load_move weighted load from busiest to
* this_rq, as part of a balancing operation within domain "sd".
@@ -3242,7 +3244,7 @@ static int move_tasks(struct lb_env *env)
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
- env->loop_break += sysctl_sched_nr_migrate;
+ env->loop_break += sched_nr_migrate_break;
env->flags |= LBF_NEED_BREAK;
break;
}
@@ -3252,7 +3254,7 @@ static int move_tasks(struct lb_env *env)
load = task_h_load(p);
- if (load < 16 && !env->sd->nr_balance_failed)
+ if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
if ((load / 2) > env->load_move)
@@ -4407,7 +4409,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.idle = idle,
- .loop_break = sysctl_sched_nr_migrate,
+ .loop_break = sched_nr_migrate_break,
};
cpumask_copy(cpus, cpu_active_mask);
@@ -4445,10 +4447,10 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.load_move = imbalance;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
- env.loop_max = busiest->nr_running;
+ env.load_move = imbalance;
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+ env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
local_irq_save(flags);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e61fd73..de00a48 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(LB_MIN, false)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-03-31 17:07 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-03-31 17:07 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: e3831edd59edf57ca11fc289f08961b20baf5146 sched: Fix incorrect usage of for_each_cpu_mask() in select_fallback_rq()
Thanks,
Ingo
------------------>
Catalin Marinas (1):
sched/arch: Introduce the finish_arch_post_lock_switch() scheduler callback
Srivatsa S. Bhat (1):
sched: Fix incorrect usage of for_each_cpu_mask() in select_fallback_rq()
Stephen Boyd (1):
sched: Fix __schedule_bug() output when called from an interrupt
kernel/sched/core.c | 13 ++++---------
kernel/sched/sched.h | 3 +++
2 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9c1629c..8773176 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1268,7 +1268,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
int dest_cpu;
/* Look for allowed, online CPU in same node. */
- for_each_cpu_mask(dest_cpu, *nodemask) {
+ for_each_cpu(dest_cpu, nodemask) {
if (!cpu_online(dest_cpu))
continue;
if (!cpu_active(dest_cpu))
@@ -1279,7 +1279,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu_mask(dest_cpu, *tsk_cpus_allowed(p)) {
+ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
if (!cpu_online(dest_cpu))
continue;
if (!cpu_active(dest_cpu))
@@ -1962,6 +1962,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
local_irq_enable();
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
finish_lock_switch(rq, prev);
+ finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
if (mm)
@@ -3099,8 +3100,6 @@ EXPORT_SYMBOL(sub_preempt_count);
*/
static noinline void __schedule_bug(struct task_struct *prev)
{
- struct pt_regs *regs = get_irq_regs();
-
if (oops_in_progress)
return;
@@ -3111,11 +3110,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
-
- if (regs)
- show_regs(regs);
- else
- dump_stack();
+ dump_stack();
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 753bdd5..2f7a723 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -681,6 +681,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
#ifndef finish_arch_switch
# define finish_arch_switch(prev) do { } while (0)
#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-03-29 10:50 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-03-29 10:50 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: 160594e99dbbb0a5600ad922c630952c7c1c14bf cpusets: Remove an unused variable
Thanks,
Ingo
------------------>
Dan Carpenter (1):
cpusets: Remove an unused variable
Ingo Molnar (1):
MAINTAINERS: Update email address for SCHEDULER and PERF EVENTS
Michael J Wang (1):
sched/rt: Improve pick_next_highest_task_rt()
Peter Zijlstra (3):
sched: Fix compiler warning about declared inline after use
sched/x86/smp: Do not enable IRQs over calibrate_delay()
sched: Fix select_fallback_rq() vs cpu_active/cpu_online
MAINTAINERS | 4 +-
arch/x86/kernel/smpboot.c | 5 ---
include/linux/cpuset.h | 6 +---
kernel/cpuset.c | 21 +++------------
kernel/sched/core.c | 62 +++++++++++++++++++++++++++++++++-----------
kernel/sched/fair.c | 16 ++++++------
kernel/sched/rt.c | 2 +-
7 files changed, 63 insertions(+), 53 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index 3321d75..1ad6a06 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5112,7 +5112,7 @@ F: kernel/delayacct.c
PERFORMANCE EVENTS SUBSYSTEM
M: Peter Zijlstra <a.p.zijlstra@chello.nl>
M: Paul Mackerras <paulus@samba.org>
-M: Ingo Molnar <mingo@elte.hu>
+M: Ingo Molnar <mingo@redhat.com>
M: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
S: Supported
@@ -5736,7 +5736,7 @@ S: Maintained
F: drivers/watchdog/sc1200wdt.c
SCHEDULER
-M: Ingo Molnar <mingo@elte.hu>
+M: Ingo Molnar <mingo@redhat.com>
M: Peter Zijlstra <peterz@infradead.org>
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
S: Maintained
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58f7816..89571a0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -219,14 +219,9 @@ static void __cpuinit smp_callin(void)
* Update loops_per_jiffy in cpu_data. Previous call to
* smp_store_cpu_info() stored a value that is close but not as
* accurate as the value just calculated.
- *
- * Need to enable IRQs because it can take longer and then
- * the NMI watchdog might kill us.
*/
- local_irq_enable();
calibrate_delay();
cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
- local_irq_disable();
pr_debug("Stack at about %p\n", &cpuid);
/*
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index e9eaec5..e0ffaf0 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -22,7 +22,7 @@ extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_update_active_cpus(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
+extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
@@ -144,10 +144,8 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
cpumask_copy(mask, cpu_possible_mask);
}
-static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
+static inline void cpuset_cpus_allowed_fallback(struct task_struct *p)
{
- do_set_cpus_allowed(p, cpu_possible_mask);
- return cpumask_any(cpu_active_mask);
}
static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a09ac2b..4ef4d7e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2195,10 +2195,9 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
mutex_unlock(&callback_mutex);
}
-int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
const struct cpuset *cs;
- int cpu;
rcu_read_lock();
cs = task_cs(tsk);
@@ -2219,22 +2218,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
* set any mask even if it is not right from task_cs() pov,
* the pending set_cpus_allowed_ptr() will fix things.
+ *
+ * select_fallback_rq() will fix things ups and set cpu_possible_mask
+ * if required.
*/
-
- cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
- if (cpu >= nr_cpu_ids) {
- /*
- * Either tsk->cpus_allowed is wrong (see above) or it
- * is actually empty. The latter case is only possible
- * if we are racing with remove_tasks_in_empty_cpuset().
- * Like above we can temporary set any mask and rely on
- * set_cpus_allowed_ptr() as synchronization point.
- */
- do_set_cpus_allowed(tsk, cpu_possible_mask);
- cpu = cpumask_any(cpu_active_mask);
- }
-
- return cpu;
}
void cpuset_init_current_mems_allowed(void)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e3ccc13..9c1629c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1263,29 +1263,59 @@ EXPORT_SYMBOL_GPL(kick_process);
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
- int dest_cpu;
const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+ enum { cpuset, possible, fail } state = cpuset;
+ int dest_cpu;
/* Look for allowed, online CPU in same node. */
- for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+ for_each_cpu_mask(dest_cpu, *nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
return dest_cpu;
+ }
- /* Any allowed, online CPU? */
- dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
- if (dest_cpu < nr_cpu_ids)
- return dest_cpu;
+ for (;;) {
+ /* Any allowed, online CPU? */
+ for_each_cpu_mask(dest_cpu, *tsk_cpus_allowed(p)) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ goto out;
+ }
- /* No more Mr. Nice Guy. */
- dest_cpu = cpuset_cpus_allowed_fallback(p);
- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit()) {
- printk_sched("process %d (%s) no longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, cpu);
+ switch (state) {
+ case cpuset:
+ /* No more Mr. Nice Guy. */
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
+
+ case possible:
+ do_set_cpus_allowed(p, cpu_possible_mask);
+ state = fail;
+ break;
+
+ case fail:
+ BUG();
+ break;
+ }
+ }
+
+out:
+ if (state != cpuset) {
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk_sched("process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
+ }
}
return dest_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11f3979..258f430 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec);
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -1162,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
__clear_buddies_skip(se);
}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -1546,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
resched_task(rq_of(cfs_rq)->curr);
}
-static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec)
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
@@ -2073,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec) {}
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b60dad7..44af55e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1428,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
next_idx:
if (idx >= MAX_RT_PRIO)
continue;
- if (next && next->prio < idx)
+ if (next && next->prio <= idx)
continue;
list_for_each_entry(rt_se, array->queue + idx, run_list) {
struct task_struct *p;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-02-02 10:07 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-02-02 10:07 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: cb297a3e433dbdcf7ad81e0564e7b804c941ff0d sched/rt: Fix task stack corruption under __ARCH_WANT_INTERRUPTS_ON_CTXSW
Thanks,
Ingo
------------------>
Chanho Min (1):
sched/rt: Fix task stack corruption under __ARCH_WANT_INTERRUPTS_ON_CTXSW
Christian Borntraeger (1):
sched/s390: Fix compile error in sched/core.c
Peter Zijlstra (1):
sched: Fix rq->nr_uninterruptible update race
Suresh Siddha (1):
sched/nohz: Fix nohz cpu idle load balancing state with cpu hotplug
Yasunori Goto (1):
sched: Fix ancient race in do_exit()
kernel/exit.c | 16 ++++++++++++++++
kernel/sched/core.c | 19 +++++++------------
kernel/sched/fair.c | 34 +++++++++++++++++++++++++++++-----
kernel/sched/rt.c | 5 +++++
4 files changed, 57 insertions(+), 17 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index 294b170..4b4042f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1038,6 +1038,22 @@ void do_exit(long code)
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
+
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&tsk->pi_lock);
+
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index df00cb0..5255c9d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include <asm/mutex.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
@@ -723,9 +724,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
p->sched_class->dequeue_task(rq, p, flags);
}
-/*
- * activate_task - move a task to the runqueue.
- */
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
@@ -734,9 +732,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
enqueue_task(rq, p, flags);
}
-/*
- * deactivate_task - remove a task from the runqueue.
- */
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
@@ -4134,7 +4129,7 @@ recheck:
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
- deactivate_task(rq, p, 0);
+ dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -4147,7 +4142,7 @@ recheck:
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq)
- activate_task(rq, p, 0);
+ enqueue_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
@@ -4998,9 +4993,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* placed properly.
*/
if (p->on_rq) {
- deactivate_task(rq_src, p, 0);
+ dequeue_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
- activate_task(rq_dest, p, 0);
+ enqueue_task(rq_dest, p, 0);
check_preempt_curr(rq_dest, p, 0);
}
done:
@@ -7032,10 +7027,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
on_rq = p->on_rq;
if (on_rq)
- deactivate_task(rq, p, 0);
+ dequeue_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
if (on_rq) {
- activate_task(rq, p, 0);
+ enqueue_task(rq, p, 0);
resched_task(rq->curr);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 84adb2d..7c6414f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4866,6 +4866,15 @@ static void nohz_balancer_kick(int cpu)
return;
}
+static inline void clear_nohz_tick_stopped(int cpu)
+{
+ if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+ }
+}
+
static inline void set_cpu_sd_state_busy(void)
{
struct sched_domain *sd;
@@ -4904,6 +4913,12 @@ void select_nohz_load_balancer(int stop_tick)
{
int cpu = smp_processor_id();
+ /*
+ * If this cpu is going down, then nothing needs to be done.
+ */
+ if (!cpu_active(cpu))
+ return;
+
if (stop_tick) {
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
@@ -4914,6 +4929,18 @@ void select_nohz_load_balancer(int stop_tick)
}
return;
}
+
+static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DYING:
+ clear_nohz_tick_stopped(smp_processor_id());
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
#endif
static DEFINE_SPINLOCK(balancing);
@@ -5070,11 +5097,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
* busy tick after returning from idle, we will update the busy stats.
*/
set_cpu_sd_state_busy();
- if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
- clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
- }
+ clear_nohz_tick_stopped(cpu);
/*
* None are in tickless mode and hence no need for NOHZ idle load
@@ -5590,6 +5613,7 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ cpu_notifier(sched_ilb_notifier, 0);
#endif
#endif /* SMP */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3640ebb..f42ae7f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1587,6 +1587,11 @@ static int push_rt_task(struct rq *rq)
if (!next_task)
return 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ if (unlikely(task_running(rq, next_task)))
+ return 0;
+#endif
+
retry:
if (unlikely(next_task == rq->curr)) {
WARN_ON(1);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2012-01-12 6:15 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2012-01-12 6:15 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
HEAD: bced76aeaca03b45e3b4bdb868cada328e497847 sched: Fix lockup by limiting load-balance retries on lock-break
Thanks,
Ingo
------------------>
Fabio Estevam (1):
sched: Fix CONFIG_CGROUP_SCHED dependency
Hiroshi Shimamoto (1):
sched: Remove empty #ifdefs
Peter Zijlstra (1):
sched: Fix lockup by limiting load-balance retries on lock-break
init/Kconfig | 1 -
kernel/sched/core.c | 7 -------
kernel/sched/fair.c | 10 +++++++---
3 files changed, 7 insertions(+), 11 deletions(-)
diff --git a/init/Kconfig b/init/Kconfig
index 82b6a4c..a34cd17 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -702,7 +702,6 @@ config CGROUP_PERF
menuconfig CGROUP_SCHED
bool "Group CPU scheduler"
- depends on EXPERIMENTAL
default n
help
This feature lets CPU scheduler recognize task groups and control CPU
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4dbfd04..457c881 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7136,10 +7136,6 @@ void set_curr_task(int cpu, struct task_struct *p)
#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-#else /* !CONFIG_RT_GROUP_SCHED */
-#endif /* CONFIG_RT_GROUP_SCHED */
-
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
@@ -7248,9 +7244,6 @@ void sched_move_task(struct task_struct *tsk)
}
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#endif
-
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
static unsigned long to_ratio(u64 period, u64 runtime)
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e42de9..84adb2d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
}
#define LBF_ALL_PINNED 0x01
-#define LBF_NEED_BREAK 0x02
-#define LBF_ABORT 0x04
+#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
+#define LBF_HAD_BREAK 0x04
+#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT 0x10
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
@@ -4508,7 +4510,9 @@ redo:
goto out_balanced;
if (lb_flags & LBF_NEED_BREAK) {
- lb_flags &= ~LBF_NEED_BREAK;
+ lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+ if (lb_flags & LBF_ABORT)
+ goto out_balanced;
goto redo;
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2011-12-17 20:56 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2011-12-17 20:56 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
Thanks,
Ingo
------------------>
Peter Zijlstra (2):
MAINTAINERS: Update tip.git related git trees
sched: Fix select_idle_sibling() regression in selecting an idle SMT sibling
MAINTAINERS | 13 +++++++++----
kernel/sched_fair.c | 14 ++++++++------
2 files changed, 17 insertions(+), 10 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index 4475602..9706a21 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3118,6 +3118,7 @@ F: include/linux/hid*
HIGH-RESOLUTION TIMERS, CLOCKEVENTS, DYNTICKS
M: Thomas Gleixner <tglx@linutronix.de>
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
S: Maintained
F: Documentation/timers/
F: kernel/hrtimer.c
@@ -3627,7 +3628,7 @@ F: net/irda/
IRQ SUBSYSTEM
M: Thomas Gleixner <tglx@linutronix.de>
S: Maintained
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git irq/core
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
F: kernel/irq/
ISAPNP
@@ -4115,7 +4116,7 @@ F: drivers/hwmon/lm90.c
LOCKDEP AND LOCKSTAT
M: Peter Zijlstra <peterz@infradead.org>
M: Ingo Molnar <mingo@redhat.com>
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/peterz/linux-2.6-lockdep.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/locking
S: Maintained
F: Documentation/lockdep*.txt
F: Documentation/lockstat.txt
@@ -5102,6 +5103,7 @@ M: Peter Zijlstra <a.p.zijlstra@chello.nl>
M: Paul Mackerras <paulus@samba.org>
M: Ingo Molnar <mingo@elte.hu>
M: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
S: Supported
F: kernel/events/*
F: include/linux/perf_event.h
@@ -5181,6 +5183,7 @@ F: drivers/scsi/pm8001/
POSIX CLOCKS and TIMERS
M: Thomas Gleixner <tglx@linutronix.de>
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
S: Supported
F: fs/timerfd.c
F: include/linux/timer*
@@ -5696,6 +5699,7 @@ F: drivers/dma/dw_dmac.c
TIMEKEEPING, NTP
M: John Stultz <johnstul@us.ibm.com>
M: Thomas Gleixner <tglx@linutronix.de>
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
S: Supported
F: include/linux/clocksource.h
F: include/linux/time.h
@@ -5720,6 +5724,7 @@ F: drivers/watchdog/sc1200wdt.c
SCHEDULER
M: Ingo Molnar <mingo@elte.hu>
M: Peter Zijlstra <peterz@infradead.org>
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
S: Maintained
F: kernel/sched*
F: include/linux/sched.h
@@ -6647,7 +6652,7 @@ TRACING
M: Steven Rostedt <rostedt@goodmis.org>
M: Frederic Weisbecker <fweisbec@gmail.com>
M: Ingo Molnar <mingo@redhat.com>
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git perf/core
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
S: Maintained
F: Documentation/trace/ftrace.txt
F: arch/*/*/*/ftrace.h
@@ -7397,7 +7402,7 @@ M: Thomas Gleixner <tglx@linutronix.de>
M: Ingo Molnar <mingo@redhat.com>
M: "H. Peter Anvin" <hpa@zytor.com>
M: x86@kernel.org
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
S: Maintained
F: Documentation/x86/
F: arch/x86/
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a78ed27..8a39fa3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2352,13 +2352,11 @@ again:
if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
continue;
- if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
- if (!smt) {
- smt = 1;
- goto again;
- }
+ if (smt && !(sd->flags & SD_SHARE_CPUPOWER))
+ break;
+
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
- }
sg = sd->groups;
do {
@@ -2378,6 +2376,10 @@ next:
sg = sg->next;
} while (sg != sd->groups);
}
+ if (!smt) {
+ smt = 1;
+ goto again;
+ }
done:
rcu_read_unlock();
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2011-12-05 18:49 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2011-12-05 18:49 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus
Thanks,
Ingo
------------------>
Carsten Emde (1):
sched: Set the command name of the idle tasks in SMP kernels
Hui Kang (1):
sched_fair: Fix a typo in the comment describing update_sd_lb_stats
J. Bruce Fields (1):
sched: Document wait_for_completion_*() return values
Paul Turner (1):
sched: Fix buglet in return_cfs_rq_runtime()
Peter Zijlstra (3):
sched: Add a comment to effective_load() since it's a pain
sched, rt: Provide means of disabling cross-cpu bandwidth sharing
sched: Avoid SMT siblings in select_idle_sibling() if possible
Salman Qazi (1):
sched, x86: Avoid unnecessary overflow in sched_clock
arch/x86/include/asm/timer.h | 23 ++++++-
include/linux/init_task.h | 4 +-
kernel/sched.c | 17 +++++
kernel/sched_fair.c | 159 +++++++++++++++++++++++++++++++++---------
kernel/sched_features.h | 1 +
kernel/sched_rt.c | 3 +
6 files changed, 171 insertions(+), 36 deletions(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index fa7b917..431793e 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -32,6 +32,22 @@ extern int no_timer_check;
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ *
+ * In:
+ *
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * Although we may still have enough bits to store the value of ns,
+ * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
+ * leading to an incorrect result.
+ *
+ * To avoid this, we can decompose 'cycles' into quotient and remainder
+ * of division by SC. Then,
+ *
+ * ns = (quot * SC + rem) * cyc2ns_scale / SC
+ * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
+ *
+ * - sqazi@google.com
*/
DECLARE_PER_CPU(unsigned long, cyc2ns);
@@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
+ unsigned long long quot;
+ unsigned long long rem;
int cpu = smp_processor_id();
unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
- ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR;
+ quot = (cyc >> CYC2NS_SCALE_FACTOR);
+ rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
+ ns += quot * per_cpu(cyc2ns, cpu) +
+ ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
return ns;
}
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 08ffab0..b6e5b8b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -126,6 +126,8 @@ extern struct cred init_cred;
# define INIT_PERF_EVENTS(tsk)
#endif
+#define INIT_TASK_COMM "swapper"
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -162,7 +164,7 @@ extern struct cred init_cred;
.group_leader = &tsk, \
RCU_INIT_POINTER(.real_cred, &init_cred), \
RCU_INIT_POINTER(.cred, &init_cred), \
- .comm = "swapper", \
+ .comm = INIT_TASK_COMM, \
.thread = INIT_THREAD, \
.fs = &init_fs, \
.files = &init_files, \
diff --git a/kernel/sched.c b/kernel/sched.c
index 0e9344a..d6b149c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
#include <linux/ctype.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
+#include <linux/init_task.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
*
* This waits for completion of a specific task to be signaled. It is
* interruptible.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_interruptible(struct completion *x)
{
@@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
*
* This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_killable(struct completion *x)
{
@@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
* This waits for either a completion of a specific task to be
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_killable_timeout(struct completion *x,
@@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
+#if defined(CONFIG_SMP)
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
}
/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c9e679..a78ed27 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
list_del_leaf_cfs_rq(cfs_rq);
}
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+ long tg_weight;
+
+ /*
+ * Use this CPU's actual weight instead of the last load_contribution
+ * to gain a more accurate current total weight. See
+ * update_cfs_rq_load_contribution().
+ */
+ tg_weight = atomic_read(&tg->load_weight);
+ tg_weight -= cfs_rq->load_contribution;
+ tg_weight += cfs_rq->load.weight;
+
+ return tg_weight;
+}
+
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long load_weight, load, shares;
+ long tg_weight, load, shares;
+ tg_weight = calc_tg_weight(tg, cfs_rq);
load = cfs_rq->load.weight;
- load_weight = atomic_read(&tg->load_weight);
- load_weight += load;
- load_weight -= cfs_rq->load_contribution;
-
shares = (tg->shares * load);
- if (load_weight)
- shares /= load_weight;
+ if (tg_weight)
+ shares /= tg_weight;
if (shares < MIN_SHARES)
shares = MIN_SHARES;
@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ * s_i = rw_i / \Sum rw_j (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ * rw_i = { 2, 4, 1, 0 }
+ * s_i = { 2/7, 4/7, 1/7, 0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ * rw'_i = { 3, 4, 1, 0 }
+ * s'_i = { 3/8, 4/8, 1/8, 0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ * dw_i = S * (s'_i - s_i) (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
*/
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
- if (!tg->parent)
+ if (!tg->parent) /* the trivial, non-cgroup case */
return wl;
for_each_sched_entity(se) {
- long lw, w;
+ long w, W;
tg = se->my_q->tg;
- w = se->my_q->load.weight;
- /* use this cpu's instantaneous contribution */
- lw = atomic_read(&tg->load_weight);
- lw -= se->my_q->load_contribution;
- lw += w + wg;
+ /*
+ * W = @wg + \Sum rw_j
+ */
+ W = wg + calc_tg_weight(tg, se->my_q);
- wl += w;
+ /*
+ * w = rw_i + @wl
+ */
+ w = se->my_q->load.weight + wl;
- if (lw > 0 && wl < lw)
- wl = (wl * tg->shares) / lw;
+ /*
+ * wl = S * s'_i; see (2)
+ */
+ if (W > 0 && w < W)
+ wl = (w * tg->shares) / W;
else
wl = tg->shares;
- /* zero point is MIN_SHARES */
+ /*
+ * Per the above, wl is the new se->load.weight value; since
+ * those are clipped to [MIN_SHARES, ...) do so now. See
+ * calc_cfs_shares().
+ */
if (wl < MIN_SHARES)
wl = MIN_SHARES;
+
+ /*
+ * wl = dw_i = S * (s'_i - s_i); see (3)
+ */
wl -= se->load.weight;
+
+ /*
+ * Recursively apply this logic to all parent groups to compute
+ * the final effective load change on the root group. Since
+ * only the @tg group gets extra weight, all parent groups can
+ * only redistribute existing shares. @wl is the shift in shares
+ * resulting from this level per the above.
+ */
wg = 0;
}
@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- int i;
+ struct sched_group *sg;
+ int i, smt = 0;
/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
rcu_read_lock();
+again:
for_each_domain(target, sd) {
- if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
- break;
+ if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
+ continue;
- for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
- target = i;
- break;
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
+ if (!smt) {
+ smt = 1;
+ goto again;
}
+ break;
}
- /*
- * Lets stop looking for an idle sibling when we reached
- * the domain that spans the current cpu and prev_cpu.
- */
- if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
- cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
- break;
+ sg = sd->groups;
+ do {
+ if (!cpumask_intersects(sched_group_cpus(sg),
+ tsk_cpus_allowed(p)))
+ goto next;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (!idle_cpu(i))
+ goto next;
+ }
+
+ target = cpumask_first_and(sched_group_cpus(sg),
+ tsk_cpus_allowed(p));
+ goto done;
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
}
+done:
rcu_read_unlock();
return target;
@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
}
/**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index efa0a7b..8480224 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
SCHED_FEAT(TTWU_QUEUE, 1)
SCHED_FEAT(FORCE_SD_OVERLAP, 0)
+SCHED_FEAT(RT_RUNTIME_SHARE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 056cbd2..583a136 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
{
int more = 0;
+ if (!sched_feat(RT_RUNTIME_SHARE))
+ return more;
+
if (rt_rq->rt_time > rt_rq->rt_runtime) {
raw_spin_unlock(&rt_rq->rt_runtime_lock);
more = do_balance_runtime(rt_rq);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2011-09-30 18:36 Ingo Molnar
@ 2011-09-30 19:44 ` Thomas Gleixner
0 siblings, 0 replies; 384+ messages in thread
From: Thomas Gleixner @ 2011-09-30 19:44 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Linus Torvalds, linux-kernel, Peter Zijlstra, Andrew Morton
On Fri, 30 Sep 2011, Ingo Molnar wrote:
> Linus,
>
> Please pull the latest sched-urgent-for-linus git tree from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-urgent-for-linus
git://tesla.tglx.de/git/linux-2.6-tip sched-urgent-for-linus
^ permalink raw reply [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2011-09-30 18:36 Ingo Molnar
2011-09-30 19:44 ` Thomas Gleixner
0 siblings, 1 reply; 384+ messages in thread
From: Ingo Molnar @ 2011-09-30 18:36 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-urgent-for-linus
Thanks,
Ingo
------------------>
Shawn Bohrer (1):
sched/rt: Migrate equal priority tasks to available CPUs
Simon Kirby (1):
sched: Fix up wchan borkage
kernel/sched.c | 2 +-
kernel/sched_rt.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index ec5f472..d249ea8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4372,7 +4372,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
-asmlinkage void schedule(void)
+asmlinkage void __sched schedule(void)
{
struct task_struct *tsk = current;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 97540f0..af11778 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1050,7 +1050,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
*/
if (curr && unlikely(rt_task(curr)) &&
(curr->rt.nr_cpus_allowed < 2 ||
- curr->prio < p->prio) &&
+ curr->prio <= p->prio) &&
(p->rt.nr_cpus_allowed > 1)) {
int target = find_lowest_rq(p);
@@ -1581,7 +1581,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
p->rt.nr_cpus_allowed > 1 &&
rt_task(rq->curr) &&
(rq->curr->rt.nr_cpus_allowed < 2 ||
- rq->curr->prio < p->prio))
+ rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2011-07-20 21:06 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2011-07-20 21:06 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-urgent-for-linus
Thanks,
Ingo
------------------>
Peter Zijlstra (3):
sched: Break out cpu_power from the sched_group structure
sched: Allow for overlapping sched_domain spans
sched: Avoid creating superfluous NUMA domains on non-NUMA systems
include/linux/sched.h | 14 +++-
kernel/sched.c | 189 ++++++++++++++++++++++++++++++++++++++---------
kernel/sched_fair.c | 46 ++++++------
kernel/sched_features.h | 2 +
4 files changed, 190 insertions(+), 61 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 496770a..bde99d5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -844,6 +844,7 @@ enum cpu_idle_type {
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
+#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
enum powersavings_balance_level {
POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
@@ -893,16 +894,21 @@ static inline int sd_power_saving_flags(void)
return 0;
}
-struct sched_group {
- struct sched_group *next; /* Must be a circular list */
+struct sched_group_power {
atomic_t ref;
-
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU.
*/
- unsigned int cpu_power, cpu_power_orig;
+ unsigned int power, power_orig;
+};
+
+struct sched_group {
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
+
unsigned int group_weight;
+ struct sched_group_power *sgp;
/*
* The CPUs this group covers.
diff --git a/kernel/sched.c b/kernel/sched.c
index 3dc716f..14168c4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6557,7 +6557,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!group->cpu_power) {
+ if (!group->sgp->power) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -6581,9 +6581,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->cpu_power != SCHED_POWER_SCALE) {
+ if (group->sgp->power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
- group->cpu_power);
+ group->sgp->power);
}
group = group->next;
@@ -6774,11 +6774,39 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+ kfree(sg->sgp);
+
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
static void free_sched_domain(struct rcu_head *rcu)
{
struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
- if (atomic_dec_and_test(&sd->groups->ref))
+
+ /*
+ * If its an overlapping domain it has private groups, iterate and
+ * nuke them all.
+ */
+ if (sd->flags & SD_OVERLAP) {
+ free_sched_groups(sd->groups, 1);
+ } else if (atomic_dec_and_test(&sd->groups->ref)) {
+ kfree(sd->groups->sgp);
kfree(sd->groups);
+ }
kfree(sd);
}
@@ -6945,6 +6973,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
+ struct sched_group_power **__percpu sgp;
};
struct s_data {
@@ -6964,15 +6993,73 @@ struct sched_domain_topology_level;
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+#define SDTL_OVERLAP 0x01
+
struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
+ int flags;
struct sd_data data;
};
-/*
- * Assumes the sched_domain tree is fully constructed
- */
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *child;
+ int i;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct cpumask *sg_span;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(i));
+
+ if (!sg)
+ goto fail;
+
+ sg_span = sched_group_cpus(sg);
+
+ child = *per_cpu_ptr(sdd->sd, i);
+ if (child->child) {
+ child = child->child;
+ cpumask_copy(sg_span, sched_domain_span(child));
+ } else
+ cpumask_set_cpu(i, sg_span);
+
+ cpumask_or(covered, covered, sg_span);
+
+ sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ atomic_inc(&sg->sgp->ref);
+
+ if (cpumask_test_cpu(cpu, sg_span))
+ groups = sg;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = groups;
+
+ return 0;
+
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
+}
+
static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6981,24 +7068,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (child)
cpu = cpumask_first(sched_domain_span(child));
- if (sg)
+ if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
+ (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+ atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ }
return cpu;
}
/*
- * build_sched_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
*/
-static void
-build_sched_groups(struct sched_domain *sd)
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
{
struct sched_group *first = NULL, *last = NULL;
struct sd_data *sdd = sd->private;
@@ -7006,6 +7093,12 @@ build_sched_groups(struct sched_domain *sd)
struct cpumask *covered;
int i;
+ get_group(cpu, sdd, &sd->groups);
+ atomic_inc(&sd->groups->ref);
+
+ if (cpu != cpumask_first(sched_domain_span(sd)))
+ return 0;
+
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
@@ -7020,7 +7113,7 @@ build_sched_groups(struct sched_domain *sd)
continue;
cpumask_clear(sched_group_cpus(sg));
- sg->cpu_power = 0;
+ sg->sgp->power = 0;
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -7037,6 +7130,8 @@ build_sched_groups(struct sched_domain *sd)
last = sg;
}
last->next = first;
+
+ return 0;
}
/*
@@ -7051,12 +7146,17 @@ build_sched_groups(struct sched_domain *sd)
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
- WARN_ON(!sd || !sd->groups);
+ struct sched_group *sg = sd->groups;
- if (cpu != group_first_cpu(sd->groups))
- return;
+ WARN_ON(!sd || !sg);
- sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+ do {
+ sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg = sg->next;
+ } while (sg != sd->groups);
+
+ if (cpu != group_first_cpu(sg))
+ return;
update_group_power(sd, cpu);
}
@@ -7177,15 +7277,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
static void claim_allocations(int cpu, struct sched_domain *sd)
{
struct sd_data *sdd = sd->private;
- struct sched_group *sg = sd->groups;
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
*per_cpu_ptr(sdd->sd, cpu) = NULL;
- if (cpu == cpumask_first(sched_group_cpus(sg))) {
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- }
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+ *per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
#ifdef CONFIG_SCHED_SMT
@@ -7210,7 +7310,7 @@ static struct sched_domain_topology_level default_topology[] = {
#endif
{ sd_init_CPU, cpu_cpu_mask, },
#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, },
+ { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
{ sd_init_ALLNODES, cpu_allnodes_mask, },
#endif
{ NULL, },
@@ -7234,9 +7334,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
+ sdd->sgp = alloc_percpu(struct sched_group_power *);
+ if (!sdd->sgp)
+ return -ENOMEM;
+
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
+ struct sched_group_power *sgp;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -7251,6 +7356,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
return -ENOMEM;
*per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgp = kzalloc_node(sizeof(struct sched_group_power),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgp)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sgp, j) = sgp;
}
}
@@ -7266,11 +7378,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
- kfree(*per_cpu_ptr(sdd->sd, j));
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sg, j));
+ kfree(*per_cpu_ptr(sdd->sgp, j));
}
free_percpu(sdd->sd);
free_percpu(sdd->sg);
+ free_percpu(sdd->sgp);
}
}
@@ -7316,8 +7432,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_topology_level *tl;
sd = NULL;
- for (tl = sched_domain_topology; tl->init; tl++)
+ for (tl = sched_domain_topology; tl->init; tl++) {
sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
while (sd->child)
sd = sd->child;
@@ -7329,13 +7450,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- get_group(i, sd->private, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (i != cpumask_first(sched_domain_span(sd)))
- continue;
-
- build_sched_groups(sd);
+ if (sd->flags & SD_OVERLAP) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
}
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 433491c..c768588 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
+ avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
if (local_group) {
this_load = avg_load;
@@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
power >>= SCHED_POWER_SHIFT;
}
- sdg->cpu_power_orig = power;
+ sdg->sgp->power_orig = power;
if (sched_feat(ARCH_POWER))
power *= arch_scale_freq_power(sd, cpu);
@@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
power = 1;
cpu_rq(cpu)->cpu_power = power;
- sdg->cpu_power = power;
+ sdg->sgp->power = power;
}
static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- power += group->cpu_power;
+ power += group->sgp->power;
group = group->next;
} while (group != child->groups);
- sdg->cpu_power = power;
+ sdg->sgp->power = power;
}
/*
@@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
/*
* If ~90% of the cpu_power is still there, we're good.
*/
- if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+ if (group->sgp->power * 32 > group->sgp->power_orig * 29)
return 1;
return 0;
@@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
}
/* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
+ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
/*
* Consider the group unbalanced when the imbalance is larger
@@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
SCHED_POWER_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
@@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
return;
sds->total_load += sgs.group_load;
- sds->total_pwr += sg->cpu_power;
+ sds->total_pwr += sg->sgp->power;
/*
* In case the child domain prefers tasks go to siblings
@@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,
if (this_cpu > busiest_cpu)
return 0;
- *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+ *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
SCHED_POWER_SCALE);
return 1;
}
@@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
scaled_busy_load_per_task = sds->busiest_load_per_task
* SCHED_POWER_SCALE;
- scaled_busy_load_per_task /= sds->busiest->cpu_power;
+ scaled_busy_load_per_task /= sds->busiest->sgp->power;
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
(scaled_busy_load_per_task * imbn)) {
@@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
* moving them.
*/
- pwr_now += sds->busiest->cpu_power *
+ pwr_now += sds->busiest->sgp->power *
min(sds->busiest_load_per_task, sds->max_load);
- pwr_now += sds->this->cpu_power *
+ pwr_now += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load);
pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->busiest->cpu_power;
+ sds->busiest->sgp->power;
if (sds->max_load > tmp)
- pwr_move += sds->busiest->cpu_power *
+ pwr_move += sds->busiest->sgp->power *
min(sds->busiest_load_per_task, sds->max_load - tmp);
/* Amount of load we'd add */
- if (sds->max_load * sds->busiest->cpu_power <
+ if (sds->max_load * sds->busiest->sgp->power <
sds->busiest_load_per_task * SCHED_POWER_SCALE)
- tmp = (sds->max_load * sds->busiest->cpu_power) /
- sds->this->cpu_power;
+ tmp = (sds->max_load * sds->busiest->sgp->power) /
+ sds->this->sgp->power;
else
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->this->cpu_power;
- pwr_move += sds->this->cpu_power *
+ sds->this->sgp->power;
+ pwr_move += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
@@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
- load_above_capacity /= sds->busiest->cpu_power;
+ load_above_capacity /= sds->busiest->sgp->power;
}
/*
@@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * sds->busiest->cpu_power,
- (sds->avg_load - sds->this_load) * sds->this->cpu_power)
+ *imbalance = min(max_pull * sds->busiest->sgp->power,
+ (sds->avg_load - sds->this_load) * sds->this->sgp->power)
/ SCHED_POWER_SCALE;
/*
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f73..1e7066d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1)
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, 1)
+
+SCHED_FEAT(FORCE_SD_OVERLAP, 0)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2011-07-07 18:19 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2011-07-07 18:19 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Mike Galbraith,
Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-urgent-for-linus
Thanks,
Ingo
------------------>
Mike Galbraith (1):
sched, cgroups: Fix MIN_SHARES on 64-bit boxen
Peter Zijlstra (1):
sched: Disable (revert) SCHED_LOAD_SCALE increase
include/linux/sched.h | 2 +-
kernel/sched.c | 9 +++------
2 files changed, 4 insertions(+), 7 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a837b20..496770a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -808,7 +808,7 @@ enum cpu_idle_type {
* when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
* increased costs.
*/
-#if BITS_PER_LONG > 32
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
# define SCHED_LOAD_RESOLUTION 10
# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f2e502..9769c75 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -292,8 +292,8 @@ static DEFINE_SPINLOCK(task_group_lock);
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
-#define MIN_SHARES 2
-#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
+#define MIN_SHARES (1UL << 1)
+#define MAX_SHARES (1UL << 18)
static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
#endif
@@ -8450,10 +8450,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
if (!tg->se[0])
return -EINVAL;
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
mutex_lock(&shares_mutex);
if (tg->shares == shares)
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2011-06-15 20:04 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2011-06-15 20:04 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-urgent-for-linus
Thanks,
Ingo
------------------>
Hillf Danton (1):
sched: Fix need_resched() when checking peempt
Steven Rostedt (1):
sched: Check if lowest_mask is initialized in find_lowest_rq()
kernel/sched_rt.c | 6 +++++-
1 files changed, 5 insertions(+), 1 deletions(-)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 88725c9..10d0182 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1096,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
* to move current somewhere else, making room for our non-migratable
* task.
*/
- if (p->prio == rq->curr->prio && !need_resched())
+ if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
check_preempt_equal_prio(rq, p);
#endif
}
@@ -1239,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task)
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ /* Make sure the mask is initialized first */
+ if (unlikely(!lowest_mask))
+ return -1;
+
if (task->rt.nr_cpus_allowed == 1)
return -1; /* No other targets possible */
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2011-06-07 18:01 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2011-06-07 18:01 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-urgent-for-linus
Thanks,
Ingo
------------------>
Peter Zijlstra (4):
sched: Fix cross-cpu clock sync on remote wakeups
sched: Fix schedstat.nr_wakeups_migrate
lockdep: Fix lock_is_held() on recursion
sched: Fix/clarify set_task_cpu() locking rules
include/linux/sched.h | 1 +
kernel/lockdep.c | 2 +-
kernel/sched.c | 33 ++++++++++++++++++++++++---------
3 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8da84b7..483c1ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1063,6 +1063,7 @@ struct sched_domain;
*/
#define WF_SYNC 0x01 /* waker goes to sleep after wakup */
#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_MIGRATED 0x04 /* internal use, task got migrated */
#define ENQUEUE_WAKEUP 1
#define ENQUEUE_HEAD 2
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 63437d0..298c927 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3426,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock)
int ret = 0;
if (unlikely(current->lockdep_recursion))
- return ret;
+ return 1; /* avoid false negative lockdep_assert_held() */
raw_local_irq_save(flags);
check_flags(flags);
diff --git a/kernel/sched.c b/kernel/sched.c
index cbb3a0e..3f2e502 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq)
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
- * holds that lock for each task it moves into the cgroup. Therefore
- * by holding that lock, we pin the task to the current cgroup.
+ * We use task_subsys_state_check() and extend the RCU verification with
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * task it moves into the cgroup. Therefore by holding either of those locks,
+ * we pin the task to the current cgroup.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
@@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p)
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock));
+ lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock));
tg = container_of(css, struct task_group, css);
return autogroup_task_group(p, tg);
@@ -2200,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
+ /*
+ * The caller should hold either p->pi_lock or rq->lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+ *
+ * sched_move_task() holds both and thus holding either pins the cgroup,
+ * see set_task_rq().
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_rq_lock().
+ */
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
@@ -2447,6 +2458,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
rcu_read_unlock();
}
+
+ if (wake_flags & WF_MIGRATED)
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+
#endif /* CONFIG_SMP */
schedstat_inc(rq, ttwu_count);
@@ -2455,9 +2470,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
if (wake_flags & WF_SYNC)
schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (cpu != task_cpu(p))
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
#endif /* CONFIG_SCHEDSTATS */
}
@@ -2600,6 +2612,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
}
@@ -2674,8 +2687,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- if (task_cpu(p) != cpu)
+ if (task_cpu(p) != cpu) {
+ wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
+ }
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2011-05-28 16:40 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2011-05-28 16:40 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-urgent-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-urgent-for-linus
Thanks,
Ingo
------------------>
KOSAKI Motohiro (1):
cpuset: Fix cpuset_cpus_allowed_fallback(), don't update tsk->rt.nr_cpus_allowed
Peter Zijlstra (2):
sched: Fix ttwu() for __ARCH_WANT_INTERRUPTS_ON_CTXSW
sched: Fix ->min_vruntime calculation in dequeue_entity()
Xiaotian Feng (1):
sched: More sched_domain iterations fixes
include/linux/cpuset.h | 2 +-
include/linux/sched.h | 7 ++++++
kernel/cpuset.c | 4 +-
kernel/kthread.c | 4 +-
kernel/sched.c | 56 ++++++++++++++++++++++++++++++++++-------------
kernel/sched_fair.c | 5 ++-
kernel/sched_rt.c | 10 ++++++-
kernel/sched_stats.h | 4 +-
8 files changed, 65 insertions(+), 27 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index f20eb8f..e9eaec5 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -146,7 +146,7 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
{
- cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+ do_set_cpus_allowed(p, cpu_possible_mask);
return cpumask_any(cpu_active_mask);
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc88712..8da84b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1841,9 +1841,16 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif
#ifdef CONFIG_SMP
+extern void do_set_cpus_allowed(struct task_struct *p,
+ const struct cpumask *new_mask);
+
extern int set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask);
#else
+static inline void do_set_cpus_allowed(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+}
static inline int set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask)
{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1ceeb04..9c9b754 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2190,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
rcu_read_lock();
cs = task_cs(tsk);
if (cs)
- cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, cs->cpus_allowed);
rcu_read_unlock();
/*
@@ -2217,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* Like above we can temporary set any mask and rely on
* set_cpus_allowed_ptr() as synchronization point.
*/
- cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+ do_set_cpus_allowed(tsk, cpu_possible_mask);
cpu = cpumask_any(cpu_active_mask);
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3b34d27..4ba7ccc 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
return;
}
- p->cpus_allowed = cpumask_of_cpu(cpu);
- p->rt.nr_cpus_allowed = 1;
+ /* It's safe because the task is inactive. */
+ do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_THREAD_BOUND;
}
EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/sched.c b/kernel/sched.c
index 5e43e9d..cbb3a0e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2573,7 +2573,26 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
if (!next)
smp_send_reschedule(cpu);
}
-#endif
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_cpu) {
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu)
{
@@ -2631,17 +2650,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
while (p->on_cpu) {
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
/*
- * If called from interrupt context we could have landed in the
- * middle of schedule(), in this case we should take care not
- * to spin on ->on_cpu if p is current, since that would
- * deadlock.
+ * In case the architecture enables interrupts in
+ * context_switch(), we cannot busy wait, since that
+ * would lead to deadlocks when an interrupt hits and
+ * tries to wake up @prev. So bail and do a complete
+ * remote wakeup.
*/
- if (p == current) {
- ttwu_queue(p, cpu);
+ if (ttwu_activate_remote(p, wake_flags))
goto stat;
- }
-#endif
+#else
cpu_relax();
+#endif
}
/*
* Pairs with the smp_wmb() in finish_lock_switch().
@@ -5841,7 +5860,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ do_set_cpus_allowed(idle, cpumask_of(cpu));
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5929,6 +5948,16 @@ static inline void sched_init_granularity(void)
}
#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class && p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+ else {
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ }
+}
+
/*
* This is how migration works:
*
@@ -5974,12 +6003,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
}
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
- else {
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
- }
+ do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), new_mask))
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e32a9b7..433491c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->on_rq = 0;
update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
- update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
+
+ update_min_vruntime(cfs_rq);
+ update_cfs_shares(cfs_rq);
}
/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 64b2a37..88725c9 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1263,6 +1263,7 @@ static int find_lowest_rq(struct task_struct *task)
if (!cpumask_test_cpu(this_cpu, lowest_mask))
this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+ rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
int best_cpu;
@@ -1272,15 +1273,20 @@ static int find_lowest_rq(struct task_struct *task)
* remote processor.
*/
if (this_cpu != -1 &&
- cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
return this_cpu;
+ }
best_cpu = cpumask_first_and(lowest_mask,
sched_domain_span(sd));
- if (best_cpu < nr_cpu_ids)
+ if (best_cpu < nr_cpu_ids) {
+ rcu_read_unlock();
return best_cpu;
+ }
}
}
+ rcu_read_unlock();
/*
* And finally, if there were no matches within the domains
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf43..331e01b 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
#ifdef CONFIG_SMP
/* domain-specific stats */
- preempt_disable();
+ rcu_read_lock();
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
}
- preempt_enable();
+ rcu_read_unlock();
#endif
}
kfree(mask_str);
^ permalink raw reply related [flat|nested] 384+ messages in thread
* [GIT PULL] scheduler fixes
@ 2011-04-16 10:07 Ingo Molnar
0 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2011-04-16 10:07 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
Linus,
Please pull the latest sched-fixes-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus
Thanks,
Ingo
------------------>
Ken Chen (2):
sched: Fix sched-domain avg_load calculation
sched: Fix erroneous all_pinned logic
kernel/sched_fair.c | 14 ++++++--------
1 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7f00772..6fa833a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2104,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
enum cpu_idle_type idle, int *all_pinned,
int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
{
- int loops = 0, pulled = 0, pinned = 0;
+ int loops = 0, pulled = 0;
long rem_load_move = max_load_move;
struct task_struct *p, *n;
if (max_load_move == 0)
goto out;
- pinned = 1;
-
list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
if (loops++ > sysctl_sched_nr_migrate)
break;
if ((p->se.load.weight >> 1) > rem_load_move ||
- !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+ !can_migrate_task(p, busiest, this_cpu, sd, idle,
+ all_pinned))
continue;
pull_task(busiest, p, this_rq, this_cpu);
@@ -2153,9 +2152,6 @@ out:
*/
schedstat_add(sd, lb_gained[idle], pulled);
- if (all_pinned)
- *all_pinned = pinned;
-
return max_load_move - rem_load_move;
}
@@ -3127,6 +3123,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
+ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
/*
* If the busiest group is imbalanced the below checks don't
* work because they assumes all things are equal, which typically
@@ -3151,7 +3149,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* Don't pull any tasks if this group is already above the domain
* average load.
*/
- sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
if (sds.this_load >= sds.avg_load)
goto out_balanced;
@@ -3340,6 +3337,7 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
+ all_pinned = 1;
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2011-04-04 15:45 ` Linus Torvalds
` (2 preceding siblings ...)
2011-04-04 19:19 ` Ingo Molnar
@ 2011-04-05 8:14 ` Peter Zijlstra
3 siblings, 0 replies; 384+ messages in thread
From: Peter Zijlstra @ 2011-04-05 8:14 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Ingo Molnar, linux-kernel, Thomas Gleixner, Andrew Morton
On Mon, 2011-04-04 at 08:45 -0700, Linus Torvalds wrote:
> So I pulled this, but I think this:
>
> On Sat, Apr 2, 2011 at 3:31 AM, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > - if (interval > HZ*NR_CPUS/10)
> > - interval = HZ*NR_CPUS/10;
> > + if (interval > HZ*num_online_cpus()/10)
> > + interval = HZ*num_online_cpus()/10;
>
> is a horrible patch.
>
> Think about what that expands to. It's going to potentially be two
> function calls. And the code is just ugly.
>
> So please, when doing search-and-replace changes like this, just clean
> up the code at the same time. Back when it was about pure constants,
> there was only a typing/ugly overhead from duplicating the constant,
> but the compiler would see a single constant.
>
> Now it's _possible_ that the compiler could do the analysis and fold
> it all back to a single thing. But it's unlikely to happen except for
> configurations that end up making it all trivial.
>
> So just add something like a
>
> int max_interval = HZ*num_online_cpus()/10;
>
> possibly even with a comment about _why_ that is the max interval allowed. Ok?
How about something like the below?
---
Subject: sched: Clean up load-balance interval calculation
Instead of the possible multiple-evaluation of num_online_cpus() avoid
it all together in the normal case since its implemented with a hamming
weight function over a cpu bitmask which can be darn expensive for those
with big iron.
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 3 +++
kernel/sched_fair.c | 16 ++++++++++++----
2 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index a884551..17b4d22 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6331,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#endif
}
+
+ update_max_interval();
+
return NOTIFY_OK;
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c7ec5c8..a14a04e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3820,6 +3820,17 @@ void select_nohz_load_balancer(int stop_tick)
static DEFINE_SPINLOCK(balancing);
+static unsigned long max_load_balance_interval = HZ/10;
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+static void update_max_interval(void)
+{
+ max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
@@ -3849,10 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
- if (unlikely(!interval))
- interval = 1;
- if (interval > HZ*num_online_cpus()/10)
- interval = HZ*num_online_cpus()/10;
+ interval = clamp(interval, 1UL, max_load_balance_interval);
need_serialize = sd->flags & SD_SERIALIZE;
^ permalink raw reply related [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2011-04-04 15:45 ` Linus Torvalds
2011-04-04 16:08 ` Sisir Koppaka
2011-04-04 16:16 ` Andrew Morton
@ 2011-04-04 19:19 ` Ingo Molnar
2011-04-05 8:14 ` Peter Zijlstra
3 siblings, 0 replies; 384+ messages in thread
From: Ingo Molnar @ 2011-04-04 19:19 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton
* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> So I pulled this, but I think this:
>
> On Sat, Apr 2, 2011 at 3:31 AM, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > - if (interval > HZ*NR_CPUS/10)
> > - interval = HZ*NR_CPUS/10;
> > + if (interval > HZ*num_online_cpus()/10)
> > + interval = HZ*num_online_cpus()/10;
>
> is a horrible patch.
Indeed, will fix it.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2011-04-04 15:45 ` Linus Torvalds
2011-04-04 16:08 ` Sisir Koppaka
@ 2011-04-04 16:16 ` Andrew Morton
2011-04-04 19:19 ` Ingo Molnar
2011-04-05 8:14 ` Peter Zijlstra
3 siblings, 0 replies; 384+ messages in thread
From: Andrew Morton @ 2011-04-04 16:16 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Ingo Molnar, linux-kernel, Peter Zijlstra, Thomas Gleixner
On Mon, 4 Apr 2011 08:45:34 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote:
> So I pulled this, but I think this:
>
> On Sat, Apr 2, 2011 at 3:31 AM, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > - if (interval > HZ*NR_CPUS/10)
> > - interval = HZ*NR_CPUS/10;
> > + if (interval > HZ*num_online_cpus()/10)
> > + interval = HZ*num_online_cpus()/10;
>
> is a horrible patch.
>
> Think about what that expands to. It's going to potentially be two
> function calls. And the code is just ugly.
>
> So please, when doing search-and-replace changes like this, just clean
> up the code at the same time. Back when it was about pure constants,
> there was only a typing/ugly overhead from duplicating the constant,
> but the compiler would see a single constant.
>
> Now it's _possible_ that the compiler could do the analysis and fold
> it all back to a single thing. But it's unlikely to happen except for
> configurations that end up making it all trivial.
>
> So just add something like a
>
> int max_interval = HZ*num_online_cpus()/10;
>
> possibly even with a comment about _why_ that is the max interval allowed. Ok?
>
max() conveniently avoids the double-evaluation:
#define max(x, y) ({ \
typeof(x) _max1 = (x); \
typeof(y) _max2 = (y); \
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })
^ permalink raw reply [flat|nested] 384+ messages in thread
* Re: [GIT PULL] scheduler fixes
2011-04-04 15:45 ` Linus Torvalds
@ 2011-04-04 16:08 ` Sisir Koppaka
2011-04-04 16:16 ` Andrew Morton
` (2 subsequent siblings)
3 siblings, 0 replies; 384+ messages in thread
From: Sisir Koppaka @ 2011-04-04 16:08 UTC (permalink / raw)
To: Linus Torvalds
Cc: Ingo Molnar, linux-kernel, Peter Zijlstra, Thomas Gleixner,
Andrew Morton
On Mon, Apr 4, 2011 at 9:15 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> So I pulled this, but I think this:
>
> On Sat, Apr 2, 2011 at 3:31 AM, Ingo Molnar <mingo@elte.hu> wrote:
>>
>> - if (interval > HZ*NR_CPUS/10)
>> - interval = HZ*NR_CPUS/10;
>> + if (interval > HZ*num_online_cpus()/10)
>> + interval = HZ*num_online_cpus()/10;
>
> is a horrible patch.
>
> Think about what that expands to. It's going to potentially be two
> function calls. And the code is just ugly.
>
> So please, when doing search-and-replace changes like this, just clean
> up the code at the same time. Back when it was about pure constants,
> there was only a typing/ugly overhead from duplicating the constant,
> but the compiler would see a single constant.
>
> Now it's _possible_ that the compiler could do the analysis and fold
> it all back to a single thing. But it's unlikely to happen except for
> configurations that end up making it all trivial.
>
> So just add something like a
>
> int max_interval = HZ*num_online_cpus()/10;
>
> possibly even with a comment about _why_ that is the max interval allowed. Ok?
>
> Linus
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
Ok sure. You're right, I'll resend the patch.
Sisir
^ permalink raw reply [flat|nested] 384+ messages in thread<