LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: LKML <linux-kernel@vger.kernel.org>, Ingo Molnar <mingo@elte.hu>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>,
	Mike Galbraith <efault@gmx.de>,
	Dhaval Giani <dhaval@linux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [RFC/PATCH 01/17] sched: mix tasks and groups
Date: Sun, 09 Mar 2008 18:08:51 +0100	[thread overview]
Message-ID: <20080309170925.857156000@chello.nl> (raw)
In-Reply-To: <20080309170850.256853000@chello.nl>

[-- Attachment #1: dhaval-fairness-model.patch --]
[-- Type: text/plain, Size: 7813 bytes --]

This patch allows tasks and groups to exist in the same cfs_rq. With this
change the CFS group scheduling follows a 1/(M+N) model from a 1/(1+N)
fairness model where M tasks and N groups exist at the cfs_rq level.

[a.p.zijlstra@chello.nl: rt bits]
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c      |   54 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched_fair.c |   48 +++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_rt.c   |   15 ++++++++------
 3 files changed, 106 insertions(+), 11 deletions(-)

Index: linux-2.6-2/kernel/sched.c
===================================================================
--- linux-2.6-2.orig/kernel/sched.c
+++ linux-2.6-2/kernel/sched.c
@@ -273,18 +273,23 @@ struct task_group {
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+
+#ifdef CONFIG_USER_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+#endif
 
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
+#ifdef CONFIG_USER_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+#endif
 
 static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
 static struct rt_rq *init_rt_rq_p[NR_CPUS];
@@ -7279,6 +7284,10 @@ static void init_tg_cfs_entry(struct rq 
 		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 
 	tg->se[cpu] = se;
+	/* se could be NULL for init_task_group */
+	if (!se)
+		return;
+
 	se->cfs_rq = &rq->cfs;
 	se->my_q = cfs_rq;
 	se->load.weight = tg->shares;
@@ -7301,6 +7310,9 @@ static void init_tg_rt_entry(struct rq *
 		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 
 	tg->rt_se[cpu] = rt_se;
+	if (!rt_se)
+		return;
+
 	rt_se->rt_rq = &rq->rt;
 	rt_se->my_q = rt_rq;
 	rt_se->parent = NULL;
@@ -7343,18 +7355,56 @@ void __init sched_init(void)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.shares = init_task_group_load;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+		/*
+		 * How much cpu bandwidth does init_task_group get?
+		 *
+		 * In case of task-groups formed thr' the cgroup filesystem, it
+		 * gets 100% of the cpu resources in the system. This overall
+		 * system cpu resource is divided among the tasks of
+		 * init_task_group and its child task-groups in a fair manner,
+		 * based on each entity's (task or task-group's) weight
+		 * (se->load.weight).
+		 *
+		 * In other words, if init_task_group has 10 tasks of weight
+		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
+		 * then A0's share of the cpu resource is:
+		 *
+		 * 	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+		 *
+		 * We achieve this by letting init_task_group's tasks sit
+		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+		 */
+		init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1);
+#elif defined CONFIG_USER_SCHED
+		/*
+		 * In case of task-groups formed thr' the user id of tasks,
+		 * init_task_group represents tasks belonging to root user.
+		 * Hence it forms a sibling of all subsequent groups formed.
+		 * In this case, init_task_group gets only a fraction of overall
+		 * system cpu resource, based on the weight assigned to root
+		 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
+		 * by letting tasks of init_task_group sit in a separate cfs_rq
+		 * (init_cfs_rq) and having one entity represent this group of
+		 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
+		 */
 		init_tg_cfs_entry(rq, &init_task_group,
 				&per_cpu(init_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1);
 
 #endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+		init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1);
+#elif defined CONFIG_USER_SCHED
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
 				&per_cpu(init_sched_rt_entity, i), i, 1);
-#else
-		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#endif
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
Index: linux-2.6-2/kernel/sched_fair.c
===================================================================
--- linux-2.6-2.orig/kernel/sched_fair.c
+++ linux-2.6-2/kernel/sched_fair.c
@@ -1029,6 +1029,17 @@ out_set_cpu:
 }
 #endif /* CONFIG_SMP */
 
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+	int depth = 0;
+
+	for_each_sched_entity(se)
+		depth++;
+
+	return depth;
+}
+
 
 /*
  * Preempt the current task with a newly woken task if needed:
@@ -1039,6 +1050,7 @@ static void check_preempt_wakeup(struct 
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	unsigned long gran;
+	int se_depth, pse_depth;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -1056,6 +1068,27 @@ static void check_preempt_wakeup(struct 
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
+	/*
+	 * preemption test can be made between sibling entities who are in the
+	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+	 * both tasks until we find their ancestors who are siblings of common
+	 * parent.
+	 */
+
+	/* First walk up until both entities are at same depth */
+	se_depth = depth_se(se);
+	pse_depth = depth_se(pse);
+
+	while (se_depth > pse_depth) {
+		se_depth--;
+		se = parent_entity(se);
+	}
+
+	while (pse_depth > se_depth) {
+		pse_depth--;
+		pse = parent_entity(pse);
+	}
+
 	while (!is_same_group(se, pse)) {
 		se = parent_entity(se);
 		pse = parent_entity(pse);
@@ -1122,13 +1155,22 @@ static void put_prev_task_fair(struct rq
 static struct task_struct *
 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 {
-	struct task_struct *p;
+	struct task_struct *p = NULL;
+	struct sched_entity *se;
 
 	if (!curr)
 		return NULL;
 
-	p = rb_entry(curr, struct task_struct, se.run_node);
-	cfs_rq->rb_load_balance_curr = rb_next(curr);
+	/* Skip over entities that are not tasks */
+	do {
+		se = rb_entry(curr, struct sched_entity, run_node);
+		curr = rb_next(curr);
+	} while (curr && !entity_is_task(se));
+
+	cfs_rq->rb_load_balance_curr = curr;
+
+	if (entity_is_task(se))
+		p = task_of(se);
 
 	return p;
 }
Index: linux-2.6-2/kernel/sched_rt.c
===================================================================
--- linux-2.6-2.orig/kernel/sched_rt.c
+++ linux-2.6-2/kernel/sched_rt.c
@@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
-	spin_lock(&rt_rq->rt_runtime_lock);
-	rt_rq->rt_time += delta_exec;
-	if (sched_rt_runtime_exceeded(rt_rq))
-		resched_task(curr);
-	spin_unlock(&rt_rq->rt_runtime_lock);
+	for_each_sched_rt_entity(rt_se) {
+		rt_rq = rt_rq_of_se(rt_se);
+
+		spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_time += delta_exec;
+		if (sched_rt_runtime_exceeded(rt_rq))
+			resched_task(curr);
+		spin_unlock(&rt_rq->rt_runtime_lock);
+	}
 }
 
 static inline
@@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sch
  * entries, we must remove entries top - down.
  *
  * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
- *      doesn't matter much for now, as h=2 for GROUP_SCHED.
  */
 static void dequeue_rt_stack(struct task_struct *p)
 {

--


  reply	other threads:[~2008-03-09 17:09 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-03-09 17:08 [RFC/PATCH 00/17] Current sched patch stack Peter Zijlstra
2008-03-09 17:08 ` Peter Zijlstra [this message]
2008-04-01 12:12   ` [RFC/PATCH 01/17] sched: mix tasks and groups Srivatsa Vaddagiri
2008-04-01 12:05     ` Peter Zijlstra
2008-03-09 17:08 ` [RFC/PATCH 02/17] sched: allow the CFS group scheduler to have multiple levels Peter Zijlstra
2008-03-12 14:47   ` Dhaval Giani
2008-03-09 17:08 ` [RFC/PATCH 03/17] sched: rt-group: full hierarchy support for the rt groups Peter Zijlstra
2008-03-12  8:25   ` Dhaval Giani
     [not found]     ` <661de9470803120129l66497656o948c13c6c0c26766@mail.gmail.com>
2008-03-12  9:29       ` Peter Zijlstra
2008-03-12 13:34         ` Balbir Singh
2008-03-12 13:39           ` Dhaval Giani
2008-03-12 14:08         ` Balbir Singh
2008-03-09 17:08 ` [RFC/PATCH 04/17] sched: fair-group: SMP-nice for group scheduling Peter Zijlstra
2008-03-09 17:08 ` [RFC/PATCH 05/17] sched: rt-group: optimize dequeue_rt_stack Peter Zijlstra
2008-03-09 17:08 ` [RFC/PATCH 06/17] sched: fair-group scheduling vs latency Peter Zijlstra
2008-03-09 17:08 ` [RFC/PATCH 07/17] sched: fair-group: de-couple load-balancing from the rb-trees Peter Zijlstra
2008-03-09 17:08 ` [RFC/PATCH 08/17] sched: fair-group: single RQ approach Peter Zijlstra
2008-03-09 17:08 ` [RFC/PATCH 09/17] sched: fair: remove moved_group() Peter Zijlstra
2008-03-09 17:09 ` [RFC/PATCH 10/17] sched: remove sysctl_sched_batch_wakeup_granularity Peter Zijlstra
2008-03-09 17:09 ` [RFC/PATCH 11/17] sched: fair: optimize sched_slice() Peter Zijlstra
2008-03-09 17:09 ` [RFC/PATCH 12/17] sched: fair: cfs_root_rq Peter Zijlstra
2008-03-09 17:09 ` [RFC/PATCH 13/17] sched: fair: calc_delta_weight() Peter Zijlstra
2008-03-09 17:09 ` [RFC/PATCH 14/17] sched: fair: avg_vruntime Peter Zijlstra
2008-03-09 17:09 ` [RFC/PATCH 15/17] sched: fair-group: EEVDF scheduling Peter Zijlstra
2008-03-09 17:09 ` [RFC/PATCH 16/17] sched: fair: bound tardiness Peter Zijlstra
2008-03-09 17:09 ` [RFC/PATCH 17/17] sched: fair: optimize the elegebility test Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080309170925.857156000@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=dhaval@linux.vnet.ibm.com \
    --cc=dmitry.adamushko@gmail.com \
    --cc=efault@gmx.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=vatsa@linux.vnet.ibm.com \
    --subject='Re: [RFC/PATCH 01/17] sched: mix tasks and groups' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).