LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 2/2] Customize sched domain via cpuset
@ 2008-04-01 11:27 Hidetoshi Seto
  2008-04-01 11:51 ` Peter Zijlstra
  0 siblings, 1 reply; 4+ messages in thread
From: Hidetoshi Seto @ 2008-04-01 11:27 UTC (permalink / raw)
  To: linux-kernel

The implementation is here.

 - Add 2 new cpuset files:
     sched_wake_idle_far
     sched_balance_newidle_far

 - Modify partition_sched_domains() and build_sched_domains()
   to take flags parameter passed from cpuset.

 - Fill newidle_idx for node domains which currently unused but
   might be required for sched_balance_newidle_far.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>

---
 include/asm-ia64/topology.h |    2
 include/asm-sh/topology.h   |    2
 include/asm-x86/topology.h  |    2
 include/linux/sched.h       |    4 +
 kernel/cpuset.c             |   89 ++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched.c              |   38 ++++++++++++++++--
 kernel/sched_fair.c         |    4 +
 7 files changed, 128 insertions(+), 13 deletions(-)

Index: GIT-torvalds/kernel/sched_fair.c
===================================================================
--- GIT-torvalds.orig/kernel/sched_fair.c
+++ GIT-torvalds/kernel/sched_fair.c
@@ -957,7 +957,9 @@ static int wake_idle(int cpu, struct tas
 		return cpu;

 	for_each_domain(cpu, sd) {
-		if (sd->flags & SD_WAKE_IDLE) {
+		if ((sd->flags & SD_WAKE_IDLE)
+		    || ((sd->flags & SD_WAKE_IDLE_FAR)
+			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i)) {
Index: GIT-torvalds/kernel/cpuset.c
===================================================================
--- GIT-torvalds.orig/kernel/cpuset.c
+++ GIT-torvalds/kernel/cpuset.c
@@ -126,6 +126,8 @@ typedef enum {
 	CS_MEM_EXCLUSIVE,
 	CS_MEMORY_MIGRATE,
 	CS_SCHED_LOAD_BALANCE,
+	CS_SCHED_BALANCE_NEWIDLE_FAR,
+	CS_SCHED_WAKE_IDLE_FAR,
 	CS_SPREAD_PAGE,
 	CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
@@ -146,6 +148,16 @@ static inline int is_sched_load_balance(
 	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 }

+static inline int is_sched_balance_newidle_far(const struct cpuset *cs)
+{
+	return test_bit(CS_SCHED_BALANCE_NEWIDLE_FAR, &cs->flags);
+}
+
+static inline int is_sched_wake_idle_far(const struct cpuset *cs)
+{
+	return test_bit(CS_SCHED_WAKE_IDLE_FAR, &cs->flags);
+}
+
 static inline int is_memory_migrate(const struct cpuset *cs)
 {
 	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
@@ -161,6 +173,11 @@ static inline int is_spread_slab(const s
 	return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }

+static inline int is_sched_custom_domain(const struct cpuset *cs)
+{
+	return is_sched_balance_newidle_far(cs) || is_sched_wake_idle_far(cs);
+}
+
 /*
  * Increment this integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -553,12 +570,14 @@ static void rebuild_sched_domains(void)
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
 	cpumask_t *doms;	/* resulting partition; i.e. sched domains */
+	int *flags;		/* flags for custom sched domains */
 	int ndoms;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] cpumask_t slot */

 	q = NULL;
 	csa = NULL;
 	doms = NULL;
+	flags = NULL;

 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
@@ -566,6 +585,13 @@ static void rebuild_sched_domains(void)
 		doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 		if (!doms)
 			goto rebuild;
+		if (is_sched_custom_domain(&top_cpuset)) {
+			flags = kzalloc(sizeof(int), GFP_KERNEL);
+			if (flags && is_sched_balance_newidle_far(&top_cpuset))
+				*flags |= SD_BALANCE_NEWIDLE;
+			if (flags && is_sched_wake_idle_far(&top_cpuset))
+				*flags |= SD_WAKE_IDLE_FAR;
+		}
 		*doms = top_cpuset.cpus_allowed;
 		goto rebuild;
 	}
@@ -622,6 +648,7 @@ restart:
 	doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms)
 		goto rebuild;
+	flags = kzalloc(ndoms * sizeof(int), GFP_KERNEL);

 	for (nslot = 0, i = 0; i < csn; i++) {
 		struct cpuset *a = csa[i];
@@ -650,6 +677,13 @@ restart:
 				if (apn == b->pn) {
 					cpus_or(*dp, *dp, b->cpus_allowed);
 					b->pn = -1;
+					if (flags
+					    && is_sched_balance_newidle_far(b))
+						*(flags + nslot) |=
+							SD_BALANCE_NEWIDLE;
+					if (flags && is_sched_wake_idle_far(b))
+						*(flags + nslot) |=
+							SD_WAKE_IDLE_FAR;
 				}
 			}
 			nslot++;
@@ -660,7 +694,7 @@ restart:
 rebuild:
 	/* Have scheduler rebuild sched domains */
 	get_online_cpus();
-	partition_sched_domains(ndoms, doms);
+	partition_sched_domains(ndoms, doms, flags);
 	put_online_cpus();

 done:
@@ -668,6 +702,7 @@ done:
 		kfifo_free(q);
 	kfree(csa);
 	/* Don't kfree(doms) -- partition_sched_domains() does that. */
+	/* Don't kfree(flags) -- partition_sched_domains() does that. */
 }

 static inline int started_after_time(struct task_struct *t1,
@@ -1011,10 +1046,26 @@ static int update_memory_pressure_enable
 	return 0;
 }

+static int need_rebuild_domains(struct cpuset *cs, struct cpuset *tcs)
+{
+	if (is_sched_load_balance(cs) != is_sched_load_balance(tcs))
+		return 1;
+	if (!is_sched_load_balance(tcs))
+		return 0;
+	if (is_sched_balance_newidle_far(cs) !=
+					is_sched_balance_newidle_far(tcs))
+		return 1;
+	if (is_sched_wake_idle_far(cs) != is_sched_wake_idle_far(tcs))
+		return 1;
+	return 0;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
  *				CS_SCHED_LOAD_BALANCE,
+ *				CS_SCHED_BALANCE_NEW_IDLE_FAR,
+ *				CS_SCHED_WAKE_IDLE_FAR,
  *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
  *				CS_SPREAD_PAGE, CS_SPREAD_SLAB)
  * cs:	the cpuset to update
@@ -1043,8 +1094,7 @@ static int update_flag(cpuset_flagbits_t
 		return err;

 	cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
-	balance_flag_changed = (is_sched_load_balance(cs) !=
-		 			is_sched_load_balance(&trialcs));
+	balance_flag_changed = need_rebuild_domains(cs, &trialcs);

 	mutex_lock(&callback_mutex);
 	cs->flags = trialcs.flags;
@@ -1202,6 +1252,8 @@ typedef enum {
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_SCHED_LOAD_BALANCE,
+	FILE_SCHED_BALANCE_NEWIDLE_FAR,
+	FILE_SCHED_WAKE_IDLE_FAR,
 	FILE_MEMORY_PRESSURE_ENABLED,
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
@@ -1256,6 +1308,12 @@ static ssize_t cpuset_common_file_write(
 	case FILE_SCHED_LOAD_BALANCE:
 		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
 		break;
+	case FILE_SCHED_BALANCE_NEWIDLE_FAR:
+		retval = update_flag(CS_SCHED_BALANCE_NEWIDLE_FAR, cs, buffer);
+		break;
+	case FILE_SCHED_WAKE_IDLE_FAR:
+		retval = update_flag(CS_SCHED_WAKE_IDLE_FAR, cs, buffer);
+		break;
 	case FILE_MEMORY_MIGRATE:
 		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
 		break;
@@ -1354,6 +1412,12 @@ static ssize_t cpuset_common_file_read(s
 	case FILE_SCHED_LOAD_BALANCE:
 		*s++ = is_sched_load_balance(cs) ? '1' : '0';
 		break;
+	case FILE_SCHED_BALANCE_NEWIDLE_FAR:
+		*s++ = is_sched_balance_newidle_far(cs) ? '1' : '0';
+		break;
+	case FILE_SCHED_WAKE_IDLE_FAR:
+		*s++ = is_sched_wake_idle_far(cs) ? '1' : '0';
+		break;
 	case FILE_MEMORY_MIGRATE:
 		*s++ = is_memory_migrate(cs) ? '1' : '0';
 		break;
@@ -1424,6 +1488,20 @@ static struct cftype cft_sched_load_bala
 	.private = FILE_SCHED_LOAD_BALANCE,
 };

+static struct cftype cft_sched_balance_newidle_far = {
+	.name = "sched_balance_newidle_far",
+	.read = cpuset_common_file_read,
+	.write = cpuset_common_file_write,
+	.private = FILE_SCHED_BALANCE_NEWIDLE_FAR,
+};
+
+static struct cftype cft_sched_wake_idle_far = {
+	.name = "sched_wake_idle_far",
+	.read = cpuset_common_file_read,
+	.write = cpuset_common_file_write,
+	.private = FILE_SCHED_WAKE_IDLE_FAR,
+};
+
 static struct cftype cft_memory_migrate = {
 	.name = "memory_migrate",
 	.read = cpuset_common_file_read,
@@ -1475,6 +1553,11 @@ static int cpuset_populate(struct cgroup
 		return err;
 	if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
 		return err;
+	if ((err = cgroup_add_file(cont, ss,
+					&cft_sched_balance_newidle_far)) < 0)
+		return err;
+	if ((err = cgroup_add_file(cont, ss, &cft_sched_wake_idle_far)) < 0)
+		return err;
 	if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
 		return err;
 	if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
Index: GIT-torvalds/include/linux/sched.h
===================================================================
--- GIT-torvalds.orig/include/linux/sched.h
+++ GIT-torvalds/include/linux/sched.h
@@ -704,6 +704,7 @@ enum cpu_idle_type {
 #define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	512	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
+#define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */

 #define BALANCE_FOR_MC_POWER	\
 	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
@@ -789,7 +790,8 @@ struct sched_domain {
 #endif
 };

-extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
+extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+				    int *flags_new);
 extern int arch_reinit_sched_domains(void);

 #endif	/* CONFIG_SMP */
Index: GIT-torvalds/kernel/sched.c
===================================================================
--- GIT-torvalds.orig/kernel/sched.c
+++ GIT-torvalds/kernel/sched.c
@@ -6586,7 +6586,7 @@ static void init_sched_groups_power(int
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static int build_sched_domains(const cpumask_t *cpu_map)
+static int __build_sched_domains(const cpumask_t *cpu_map, int flags)
 {
 	int i;
 	struct root_domain *rd;
@@ -6627,6 +6627,7 @@ static int build_sched_domains(const cpu
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
+			/* prohibit "sd->flags |= flags" for allnodes_domain */
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups);
 			p = sd;
 			sd_allnodes = 1;
@@ -6636,6 +6637,7 @@ static int build_sched_domains(const cpu
 		sd = &per_cpu(node_domains, i);
 		*sd = SD_NODE_INIT;
 		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->flags |= flags;
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -6646,6 +6648,7 @@ static int build_sched_domains(const cpu
 		sd = &per_cpu(phys_domains, i);
 		*sd = SD_CPU_INIT;
 		sd->span = nodemask;
+		sd->flags |= flags;
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -6657,6 +6660,7 @@ static int build_sched_domains(const cpu
 		*sd = SD_MC_INIT;
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
+		sd->flags |= flags;
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_core_group(i, cpu_map, &sd->groups);
@@ -6668,6 +6672,7 @@ static int build_sched_domains(const cpu
 		*sd = SD_SIBLING_INIT;
 		sd->span = per_cpu(cpu_sibling_map, i);
 		cpus_and(sd->span, sd->span, *cpu_map);
+		sd->flags |= flags;
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups);
@@ -6840,8 +6845,14 @@ error:
 #endif
 }

+static int build_sched_domains(const cpumask_t *cpu_map)
+{
+	return __build_sched_domains(cpu_map, 0);
+}
+
 static cpumask_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
+static int *flags_cur;		/* custom flags of domains in 'doms_cur' */

 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
@@ -6868,6 +6879,7 @@ static int arch_init_sched_domains(const
 	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
+	flags_cur = NULL;
 	cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
 	err = build_sched_domains(doms_cur);
 	register_sched_domain_sysctl();
@@ -6896,6 +6908,16 @@ static void detach_destroy_domains(const
 	arch_destroy_sched_domains(cpu_map);
 }

+/* handle null as 0s array */
+static inline int flags_equal(int *cur, int idx_cur, int *new, int idx_new)
+{
+	if (!new)
+		return (!cur || !cur[idx_cur]);
+	if (!cur)
+		return (!new[idx_new]);
+	return (cur[idx_cur] == new[idx_new]);
+}
+
 /*
  * Partition sched domains as specified by the 'ndoms_new'
  * cpumasks in the array doms_new[] of cpumasks. This compares
@@ -6917,7 +6939,7 @@ static void detach_destroy_domains(const
  *
  * Call with hotplug lock held
  */
-void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
+void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, int *flags_new)
 {
 	int i, j;

@@ -6929,13 +6951,15 @@ void partition_sched_domains(int ndoms_n
 	if (doms_new == NULL) {
 		ndoms_new = 1;
 		doms_new = &fallback_doms;
+		flags_new = NULL;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
 	}

 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < ndoms_new; j++) {
-			if (cpus_equal(doms_cur[i], doms_new[j]))
+			if (cpus_equal(doms_cur[i], doms_new[j])
+			    && flags_equal(flags_cur, i, flags_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
@@ -6947,11 +6971,13 @@ match1:
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
-			if (cpus_equal(doms_new[i], doms_cur[j]))
+			if (cpus_equal(doms_new[i], doms_cur[j])
+			    && flags_equal(flags_new, i, flags_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
-		build_sched_domains(doms_new + i);
+		__build_sched_domains(doms_new + i,
+					flags_new ? flags_new[i] : 0);
 match2:
 		;
 	}
@@ -6959,7 +6985,9 @@ match2:
 	/* Remember the new sched domains */
 	if (doms_cur != &fallback_doms)
 		kfree(doms_cur);
+	kfree(flags_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
+	flags_cur = flags_new;
 	ndoms_cur = ndoms_new;

 	register_sched_domain_sysctl();
Index: GIT-torvalds/include/asm-ia64/topology.h
===================================================================
--- GIT-torvalds.orig/include/asm-ia64/topology.h
+++ GIT-torvalds/include/asm-ia64/topology.h
@@ -93,7 +93,7 @@ void build_cpu_to_node_map(void);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 0, /* unused */	\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
Index: GIT-torvalds/include/asm-sh/topology.h
===================================================================
--- GIT-torvalds.orig/include/asm-sh/topology.h
+++ GIT-torvalds/include/asm-sh/topology.h
@@ -16,7 +16,7 @@
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 0,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
Index: GIT-torvalds/include/asm-x86/topology.h
===================================================================
--- GIT-torvalds.orig/include/asm-x86/topology.h
+++ GIT-torvalds/include/asm-x86/topology.h
@@ -129,7 +129,7 @@ extern unsigned long node_remap_size[];

 # define SD_CACHE_NICE_TRIES	2
 # define SD_IDLE_IDX		2
-# define SD_NEWIDLE_IDX		0
+# define SD_NEWIDLE_IDX		2
 # define SD_FORKEXEC_IDX	1

 #endif


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] Customize sched domain via cpuset
  2008-04-01 11:27 [PATCH 2/2] Customize sched domain via cpuset Hidetoshi Seto
@ 2008-04-01 11:51 ` Peter Zijlstra
  2008-04-02  8:41   ` Hidetoshi Seto
  0 siblings, 1 reply; 4+ messages in thread
From: Peter Zijlstra @ 2008-04-01 11:51 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: linux-kernel, Ingo Molnar, Paul Jackson

On Tue, 2008-04-01 at 20:27 +0900, Hidetoshi Seto wrote:
> The implementation is here.
> 
>  - Add 2 new cpuset files:
>      sched_wake_idle_far
>      sched_balance_newidle_far
> 
>  - Modify partition_sched_domains() and build_sched_domains()
>    to take flags parameter passed from cpuset.
> 
>  - Fill newidle_idx for node domains which currently unused but
>    might be required for sched_balance_newidle_far.

Just to be clear; the same effect can be had by poking into:

 /proc/sys/kernel/sched_domain/$cpu/$domain/flags

but this interface you now propose gives a more stable interface in that
you'd have to re-do your setting after every cpuset change (admittedly
those are rare, but I see how it could be a nuisance).

Or do you actually add something that wasn't available through the
initial domain interface?

> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
> 
> ---
>  include/asm-ia64/topology.h |    2
>  include/asm-sh/topology.h   |    2
>  include/asm-x86/topology.h  |    2
>  include/linux/sched.h       |    4 +
>  kernel/cpuset.c             |   89 ++++++++++++++++++++++++++++++++++++++++++--
>  kernel/sched.c              |   38 ++++++++++++++++--
>  kernel/sched_fair.c         |    4 +
>  7 files changed, 128 insertions(+), 13 deletions(-)
> 
> Index: GIT-torvalds/kernel/sched_fair.c
> ===================================================================
> --- GIT-torvalds.orig/kernel/sched_fair.c
> +++ GIT-torvalds/kernel/sched_fair.c
> @@ -957,7 +957,9 @@ static int wake_idle(int cpu, struct tas
>  		return cpu;
> 
>  	for_each_domain(cpu, sd) {
> -		if (sd->flags & SD_WAKE_IDLE) {
> +		if ((sd->flags & SD_WAKE_IDLE)
> +		    || ((sd->flags & SD_WAKE_IDLE_FAR)
> +			&& !task_hot(p, task_rq(p)->clock, sd))) {
>  			cpus_and(tmp, sd->span, p->cpus_allowed);
>  			for_each_cpu_mask(i, tmp) {
>  				if (idle_cpu(i)) {
> Index: GIT-torvalds/kernel/cpuset.c
> ===================================================================
> --- GIT-torvalds.orig/kernel/cpuset.c
> +++ GIT-torvalds/kernel/cpuset.c
> @@ -126,6 +126,8 @@ typedef enum {
>  	CS_MEM_EXCLUSIVE,
>  	CS_MEMORY_MIGRATE,
>  	CS_SCHED_LOAD_BALANCE,
> +	CS_SCHED_BALANCE_NEWIDLE_FAR,
> +	CS_SCHED_WAKE_IDLE_FAR,
>  	CS_SPREAD_PAGE,
>  	CS_SPREAD_SLAB,
>  } cpuset_flagbits_t;
> @@ -146,6 +148,16 @@ static inline int is_sched_load_balance(
>  	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
>  }
> 
> +static inline int is_sched_balance_newidle_far(const struct cpuset *cs)
> +{
> +	return test_bit(CS_SCHED_BALANCE_NEWIDLE_FAR, &cs->flags);
> +}
> +
> +static inline int is_sched_wake_idle_far(const struct cpuset *cs)
> +{
> +	return test_bit(CS_SCHED_WAKE_IDLE_FAR, &cs->flags);
> +}
> +
>  static inline int is_memory_migrate(const struct cpuset *cs)
>  {
>  	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
> @@ -161,6 +173,11 @@ static inline int is_spread_slab(const s
>  	return test_bit(CS_SPREAD_SLAB, &cs->flags);
>  }
> 
> +static inline int is_sched_custom_domain(const struct cpuset *cs)
> +{
> +	return is_sched_balance_newidle_far(cs) || is_sched_wake_idle_far(cs);
> +}
> +
>  /*
>   * Increment this integer everytime any cpuset changes its
>   * mems_allowed value.  Users of cpusets can track this generation
> @@ -553,12 +570,14 @@ static void rebuild_sched_domains(void)
>  	int csn;		/* how many cpuset ptrs in csa so far */
>  	int i, j, k;		/* indices for partition finding loops */
>  	cpumask_t *doms;	/* resulting partition; i.e. sched domains */
> +	int *flags;		/* flags for custom sched domains */
>  	int ndoms;		/* number of sched domains in result */
>  	int nslot;		/* next empty doms[] cpumask_t slot */
> 
>  	q = NULL;
>  	csa = NULL;
>  	doms = NULL;
> +	flags = NULL;
> 
>  	/* Special case for the 99% of systems with one, full, sched domain */
>  	if (is_sched_load_balance(&top_cpuset)) {
> @@ -566,6 +585,13 @@ static void rebuild_sched_domains(void)
>  		doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
>  		if (!doms)
>  			goto rebuild;
> +		if (is_sched_custom_domain(&top_cpuset)) {
> +			flags = kzalloc(sizeof(int), GFP_KERNEL);
> +			if (flags && is_sched_balance_newidle_far(&top_cpuset))
> +				*flags |= SD_BALANCE_NEWIDLE;
> +			if (flags && is_sched_wake_idle_far(&top_cpuset))
> +				*flags |= SD_WAKE_IDLE_FAR;
> +		}
>  		*doms = top_cpuset.cpus_allowed;
>  		goto rebuild;
>  	}
> @@ -622,6 +648,7 @@ restart:
>  	doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
>  	if (!doms)
>  		goto rebuild;
> +	flags = kzalloc(ndoms * sizeof(int), GFP_KERNEL);
> 
>  	for (nslot = 0, i = 0; i < csn; i++) {
>  		struct cpuset *a = csa[i];
> @@ -650,6 +677,13 @@ restart:
>  				if (apn == b->pn) {
>  					cpus_or(*dp, *dp, b->cpus_allowed);
>  					b->pn = -1;
> +					if (flags
> +					    && is_sched_balance_newidle_far(b))
> +						*(flags + nslot) |=
> +							SD_BALANCE_NEWIDLE;
> +					if (flags && is_sched_wake_idle_far(b))
> +						*(flags + nslot) |=
> +							SD_WAKE_IDLE_FAR;
>  				}
>  			}
>  			nslot++;
> @@ -660,7 +694,7 @@ restart:
>  rebuild:
>  	/* Have scheduler rebuild sched domains */
>  	get_online_cpus();
> -	partition_sched_domains(ndoms, doms);
> +	partition_sched_domains(ndoms, doms, flags);
>  	put_online_cpus();
> 
>  done:
> @@ -668,6 +702,7 @@ done:
>  		kfifo_free(q);
>  	kfree(csa);
>  	/* Don't kfree(doms) -- partition_sched_domains() does that. */
> +	/* Don't kfree(flags) -- partition_sched_domains() does that. */
>  }
> 
>  static inline int started_after_time(struct task_struct *t1,
> @@ -1011,10 +1046,26 @@ static int update_memory_pressure_enable
>  	return 0;
>  }
> 
> +static int need_rebuild_domains(struct cpuset *cs, struct cpuset *tcs)
> +{
> +	if (is_sched_load_balance(cs) != is_sched_load_balance(tcs))
> +		return 1;
> +	if (!is_sched_load_balance(tcs))
> +		return 0;
> +	if (is_sched_balance_newidle_far(cs) !=
> +					is_sched_balance_newidle_far(tcs))
> +		return 1;
> +	if (is_sched_wake_idle_far(cs) != is_sched_wake_idle_far(tcs))
> +		return 1;
> +	return 0;
> +}
> +
>  /*
>   * update_flag - read a 0 or a 1 in a file and update associated flag
>   * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
>   *				CS_SCHED_LOAD_BALANCE,
> + *				CS_SCHED_BALANCE_NEW_IDLE_FAR,
> + *				CS_SCHED_WAKE_IDLE_FAR,
>   *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
>   *				CS_SPREAD_PAGE, CS_SPREAD_SLAB)
>   * cs:	the cpuset to update
> @@ -1043,8 +1094,7 @@ static int update_flag(cpuset_flagbits_t
>  		return err;
> 
>  	cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
> -	balance_flag_changed = (is_sched_load_balance(cs) !=
> -		 			is_sched_load_balance(&trialcs));
> +	balance_flag_changed = need_rebuild_domains(cs, &trialcs);
> 
>  	mutex_lock(&callback_mutex);
>  	cs->flags = trialcs.flags;
> @@ -1202,6 +1252,8 @@ typedef enum {
>  	FILE_CPU_EXCLUSIVE,
>  	FILE_MEM_EXCLUSIVE,
>  	FILE_SCHED_LOAD_BALANCE,
> +	FILE_SCHED_BALANCE_NEWIDLE_FAR,
> +	FILE_SCHED_WAKE_IDLE_FAR,
>  	FILE_MEMORY_PRESSURE_ENABLED,
>  	FILE_MEMORY_PRESSURE,
>  	FILE_SPREAD_PAGE,
> @@ -1256,6 +1308,12 @@ static ssize_t cpuset_common_file_write(
>  	case FILE_SCHED_LOAD_BALANCE:
>  		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
>  		break;
> +	case FILE_SCHED_BALANCE_NEWIDLE_FAR:
> +		retval = update_flag(CS_SCHED_BALANCE_NEWIDLE_FAR, cs, buffer);
> +		break;
> +	case FILE_SCHED_WAKE_IDLE_FAR:
> +		retval = update_flag(CS_SCHED_WAKE_IDLE_FAR, cs, buffer);
> +		break;
>  	case FILE_MEMORY_MIGRATE:
>  		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
>  		break;
> @@ -1354,6 +1412,12 @@ static ssize_t cpuset_common_file_read(s
>  	case FILE_SCHED_LOAD_BALANCE:
>  		*s++ = is_sched_load_balance(cs) ? '1' : '0';
>  		break;
> +	case FILE_SCHED_BALANCE_NEWIDLE_FAR:
> +		*s++ = is_sched_balance_newidle_far(cs) ? '1' : '0';
> +		break;
> +	case FILE_SCHED_WAKE_IDLE_FAR:
> +		*s++ = is_sched_wake_idle_far(cs) ? '1' : '0';
> +		break;
>  	case FILE_MEMORY_MIGRATE:
>  		*s++ = is_memory_migrate(cs) ? '1' : '0';
>  		break;
> @@ -1424,6 +1488,20 @@ static struct cftype cft_sched_load_bala
>  	.private = FILE_SCHED_LOAD_BALANCE,
>  };
> 
> +static struct cftype cft_sched_balance_newidle_far = {
> +	.name = "sched_balance_newidle_far",
> +	.read = cpuset_common_file_read,
> +	.write = cpuset_common_file_write,
> +	.private = FILE_SCHED_BALANCE_NEWIDLE_FAR,
> +};
> +
> +static struct cftype cft_sched_wake_idle_far = {
> +	.name = "sched_wake_idle_far",
> +	.read = cpuset_common_file_read,
> +	.write = cpuset_common_file_write,
> +	.private = FILE_SCHED_WAKE_IDLE_FAR,
> +};
> +
>  static struct cftype cft_memory_migrate = {
>  	.name = "memory_migrate",
>  	.read = cpuset_common_file_read,
> @@ -1475,6 +1553,11 @@ static int cpuset_populate(struct cgroup
>  		return err;
>  	if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
>  		return err;
> +	if ((err = cgroup_add_file(cont, ss,
> +					&cft_sched_balance_newidle_far)) < 0)
> +		return err;
> +	if ((err = cgroup_add_file(cont, ss, &cft_sched_wake_idle_far)) < 0)
> +		return err;
>  	if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
>  		return err;
>  	if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
> Index: GIT-torvalds/include/linux/sched.h
> ===================================================================
> --- GIT-torvalds.orig/include/linux/sched.h
> +++ GIT-torvalds/include/linux/sched.h
> @@ -704,6 +704,7 @@ enum cpu_idle_type {
>  #define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
>  #define SD_SHARE_PKG_RESOURCES	512	/* Domain members share cpu pkg resources */
>  #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
> +#define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
> 
>  #define BALANCE_FOR_MC_POWER	\
>  	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
> @@ -789,7 +790,8 @@ struct sched_domain {
>  #endif
>  };
> 
> -extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
> +extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
> +				    int *flags_new);
>  extern int arch_reinit_sched_domains(void);
> 
>  #endif	/* CONFIG_SMP */
> Index: GIT-torvalds/kernel/sched.c
> ===================================================================
> --- GIT-torvalds.orig/kernel/sched.c
> +++ GIT-torvalds/kernel/sched.c
> @@ -6586,7 +6586,7 @@ static void init_sched_groups_power(int
>   * Build sched domains for a given set of cpus and attach the sched domains
>   * to the individual cpus
>   */
> -static int build_sched_domains(const cpumask_t *cpu_map)
> +static int __build_sched_domains(const cpumask_t *cpu_map, int flags)
>  {
>  	int i;
>  	struct root_domain *rd;
> @@ -6627,6 +6627,7 @@ static int build_sched_domains(const cpu
>  			sd = &per_cpu(allnodes_domains, i);
>  			*sd = SD_ALLNODES_INIT;
>  			sd->span = *cpu_map;
> +			/* prohibit "sd->flags |= flags" for allnodes_domain */
>  			cpu_to_allnodes_group(i, cpu_map, &sd->groups);
>  			p = sd;
>  			sd_allnodes = 1;
> @@ -6636,6 +6637,7 @@ static int build_sched_domains(const cpu
>  		sd = &per_cpu(node_domains, i);
>  		*sd = SD_NODE_INIT;
>  		sd->span = sched_domain_node_span(cpu_to_node(i));
> +		sd->flags |= flags;
>  		sd->parent = p;
>  		if (p)
>  			p->child = sd;
> @@ -6646,6 +6648,7 @@ static int build_sched_domains(const cpu
>  		sd = &per_cpu(phys_domains, i);
>  		*sd = SD_CPU_INIT;
>  		sd->span = nodemask;
> +		sd->flags |= flags;
>  		sd->parent = p;
>  		if (p)
>  			p->child = sd;
> @@ -6657,6 +6660,7 @@ static int build_sched_domains(const cpu
>  		*sd = SD_MC_INIT;
>  		sd->span = cpu_coregroup_map(i);
>  		cpus_and(sd->span, sd->span, *cpu_map);
> +		sd->flags |= flags;
>  		sd->parent = p;
>  		p->child = sd;
>  		cpu_to_core_group(i, cpu_map, &sd->groups);
> @@ -6668,6 +6672,7 @@ static int build_sched_domains(const cpu
>  		*sd = SD_SIBLING_INIT;
>  		sd->span = per_cpu(cpu_sibling_map, i);
>  		cpus_and(sd->span, sd->span, *cpu_map);
> +		sd->flags |= flags;
>  		sd->parent = p;
>  		p->child = sd;
>  		cpu_to_cpu_group(i, cpu_map, &sd->groups);
> @@ -6840,8 +6845,14 @@ error:
>  #endif
>  }
> 
> +static int build_sched_domains(const cpumask_t *cpu_map)
> +{
> +	return __build_sched_domains(cpu_map, 0);
> +}
> +
>  static cpumask_t *doms_cur;	/* current sched domains */
>  static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
> +static int *flags_cur;		/* custom flags of domains in 'doms_cur' */
> 
>  /*
>   * Special case: If a kmalloc of a doms_cur partition (array of
> @@ -6868,6 +6879,7 @@ static int arch_init_sched_domains(const
>  	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
>  	if (!doms_cur)
>  		doms_cur = &fallback_doms;
> +	flags_cur = NULL;
>  	cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
>  	err = build_sched_domains(doms_cur);
>  	register_sched_domain_sysctl();
> @@ -6896,6 +6908,16 @@ static void detach_destroy_domains(const
>  	arch_destroy_sched_domains(cpu_map);
>  }
> 
> +/* handle null as 0s array */
> +static inline int flags_equal(int *cur, int idx_cur, int *new, int idx_new)
> +{
> +	if (!new)
> +		return (!cur || !cur[idx_cur]);
> +	if (!cur)
> +		return (!new[idx_new]);
> +	return (cur[idx_cur] == new[idx_new]);
> +}
> +
>  /*
>   * Partition sched domains as specified by the 'ndoms_new'
>   * cpumasks in the array doms_new[] of cpumasks. This compares
> @@ -6917,7 +6939,7 @@ static void detach_destroy_domains(const
>   *
>   * Call with hotplug lock held
>   */
> -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
> +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, int *flags_new)
>  {
>  	int i, j;
> 
> @@ -6929,13 +6951,15 @@ void partition_sched_domains(int ndoms_n
>  	if (doms_new == NULL) {
>  		ndoms_new = 1;
>  		doms_new = &fallback_doms;
> +		flags_new = NULL;
>  		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
>  	}
> 
>  	/* Destroy deleted domains */
>  	for (i = 0; i < ndoms_cur; i++) {
>  		for (j = 0; j < ndoms_new; j++) {
> -			if (cpus_equal(doms_cur[i], doms_new[j]))
> +			if (cpus_equal(doms_cur[i], doms_new[j])
> +			    && flags_equal(flags_cur, i, flags_new, j))
>  				goto match1;
>  		}
>  		/* no match - a current sched domain not in new doms_new[] */
> @@ -6947,11 +6971,13 @@ match1:
>  	/* Build new domains */
>  	for (i = 0; i < ndoms_new; i++) {
>  		for (j = 0; j < ndoms_cur; j++) {
> -			if (cpus_equal(doms_new[i], doms_cur[j]))
> +			if (cpus_equal(doms_new[i], doms_cur[j])
> +			    && flags_equal(flags_new, i, flags_cur, j))
>  				goto match2;
>  		}
>  		/* no match - add a new doms_new */
> -		build_sched_domains(doms_new + i);
> +		__build_sched_domains(doms_new + i,
> +					flags_new ? flags_new[i] : 0);
>  match2:
>  		;
>  	}
> @@ -6959,7 +6985,9 @@ match2:
>  	/* Remember the new sched domains */
>  	if (doms_cur != &fallback_doms)
>  		kfree(doms_cur);
> +	kfree(flags_cur);	/* kfree(NULL) is safe */
>  	doms_cur = doms_new;
> +	flags_cur = flags_new;
>  	ndoms_cur = ndoms_new;
> 
>  	register_sched_domain_sysctl();
> Index: GIT-torvalds/include/asm-ia64/topology.h
> ===================================================================
> --- GIT-torvalds.orig/include/asm-ia64/topology.h
> +++ GIT-torvalds/include/asm-ia64/topology.h
> @@ -93,7 +93,7 @@ void build_cpu_to_node_map(void);
>  	.cache_nice_tries	= 2,			\
>  	.busy_idx		= 3,			\
>  	.idle_idx		= 2,			\
> -	.newidle_idx		= 0, /* unused */	\
> +	.newidle_idx		= 2,			\
>  	.wake_idx		= 1,			\
>  	.forkexec_idx		= 1,			\
>  	.flags			= SD_LOAD_BALANCE	\
> Index: GIT-torvalds/include/asm-sh/topology.h
> ===================================================================
> --- GIT-torvalds.orig/include/asm-sh/topology.h
> +++ GIT-torvalds/include/asm-sh/topology.h
> @@ -16,7 +16,7 @@
>  	.cache_nice_tries	= 2,			\
>  	.busy_idx		= 3,			\
>  	.idle_idx		= 2,			\
> -	.newidle_idx		= 0,			\
> +	.newidle_idx		= 2,			\
>  	.wake_idx		= 1,			\
>  	.forkexec_idx		= 1,			\
>  	.flags			= SD_LOAD_BALANCE	\
> Index: GIT-torvalds/include/asm-x86/topology.h
> ===================================================================
> --- GIT-torvalds.orig/include/asm-x86/topology.h
> +++ GIT-torvalds/include/asm-x86/topology.h
> @@ -129,7 +129,7 @@ extern unsigned long node_remap_size[];
> 
>  # define SD_CACHE_NICE_TRIES	2
>  # define SD_IDLE_IDX		2
> -# define SD_NEWIDLE_IDX		0
> +# define SD_NEWIDLE_IDX		2
>  # define SD_FORKEXEC_IDX	1
> 
>  #endif
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] Customize sched domain via cpuset
  2008-04-01 11:51 ` Peter Zijlstra
@ 2008-04-02  8:41   ` Hidetoshi Seto
  2008-04-02  8:44     ` Peter Zijlstra
  0 siblings, 1 reply; 4+ messages in thread
From: Hidetoshi Seto @ 2008-04-02  8:41 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, Ingo Molnar, Paul Jackson

Peter Zijlstra wrote:
> On Tue, 2008-04-01 at 20:27 +0900, Hidetoshi Seto wrote:
>> The implementation is here.
>>
>>  - Add 2 new cpuset files:
>>      sched_wake_idle_far
>>      sched_balance_newidle_far
>>
>>  - Modify partition_sched_domains() and build_sched_domains()
>>    to take flags parameter passed from cpuset.
>>
>>  - Fill newidle_idx for node domains which currently unused but
>>    might be required for sched_balance_newidle_far.
> 
> Just to be clear; the same effect can be had by poking into:
> 
>  /proc/sys/kernel/sched_domain/$cpu/$domain/flags
> 
> but this interface you now propose gives a more stable interface in that
> you'd have to re-do your setting after every cpuset change (admittedly
> those are rare, but I see how it could be a nuisance).

And the sysctl entry "sched_domain" is not available unless SCHED_DEBUG.

So it is common sense that this sysctl is not open to public yet,
and that the expected users are scheduler developers, Ingo and friends.

> Or do you actually add something that wasn't available through the
> initial domain interface?

At this time I have no idea, but it would be possible if there are
something unreasonable on global system but acceptable on a part.

In other words, we can invent other sched_* families which has special
effect that "default scheduler" never have.

Thanks,
H.Seto

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] Customize sched domain via cpuset
  2008-04-02  8:41   ` Hidetoshi Seto
@ 2008-04-02  8:44     ` Peter Zijlstra
  0 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2008-04-02  8:44 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: linux-kernel, Ingo Molnar, Paul Jackson

On Wed, 2008-04-02 at 17:41 +0900, Hidetoshi Seto wrote:
> Peter Zijlstra wrote:
> > On Tue, 2008-04-01 at 20:27 +0900, Hidetoshi Seto wrote:
> >> The implementation is here.
> >>
> >>  - Add 2 new cpuset files:
> >>      sched_wake_idle_far
> >>      sched_balance_newidle_far
> >>
> >>  - Modify partition_sched_domains() and build_sched_domains()
> >>    to take flags parameter passed from cpuset.
> >>
> >>  - Fill newidle_idx for node domains which currently unused but
> >>    might be required for sched_balance_newidle_far.
> > 
> > Just to be clear; the same effect can be had by poking into:
> > 
> >  /proc/sys/kernel/sched_domain/$cpu/$domain/flags
> > 
> > but this interface you now propose gives a more stable interface in that
> > you'd have to re-do your setting after every cpuset change (admittedly
> > those are rare, but I see how it could be a nuisance).
> 
> And the sysctl entry "sched_domain" is not available unless SCHED_DEBUG.
> 
> So it is common sense that this sysctl is not open to public yet,
> and that the expected users are scheduler developers, Ingo and friends.

Ah, right, totally forgot about that :-)

> > Or do you actually add something that wasn't available through the
> > initial domain interface?
> 
> At this time I have no idea, but it would be possible if there are
> something unreasonable on global system but acceptable on a part.
> 
> In other words, we can invent other sched_* families which has special
> effect that "default scheduler" never have.

I was asking about this patch in specific, and the answer seems to be:
no, we don't add anything that wasn't available.

And yes, I see the possiblilties to extend this :-)


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-04-02  8:44 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-04-01 11:27 [PATCH 2/2] Customize sched domain via cpuset Hidetoshi Seto
2008-04-01 11:51 ` Peter Zijlstra
2008-04-02  8:41   ` Hidetoshi Seto
2008-04-02  8:44     ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).