LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] sched: Add a new version sysctl to control child runs first
@ 2021-09-12  4:12 cgel.zte
  2021-09-13  8:13 ` Peter Zijlstra
  0 siblings, 1 reply; 6+ messages in thread
From: cgel.zte @ 2021-09-12  4:12 UTC (permalink / raw)
  To: peterz, yzaikin, liu.hailong6
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, mcgrof, keescook, pjt, yang.yang29,
	joshdon, linux-kernel, linux-fsdevel, Zeal Robot

From: Yang Yang <yang.yang29@zte.com.cn>

The old version sysctl has some problems. First, it allows set value
bigger than 1, which is unnecessary. Second, it didn't follow the
rule of capabilities. Thirdly, it didn't use static key. This new
version fixes all the problems.

Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Reported-by: Zeal Robot <zealci@zte.com.cm>
---
 include/linux/sched/sysctl.h |  2 ++
 kernel/sched/core.c          | 35 +++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |  3 ++-
 kernel/sched/sched.h         |  1 +
 kernel/sysctl.c              |  6 ++++--
 5 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 304f431178fd..0a194d0cf692 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -74,6 +74,8 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
 int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
+int sysctl_child_runs_first(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 extern unsigned int sysctl_sched_energy_aware;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c4462c454ab9..bfea7ecf3b83 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4323,6 +4323,41 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
 #endif /* CONFIG_PROC_SYSCTL */
 #endif /* CONFIG_SCHEDSTATS */
 
+DEFINE_STATIC_KEY_FALSE(child_runs_first);
+
+static void set_child_runs_first(bool enabled)
+{
+	if (enabled) {
+		static_branch_enable(&child_runs_first);
+		sysctl_sched_child_runs_first = 1;
+	} else {
+		static_branch_disable(&child_runs_first);
+		sysctl_sched_child_runs_first = 0;
+	}
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_child_runs_first(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&child_runs_first);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		set_child_runs_first(state);
+	return err;
+}
+#endif /* CONFIG_PROC_SYSCTL */
+
 /*
  * fork()/clone()-time setup:
  */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff69f245b939..f6d4307bd654 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11099,7 +11099,8 @@ static void task_fork_fair(struct task_struct *p)
 	}
 	place_entity(cfs_rq, se, 1);
 
-	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
+	if (static_branch_unlikely(&child_runs_first) &&
+	    curr && entity_before(curr, se)) {
 		/*
 		 * Upon rescheduling, sched_class::put_prev_task() will place
 		 * 'current' within the tree based on its new key value.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e5793e117..89ac11e48173 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2002,6 +2002,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
 
 extern struct static_key_false sched_numa_balancing;
 extern struct static_key_false sched_schedstats;
+DECLARE_STATIC_KEY_FALSE(child_runs_first);
 
 static inline u64 global_rt_period(void)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 083be6af29d7..72063cffc565 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1773,10 +1773,12 @@ int proc_do_static_key(struct ctl_table *table, int write,
 static struct ctl_table kern_table[] = {
 	{
 		.procname	= "sched_child_runs_first",
-		.data		= &sysctl_sched_child_runs_first,
+		.data		= NULL,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= sysctl_child_runs_first,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #ifdef CONFIG_SCHEDSTATS
 	{
-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] sched: Add a new version sysctl to control child runs first
  2021-09-12  4:12 [PATCH] sched: Add a new version sysctl to control child runs first cgel.zte
@ 2021-09-13  8:13 ` Peter Zijlstra
  2021-09-13 11:37   ` CGEL
  0 siblings, 1 reply; 6+ messages in thread
From: Peter Zijlstra @ 2021-09-13  8:13 UTC (permalink / raw)
  To: cgel.zte
  Cc: yzaikin, liu.hailong6, mingo, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot, mcgrof,
	keescook, pjt, yang.yang29, joshdon, linux-kernel, linux-fsdevel,
	Zeal Robot

On Sun, Sep 12, 2021 at 04:12:23AM +0000, cgel.zte@gmail.com wrote:
> From: Yang Yang <yang.yang29@zte.com.cn>
> 
> The old version sysctl has some problems. First, it allows set value
> bigger than 1, which is unnecessary. Second, it didn't follow the
> rule of capabilities. Thirdly, it didn't use static key. This new
> version fixes all the problems.

Does any of that actually matter?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] sched: Add a new version sysctl to control child runs first
  2021-09-13  8:13 ` Peter Zijlstra
@ 2021-09-13 11:37   ` CGEL
  2021-09-13 13:42     ` Peter Zijlstra
  0 siblings, 1 reply; 6+ messages in thread
From: CGEL @ 2021-09-13 11:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: yzaikin, liu.hailong6, mingo, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot, mcgrof,
	keescook, pjt, yang.yang29, joshdon, linux-kernel, linux-fsdevel,
	Zeal Robot

On Mon, Sep 13, 2021 at 10:13:54AM +0200, Peter Zijlstra wrote:
> On Sun, Sep 12, 2021 at 04:12:23AM +0000, cgel.zte@gmail.com wrote:
> > From: Yang Yang <yang.yang29@zte.com.cn>
> > 
> > The old version sysctl has some problems. First, it allows set value
> > bigger than 1, which is unnecessary. Second, it didn't follow the
> > rule of capabilities. Thirdly, it didn't use static key. This new
> > version fixes all the problems.
> 
> Does any of that actually matter?

For the first problem, I think the reason why sysctl_schedstats() only
accepts 0 or 1, is suitbale for sysctl_child_runs_first(). Since
task_fork_fair() only need sysctl_sched_child_runs_first to be
zero or non-zero.

For the second problem, I remember there is a rule: try to
administration system through capilities but not depends on
root identity. Just like sysctl_schedstats() or other
sysctl_xx().

For the thirdly problem, sysctl_child_runs_first maynot changes
often, but may accessed often, like static_key delayacct_key
controlled by sysctl_delayacct().

Thanks!

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] sched: Add a new version sysctl to control child runs first
  2021-09-13 11:37   ` CGEL
@ 2021-09-13 13:42     ` Peter Zijlstra
  2021-09-14  4:05       ` CGEL
       [not found]       ` <20210914040524.GA141438@cgel.zte@gmail.com>
  0 siblings, 2 replies; 6+ messages in thread
From: Peter Zijlstra @ 2021-09-13 13:42 UTC (permalink / raw)
  To: CGEL
  Cc: yzaikin, liu.hailong6, mingo, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot, mcgrof,
	keescook, pjt, yang.yang29, joshdon, linux-kernel, linux-fsdevel,
	Zeal Robot

On Mon, Sep 13, 2021 at 11:37:31AM +0000, CGEL wrote:
> On Mon, Sep 13, 2021 at 10:13:54AM +0200, Peter Zijlstra wrote:
> > On Sun, Sep 12, 2021 at 04:12:23AM +0000, cgel.zte@gmail.com wrote:
> > > From: Yang Yang <yang.yang29@zte.com.cn>
> > > 
> > > The old version sysctl has some problems. First, it allows set value
> > > bigger than 1, which is unnecessary. Second, it didn't follow the
> > > rule of capabilities. Thirdly, it didn't use static key. This new
> > > version fixes all the problems.
> > 
> > Does any of that actually matter?
> 
> For the first problem, I think the reason why sysctl_schedstats() only
> accepts 0 or 1, is suitbale for sysctl_child_runs_first(). Since
> task_fork_fair() only need sysctl_sched_child_runs_first to be
> zero or non-zero.

This could potentially break people that already write a larger value in
it -- by accident or otherwise.

> For the second problem, I remember there is a rule: try to
> administration system through capilities but not depends on
> root identity. Just like sysctl_schedstats() or other
> sysctl_xx().

It seems entirely daft to me; those files are already 644, if root opens
the file and passes it along, it gets to keep the pieces.

> For the thirdly problem, sysctl_child_runs_first maynot changes
> often, but may accessed often, like static_key delayacct_key
> controlled by sysctl_delayacct().

Can you actually show it makes a performance difference in a fork
micro-bench? Given the amount of gunk fork() already does, I don't think
it'll matter one way or the other, and in that case, simpler is better.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] sched: Add a new version sysctl to control child runs first
  2021-09-13 13:42     ` Peter Zijlstra
@ 2021-09-14  4:05       ` CGEL
       [not found]       ` <20210914040524.GA141438@cgel.zte@gmail.com>
  1 sibling, 0 replies; 6+ messages in thread
From: CGEL @ 2021-09-14  4:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: yzaikin, liu.hailong6, mingo, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot, mcgrof,
	keescook, pjt, yang.yang29, joshdon, linux-kernel, linux-fsdevel,
	Zeal Robot

esOn Mon, Sep 13, 2021 at 03:42:45PM +0200, Peter Zijlstra wrote:
> On Mon, Sep 13, 2021 at 11:37:31AM +0000, CGEL wrote:
> > On Mon, Sep 13, 2021 at 10:13:54AM +0200, Peter Zijlstra wrote:
> > > On Sun, Sep 12, 2021 at 04:12:23AM +0000, cgel.zte@gmail.com wrote:
> > > > From: Yang Yang <yang.yang29@zte.com.cn>
> > > > 
> > > > The old version sysctl has some problems. First, it allows set value
> > > > bigger than 1, which is unnecessary. Second, it didn't follow the
> > > > rule of capabilities. Thirdly, it didn't use static key. This new
> > > > version fixes all the problems.
> > > 
> > > Does any of that actually matter?
> > 
> > For the first problem, I think the reason why sysctl_schedstats() only
> > accepts 0 or 1, is suitbale for sysctl_child_runs_first(). Since
> > task_fork_fair() only need sysctl_sched_child_runs_first to be
> > zero or non-zero.
> 
> This could potentially break people that already write a larger value in
> it -- by accident or otherwise.

Thanks for reply!

You mean it's right to set sched_child_runs_first 0 or 1, but consider about
compatibility, just leave it?
Should stable/longterm branches keep compatibility, but linux-next fixes it?

Let's take a look at negative influence about unnecessary values of sysctl.
Some tune tools will automatic to set different values of sysctl to see
performance impact. So invalid values may waste tune tools's time, specially
when the range of values is big.

For example A-Tune, see below:
https://docs.openeuler.org/zh/docs/20.03_LTS/docs/A-Tune/%E8%AE%A4%E8%AF%86A-Tune.html 
Since it's wroten in Chinese, I try to explain it in short.
A-Tune modeling sysctls first(what values sysctls accept), then automatic to iterate
different values to find the best combination of sysctl values for the workload.

> 
> > For the second problem, I remember there is a rule: try to
> > administration system through capilities but not depends on
> > root identity. Just like sysctl_schedstats() or other
> > sysctl_xx().
> 
> It seems entirely daft to me; those files are already 644, if root opens
> the file and passes it along, it gets to keep the pieces.
> 

I think it's indeed a little tricky: root may drop it's own capabilites.
Let's see another example of netdev_store(), root can't modify netdev
attribute without CAP_NET_ADMIN, even it pass the 644 DAC check.

> > For the thirdly problem, sysctl_child_runs_first maynot changes
> > often, but may accessed often, like static_key delayacct_key
> > controlled by sysctl_delayacct().
> 
> Can you actually show it makes a performance difference in a fork
> micro-bench? Given the amount of gunk fork() already does, I don't think
> it'll matter one way or the other, and in that case, simpler is better.

With 5.14-rc6 and gcc6.2.0, this patch will reduce test instruct in
task_fork_fair() as Documentation/staging/static-keys.rst said.
Since task_fork_fair() may called often, I think it's OK to use static
key, actually there are quit a lot static keys in kernel/xx.

When talk about simply, maybe keep in consistent with other sysctls like
task_delayacct() is also a kind of simply in code style.

Before this patch:
ffff810a5c60 <task_fork_fair>:
..
ffffffff810a5cf3: e8 a8 b3 ff ff       callq ffffffff810a10a0 <place_entity>
ffffffff810a5cf8: 8b 05 e2 b5 5d 01    mov 0x15db5e2(%rip),%eax # ffffffff826812e0 <sysctl_sched_child_runs_first>
ffffffff810a5cfe: 85 c0                test %eax,%eax
ffffffff810a5d00: 74 5b                je ffffffff810a5d5d <task_fork_fair+0xfd>
ffffffff810a5d02: 49 8b 55 50          mov 0x50(%r13),%rdx
ffffffff810a5d06: 49 8b 84 24 10 01 00 mov 0x110(%r12),%rax
ffffffff810a5d0d: 00 
ffffffff810a5d0e: 48 39 c2             cmp %rax,%rdx
ffffffff810a5d11: 78 36                js ffffffff810a5d49 <task_fork_fair+0xe9>
ffffffff810a5d13: 48 2b 45 28          sub    0x28(%rbp),%rax

After this patch:
ffffffff810a5c60 <task_fork_fair>:
..
ffffffff810a5cf3: e8 a8 b3 ff ff       callq  ffffffff810a10a0 <place_entity>
ffffffff810a5cf8: 66 90                xchg   %ax,%ax
ffffffff810a5cfa: 49 8b 84 24 10 01 00 mov    0x110(%r12),%rax
ffffffff810a5d01: 00
ffffffff810a5d02: 48 2b 45 28          sub    0x28(%rbp),%rax

Thanks!

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] sched: Add a new version sysctl to control child runs first
       [not found]       ` <20210914040524.GA141438@cgel.zte@gmail.com>
@ 2021-10-07  3:26         ` CGEL
  0 siblings, 0 replies; 6+ messages in thread
From: CGEL @ 2021-10-07  3:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: yzaikin, mingo, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, bristot, mcgrof, keescook, pjt,
	yang.yang29, joshdon, linux-kernel, linux-fsdevel, Zeal Robot

On Tue, Sep 14, 2021 at 04:05:26AM +0000, CGEL wrote:
> esOn Mon, Sep 13, 2021 at 03:42:45PM +0200, Peter Zijlstra wrote:
> > On Mon, Sep 13, 2021 at 11:37:31AM +0000, CGEL wrote:
> > > On Mon, Sep 13, 2021 at 10:13:54AM +0200, Peter Zijlstra wrote:
> > > > On Sun, Sep 12, 2021 at 04:12:23AM +0000, cgel.zte@gmail.com wrote:
> > > > > From: Yang Yang <yang.yang29@zte.com.cn>
> > > > > 
> > > > > The old version sysctl has some problems. First, it allows set value
> > > > > bigger than 1, which is unnecessary. Second, it didn't follow the
> > > > > rule of capabilities. Thirdly, it didn't use static key. This new
> > > > > version fixes all the problems.
> > > > 
> > > > Does any of that actually matter?
> > > 
> > > For the first problem, I think the reason why sysctl_schedstats() only
> > > accepts 0 or 1, is suitbale for sysctl_child_runs_first(). Since
> > > task_fork_fair() only need sysctl_sched_child_runs_first to be
> > > zero or non-zero.
> > 
> > This could potentially break people that already write a larger value in
> > it -- by accident or otherwise.
> 
> Thanks for reply!
> 
> You mean it's right to set sched_child_runs_first 0 or 1, but consider about
> compatibility, just leave it?
> Should stable/longterm branches keep compatibility, but linux-next fixes it?
> 
> Let's take a look at negative influence about unnecessary values of sysctl.
> Some tune tools will automatic to set different values of sysctl to see
> performance impact. So invalid values may waste tune tools's time, specially
> when the range of values is big.
> 
> For example A-Tune, see below:
> https://docs.openeuler.org/zh/docs/20.03_LTS/docs/A-Tune/%E8%AE%A4%E8%AF%86A-Tune.html 
> Since it's wroten in Chinese, I try to explain it in short.
> A-Tune modeling sysctls first(what values sysctls accept), then automatic to iterate
> different values to find the best combination of sysctl values for the workload.
>
Hi 
Should modify this path or just abandon it?

> > 
> > > For the second problem, I remember there is a rule: try to
> > > administration system through capilities but not depends on
> > > root identity. Just like sysctl_schedstats() or other
> > > sysctl_xx().
> > 
> > It seems entirely daft to me; those files are already 644, if root opens
> > the file and passes it along, it gets to keep the pieces.
> > 
> 
> I think it's indeed a little tricky: root may drop it's own capabilites.
> Let's see another example of netdev_store(), root can't modify netdev
> attribute without CAP_NET_ADMIN, even it pass the 644 DAC check.
> 
> > > For the thirdly problem, sysctl_child_runs_first maynot changes
> > > often, but may accessed often, like static_key delayacct_key
> > > controlled by sysctl_delayacct().
> > 
> > Can you actually show it makes a performance difference in a fork
> > micro-bench? Given the amount of gunk fork() already does, I don't think
> > it'll matter one way or the other, and in that case, simpler is better.
> 
> With 5.14-rc6 and gcc6.2.0, this patch will reduce test instruct in
> task_fork_fair() as Documentation/staging/static-keys.rst said.
> Since task_fork_fair() may called often, I think it's OK to use static
> key, actually there are quit a lot static keys in kernel/xx.
> 
> When talk about simply, maybe keep in consistent with other sysctls like
> task_delayacct() is also a kind of simply in code style.
> 
> Before this patch:
> ffff810a5c60 <task_fork_fair>:
> ..
> ffffffff810a5cf3: e8 a8 b3 ff ff       callq ffffffff810a10a0 <place_entity>
> ffffffff810a5cf8: 8b 05 e2 b5 5d 01    mov 0x15db5e2(%rip),%eax # ffffffff826812e0 <sysctl_sched_child_runs_first>
> ffffffff810a5cfe: 85 c0                test %eax,%eax
> ffffffff810a5d00: 74 5b                je ffffffff810a5d5d <task_fork_fair+0xfd>
> ffffffff810a5d02: 49 8b 55 50          mov 0x50(%r13),%rdx
> ffffffff810a5d06: 49 8b 84 24 10 01 00 mov 0x110(%r12),%rax
> ffffffff810a5d0d: 00 
> ffffffff810a5d0e: 48 39 c2             cmp %rax,%rdx
> ffffffff810a5d11: 78 36                js ffffffff810a5d49 <task_fork_fair+0xe9>
> ffffffff810a5d13: 48 2b 45 28          sub    0x28(%rbp),%rax
> 
> After this patch:
> ffffffff810a5c60 <task_fork_fair>:
> ..
> ffffffff810a5cf3: e8 a8 b3 ff ff       callq  ffffffff810a10a0 <place_entity>
> ffffffff810a5cf8: 66 90                xchg   %ax,%ax
> ffffffff810a5cfa: 49 8b 84 24 10 01 00 mov    0x110(%r12),%rax
> ffffffff810a5d01: 00
> ffffffff810a5d02: 48 2b 45 28          sub    0x28(%rbp),%rax
> 
> Thanks!

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-10-07  3:26 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-12  4:12 [PATCH] sched: Add a new version sysctl to control child runs first cgel.zte
2021-09-13  8:13 ` Peter Zijlstra
2021-09-13 11:37   ` CGEL
2021-09-13 13:42     ` Peter Zijlstra
2021-09-14  4:05       ` CGEL
     [not found]       ` <20210914040524.GA141438@cgel.zte@gmail.com>
2021-10-07  3:26         ` CGEL

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).