LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [rfc] balance-on-fork NUMA placement
@ 2007-07-31  5:41 Nick Piggin
  2007-07-31  8:01 ` Ingo Molnar
  2007-07-31  9:14 ` Andi Kleen
  0 siblings, 2 replies; 27+ messages in thread
From: Nick Piggin @ 2007-07-31  5:41 UTC (permalink / raw)
  To: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

Hi,

I haven't given this idea testing yet, but I just wanted to get some
opinions on it first. NUMA placement still isn't ideal (eg. tasks with
a memory policy will not do any placement, and process migrations of
course will leave the memory behind...), but it does give a bit more
chance for the memory controllers and interconnects to get evenly
loaded.

The primary reason for currently doing balance on fork is to improve
the NUMA placement of user memory, so on the basis that is useful, I
think it should be useful for kernel memory too?

---
NUMA balance-on-fork code is in a good position to allocate all of a new
process's memory on a chosen node. However, it really only starts allocating
on the correct node after the process starts running.

task and thread structures, stack, mm_struct, vmas, page tables etc. are
all allocated on the parent's node.

This patch uses memory policies to attempt to improve this. It requires
that we ask the scheduler to suggest the child's new CPU earlier in the
fork, but that is not a fundamental difference.



Index: linux-2.6/include/linux/mempolicy.h
===================================================================
--- linux-2.6.orig/include/linux/mempolicy.h
+++ linux-2.6/include/linux/mempolicy.h
@@ -141,6 +141,8 @@ void mpol_free_shared_policy(struct shar
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
+extern int mpol_prefer_cpu_start(int cpu);
+extern void mpol_prefer_cpu_end(int arg);
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
@@ -227,6 +229,15 @@ mpol_shared_policy_lookup(struct shared_
 #define vma_policy(vma) NULL
 #define vma_set_policy(vma, pol) do {} while(0)
 
+static inline int mpol_prefer_cpu_start(int cpu)
+{
+	return 0;
+}
+
+static inline void mpol_prefer_cpu_end(int arg)
+{
+}
+
 static inline void numa_policy_init(void)
 {
 }
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1460,6 +1460,7 @@ extern void FASTCALL(wake_up_new_task(st
 #else
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
+extern int sched_fork_suggest_cpu(int clone_flags);
 extern void sched_fork(struct task_struct *p, int clone_flags);
 extern void sched_dead(struct task_struct *p);
 
@@ -1782,6 +1783,7 @@ static inline unsigned int task_cpu(cons
 }
 
 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
+extern void __set_task_cpu(struct task_struct *p, unsigned int cpu);
 
 #else
 
@@ -1794,6 +1796,10 @@ static inline void set_task_cpu(struct t
 {
 }
 
+extern void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+}
+
 #endif /* CONFIG_SMP */
 
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -964,6 +964,7 @@ static struct task_struct *copy_process(
 					int __user *child_tidptr,
 					struct pid *pid)
 {
+	int cpu, mpol_arg;
 	int retval;
 	struct task_struct *p = NULL;
 
@@ -989,10 +990,13 @@ static struct task_struct *copy_process(
 	if (retval)
 		goto fork_out;
 
+	cpu = sched_fork_suggest_cpu(clone_flags);
+	mpol_arg = mpol_prefer_cpu_start(cpu);
+
 	retval = -ENOMEM;
 	p = dup_task_struct(current);
 	if (!p)
-		goto fork_out;
+		goto fork_mpol;
 
 	rt_mutex_init_task(p);
 
@@ -1183,7 +1187,7 @@ static struct task_struct *copy_process(
 	INIT_LIST_HEAD(&p->ptrace_children);
 	INIT_LIST_HEAD(&p->ptrace_list);
 
-	/* Perform scheduler related setup. Assign this task to a CPU. */
+	/* Perform scheduler related setup. */
 	sched_fork(p, clone_flags);
 
 	/* Need tasklist lock for parent etc handling! */
@@ -1193,6 +1197,7 @@ static struct task_struct *copy_process(
 	p->ioprio = current->ioprio;
 
 	/*
+	 * Assign this task to a CPU.
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.
 	 *
@@ -1202,9 +1207,10 @@ static struct task_struct *copy_process(
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
 	p->cpus_allowed = current->cpus_allowed;
-	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
-			!cpu_online(task_cpu(p))))
-		set_task_cpu(p, smp_processor_id());
+	if (unlikely(!cpu_isset(cpu, p->cpus_allowed) ||
+			!cpu_online(cpu)))
+		cpu = smp_processor_id();
+	__set_task_cpu(p, cpu);
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
@@ -1274,6 +1280,7 @@ static struct task_struct *copy_process(
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
+	mpol_prefer_cpu_end(mpol_arg);
 	return p;
 
 bad_fork_cleanup_namespaces:
@@ -1315,6 +1322,8 @@ bad_fork_cleanup_count:
 	free_uid(p->user);
 bad_fork_free:
 	free_task(p);
+fork_mpol:
+	mpol_prefer_cpu_end(mpol_arg);
 fork_out:
 	return ERR_PTR(retval);
 }
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -981,16 +981,13 @@ unsigned long weighted_cpuload(const int
 	return cpu_rq(cpu)->ls.load.weight;
 }
 
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
 #ifdef CONFIG_SMP
+void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
 	task_thread_info(p)->cpu = cpu;
 	set_task_cfs_rq(p);
-#endif
 }
 
-#ifdef CONFIG_SMP
-
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
@@ -1601,20 +1598,26 @@ static void __sched_fork(struct task_str
 	p->state = TASK_RUNNING;
 }
 
+int sched_fork_suggest_cpu(int clone_flags)
+{
+#ifdef CONFIG_SMP
+	int cpu, new_cpu;
+	cpu = get_cpu();
+	new_cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+	put_cpu();
+	return new_cpu;
+#else
+	return 0;
+#endif
+}
+
 /*
  * fork()/clone()-time setup:
  */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
-	int cpu = get_cpu();
-
 	__sched_fork(p);
 
-#ifdef CONFIG_SMP
-	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
-#endif
-	__set_task_cpu(p, cpu);
-
 	/*
 	 * Make sure we do not leak PI boosting priority to the child:
 	 */
@@ -1631,7 +1634,6 @@ void sched_fork(struct task_struct *p, i
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
-	put_cpu();
 }
 
 /*
Index: linux-2.6/mm/mempolicy.c
===================================================================
--- linux-2.6.orig/mm/mempolicy.c
+++ linux-2.6/mm/mempolicy.c
@@ -1596,6 +1596,29 @@ void mpol_free_shared_policy(struct shar
 	spin_unlock(&p->lock);
 }
 
+int mpol_prefer_cpu_start(int cpu)
+{
+	nodemask_t prefer_node = nodemask_of_node(cpu_to_node(cpu));
+
+	/* Only change if we are MPOL_DEFAULT */
+	if (current->mempolicy)
+		return 0;
+
+	if (do_set_mempolicy(MPOL_PREFERRED, &prefer_node))
+		return 0;
+
+	return 1;
+}
+
+void mpol_prefer_cpu_end(int arg)
+{
+	if (!arg)
+		return;
+
+	if (do_set_mempolicy(MPOL_DEFAULT, NULL))
+		BUG();
+}
+
 /* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-07-31  5:41 [rfc] balance-on-fork NUMA placement Nick Piggin
@ 2007-07-31  8:01 ` Ingo Molnar
  2007-08-01  0:21   ` Nick Piggin
  2007-07-31  9:14 ` Andi Kleen
  1 sibling, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2007-07-31  8:01 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Linux Kernel Mailing List, Linux Memory Management List


* Nick Piggin <npiggin@suse.de> wrote:

> This patch uses memory policies to attempt to improve this. It 
> requires that we ask the scheduler to suggest the child's new CPU 
> earlier in the fork, but that is not a fundamental difference.

no fundamental objections, but i think we could simply move sched_fork() 
to the following place:

> @@ -989,10 +990,13 @@ static struct task_struct *copy_process(
>  	if (retval)
>  		goto fork_out;
>  
> +	cpu = sched_fork_suggest_cpu(clone_flags);
> +	mpol_arg = mpol_prefer_cpu_start(cpu);
> +
>  	retval = -ENOMEM;
>  	p = dup_task_struct(current);
>  	if (!p)
> -		goto fork_out;
> +		goto fork_mpol;
>  
>  	rt_mutex_init_task(p);


_after_ the dup_task_struct(). Then change sched_fork() to return a CPU 
number - hence we dont have a separate sched_fork_suggest_cpu() 
initialization function, only one, obvious sched_fork() function. 
Agreed?

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-07-31  5:41 [rfc] balance-on-fork NUMA placement Nick Piggin
  2007-07-31  8:01 ` Ingo Molnar
@ 2007-07-31  9:14 ` Andi Kleen
  2007-07-31 23:40   ` Christoph Lameter
  2007-08-01  0:23   ` Nick Piggin
  1 sibling, 2 replies; 27+ messages in thread
From: Andi Kleen @ 2007-07-31  9:14 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Linux Kernel Mailing List, Linux Memory Management List

On Tuesday 31 July 2007 07:41, Nick Piggin wrote:

> I haven't given this idea testing yet, but I just wanted to get some
> opinions on it first. NUMA placement still isn't ideal (eg. tasks with
> a memory policy will not do any placement, and process migrations of
> course will leave the memory behind...), but it does give a bit more
> chance for the memory controllers and interconnects to get evenly
> loaded.

I didn't think slab honored mempolicies by default? 
At least you seem to need to set special process flags.

> NUMA balance-on-fork code is in a good position to allocate all of a new
> process's memory on a chosen node. However, it really only starts
> allocating on the correct node after the process starts running.
>
> task and thread structures, stack, mm_struct, vmas, page tables etc. are
> all allocated on the parent's node.

The page tables should be only allocated when the process runs; except
for the PGD.

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-07-31  9:14 ` Andi Kleen
@ 2007-07-31 23:40   ` Christoph Lameter
  2007-08-01  8:39     ` Andi Kleen
  2007-08-02  3:42     ` Nick Piggin
  2007-08-01  0:23   ` Nick Piggin
  1 sibling, 2 replies; 27+ messages in thread
From: Christoph Lameter @ 2007-07-31 23:40 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Nick Piggin, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Tue, 31 Jul 2007, Andi Kleen wrote:

> On Tuesday 31 July 2007 07:41, Nick Piggin wrote:
> 
> > I haven't given this idea testing yet, but I just wanted to get some
> > opinions on it first. NUMA placement still isn't ideal (eg. tasks with
> > a memory policy will not do any placement, and process migrations of
> > course will leave the memory behind...), but it does give a bit more
> > chance for the memory controllers and interconnects to get evenly
> > loaded.
> 
> I didn't think slab honored mempolicies by default? 
> At least you seem to need to set special process flags.

It does in the sense that slabs are allocated following policies. If you 
want to place individual objects then you need to use kmalloc_node().

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-07-31  8:01 ` Ingo Molnar
@ 2007-08-01  0:21   ` Nick Piggin
  2007-08-01  6:19     ` Ingo Molnar
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2007-08-01  0:21 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Linux Kernel Mailing List, Linux Memory Management List

On Tue, Jul 31, 2007 at 10:01:14AM +0200, Ingo Molnar wrote:
> 
> * Nick Piggin <npiggin@suse.de> wrote:
> 
> > This patch uses memory policies to attempt to improve this. It 
> > requires that we ask the scheduler to suggest the child's new CPU 
> > earlier in the fork, but that is not a fundamental difference.
> 
> no fundamental objections, but i think we could simply move sched_fork() 
> to the following place:
> 
> > @@ -989,10 +990,13 @@ static struct task_struct *copy_process(
> >  	if (retval)
> >  		goto fork_out;
> >  
> > +	cpu = sched_fork_suggest_cpu(clone_flags);
> > +	mpol_arg = mpol_prefer_cpu_start(cpu);
> > +
> >  	retval = -ENOMEM;
> >  	p = dup_task_struct(current);
> >  	if (!p)
> > -		goto fork_out;
> > +		goto fork_mpol;
> >  
> >  	rt_mutex_init_task(p);
> 
> 
> _after_ the dup_task_struct(). Then change sched_fork() to return a CPU 
> number - hence we dont have a separate sched_fork_suggest_cpu() 
> initialization function, only one, obvious sched_fork() function. 
> Agreed?

That puts task struct, kernel stack, thread info on the wrong node.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-07-31  9:14 ` Andi Kleen
  2007-07-31 23:40   ` Christoph Lameter
@ 2007-08-01  0:23   ` Nick Piggin
  2007-08-01 17:53     ` Martin Bligh
  1 sibling, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2007-08-01  0:23 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ingo Molnar, Linux Kernel Mailing List, Linux Memory Management List

On Tue, Jul 31, 2007 at 11:14:08AM +0200, Andi Kleen wrote:
> On Tuesday 31 July 2007 07:41, Nick Piggin wrote:
> 
> > I haven't given this idea testing yet, but I just wanted to get some
> > opinions on it first. NUMA placement still isn't ideal (eg. tasks with
> > a memory policy will not do any placement, and process migrations of
> > course will leave the memory behind...), but it does give a bit more
> > chance for the memory controllers and interconnects to get evenly
> > loaded.
> 
> I didn't think slab honored mempolicies by default? 
> At least you seem to need to set special process flags.
> 
> > NUMA balance-on-fork code is in a good position to allocate all of a new
> > process's memory on a chosen node. However, it really only starts
> > allocating on the correct node after the process starts running.
> >
> > task and thread structures, stack, mm_struct, vmas, page tables etc. are
> > all allocated on the parent's node.
> 
> The page tables should be only allocated when the process runs; except
> for the PGD.

We certainly used to copy all page tables on fork. Not any more, but we
must still copy anonymous page tables.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-01  0:21   ` Nick Piggin
@ 2007-08-01  6:19     ` Ingo Molnar
  0 siblings, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2007-08-01  6:19 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Linux Kernel Mailing List, Linux Memory Management List


* Nick Piggin <npiggin@suse.de> wrote:

> > _after_ the dup_task_struct(). Then change sched_fork() to return a 
> > CPU number - hence we dont have a separate sched_fork_suggest_cpu() 
> > initialization function, only one, obvious sched_fork() function. 
> > Agreed?
> 
> That puts task struct, kernel stack, thread info on the wrong node.

ok, i missed that - your patch looks then fine to me.

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-07-31 23:40   ` Christoph Lameter
@ 2007-08-01  8:39     ` Andi Kleen
  2007-08-02  3:42     ` Nick Piggin
  1 sibling, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2007-08-01  8:39 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Wednesday 01 August 2007 01:40:18 Christoph Lameter wrote:
 
> It does in the sense that slabs are allocated following policies. If you 
> want to place individual objects then you need to use kmalloc_node().

Nick wants to place individual objects here

-Andi



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-01  0:23   ` Nick Piggin
@ 2007-08-01 17:53     ` Martin Bligh
  2007-08-01 18:32       ` Lee Schermerhorn
  0 siblings, 1 reply; 27+ messages in thread
From: Martin Bligh @ 2007-08-01 17:53 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

Nick Piggin wrote:
> On Tue, Jul 31, 2007 at 11:14:08AM +0200, Andi Kleen wrote:
>> On Tuesday 31 July 2007 07:41, Nick Piggin wrote:
>>
>>> I haven't given this idea testing yet, but I just wanted to get some
>>> opinions on it first. NUMA placement still isn't ideal (eg. tasks with
>>> a memory policy will not do any placement, and process migrations of
>>> course will leave the memory behind...), but it does give a bit more
>>> chance for the memory controllers and interconnects to get evenly
>>> loaded.
>> I didn't think slab honored mempolicies by default? 
>> At least you seem to need to set special process flags.
>>
>>> NUMA balance-on-fork code is in a good position to allocate all of a new
>>> process's memory on a chosen node. However, it really only starts
>>> allocating on the correct node after the process starts running.
>>>
>>> task and thread structures, stack, mm_struct, vmas, page tables etc. are
>>> all allocated on the parent's node.
>> The page tables should be only allocated when the process runs; except
>> for the PGD.
> 
> We certainly used to copy all page tables on fork. Not any more, but we
> must still copy anonymous page tables.

This topic seems to come up periodically every since we first introduced
the NUMA scheduler, and every time we decide it's a bad idea. What's
changed? What workloads does this improve (aside from some artificial
benchmark like stream)?

To repeat the conclusions of last time ... the primary problem is that
99% of the time, we exec after we fork, and it makes that fork/exec
cycle slower, not faster, so exec is generally a much better time to do
this. There's no good predictor of whether we'll exec after fork, unless
one has magically appeared since late 2.5.x ?

M.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-01 17:53     ` Martin Bligh
@ 2007-08-01 18:32       ` Lee Schermerhorn
  2007-08-01 22:52         ` Martin Bligh
  0 siblings, 1 reply; 27+ messages in thread
From: Lee Schermerhorn @ 2007-08-01 18:32 UTC (permalink / raw)
  To: Martin Bligh
  Cc: Nick Piggin, Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List, Eric Whitney

On Wed, 2007-08-01 at 10:53 -0700, Martin Bligh wrote:
> Nick Piggin wrote:
> > On Tue, Jul 31, 2007 at 11:14:08AM +0200, Andi Kleen wrote:
> >> On Tuesday 31 July 2007 07:41, Nick Piggin wrote:
> >>
> >>> I haven't given this idea testing yet, but I just wanted to get some
> >>> opinions on it first. NUMA placement still isn't ideal (eg. tasks with
> >>> a memory policy will not do any placement, and process migrations of
> >>> course will leave the memory behind...), but it does give a bit more
> >>> chance for the memory controllers and interconnects to get evenly
> >>> loaded.
> >> I didn't think slab honored mempolicies by default? 
> >> At least you seem to need to set special process flags.
> >>
> >>> NUMA balance-on-fork code is in a good position to allocate all of a new
> >>> process's memory on a chosen node. However, it really only starts
> >>> allocating on the correct node after the process starts running.
> >>>
> >>> task and thread structures, stack, mm_struct, vmas, page tables etc. are
> >>> all allocated on the parent's node.
> >> The page tables should be only allocated when the process runs; except
> >> for the PGD.
> > 
> > We certainly used to copy all page tables on fork. Not any more, but we
> > must still copy anonymous page tables.
> 
> This topic seems to come up periodically every since we first introduced
> the NUMA scheduler, and every time we decide it's a bad idea. What's
> changed? What workloads does this improve (aside from some artificial
> benchmark like stream)?
> 
> To repeat the conclusions of last time ... the primary problem is that
> 99% of the time, we exec after we fork, and it makes that fork/exec
> cycle slower, not faster, so exec is generally a much better time to do
> this. There's no good predictor of whether we'll exec after fork, unless
> one has magically appeared since late 2.5.x ?
> 

As Nick points out, one reason to balance on fork() rather than exec()
is that with balance on exec you already have the new task's kernel
structs allocated on the "wrong" node.  However, as you point out, this
slows down the fork/exec cycle.  This is especially noticeable on larger
node-count systems in, e.g., shell scripts that spawn a lot of short
lived child processes.  "Back in the day", we got bitten by this on the
Alpha EV7 [a.k.a. Marvel] platform with just ~64 nodes--small compared
to, say, the current Altix platform.  

On the other hand, if you're launching a few larger, long-lived
applications with any significant %-age of system time, you might want
to consider spreading them out across nodes and having their warmer
kernel data structures close to them.  A dilemma.

Altho' I was no longer working on this platform when this issue came up,
I believe that the kernel developers came up with something along these
lines:

+ define a "credit" member of the "task" struct, initialized to, say,
zero.

+ when "credit" is zero, or below some threshold, balance on fork--i.e.,
spread out the load--otherwise fork "locally" and decrement credit
[maybe not < 0].

+ when reaping dead children, if the poor thing's cpu utilization is
below some threshold, give the parent some credit.  [blood money?]

And so forth.  Initial forks will balance.  If the children refuse to
die, forks will continue to balance.  If the parent starts seeing short
lived children, fork()s will eventually start to stay local.  

I believe that this solved the pathological behavior we were seeing with
shell scripts taking way longer on the larger, supposedly more powerful,
platforms.

Of course, that OS could migrate the equivalent of task structs and
kernel stack [the old Unix user struct that was traditionally swappable,
so fairly easy to migrate].  On Linux, all bets are off, once the
scheduler starts migrating tasks away from the node that contains their
task struct, ...  [Remember Eric Focht's "NUMA Affine Scheduler" patch
with it's "home node"?]

Lee


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-01 18:32       ` Lee Schermerhorn
@ 2007-08-01 22:52         ` Martin Bligh
  2007-08-02  1:36           ` Nick Piggin
  2007-08-02 14:49           ` Lee Schermerhorn
  0 siblings, 2 replies; 27+ messages in thread
From: Martin Bligh @ 2007-08-01 22:52 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Nick Piggin, Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List, Eric Whitney


>> This topic seems to come up periodically every since we first introduced
>> the NUMA scheduler, and every time we decide it's a bad idea. What's
>> changed? What workloads does this improve (aside from some artificial
>> benchmark like stream)?
>>
>> To repeat the conclusions of last time ... the primary problem is that
>> 99% of the time, we exec after we fork, and it makes that fork/exec
>> cycle slower, not faster, so exec is generally a much better time to do
>> this. There's no good predictor of whether we'll exec after fork, unless
>> one has magically appeared since late 2.5.x ?
>>
> 
> As Nick points out, one reason to balance on fork() rather than exec()
> is that with balance on exec you already have the new task's kernel
> structs allocated on the "wrong" node.  However, as you point out, this
> slows down the fork/exec cycle.  This is especially noticeable on larger
> node-count systems in, e.g., shell scripts that spawn a lot of short
> lived child processes.  "Back in the day", we got bitten by this on the
> Alpha EV7 [a.k.a. Marvel] platform with just ~64 nodes--small compared
> to, say, the current Altix platform.  
> 
> On the other hand, if you're launching a few larger, long-lived
> applications with any significant %-age of system time, you might want
> to consider spreading them out across nodes and having their warmer
> kernel data structures close to them.  A dilemma.
> 
> Altho' I was no longer working on this platform when this issue came up,
> I believe that the kernel developers came up with something along these
> lines:
> 
> + define a "credit" member of the "task" struct, initialized to, say,
> zero.
> 
> + when "credit" is zero, or below some threshold, balance on fork--i.e.,
> spread out the load--otherwise fork "locally" and decrement credit
> [maybe not < 0].
> 
> + when reaping dead children, if the poor thing's cpu utilization is
> below some threshold, give the parent some credit.  [blood money?]
> 
> And so forth.  Initial forks will balance.  If the children refuse to
> die, forks will continue to balance.  If the parent starts seeing short
> lived children, fork()s will eventually start to stay local.  

Fork without exec is much more rare than without. Optimising for
the uncommon case is the Wrong Thing to Do (tm). What we decided
the last time(s) this came up was to allow userspace to pass
a hint in if they wanted to fork and not exec.

> I believe that this solved the pathological behavior we were seeing with
> shell scripts taking way longer on the larger, supposedly more powerful,
> platforms.
> 
> Of course, that OS could migrate the equivalent of task structs and
> kernel stack [the old Unix user struct that was traditionally swappable,
> so fairly easy to migrate].  On Linux, all bets are off, once the
> scheduler starts migrating tasks away from the node that contains their
> task struct, ...  [Remember Eric Focht's "NUMA Affine Scheduler" patch
> with it's "home node"?]

Task migration doesn't work well at all without userspace hints.
SGI tried for ages (with IRIX) and failed. There's long discussions
of all of these things back in the days when we merged the original
NUMA scheduler in late 2.5 ...

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-01 22:52         ` Martin Bligh
@ 2007-08-02  1:36           ` Nick Piggin
  2007-08-02 18:33             ` Martin Bligh
  2007-08-02 14:49           ` Lee Schermerhorn
  1 sibling, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2007-08-02  1:36 UTC (permalink / raw)
  To: Martin Bligh
  Cc: Lee Schermerhorn, Andi Kleen, Ingo Molnar,
	Linux Kernel Mailing List, Linux Memory Management List,
	Eric Whitney

On Wed, Aug 01, 2007 at 03:52:11PM -0700, Martin Bligh wrote:
> 
> >And so forth.  Initial forks will balance.  If the children refuse to
> >die, forks will continue to balance.  If the parent starts seeing short
> >lived children, fork()s will eventually start to stay local.  
> 
> Fork without exec is much more rare than without. Optimising for
> the uncommon case is the Wrong Thing to Do (tm). What we decided

It's only the wrong thing to do if it hurts the common case too
much. Considering we _already_ balance on exec, then adding another
balance on fork is not going to introduce some order of magnitude
problem -- at worst it would be 2x but it really isn't too slow
anyway (at least nobody complained when we added it).

One place where we found it helps is clone for threads.

If we didn't do such a bad job at keeping tasks together with their
local memory, then we might indeed reduce some of the balance-on-crap
and increase the aggressiveness of periodic balancing.

Considering we _already_ balance on fork/clone, I don't know what
your argument is against this patch is? Doing the balance earlier
and allocating more stuff on the local node is surely not a bad
idea.


> the last time(s) this came up was to allow userspace to pass
> a hint in if they wanted to fork and not exec.
> 
> >I believe that this solved the pathological behavior we were seeing with
> >shell scripts taking way longer on the larger, supposedly more powerful,
> >platforms.
> >
> >Of course, that OS could migrate the equivalent of task structs and
> >kernel stack [the old Unix user struct that was traditionally swappable,
> >so fairly easy to migrate].  On Linux, all bets are off, once the
> >scheduler starts migrating tasks away from the node that contains their
> >task struct, ...  [Remember Eric Focht's "NUMA Affine Scheduler" patch
> >with it's "home node"?]
> 
> Task migration doesn't work well at all without userspace hints.
> SGI tried for ages (with IRIX) and failed. There's long discussions
> of all of these things back in the days when we merged the original
> NUMA scheduler in late 2.5 ...

Task migration? Automatic memory migration you mean? I think it deserves
another look regardless of what SGI could or could not do, and Lee and I
are slowly getting things in place. We'll see what happens...


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-07-31 23:40   ` Christoph Lameter
  2007-08-01  8:39     ` Andi Kleen
@ 2007-08-02  3:42     ` Nick Piggin
  2007-08-02 19:58       ` Christoph Lameter
  1 sibling, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2007-08-02  3:42 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Tue, Jul 31, 2007 at 04:40:18PM -0700, Christoph Lameter wrote:
> On Tue, 31 Jul 2007, Andi Kleen wrote:
> 
> > On Tuesday 31 July 2007 07:41, Nick Piggin wrote:
> > 
> > > I haven't given this idea testing yet, but I just wanted to get some
> > > opinions on it first. NUMA placement still isn't ideal (eg. tasks with
> > > a memory policy will not do any placement, and process migrations of
> > > course will leave the memory behind...), but it does give a bit more
> > > chance for the memory controllers and interconnects to get evenly
> > > loaded.
> > 
> > I didn't think slab honored mempolicies by default? 
> > At least you seem to need to set special process flags.
> 
> It does in the sense that slabs are allocated following policies. If you 
> want to place individual objects then you need to use kmalloc_node().

Is there no way to place objects via policy? At least kernel stack and page
tables on x86-64 should be covered by page allocator policy, so the patch
will still be useful.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-01 22:52         ` Martin Bligh
  2007-08-02  1:36           ` Nick Piggin
@ 2007-08-02 14:49           ` Lee Schermerhorn
  1 sibling, 0 replies; 27+ messages in thread
From: Lee Schermerhorn @ 2007-08-02 14:49 UTC (permalink / raw)
  To: Martin Bligh
  Cc: Nick Piggin, Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List, Eric Whitney

On Wed, 2007-08-01 at 15:52 -0700, Martin Bligh wrote:
> >> This topic seems to come up periodically every since we first introduced
> >> the NUMA scheduler, and every time we decide it's a bad idea. What's
> >> changed? What workloads does this improve (aside from some artificial
> >> benchmark like stream)?
> >>
> >> To repeat the conclusions of last time ... the primary problem is that
> >> 99% of the time, we exec after we fork, and it makes that fork/exec
> >> cycle slower, not faster, so exec is generally a much better time to do
> >> this. There's no good predictor of whether we'll exec after fork, unless
> >> one has magically appeared since late 2.5.x ?
> >>
> > 
> > As Nick points out, one reason to balance on fork() rather than exec()
> > is that with balance on exec you already have the new task's kernel
> > structs allocated on the "wrong" node.  However, as you point out, this
> > slows down the fork/exec cycle.  This is especially noticeable on larger
> > node-count systems in, e.g., shell scripts that spawn a lot of short
> > lived child processes.  "Back in the day", we got bitten by this on the
> > Alpha EV7 [a.k.a. Marvel] platform with just ~64 nodes--small compared
> > to, say, the current Altix platform.  
> > 
> > On the other hand, if you're launching a few larger, long-lived
> > applications with any significant %-age of system time, you might want
> > to consider spreading them out across nodes and having their warmer
> > kernel data structures close to them.  A dilemma.
> > 
> > Altho' I was no longer working on this platform when this issue came up,
> > I believe that the kernel developers came up with something along these
> > lines:
> > 
> > + define a "credit" member of the "task" struct, initialized to, say,
> > zero.
> > 
> > + when "credit" is zero, or below some threshold, balance on fork--i.e.,
> > spread out the load--otherwise fork "locally" and decrement credit
> > [maybe not < 0].
> > 
> > + when reaping dead children, if the poor thing's cpu utilization is
> > below some threshold, give the parent some credit.  [blood money?]
> > 
> > And so forth.  Initial forks will balance.  If the children refuse to
> > die, forks will continue to balance.  If the parent starts seeing short
> > lived children, fork()s will eventually start to stay local.  
> 
> Fork without exec is much more rare than without. Optimising for
> the uncommon case is the Wrong Thing to Do (tm). What we decided
> the last time(s) this came up was to allow userspace to pass
> a hint in if they wanted to fork and not exec.

I understand.  Again, as Nick mentioned, at exec time, you use the
existing task struct, kernel stack, ... which might [probably will?] end
up on the wrong node.  If the task uses a significant amount of system
time, this can hurt performance/scalability.  And, for short lived, low
cpu usage tasks, such as you can get with shell scripts, you might not
even want to balance at exec time.

I agree with your assertion regarding optimizing for uncommon cases.
The mechanism I described [probably poorly, memory fades and it was only
a "hallway conversation" with the person who implemented it--in response
to a customer complaint] attempted to detect situations where local vs
balanced fork would be beneficial.  I will note, however, that when
balancing, we did look across the entire system.  Linux scheduling
domains has the intermediate "node" level that constrains this balancing
to a subset of the system.  

I'm not suggesting we submit this, nor am I particulary interested in
investigating it myself.  Just pointing out a solution to a workload
scalability issue on an existing, albeit dated, numa platform.  

> 
> > I believe that this solved the pathological behavior we were seeing with
> > shell scripts taking way longer on the larger, supposedly more powerful,
> > platforms.
> > 
> > Of course, that OS could migrate the equivalent of task structs and
> > kernel stack [the old Unix user struct that was traditionally swappable,
> > so fairly easy to migrate].  On Linux, all bets are off, once the
> > scheduler starts migrating tasks away from the node that contains their
> > task struct, ...  [Remember Eric Focht's "NUMA Affine Scheduler" patch
> > with it's "home node"?]
> 
> Task migration doesn't work well at all without userspace hints.
> SGI tried for ages (with IRIX) and failed. There's long discussions
> of all of these things back in the days when we merged the original
> NUMA scheduler in late 2.5 ...

I'm not one to cast aspersions on the IRIX engineers.  However, as I
recall [could be wrong here], they were trying to use hardware counters
to predict what pages to migrate.  On the same OS discussed above, we
found that automatic, lazy migration of pages worked very well for some
workloads.  

I have patches and data [presented at LCA 2007] that shows, on a heavily
loaded 4-node, 16-cpu ia64 numa platform, ~14% reduction in real time
for a kernel build [make -j 32] and something like 22% reduction in
system time and 4% reduction in user time.  This with automatic, lazy
migration enabled vs not, on the same build of a 2.6.19-rc6-mm? kernel.
I'll also note that the reduction in system time was in spite of the
cost of the auto/lazy page migration whenever the tasks migrated to a
different node.

Later,
Lee


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-02  1:36           ` Nick Piggin
@ 2007-08-02 18:33             ` Martin Bligh
  2007-08-03  0:20               ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Martin Bligh @ 2007-08-02 18:33 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Lee Schermerhorn, Andi Kleen, Ingo Molnar,
	Linux Kernel Mailing List, Linux Memory Management List,
	Eric Whitney

Nick Piggin wrote:
> On Wed, Aug 01, 2007 at 03:52:11PM -0700, Martin Bligh wrote:
>>> And so forth.  Initial forks will balance.  If the children refuse to
>>> die, forks will continue to balance.  If the parent starts seeing short
>>> lived children, fork()s will eventually start to stay local.  
>> Fork without exec is much more rare than without. Optimising for
>> the uncommon case is the Wrong Thing to Do (tm). What we decided
> 
> It's only the wrong thing to do if it hurts the common case too
> much. Considering we _already_ balance on exec, then adding another
> balance on fork is not going to introduce some order of magnitude
> problem -- at worst it would be 2x but it really isn't too slow
> anyway (at least nobody complained when we added it).
> 
> One place where we found it helps is clone for threads.
> 
> If we didn't do such a bad job at keeping tasks together with their
> local memory, then we might indeed reduce some of the balance-on-crap
> and increase the aggressiveness of periodic balancing.
> 
> Considering we _already_ balance on fork/clone, I don't know what
> your argument is against this patch is? Doing the balance earlier
> and allocating more stuff on the local node is surely not a bad
> idea.

I don't know who turned that on ;-( I suspect nobody bothered
actually measuring it at the time though, or used some crap
benchmark like stream to do so. It should get reverted.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-02  3:42     ` Nick Piggin
@ 2007-08-02 19:58       ` Christoph Lameter
  2007-08-03  0:26         ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2007-08-02 19:58 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Thu, 2 Aug 2007, Nick Piggin wrote:

> > It does in the sense that slabs are allocated following policies. If you 
> > want to place individual objects then you need to use kmalloc_node().
> 
> Is there no way to place objects via policy? At least kernel stack and page
> tables on x86-64 should be covered by page allocator policy, so the patch
> will still be useful.

Implementing policies on an object level introduces significant allocator 
overhead. Tried to do it in SLAB which created a mess.

Add a (slow) kmalloc_policy? Strict Object round robin for interleave 
right? It probably needs its own RR counter otherwise it disturbs the per 
task page RR.

For interleave kmalloc() does allocate the slabs round robin not the 
objects.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-02 18:33             ` Martin Bligh
@ 2007-08-03  0:20               ` Nick Piggin
  2007-08-03 20:10                 ` Siddha, Suresh B
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2007-08-03  0:20 UTC (permalink / raw)
  To: Martin Bligh
  Cc: Lee Schermerhorn, Andi Kleen, Ingo Molnar,
	Linux Kernel Mailing List, Linux Memory Management List,
	Eric Whitney

On Thu, Aug 02, 2007 at 11:33:39AM -0700, Martin Bligh wrote:
> Nick Piggin wrote:
> >On Wed, Aug 01, 2007 at 03:52:11PM -0700, Martin Bligh wrote:
> >>>And so forth.  Initial forks will balance.  If the children refuse to
> >>>die, forks will continue to balance.  If the parent starts seeing short
> >>>lived children, fork()s will eventually start to stay local.  
> >>Fork without exec is much more rare than without. Optimising for
> >>the uncommon case is the Wrong Thing to Do (tm). What we decided
> >
> >It's only the wrong thing to do if it hurts the common case too
> >much. Considering we _already_ balance on exec, then adding another
> >balance on fork is not going to introduce some order of magnitude
> >problem -- at worst it would be 2x but it really isn't too slow
> >anyway (at least nobody complained when we added it).
> >
> >One place where we found it helps is clone for threads.
> >
> >If we didn't do such a bad job at keeping tasks together with their
> >local memory, then we might indeed reduce some of the balance-on-crap
> >and increase the aggressiveness of periodic balancing.
> >
> >Considering we _already_ balance on fork/clone, I don't know what
> >your argument is against this patch is? Doing the balance earlier
> >and allocating more stuff on the local node is surely not a bad
> >idea.
> 
> I don't know who turned that on ;-( I suspect nobody bothered
> actually measuring it at the time though, or used some crap
> benchmark like stream to do so. It should get reverted.

So you have numbers to show it hurts? I tested some things where it
is not supposed to help, and it didn't make any difference. Nobody
else noticed either.

If the cost of doing the double balance is _really_ that painful,
then we ccould skip balance-on-exec for domains with balance-on-fork
set.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-02 19:58       ` Christoph Lameter
@ 2007-08-03  0:26         ` Nick Piggin
  2007-08-03  0:52           ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2007-08-03  0:26 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Thu, Aug 02, 2007 at 12:58:13PM -0700, Christoph Lameter wrote:
> On Thu, 2 Aug 2007, Nick Piggin wrote:
> 
> > > It does in the sense that slabs are allocated following policies. If you 
> > > want to place individual objects then you need to use kmalloc_node().
> > 
> > Is there no way to place objects via policy? At least kernel stack and page
> > tables on x86-64 should be covered by page allocator policy, so the patch
> > will still be useful.
> 
> Implementing policies on an object level introduces significant allocator 
> overhead. Tried to do it in SLAB which created a mess.
> 
> Add a (slow) kmalloc_policy? Strict Object round robin for interleave 
> right? It probably needs its own RR counter otherwise it disturbs the per 
> task page RR.

I guess interleave could be nice for other things, but for this, I
just want MPOL_BIND to work. The problem is that the pagetable copying
etc codepaths cover a lot of code and some of it (eg pagetable allocation)
is used for other paths as well.. so I was just hoping to do something
less intrusive for now if possible.


> For interleave kmalloc() does allocate the slabs round robin not the 
> objects.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-03  0:26         ` Nick Piggin
@ 2007-08-03  0:52           ` Christoph Lameter
  2007-08-03  0:57             ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2007-08-03  0:52 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Fri, 3 Aug 2007, Nick Piggin wrote:

> > Add a (slow) kmalloc_policy? Strict Object round robin for interleave 
> > right? It probably needs its own RR counter otherwise it disturbs the per 
> > task page RR.
> 
> I guess interleave could be nice for other things, but for this, I
> just want MPOL_BIND to work. The problem is that the pagetable copying
> etc codepaths cover a lot of code and some of it (eg pagetable allocation)
> is used for other paths as well.. so I was just hoping to do something
> less intrusive for now if possible.

Ok. So MPOL_BIND on a single node. We would have to save the current 
memory policy on the stack and then restore it later. Then you would need 
a special call anyways.

Or is there some way to execute the code on the target cpu? That may be 
the easiest solution.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-03  0:52           ` Christoph Lameter
@ 2007-08-03  0:57             ` Nick Piggin
  2007-08-03  1:02               ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2007-08-03  0:57 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Thu, Aug 02, 2007 at 05:52:28PM -0700, Christoph Lameter wrote:
> On Fri, 3 Aug 2007, Nick Piggin wrote:
> 
> > > Add a (slow) kmalloc_policy? Strict Object round robin for interleave 
> > > right? It probably needs its own RR counter otherwise it disturbs the per 
> > > task page RR.
> > 
> > I guess interleave could be nice for other things, but for this, I
> > just want MPOL_BIND to work. The problem is that the pagetable copying
> > etc codepaths cover a lot of code and some of it (eg pagetable allocation)
> > is used for other paths as well.. so I was just hoping to do something
> > less intrusive for now if possible.
> 
> Ok. So MPOL_BIND on a single node. We would have to save the current 
> memory policy on the stack and then restore it later. Then you would need 
> a special call anyways.

Well the memory policy will already be set to MPOL_BIND at this point.
The slab allocator I think would just have to honour the node at the
object level.


 
> Or is there some way to execute the code on the target cpu? That may be 
> the easiest solution.

It isn't so easy... we'd have to migrate the parent process to the new
node to perform the setup, and then migrate it back again afterwards.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-03  0:57             ` Nick Piggin
@ 2007-08-03  1:02               ` Christoph Lameter
  2007-08-03  1:14                 ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2007-08-03  1:02 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Fri, 3 Aug 2007, Nick Piggin wrote:

> > Ok. So MPOL_BIND on a single node. We would have to save the current 
> > memory policy on the stack and then restore it later. Then you would need 
> > a special call anyways.
> 
> Well the memory policy will already be set to MPOL_BIND at this point.
> The slab allocator I think would just have to honour the node at the
> object level.

Who set the policy? The parent process may have its own memory policy. If 
you set that then the earlier policy is lost.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-03  1:02               ` Christoph Lameter
@ 2007-08-03  1:14                 ` Nick Piggin
  2007-08-03  1:34                   ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2007-08-03  1:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Thu, Aug 02, 2007 at 06:02:56PM -0700, Christoph Lameter wrote:
> On Fri, 3 Aug 2007, Nick Piggin wrote:
> 
> > > Ok. So MPOL_BIND on a single node. We would have to save the current 
> > > memory policy on the stack and then restore it later. Then you would need 
> > > a special call anyways.
> > 
> > Well the memory policy will already be set to MPOL_BIND at this point.
> > The slab allocator I think would just have to honour the node at the
> > object level.
> 
> Who set the policy? The parent process may have its own memory policy. If 
> you set that then the earlier policy is lost.

Yeah it only gets set if the parent is initially using a default policy
at this stage (and then is restored afterwards of course).


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-03  1:14                 ` Nick Piggin
@ 2007-08-03  1:34                   ` Christoph Lameter
  2007-08-03  3:14                     ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2007-08-03  1:34 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Fri, 3 Aug 2007, Nick Piggin wrote:

> Yeah it only gets set if the parent is initially using a default policy
> at this stage (and then is restored afterwards of course).

Uggh. Looks like more hackery ahead. I think this cannot be done in the 
desired clean way until we have some revving of the memory policy 
subsystem that makes policies task context independent so that you can do

alloc_pages(...., memory_policy)

The cleanest solution that I can think of at this point is certainly to 
switch to another processor and do the allocation and copying actions from 
there. We have the migration process context right? Can that be used to 
start the new thread and can the original processor wait on some flag 
until that is complete?

Forking off from there not only places the data correctly but it also 
warms up the caches for the new process and avoids evicting cacheline on 
the original processor.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-03  1:34                   ` Christoph Lameter
@ 2007-08-03  3:14                     ` Nick Piggin
  2007-08-03  5:47                       ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2007-08-03  3:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Thu, Aug 02, 2007 at 06:34:04PM -0700, Christoph Lameter wrote:
> On Fri, 3 Aug 2007, Nick Piggin wrote:
> 
> > Yeah it only gets set if the parent is initially using a default policy
> > at this stage (and then is restored afterwards of course).
> 
> Uggh. Looks like more hackery ahead. I think this cannot be done in the 
> desired clean way until we have some revving of the memory policy 
> subsystem that makes policies task context independent so that you can do

Well what's wrong with it? It seems to use memory policies for exactly
what they are intended (aside from it being kernel directed...).


> alloc_pages(...., memory_policy)

That still doesn't completely help because again it would require modifying
call sites (at which point I could just do alloc_pages_node).


> The cleanest solution that I can think of at this point is certainly to 
> switch to another processor and do the allocation and copying actions from 
> there. We have the migration process context right? Can that be used to 
> start the new thread and can the original processor wait on some flag 
> until that is complete?

I guess you could, but that is going to add a context switch to fork
(although it usually already has one in single-CPU situation because we
run child first)... I bet it will slow something down, but it would be
interesting to see.

I don't know the fork path well enough off the top of my head to know if
it will be that simple (with error handling etc). But I think it could
be done.


> Forking off from there not only places the data correctly but it also 
> warms up the caches for the new process and avoids evicting cacheline on 
> the original processor.

Yeah, you might be right there. If the numbers say that approach is
better, then I'd not be against it. But we'd still need the simpler
mpol approach to compare it with. 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-03  3:14                     ` Nick Piggin
@ 2007-08-03  5:47                       ` Christoph Lameter
  0 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2007-08-03  5:47 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Ingo Molnar, Linux Kernel Mailing List,
	Linux Memory Management List

On Fri, 3 Aug 2007, Nick Piggin wrote:

> Well what's wrong with it? It seems to use memory policies for exactly
> what they are intended (aside from it being kernel directed...).

Sure I think you could do it with some effort. They were primarily 
designed for user space. Lots of little side issues where surprises await 
you. I think Lee documented many of them. See the recent mm commits.

> > start the new thread and can the original processor wait on some flag 
> > until that is complete?
> 
> I guess you could, but that is going to add a context switch to fork
> (although it usually already has one in single-CPU situation because we
> run child first)... I bet it will slow something down, but it would be
> interesting to see.

The context switch is needed at some point anyways to get the new process 
running on the new CPU? Just do it before allocating structures. That way 
the potential memory policy and cpuset context is preserved and followed.

> I don't know the fork path well enough off the top of my head to know if
> it will be that simple (with error handling etc). But I think it could
> be done.

I would think that the forking process has to wait on completion anyways
and get an error code.

> > Forking off from there not only places the data correctly but it also 
> > warms up the caches for the new process and avoids evicting cacheline on 
> > the original processor.
> 
> Yeah, you might be right there. If the numbers say that approach is
> better, then I'd not be against it. But we'd still need the simpler
> mpol approach to compare it with. 

Lets hope that the simpler process is really simpler after all the corner 
cases have been dealt with.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-03  0:20               ` Nick Piggin
@ 2007-08-03 20:10                 ` Siddha, Suresh B
  2007-08-06  1:20                   ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Siddha, Suresh B @ 2007-08-03 20:10 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Martin Bligh, Lee Schermerhorn, Andi Kleen, Ingo Molnar,
	Linux Kernel Mailing List, Linux Memory Management List,
	Eric Whitney

On Fri, Aug 03, 2007 at 02:20:10AM +0200, Nick Piggin wrote:
> On Thu, Aug 02, 2007 at 11:33:39AM -0700, Martin Bligh wrote:
> > Nick Piggin wrote:
> > >On Wed, Aug 01, 2007 at 03:52:11PM -0700, Martin Bligh wrote:
> > >>>And so forth.  Initial forks will balance.  If the children refuse to
> > >>>die, forks will continue to balance.  If the parent starts seeing short
> > >>>lived children, fork()s will eventually start to stay local.  
> > >>Fork without exec is much more rare than without. Optimising for
> > >>the uncommon case is the Wrong Thing to Do (tm). What we decided
> > >
> > >It's only the wrong thing to do if it hurts the common case too
> > >much. Considering we _already_ balance on exec, then adding another
> > >balance on fork is not going to introduce some order of magnitude
> > >problem -- at worst it would be 2x but it really isn't too slow
> > >anyway (at least nobody complained when we added it).
> > >
> > >One place where we found it helps is clone for threads.
> > >
> > >If we didn't do such a bad job at keeping tasks together with their
> > >local memory, then we might indeed reduce some of the balance-on-crap
> > >and increase the aggressiveness of periodic balancing.
> > >
> > >Considering we _already_ balance on fork/clone, I don't know what
> > >your argument is against this patch is? Doing the balance earlier
> > >and allocating more stuff on the local node is surely not a bad
> > >idea.
> > 
> > I don't know who turned that on ;-( I suspect nobody bothered
> > actually measuring it at the time though, or used some crap
> > benchmark like stream to do so. It should get reverted.
> 
> So you have numbers to show it hurts? I tested some things where it
> is not supposed to help, and it didn't make any difference. Nobody
> else noticed either.
> 
> If the cost of doing the double balance is _really_ that painful,
> then we ccould skip balance-on-exec for domains with balance-on-fork
> set.

Nick, Even if it is not painful, can we skip balance-on-exec if
balance-on-fork is set. There is no need for double balance, right?

Especially with the optimization you are trying to do with this patch,
balance-on-exec may lead to wrong decision making this optimization
not work as expected.

or perhaps do balance-on-fork based on clone_flags..

thanks,
suresh

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [rfc] balance-on-fork NUMA placement
  2007-08-03 20:10                 ` Siddha, Suresh B
@ 2007-08-06  1:20                   ` Nick Piggin
  0 siblings, 0 replies; 27+ messages in thread
From: Nick Piggin @ 2007-08-06  1:20 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: Martin Bligh, Lee Schermerhorn, Andi Kleen, Ingo Molnar,
	Linux Kernel Mailing List, Linux Memory Management List,
	Eric Whitney

On Fri, Aug 03, 2007 at 01:10:13PM -0700, Suresh B wrote:
> On Fri, Aug 03, 2007 at 02:20:10AM +0200, Nick Piggin wrote:
> > On Thu, Aug 02, 2007 at 11:33:39AM -0700, Martin Bligh wrote:
> > > Nick Piggin wrote:
> > > >On Wed, Aug 01, 2007 at 03:52:11PM -0700, Martin Bligh wrote:
> > > >>>And so forth.  Initial forks will balance.  If the children refuse to
> > > >>>die, forks will continue to balance.  If the parent starts seeing short
> > > >>>lived children, fork()s will eventually start to stay local.  
> > > >>Fork without exec is much more rare than without. Optimising for
> > > >>the uncommon case is the Wrong Thing to Do (tm). What we decided
> > > >
> > > >It's only the wrong thing to do if it hurts the common case too
> > > >much. Considering we _already_ balance on exec, then adding another
> > > >balance on fork is not going to introduce some order of magnitude
> > > >problem -- at worst it would be 2x but it really isn't too slow
> > > >anyway (at least nobody complained when we added it).
> > > >
> > > >One place where we found it helps is clone for threads.
> > > >
> > > >If we didn't do such a bad job at keeping tasks together with their
> > > >local memory, then we might indeed reduce some of the balance-on-crap
> > > >and increase the aggressiveness of periodic balancing.
> > > >
> > > >Considering we _already_ balance on fork/clone, I don't know what
> > > >your argument is against this patch is? Doing the balance earlier
> > > >and allocating more stuff on the local node is surely not a bad
> > > >idea.
> > > 
> > > I don't know who turned that on ;-( I suspect nobody bothered
> > > actually measuring it at the time though, or used some crap
> > > benchmark like stream to do so. It should get reverted.
> > 
> > So you have numbers to show it hurts? I tested some things where it
> > is not supposed to help, and it didn't make any difference. Nobody
> > else noticed either.
> > 
> > If the cost of doing the double balance is _really_ that painful,
> > then we ccould skip balance-on-exec for domains with balance-on-fork
> > set.
> 
> Nick, Even if it is not painful, can we skip balance-on-exec if
> balance-on-fork is set. There is no need for double balance, right?

I guess we could. There is no need for the double balance if the exec
happens immediately after the fork which is surely the common case. I
think there can be some other weird cases (eg multi-threaded code) that
does funny things though...


> Especially with the optimization you are trying to do with this patch,
> balance-on-exec may lead to wrong decision making this optimization
> not work as expected.

That's true.


^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2007-08-06  1:20 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-07-31  5:41 [rfc] balance-on-fork NUMA placement Nick Piggin
2007-07-31  8:01 ` Ingo Molnar
2007-08-01  0:21   ` Nick Piggin
2007-08-01  6:19     ` Ingo Molnar
2007-07-31  9:14 ` Andi Kleen
2007-07-31 23:40   ` Christoph Lameter
2007-08-01  8:39     ` Andi Kleen
2007-08-02  3:42     ` Nick Piggin
2007-08-02 19:58       ` Christoph Lameter
2007-08-03  0:26         ` Nick Piggin
2007-08-03  0:52           ` Christoph Lameter
2007-08-03  0:57             ` Nick Piggin
2007-08-03  1:02               ` Christoph Lameter
2007-08-03  1:14                 ` Nick Piggin
2007-08-03  1:34                   ` Christoph Lameter
2007-08-03  3:14                     ` Nick Piggin
2007-08-03  5:47                       ` Christoph Lameter
2007-08-01  0:23   ` Nick Piggin
2007-08-01 17:53     ` Martin Bligh
2007-08-01 18:32       ` Lee Schermerhorn
2007-08-01 22:52         ` Martin Bligh
2007-08-02  1:36           ` Nick Piggin
2007-08-02 18:33             ` Martin Bligh
2007-08-03  0:20               ` Nick Piggin
2007-08-03 20:10                 ` Siddha, Suresh B
2007-08-06  1:20                   ` Nick Piggin
2007-08-02 14:49           ` Lee Schermerhorn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).