LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
To: Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Mel Gorman <mgorman@techsingularity.net>,
	Rik van Riel <riel@surriel.com>,
	Srikar Dronamraju <srikar@linux.vnet.ibm.com>,
	Thomas Gleixner <tglx@linutronix.de>
Subject: [PATCH 10/19] sched/numa: Stop multiple tasks from moving to the cpu at the same time
Date: Mon,  4 Jun 2018 15:30:19 +0530	[thread overview]
Message-ID: <1528106428-19992-11-git-send-email-srikar@linux.vnet.ibm.com> (raw)
In-Reply-To: <1528106428-19992-1-git-send-email-srikar@linux.vnet.ibm.com>

Task migration under numa balancing can happen in parallel. More than
one task might choose to migrate to the same cpu at the same time. This
can result in
- During task swap, choosing a task that was not part of the evaluation.
- During task swap, task which just got moved into its preferred node,
  moving to a completely different node.
- During task swap, task failing to move to the preferred node, will have
  to wait an extra interval for the next migrate opportunity.
- During task movement, multiple task movements can cause load imbalance.

This problem is more likely if there are more cores per node or more
nodes in the system.

Use a per run-queue variable to check if numa-balance is active on the
run-queue.

Testcase       Time:         Min         Max         Avg      StdDev
numa01.sh      Real:      414.64      819.20      556.08      147.70
numa01.sh       Sys:       77.52      205.04      139.40       52.05
numa01.sh      User:    37043.24    61757.88    45517.48     9290.38
numa02.sh      Real:       60.80       63.32       61.63        0.88
numa02.sh       Sys:       17.35       39.37       25.71        7.33
numa02.sh      User:     5213.79     5374.73     5268.90       55.09
numa03.sh      Real:      780.09      948.64      831.43       63.02
numa03.sh       Sys:      104.96      136.92      116.31       11.34
numa03.sh      User:    60465.42    73339.78    64368.03     4700.14
numa04.sh      Real:      412.60      681.92      521.29       96.64
numa04.sh       Sys:      210.32      314.10      251.77       37.71
numa04.sh      User:    34026.38    45581.20    38534.49     4198.53
numa05.sh      Real:      394.79      439.63      411.35       16.87
numa05.sh       Sys:      238.32      330.09      292.31       38.32
numa05.sh      User:    33456.45    34876.07    34138.62      609.45

Testcase       Time:         Min         Max         Avg      StdDev 	 %Change
numa01.sh      Real:      434.84      676.90      550.53      106.24 	 1.008%
numa01.sh       Sys:      125.98      217.34      179.41       30.35 	 -22.3%
numa01.sh      User:    38318.48    53789.56    45864.17     6620.80 	 -0.75%
numa02.sh      Real:       60.06       61.27       60.59        0.45 	 1.716%
numa02.sh       Sys:       14.25       17.86       16.09        1.28 	 59.78%
numa02.sh      User:     5190.13     5225.67     5209.24       13.19 	 1.145%
numa03.sh      Real:      748.21      960.25      823.15       73.51 	 1.005%
numa03.sh       Sys:       96.68      122.10      110.42       11.29 	 5.334%
numa03.sh      User:    58222.16    72595.27    63552.22     5048.87 	 1.283%
numa04.sh      Real:      433.08      630.55      499.30       68.15 	 4.404%
numa04.sh       Sys:      245.22      386.75      306.09       63.32 	 -17.7%
numa04.sh      User:    35014.68    46151.72    38530.26     3924.65 	 0.010%
numa05.sh      Real:      394.77      410.07      401.41        5.99 	 2.476%
numa05.sh       Sys:      212.40      301.82      256.23       35.41 	 14.08%
numa05.sh      User:    33224.86    34201.40    33665.61      313.40 	 1.405%

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 kernel/sched/fair.c  | 17 +++++++++++++++++
 kernel/sched/sched.h |  1 +
 2 files changed, 18 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 46d773c..3e19e32 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1478,6 +1478,16 @@ struct task_numa_env {
 static void task_numa_assign(struct task_numa_env *env,
 			     struct task_struct *p, long imp)
 {
+	struct rq *rq = cpu_rq(env->dst_cpu);
+
+	if (xchg(&rq->numa_migrate_on, 1))
+		return;
+
+	if (env->best_cpu != -1) {
+		rq = cpu_rq(env->best_cpu);
+		WRITE_ONCE(rq->numa_migrate_on, 0);
+	}
+
 	if (env->best_task)
 		put_task_struct(env->best_task);
 	if (p)
@@ -1533,6 +1543,9 @@ static void task_numa_compare(struct task_numa_env *env,
 	long moveimp = imp;
 	int dist = env->dist;
 
+	if (READ_ONCE(dst_rq->numa_migrate_on))
+		return;
+
 	rcu_read_lock();
 	cur = task_rcu_dereference(&dst_rq->curr);
 	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
@@ -1699,6 +1712,7 @@ static int task_numa_migrate(struct task_struct *p)
 		.best_cpu = -1,
 	};
 	struct sched_domain *sd;
+	struct rq *best_rq;
 	unsigned long taskweight, groupweight;
 	int nid, ret, dist;
 	long taskimp, groupimp;
@@ -1803,14 +1817,17 @@ static int task_numa_migrate(struct task_struct *p)
 	 */
 	p->numa_scan_period = task_scan_start(p);
 
+	best_rq = cpu_rq(env.best_cpu);
 	if (env.best_task == NULL) {
 		ret = migrate_task_to(p, env.best_cpu);
+		WRITE_ONCE(best_rq->numa_migrate_on, 0);
 		if (ret != 0)
 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
 		return ret;
 	}
 
 	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+	WRITE_ONCE(best_rq->numa_migrate_on, 0);
 
 	if (ret != 0)
 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 211841e..55bc6e1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -756,6 +756,7 @@ struct rq {
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
 	unsigned int		nr_preferred_running;
+	unsigned int		numa_migrate_on;
 #endif
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long		cpu_load[CPU_LOAD_IDX_MAX];
-- 
1.8.3.1

  parent reply	other threads:[~2018-06-04 10:01 UTC|newest]

Thread overview: 66+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-04 10:00 [PATCH 00/19] Fixes for sched/numa_balancing Srikar Dronamraju
2018-06-04 10:00 ` [PATCH 01/19] sched/numa: Remove redundant field Srikar Dronamraju
2018-06-04 14:53   ` Rik van Riel
2018-06-05  8:41   ` Mel Gorman
2018-06-04 10:00 ` [PATCH 02/19] sched/numa: Evaluate move once per node Srikar Dronamraju
2018-06-04 14:51   ` Rik van Riel
2018-06-04 15:45     ` Srikar Dronamraju
2018-06-04 10:00 ` [PATCH 03/19] sched/numa: Simplify load_too_imbalanced Srikar Dronamraju
2018-06-04 14:57   ` Rik van Riel
2018-06-05  8:46   ` Mel Gorman
2018-06-04 10:00 ` [PATCH 04/19] sched/numa: Set preferred_node based on best_cpu Srikar Dronamraju
2018-06-04 12:18   ` Peter Zijlstra
2018-06-04 12:53     ` Srikar Dronamraju
2018-06-04 12:23   ` Peter Zijlstra
2018-06-04 12:59     ` Srikar Dronamraju
2018-06-04 13:39       ` Peter Zijlstra
2018-06-04 13:48         ` Srikar Dronamraju
2018-06-04 14:37       ` Rik van Riel
2018-06-04 15:56         ` Srikar Dronamraju
2018-06-04 10:00 ` [PATCH 05/19] sched/numa: Use task faults only if numa_group is not yet setup Srikar Dronamraju
2018-06-04 12:24   ` Peter Zijlstra
2018-06-04 13:09     ` Srikar Dronamraju
2018-06-04 10:00 ` [PATCH 06/19] sched/debug: Reverse the order of printing faults Srikar Dronamraju
2018-06-04 16:28   ` Rik van Riel
2018-06-05  8:50   ` Mel Gorman
2018-06-04 10:00 ` [PATCH 07/19] sched/numa: Skip nodes that are at hoplimit Srikar Dronamraju
2018-06-04 16:27   ` Rik van Riel
2018-06-05  8:50   ` Mel Gorman
2018-06-04 10:00 ` [PATCH 08/19] sched/numa: Remove unused task_capacity from numa_stats Srikar Dronamraju
2018-06-04 16:28   ` Rik van Riel
2018-06-05  8:57   ` Mel Gorman
2018-06-04 10:00 ` [PATCH 09/19] sched/numa: Modify migrate_swap to accept additional params Srikar Dronamraju
2018-06-04 17:00   ` Rik van Riel
2018-06-05  8:58   ` Mel Gorman
2018-06-04 10:00 ` Srikar Dronamraju [this message]
2018-06-04 17:57   ` [PATCH 10/19] sched/numa: Stop multiple tasks from moving to the cpu at the same time Rik van Riel
2018-06-05  9:51   ` Mel Gorman
2018-06-04 10:00 ` [PATCH 11/19] sched/numa: Restrict migrating in parallel to the same node Srikar Dronamraju
2018-06-04 17:59   ` Rik van Riel
2018-06-05  9:53   ` Mel Gorman
2018-06-06 12:58     ` Srikar Dronamraju
2018-06-04 10:00 ` [PATCH 12/19] sched:numa Remove numa_has_capacity Srikar Dronamraju
2018-06-04 18:07   ` Rik van Riel
2018-06-04 10:00 ` [PATCH 13/19] mm/migrate: Use xchg instead of spinlock Srikar Dronamraju
2018-06-04 18:22   ` Rik van Riel
2018-06-04 19:28   ` Peter Zijlstra
2018-06-05  7:24     ` Srikar Dronamraju
2018-06-05  8:16       ` Peter Zijlstra
2018-06-04 10:00 ` [PATCH 14/19] sched/numa: Updation of scan period need not be in lock Srikar Dronamraju
2018-06-04 18:24   ` Rik van Riel
2018-06-04 10:00 ` [PATCH 15/19] sched/numa: Use group_weights to identify if migration degrades locality Srikar Dronamraju
2018-06-04 18:56   ` Rik van Riel
2018-06-04 10:00 ` [PATCH 16/19] sched/numa: Detect if node actively handling migration Srikar Dronamraju
2018-06-04 20:05   ` Rik van Riel
2018-06-05  3:56     ` Srikar Dronamraju
2018-06-05 13:07       ` Rik van Riel
2018-06-06 12:55         ` Srikar Dronamraju
2018-06-06 13:55           ` Rik van Riel
2018-06-06 15:32             ` Srikar Dronamraju
2018-06-06 17:06               ` Rik van Riel
2018-06-04 10:00 ` [PATCH 17/19] sched/numa: Pass destination cpu as a parameter to migrate_task_rq Srikar Dronamraju
2018-06-04 10:00 ` [PATCH 18/19] sched/numa: Reset scan rate whenever task moves across nodes Srikar Dronamraju
2018-06-04 20:08   ` Rik van Riel
2018-06-05  9:58   ` Mel Gorman
2018-06-06 13:47     ` Srikar Dronamraju
2018-06-04 10:00 ` [PATCH 19/19] sched/numa: Move task_placement closer to numa_migrate_preferred Srikar Dronamraju

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1528106428-19992-11-git-send-email-srikar@linux.vnet.ibm.com \
    --to=srikar@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@techsingularity.net \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=riel@surriel.com \
    --cc=tglx@linutronix.de \
    --subject='Re: [PATCH 10/19] sched/numa: Stop multiple tasks from moving to the cpu at the same time' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).