LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Gregory Haskins <ghaskins@novell.com>
To: linux-rt-users@vger.kernel.org
Cc: linux-kernel@vger.kernel.org,
	Gregory Haskins <ghaskins@novell.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Dmitry Adamushko <dmitry.adamushko@gmail.com>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Ingo Molnar <mingo@elte.hu>, Darren Hart <dvhltc@us.ibm.com>
Subject: [PATCH 13/13] RT: Cache cpus_allowed weight for optimizing migration
Date: Tue, 23 Oct 2007 12:51:30 -0400	[thread overview]
Message-ID: <20071023165130.5536.73345.stgit@novell1.haskins.net> (raw)
In-Reply-To: <20071023164156.5536.95573.stgit@novell1.haskins.net>

Some RT tasks (particularly kthreads) are bound to one specific CPU.
It is fairly common for one or more bound tasks to get queued up at the
same time.  Consider, for instance, softirq_timer and softirq_sched.  A
timer goes off in an ISR which schedules softirq_thread to run at RT50.
Then during the handling of the timer, the system determines that it's
time to smp-rebalance the system so it schedules softirq_sched to run
from within the softirq_timer kthread context. So we are in a situation
where we have two RT50 tasks queued, and the system will go into
rt-overload condition to request other CPUs for help.

The problem is that these tasks cannot ever be pulled away since they
are already running on their one and only valid RQ.  However, the other
CPUs cannot determine that the tasks are unpullable without going
through expensive checks/locking.  Therefore the helping CPUS
experience unecessary overhead/latencies regardless as they
ineffectively try to process the overload condition.

This patch tries to optimize the situation by utilizing the hamming
weight of the task->cpus_allowed mask.  A weight of 1 indicates that
the task cannot be migrated, which may be utilized by the overload
handling code to eliminate uncessary rebalance attempts.  We also
introduce a per-rq variable to count the number of migratable tasks
that are currently running.  We only go into overload if we have more
than one rt task, AND at least one of them is migratable. 

Calculating the weight is probably relatively expensive, so it is only
done when the cpus_allowed mask is updated (which should be relatively
infrequent, especially compared to scheduling frequency) and cached in
the task_struct.


Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 include/linux/sched.h |    1 
 kernel/fork.c         |    1 
 kernel/sched.c        |  121 +++++++++++++++++++++++++++++++++++++------------
 3 files changed, 94 insertions(+), 29 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7a3829f..b657c13 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1144,6 +1144,7 @@ struct task_struct {
 
 	unsigned int policy;
 	cpumask_t cpus_allowed;
+	int nr_cpus_allowed;
 	unsigned int time_slice;
 
 #ifdef CONFIG_PREEMPT_RCU
diff --git a/kernel/fork.c b/kernel/fork.c
index 5f11f23..f808e18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1257,6 +1257,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	preempt_disable();
 	p->cpus_allowed = current->cpus_allowed;
+	p->nr_cpus_allowed = current->nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
diff --git a/kernel/sched.c b/kernel/sched.c
index bdb6be0..9120b41 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -305,6 +305,7 @@ struct rq {
 
 #ifdef CONFIG_PREEMPT_RT
 	unsigned long rt_nr_running;
+	unsigned long rt_nr_migratory;
 	unsigned long rt_nr_uninterruptible;
 	int highest_prio;
 #endif
@@ -665,20 +666,43 @@ static inline struct rq *this_rq_lock(void)
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 static __cacheline_aligned_in_smp atomic_t rt_overload;
 static cpumask_t rto_cpus;
-#endif
+
+static void inc_rt_migration(struct rq *rq)
+{
+	rq->rt_nr_migratory++;
+
+	if ((rq->rt_nr_running > 1) && !cpu_isset(rq->cpu, rto_cpus)) {
+		cpu_set(rq->cpu, rto_cpus);
+		smp_wmb();
+		atomic_inc(&rt_overload);
+	}
+}
+
+static void dec_rt_migration(struct rq *rq)
+{
+	WARN_ON(!rq->rt_nr_migratory);
+	rq->rt_nr_migratory--;
+
+	if (((rq->rt_nr_running <= 1) || !rq->rt_nr_migratory)
+	    && cpu_isset(rq->cpu, rto_cpus)) {
+		atomic_dec(&rt_overload);
+		cpu_clear(rq->cpu, rto_cpus);
+	}
+}
+
+#else
+#define inc_rt_migration(rq) do { } while(0)
+#define dec_rt_migration(rq) do { } while(0)
+#endif /* defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) */
+
 
 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 {
 #ifdef CONFIG_PREEMPT_RT
 	if (rt_task(p)) {
 		rq->rt_nr_running++;
-# ifdef CONFIG_SMP
-		if (rq->rt_nr_running == 2) {
-			cpu_set(rq->cpu, rto_cpus);
-			smp_wmb();
-			atomic_inc(&rt_overload);
-		}
-# endif
+		if (p->nr_cpus_allowed > 1)
+			inc_rt_migration(rq);
 	}
 #endif
 }
@@ -689,12 +713,8 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 	if (rt_task(p)) {
 		WARN_ON(!rq->rt_nr_running);
 		rq->rt_nr_running--;
-# ifdef CONFIG_SMP
-		if (rq->rt_nr_running == 1) {
-			atomic_dec(&rt_overload);
-			cpu_clear(rq->cpu, rto_cpus);
-		}
-# endif
+		if (p->nr_cpus_allowed > 1)
+			dec_rt_migration(rq);
 	}
 #endif
 }
@@ -1526,6 +1546,27 @@ next_in_queue:
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 
+static int lock_migration_target(struct task_struct *p, struct rq *target_rq)
+{
+	struct rq *this_rq = task_rq(p);
+
+	if (double_lock_balance(this_rq, target_rq)) {
+		/*
+		 * We had to unlock the run queue. In
+		 * the mean time, task could have
+		 * migrated already or had its affinity changed.
+		 */
+		if (unlikely(task_running(this_rq, p) ||
+			     task_rq(p) != this_rq ||
+			     !cpu_isset(target_rq->cpu, p->cpus_allowed))) {
+			spin_unlock(&target_rq->lock);
+			return 0;
+		}
+	}	
+	
+	return 1;
+}
+
 /* Only try this algorithm three times */
 #define RT_PUSH_MAX_TRIES 3
 
@@ -1537,6 +1578,28 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
 	int cpu;
 	int tries;
 
+	/*
+	 * We can optimize if the hamming weight of the cpus_allowed mask
+	 * is 1 because the task has nowhere to go but one CPU.  So don't
+	 * waste the time trying to find the lowest RQ in this case.
+	 */
+	if (task->nr_cpus_allowed == 1) {
+		/* If the task is already on the RQ, we are done */
+		if (cpu_isset(this_rq->cpu, task->cpus_allowed))
+			return NULL;
+
+		cpu = first_cpu(task->cpus_allowed);
+
+		lowest_rq = cpu_rq(cpu);
+		BUG_ON(this_rq == lowest_rq);
+
+		/* Otherwise, we can simply grab the new RQ */
+		if (lock_migration_target(task, lowest_rq))
+			return lowest_rq;
+		else
+			return NULL;
+	}
+
 	for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
 		cpu = cpupri_find(this_rq->cpu, task);
 
@@ -1546,21 +1609,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
 		lowest_rq = cpu_rq(cpu);
 
 		/* if the prio of this runqueue changed, try again */
-		if (double_lock_balance(this_rq, lowest_rq)) {
-			/*
-			 * We had to unlock the run queue. In
-			 * the mean time, task could have
-			 * migrated already or had its affinity changed.
-			 */
-			if (unlikely(task_running(this_rq, task) ||
-				     task_rq(task) != this_rq ||
-				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed))) {
-				spin_unlock(&lowest_rq->lock);
-				lowest_rq = NULL;
-				break;
-			}
-
-		}
+		if (!lock_migration_target(task, lowest_rq))
+			return NULL;
 
 		/* If this rq is still suitable use it. */
 		if (lowest_rq->highest_prio > task->prio)
@@ -5794,6 +5844,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
+	int weight = cpus_weight(new_mask);
 
 	rq = task_rq_lock(p, &flags);
 	if (!cpus_intersects(new_mask, cpu_online_map)) {
@@ -5801,7 +5852,19 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 		goto out;
 	}
 
+#ifdef CONFIG_SMP
+	if (rt_task(p) && task_running(rq,p)) {
+		if (weight != p->nr_cpus_allowed) {
+			if ((p->nr_cpus_allowed <= 1) && (weight > 1))
+				inc_rt_migration(rq);
+			else if((p->nr_cpus_allowed > 1) && (weight <= 1))
+				dec_rt_migration(rq);
+		}
+	}
+#endif
 	p->cpus_allowed = new_mask;
+	p->nr_cpus_allowed = weight;
+
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpu_isset(task_cpu(p), new_mask))
 		goto out;


  parent reply	other threads:[~2007-10-23 17:09 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-10-23 16:50 [PATCH 00/13] Balance RT tasks v5 Gregory Haskins
2007-10-23 16:50 ` [PATCH 01/13] RT: push-rt Gregory Haskins
2007-10-23 16:50 ` [PATCH 02/13] RT: Condense the next-task search into one function Gregory Haskins
2007-10-23 16:50 ` [PATCH 03/13] RT: Add a per-cpu rt_overload indication Gregory Haskins
2007-10-23 16:50 ` [PATCH 04/13] RT: Wrap the RQ notion of priority to make it conditional Gregory Haskins
2007-10-23 16:50 ` [PATCH 05/13] RT: Initialize the priority value Gregory Haskins
2007-10-23 16:50 ` [PATCH 06/13] RT: Maintain the highest RQ priority Gregory Haskins
2007-10-23 16:50 ` [PATCH 07/13] RT: Clean up some of the push-rt logic Gregory Haskins
2007-10-23 16:51 ` [PATCH 08/13] RT: Add support for low-priority wake-up to push_rt feature Gregory Haskins
2007-10-23 16:51 ` [PATCH 09/13] RT: Only dirty a cacheline if the priority is actually changing Gregory Haskins
2007-10-23 16:51 ` [PATCH 10/13] RT: Fixes for push-rt patch Gregory Haskins
2007-10-23 16:51 ` [PATCH 11/13] RT: Condense NORMAL and IDLE priorities Gregory Haskins
2007-10-23 16:51 ` [PATCH 12/13] RT: CPU priority management Gregory Haskins
2007-10-23 16:51 ` Gregory Haskins [this message]
2007-10-24  0:19   ` [PATCH 13/13] RT: Cache cpus_allowed weight for optimizing migration Ingo Oeser
2007-10-24  3:20     ` Gregory Haskins

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071023165130.5536.73345.stgit@novell1.haskins.net \
    --to=ghaskins@novell.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=dmitry.adamushko@gmail.com \
    --cc=dvhltc@us.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rt-users@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=rostedt@goodmis.org \
    --subject='Re: [PATCH 13/13] RT: Cache cpus_allowed weight for optimizing migration' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).