LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] sched: staircase deadline misc fixes
@ 2007-03-28 16:37 Con Kolivas
  2007-03-28 17:34 ` [ck] " Prakash Punnoor
                   ` (3 more replies)
  0 siblings, 4 replies; 92+ messages in thread
From: Con Kolivas @ 2007-03-28 16:37 UTC (permalink / raw)
  To: linux list, Andrew Morton, Ingo Molnar, Andy Whitcroft, ck list

test.kernel.org found some idle time regressions in the latest update to the
staircase deadline scheduler and Andy Whitcroft helped me track down the 
offending problem which was present in all previous RSDL schedulers but
previously wouldn't be manifest without changes in nice. So here is a bugfix
for the set_load_weight being incorrectly set and a few other minor 
improvements. Thanks Andy!

I'm cautiously optimistic that we're at the thin edge of the bugfix wedge now.

---
set_load_weight() should be performed after p->quota is set. This fixes a
large SMP performance regression.

Make sure rr_interval is never set to less than one jiffy.

Some sanity checking in update_cpu_clock will prevent bogus sched_clock
values.

SCHED_BATCH tasks should not set the rq->best_static_prio field.

Correct sysctl rr_interval description to describe the value in milliseconds.

Style fixes.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

---
 Documentation/sysctl/kernel.txt |    8 ++--
 kernel/sched.c                  |   73 +++++++++++++++++++++++++++++-----------
 2 files changed, 58 insertions(+), 23 deletions(-)

Index: linux-2.6.21-rc5-mm2/kernel/sched.c
===================================================================
--- linux-2.6.21-rc5-mm2.orig/kernel/sched.c	2007-03-28 09:01:03.000000000 +1000
+++ linux-2.6.21-rc5-mm2/kernel/sched.c	2007-03-29 00:02:33.000000000 +1000
@@ -88,10 +88,13 @@ unsigned long long __attribute__((weak))
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 #define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
 
-/* Some helpers for converting to/from nanosecond timing */
+/* Some helpers for converting to/from various scales.*/
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
-#define NS_TO_MS(TIME)		((TIME) / 1000000)
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 #define MS_TO_NS(TIME)		((TIME) * 1000000)
+/* Can return 0 */
+#define MS_TO_JIFFIES(TIME)	((TIME) * HZ / 1000)
+#define JIFFIES_TO_MS(TIME)	((TIME) * 1000 / HZ)
 
 #define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
 
@@ -852,16 +855,15 @@ static void requeue_task(struct task_str
 
 /*
  * task_timeslice - the total duration a task can run during one major
- * rotation.
+ * rotation. Returns value in jiffies.
  */
 static inline int task_timeslice(struct task_struct *p)
 {
-	int slice, rr;
+	int slice;
 
-	slice = rr = p->quota;
+	slice = NS_TO_JIFFIES(p->quota);
 	if (!rt_task(p))
-		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr;
-	slice = NS_TO_JIFFIES(slice) ? : 1;
+		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice;
 	return slice;
 }
 
@@ -875,7 +877,7 @@ static inline int task_timeslice(struct 
 	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define TASK_LOAD_WEIGHT(p)	LOAD_WEIGHT(task_timeslice(p))
 #define RTPRIO_TO_LOAD_WEIGHT(rp)	\
-	(LOAD_WEIGHT((rr_interval + 20 + (rp))))
+	(LOAD_WEIGHT((MS_TO_JIFFIES(rr_interval) + 20 + (rp))))
 
 static void set_load_weight(struct task_struct *p)
 {
@@ -973,11 +975,15 @@ static int effective_prio(struct task_st
  * tick still. Below nice 0 they get progressively larger.
  * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
  * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
+ * Value returned is in nanoseconds.
  */
 static unsigned int rr_quota(struct task_struct *p)
 {
 	int nice = TASK_NICE(p), rr = rr_interval;
 
+	/* Ensure that rr_interval is at least 1 tick */
+	if (unlikely(!MS_TO_JIFFIES(rr)))
+		rr = rr_interval = JIFFIES_TO_MS(1) ? : 1;
 	if (!rt_task(p)) {
 		if (nice < -6) {
 			rr *= nice * nice;
@@ -3198,13 +3204,34 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * This is called on clock ticks and on context switches.
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here.
+ * The value returned from sched_clock() occasionally gives bogus values so
+ * some sanity checking is required.
  */
 static inline void
-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
+		 int tick)
 {
 	cputime64_t time_diff = now - p->last_ran;
+	unsigned int min_diff = 1000;
 
-	/* cpu scheduler quota accounting is performed here */
+	if (tick) {
+		/*
+		 * Called from scheduler_tick() there should be less than two
+		 * jiffies worth, and not negative/overflow.
+		 */
+		if (time_diff > JIFFIES_TO_NS(2) || time_diff < min_diff)
+			time_diff = JIFFIES_TO_NS(1);
+	} else {
+		/*
+		 * Called from context_switch there should be less than one
+		 * jiffy worth, and not negative/overflowed. In the case when
+		 * sched_clock fails to return high resolution values this
+		 * also ensures at least 1 min_diff gets banked.
+		 */
+		if (time_diff > JIFFIES_TO_NS(1) || time_diff < min_diff)
+			time_diff = min_diff;
+	}
 	if (p != rq->idle && p->policy != SCHED_FIFO)
 		p->time_slice -= time_diff;
 	p->sched_time += time_diff;
@@ -3353,7 +3380,7 @@ void scheduler_tick(void)
 	int idle_at_tick = idle_cpu(cpu);
 	struct rq *rq = cpu_rq(cpu);
 
-	update_cpu_clock(p, rq, now);
+	update_cpu_clock(p, rq, now, 1);
 
 	if (!idle_at_tick)
 		task_running_tick(rq, p);
@@ -3425,7 +3452,7 @@ retry:
 	}
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
-	if (unlikely(next->time_slice < 0)) {
+	if (unlikely(next->time_slice <= 0)) {
 		/*
 		 * Unlucky enough that this task ran out of time_slice
 		 * before it hit a scheduler_tick so it should have its
@@ -3438,7 +3465,8 @@ retry:
 	}
 	rq->prio_level = idx;
 	next->rotation = rq->prio_rotation;
-	if (next->static_prio < rq->best_static_prio)
+	if (next->static_prio < rq->best_static_prio &&
+	    next->policy != SCHED_BATCH)
 		rq->best_static_prio = next->static_prio;
 	return next;
 }
@@ -3533,7 +3561,7 @@ switch_tasks:
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
-	update_cpu_clock(prev, rq, now);
+	update_cpu_clock(prev, rq, now, 0);
 	prev->timestamp = prev->last_ran = now;
 
 	sched_info_switch(prev, next);
@@ -3978,7 +4006,8 @@ void rt_mutex_setprio(struct task_struct
 	rq = task_rq_lock(p, &flags);
 
 	oldprio = p->prio;
-	if ((queued = task_queued(p)))
+	queued = task_queued(p);
+	if (queued)
 		dequeue_task(p, rq);
 	p->prio = prio;
 
@@ -4023,15 +4052,17 @@ void set_user_nice(struct task_struct *p
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	if ((queued = task_queued(p))) {
+	queued = task_queued(p);
+	if (queued) {
 		dequeue_task(p, rq);
 		dec_raw_weighted_load(rq, p);
 	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
-	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
+	p->quota = rr_quota(p);
+	set_load_weight(p);
 	delta = p->prio - old_prio;
 
 	if (queued) {
@@ -4045,7 +4076,6 @@ void set_user_nice(struct task_struct *p
 			resched_task(rq->curr);
 	}
 out_unlock:
-	p->quota = rr_quota(p);
 	task_rq_unlock(rq, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
@@ -4166,6 +4196,7 @@ static void __setscheduler(struct task_s
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
+	p->quota = rr_quota(p);
 	set_load_weight(p);
 }
 
@@ -4254,7 +4285,8 @@ recheck:
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
-	if ((queued = task_queued(p)))
+	queued = task_queued(p);
+	if (queued)
 		deactivate_task(p, rq);
 	oldprio = p->prio;
 	__setscheduler(p, policy, param->sched_priority);
@@ -7088,7 +7120,8 @@ void normalize_rt_tasks(void)
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
 
-		if ((queued = task_queued(p)))
+		queued = task_queued(p);
+		if (queued)
 			deactivate_task(p, task_rq(p));
 		__setscheduler(p, SCHED_NORMAL, 0);
 		if (queued) {
Index: linux-2.6.21-rc5-mm2/Documentation/sysctl/kernel.txt
===================================================================
--- linux-2.6.21-rc5-mm2.orig/Documentation/sysctl/kernel.txt	2007-03-28 09:01:03.000000000 +1000
+++ linux-2.6.21-rc5-mm2/Documentation/sysctl/kernel.txt	2007-03-28 09:01:04.000000000 +1000
@@ -294,9 +294,11 @@ rr_interval:
 This is the smallest duration that any cpu process scheduling unit
 will run for. Increasing this value can increase throughput of cpu
 bound tasks substantially but at the expense of increased latencies
-overall. This value is in _ticks_ and the default value chosen depends
-on the number of cpus available at scheduler initialisation. Valid
-values are from 1-100.
+overall. This value is in milliseconds and the default value chosen
+depends on the number of cpus available at scheduler initialisation
+with a minimum of 8.
+
+Valid values are from 1-100.
 
 ==============================================================
 

-- 
-ck

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [ck] [PATCH] sched: staircase deadline misc fixes
  2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas
@ 2007-03-28 17:34 ` Prakash Punnoor
  2007-04-01  6:40   ` Prakash Punnoor
  2007-03-28 18:48 ` Ingo Molnar
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 92+ messages in thread
From: Prakash Punnoor @ 2007-03-28 17:34 UTC (permalink / raw)
  To: ck; +Cc: Con Kolivas, linux list

[-- Attachment #1: Type: text/plain, Size: 848 bytes --]

Am Mittwoch 28 März 2007 schrieb Con Kolivas:
> I'm cautiously optimistic that we're at the thin edge of the bugfix wedge
> now.
>
> ---
> set_load_weight() should be performed after p->quota is set. This fixes a
> large SMP performance regression.

Hi, I am using 2.6.21-rc5 with rsdl 0.37 and think I still see a regression 
with my Athlon X2. Namely using this ac3 encoder 
(http://aften.sourceforge.net/), which I parallelized in a simple way, with 
my test sample I remember having encoding times of ~5.4sec with vanilla and 
~5.8 sec with rsdl - once the whole test wave is in cache. Otherwise you can 
easily I/O limit the encoder. ;-) You need to get sources from svn though. 
The current 0.06 release doesn't have threads support.

Cheers,

-- 
(°=                 =°)
//\ Prakash Punnoor /\\
V_/                 \_V

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas
  2007-03-28 17:34 ` [ck] " Prakash Punnoor
@ 2007-03-28 18:48 ` Ingo Molnar
  2007-03-28 23:44   ` Con Kolivas
  2007-03-29  6:36 ` Con Kolivas
  2007-04-23  8:58 ` Andrew Morton
  3 siblings, 1 reply; 92+ messages in thread
From: Ingo Molnar @ 2007-03-28 18:48 UTC (permalink / raw)
  To: Con Kolivas; +Cc: linux list, Andrew Morton


* Con Kolivas <kernel@kolivas.org> wrote:

> I'm cautiously optimistic that we're at the thin edge of the bugfix 
> wedge now.

hm, how about the questions Mike raised (there were a couple of cases of 
friction between 'the design as documented and announced' and 'the code 
as implemented')? As far as i saw they were still largely unanswered - 
but let me know if they are all answered and addressed:

 http://marc.info/?l=linux-kernel&m=117465220309006&w=2
 http://marc.info/?l=linux-kernel&m=117489673929124&w=2
 http://marc.info/?l=linux-kernel&m=117489831930240&w=2

and the numbers he posted:

 http://marc.info/?l=linux-kernel&m=117448900626028&w=2

his test conclusion was that under CPU load, RSDL (SD) generally does 
not hold up to mainline's interactivity.

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-28 18:48 ` Ingo Molnar
@ 2007-03-28 23:44   ` Con Kolivas
  2007-03-29  5:50     ` Mike Galbraith
  0 siblings, 1 reply; 92+ messages in thread
From: Con Kolivas @ 2007-03-28 23:44 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux list, Andrew Morton, ck list

On Thursday 29 March 2007 04:48, Ingo Molnar wrote:
> hm, how about the questions Mike raised (there were a couple of cases of
> friction between 'the design as documented and announced' and 'the code
> as implemented')? As far as i saw they were still largely unanswered -
> but let me know if they are all answered and addressed:

I spent less time emailing and more time coding. I have been working on 
addressing whatever people brought up.

>  http://marc.info/?l=linux-kernel&m=117465220309006&w=2

Attended to.

>  http://marc.info/?l=linux-kernel&m=117489673929124&w=2

Attended to.

>  http://marc.info/?l=linux-kernel&m=117489831930240&w=2

Checked fine.

> and the numbers he posted:
>
>  http://marc.info/?l=linux-kernel&m=117448900626028&w=2

Attended to.

> his test conclusion was that under CPU load, RSDL (SD) generally does
> not hold up to mainline's interactivity.

There have been improvements since the earlier iterations but it's still a 
fairness based design. Mike's "sticking point" test case should be improved 
as well.

My call based on my own testing and feedback from users is: 

Under niced loads it is 99% in favour of SD.

Under light loads it is 95% in favour of SD.

Under Heavy loads it becomes proportionately in favour of mainline. The 
crossover is somewhere around a load of 4.

If the reluctance to renice X goes away I'd say it was 99% across the board 
and to much higher loads.

> 	Ingo

-- 
-ck

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-28 23:44   ` Con Kolivas
@ 2007-03-29  5:50     ` Mike Galbraith
  2007-03-29  6:29       ` Mike Galbraith
                         ` (2 more replies)
  0 siblings, 3 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-03-29  5:50 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote:
> On Thursday 29 March 2007 04:48, Ingo Molnar wrote:
> > hm, how about the questions Mike raised (there were a couple of cases of
> > friction between 'the design as documented and announced' and 'the code
> > as implemented')? As far as i saw they were still largely unanswered -
> > but let me know if they are all answered and addressed:
> 
> I spent less time emailing and more time coding. I have been working on 
> addressing whatever people brought up.
> 
> >  http://marc.info/?l=linux-kernel&m=117465220309006&w=2
> 
> Attended to.
> 
> >  http://marc.info/?l=linux-kernel&m=117489673929124&w=2
> 
> Attended to.
> 
> >  http://marc.info/?l=linux-kernel&m=117489831930240&w=2
> 
> Checked fine.

That one's not fine.

+static void recalc_task_prio(struct task_struct *p, struct rq *rq)
+{
+	struct prio_array *array = rq->active;
+	int queue_prio;
+
+	update_if_moved(p, rq);
+	if (p->rotation == rq->prio_rotation) {
+		if (p->array == array) {
+			if (p->time_slice > 0)
+				return;
+			p->time_slice = p->quota;
+		} else if (p->array == rq->expired) {

You implemented nanosecond accounting, but here you give a task which
has either missed the tick ofter enough, or accumulated enough cross cpu
clock drift to have an I.O.U. in it's wallet a shiny new $8 bill.

WRT  clock drift/timewarps, your latest code cedes that these do occur,
but where these timewarps can be anywhere between minuscule with Intel
same package processors, up to a tick elsewhere, charges a tick. 
 
-       /* cpu scheduler quota accounting is performed here */
+       if (tick) {
+               /*
+                * Called from scheduler_tick() there should be less
than two
+                * jiffies worth, and not negative/overflow.
+                */
+               if (time_diff > JIFFIES_TO_NS(2) || time_diff <
min_diff)
+                       time_diff = JIFFIES_TO_NS(1); 

> > and the numbers he posted:
> >
> >  http://marc.info/?l=linux-kernel&m=117448900626028&w=2
> 
> Attended to.

Hm.  How, where?

I'm getting inconsistent results with current, but sleeping tasks still
don't _appear_ to be able to compete with hogs on an equal footing, and
I don't see how they really can.

What happens if a sleeper sleeps after using say half of it's slice, and
the hog it's sharing the CPU with then sleeps briefly after using most
of it's slice.  That's the end of the rotation.  They are put back on an
equal footing, but what just happened to the differential in cpu usage?

> > his test conclusion was that under CPU load, RSDL (SD) generally does
> > not hold up to mainline's interactivity.
> 
> There have been improvements since the earlier iterations but it's still a 
> fairness based design. Mike's "sticking point" test case should be improved 
> as well.

The behavior is different, and is less ragged, but I wouldn't say it's
really been improved.  The below was added as a workaround.

+ * This contains a bitmap for each dynamic priority level with empty slots
+ * for the valid priorities each different nice level can have. It allows
+ * us to stagger the slots where differing priorities run in a way that
+ * keeps latency differences between different nice levels at a minimum.
+ * ie, where 0 means a slot for that priority, priority running from left to
+ * right:
+ * nice -20 0000000000000000000000000000000000000000
+ * nice -10 1001000100100010001001000100010010001000
+ * nice   0 0101010101010101010101010101010101010101
+ * nice   5 1101011010110101101011010110101101011011
+ * nice  10 0110111011011101110110111011101101110111
+ * nice  15 0111110111111011111101111101111110111111
+ * nice  19 1111111111111111111011111111111111111111

I don't really know what to say about this.  I think it explains reduced
context switching, but I don't see how this could be a good thing.
Consider a nice -20 fast/light task trying to get CPU with nice 0 tasks
being constantly spawned.  How can this latency bound fast mover perform
if it can't preempt?  What am I missing?

> My call based on my own testing and feedback from users is: 
> 
> Under niced loads it is 99% in favour of SD.
> 
> Under light loads it is 95% in favour of SD.
> 
> Under Heavy loads it becomes proportionately in favour of mainline. The 
> crossover is somewhere around a load of 4.

Opinion polls are nice, but I'm more interested in gathering numbers
which either validate or invalidate the claims of the design documents.
 
WRT this subjective opinion thing, I see regressions with all loads, and
I don't see what a < 95% load really means.  If CPU isn't contended,
dishing it out is dirt simple.  Just give everybody frequent, and fairly
short chunks, and everybody is fairly happy.  The only time scheduling
becomes interesting is when there IS contention, and mainline seems to
do much better at this, with the caveat that the history mechanism
indeed doesn't always get it right.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-29  5:50     ` Mike Galbraith
@ 2007-03-29  6:29       ` Mike Galbraith
  2007-03-29  6:54         ` Mike Galbraith
  2007-03-29  8:18       ` Mike Galbraith
  2007-04-03  2:37       ` Con Kolivas
  2 siblings, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-03-29  6:29 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote:

> Opinion polls are nice, but I'm more interested in gathering numbers
> which either validate or invalidate the claims of the design documents.

Suggestion: try the testcase that Satoru Takeuch posted.  The numbers I
got with latest SD were no better than the numbers I got with the patch
I posted to try to solve it.  Seems to me the numbers with SD should
have been much better, but they in fact were not.

Running that thing, mainline's GUI was not usable, even with my patch,
but neither was it usable with SD.  What's the difference between
horrible with mainline and merely terrible with SD?  In both, the GUI
ends up doing round-robin with a slew of hogs.  In mainline, this
happens because the history logic can and does get it wrong sometimes,
which this exploit deliberately triggers.  With SD, it's by design.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas
  2007-03-28 17:34 ` [ck] " Prakash Punnoor
  2007-03-28 18:48 ` Ingo Molnar
@ 2007-03-29  6:36 ` Con Kolivas
  2007-04-23  8:58 ` Andrew Morton
  3 siblings, 0 replies; 92+ messages in thread
From: Con Kolivas @ 2007-03-29  6:36 UTC (permalink / raw)
  To: linux list; +Cc: Andrew Morton, Ingo Molnar, Andy Whitcroft, ck list

On Thursday 29 March 2007 02:37, Con Kolivas wrote:
> I'm cautiously optimistic that we're at the thin edge of the bugfix wedge
> now.

My neck condition got a lot worse today. I'm forced offline for a week and 
will be uncontactable.

-- 
-ck

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-29  6:29       ` Mike Galbraith
@ 2007-03-29  6:54         ` Mike Galbraith
  0 siblings, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-03-29  6:54 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

Oh my, I'm on a roll here... somebody stop me ;-)

Some emphasis:

On Thu, 2007-03-29 at 08:29 +0200, Mike Galbraith wrote:
> On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote:
> 
> > Opinion polls are nice, but I'm more interested in gathering numbers
> > which either validate or invalidate the claims of the design documents.
> 
> Suggestion: try the testcase that Satoru Takeuch posted.  The numbers I
> got with latest SD were no better than the numbers I got with the patch
> I posted to try to solve it.  Seems to me the numbers with SD should
> have been much better, but they in fact were not.
> 
> Running that thing, mainline's GUI was not usable, even with my patch,
> but neither was it usable with SD.  What's the difference between
> horrible with mainline and merely terrible with SD?  In both, the GUI
> ends up doing round-robin with a slew of hogs.  In mainline, this
> happens because the history logic can and does get it wrong sometimes,
> which this exploit deliberately triggers.  With SD, it's by design.

The much maligned history mechanism in mainline didn't start it's life
as an interactivity estimator, that's a name it acquired later.  What it
was first put there for was to ensure fairness for sleeping tasks.

I found it most ironic that the numbers I posted showed that mechanism
working perfectly, with an exploit that was designed specifically to
expose it's weakness, despite the deliberate tweaks that have gone in
tweaking it very heavily in the unfair direction, and this went
uncommented.  If I had run more of them, it would have shown that
weakness very well.  We all know that weakness exists.

What the numbers clearly showed was that sleeping tasks did not get the
fairness RSDL advertised with the particular test I ran, yet it went
uncommented/uncontested.  Anyone could have tested with the trivial
proggy of their choice... but nobody did.

The history mechanism is not only about interactivity, and never was. 

	-Mike

I'm gonna go piddle around with code now, much more fun than yacking :)


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-29  5:50     ` Mike Galbraith
  2007-03-29  6:29       ` Mike Galbraith
@ 2007-03-29  8:18       ` Mike Galbraith
  2007-03-29 12:55         ` [ck] " michael chang
  2007-04-03  2:35         ` Con Kolivas
  2007-04-03  2:37       ` Con Kolivas
  2 siblings, 2 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-03-29  8:18 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

Rereading to make sure I wasn't unclear anywhere...

On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote:
>  
> I don't see what a < 95% load really means.

Egad.  Here I'm pondering the numbers and light load as I'm typing, and
my fingers (seemingly independent when mind wanders off) typed < 95% as
in not fully committed, instead of "light".

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [ck] Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-29  8:18       ` Mike Galbraith
@ 2007-03-29 12:55         ` michael chang
  2007-04-03  2:35         ` Con Kolivas
  1 sibling, 0 replies; 92+ messages in thread
From: michael chang @ 2007-03-29 12:55 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Con Kolivas, ck list, linux list, Andrew Morton

On 3/29/07, Mike Galbraith <efault@gmx.de> wrote:
> Rereading to make sure I wasn't unclear anywhere...
>
> On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote:
> >
> > I don't see what a < 95% load really means.
>
> Egad.  Here I'm pondering the numbers and light load as I'm typing, and
> my fingers (seemingly independent when mind wanders off) typed < 95% as
> in not fully committed, instead of "light".

While I don't know the _exact_ figure for this, my hunch is that a
good ballpark figure is anything that is not a heavy load (less than
4, perhaps even lower, maybe <0.75 or <2?) and that is not a "niced"
load.

-- 
-- Michael Chang
~Just the crazy copy cat~

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [ck] [PATCH] sched: staircase deadline misc fixes
  2007-03-28 17:34 ` [ck] " Prakash Punnoor
@ 2007-04-01  6:40   ` Prakash Punnoor
       [not found]     ` <b14e81f00704010724i3155a16en91074ab789416f3d@mail.gmail.com>
  0 siblings, 1 reply; 92+ messages in thread
From: Prakash Punnoor @ 2007-04-01  6:40 UTC (permalink / raw)
  To: ck; +Cc: linux list

[-- Attachment #1: Type: text/plain, Size: 1178 bytes --]

Am Mittwoch 28 März 2007 schrieb Prakash Punnoor:
> Am Mittwoch 28 März 2007 schrieb Con Kolivas:
> > I'm cautiously optimistic that we're at the thin edge of the bugfix wedge
> > now.
> >
> > ---
> > set_load_weight() should be performed after p->quota is set. This fixes a
> > large SMP performance regression.
>
> Hi, I am using 2.6.21-rc5 with rsdl 0.37 and think I still see a regression
> with my Athlon X2. Namely using this ac3 encoder
> (http://aften.sourceforge.net/), which I parallelized in a simple way, with
> my test sample I remember having encoding times of ~5.4sec with vanilla and
> ~5.8 sec with rsdl - once the whole test wave is in cache. Otherwise you
> can easily I/O limit the encoder. ;-) You need to get sources from svn
> though. The current 0.06 release doesn't have threads support.

BTW, I confirmed this regression. With vanilla 2.76.21-rc5 I get back my 5.4 
secs with the test sample and two threads. Furtmermore for me vanilla 
actually feels nicer on my dual core, even with load - just subjectively 
that's why I ditched rsdl...

Cheers,
-- 
(°=                 =°)
//\ Prakash Punnoor /\\
V_/                 \_V

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [ck] [PATCH] sched: staircase deadline misc fixes
       [not found]     ` <b14e81f00704010724i3155a16en91074ab789416f3d@mail.gmail.com>
@ 2007-04-01 20:03       ` Prakash Punnoor
  0 siblings, 0 replies; 92+ messages in thread
From: Prakash Punnoor @ 2007-04-01 20:03 UTC (permalink / raw)
  To: michael chang; +Cc: ck, linux list

[-- Attachment #1: Type: text/plain, Size: 1261 bytes --]

Am Sonntag 01 April 2007 schrieb michael chang:
> On 4/1/07, Prakash Punnoor <prakash@punnoor.de> wrote:
> > Am Mittwoch 28 März 2007 schrieb Prakash Punnoor:

> > >
> > > Hi, I am using 2.6.21-rc5 with rsdl 0.37 and think I still see a
> > > regression with my Athlon X2. Namely using this ac3 encoder
> > > (http://aften.sourceforge.net/), which I parallelized in a simple way,
> > > with my test sample I remember having encoding times of ~5.4sec with
> > > vanilla and ~5.8 sec with rsdl - once the whole test wave is in cache.

> > BTW, I confirmed this regression. With vanilla 2.76.21-rc5 I get back my
> > 5.4 secs with the test sample and two threads. Furtmermore for me vanilla
>
> Which version of RSDL were you comparing to 2.6.21-rc5? Did you try
> the patch in the first message (http://lkml.org/lkml/2007/3/28/146)?
> The patch that _began_ this thread had SMP fixes in it... (Also, IIRC,
> the latest version of the scheduler no longer has the rotating
> component - so it's just SDl now.)

As I said, I tried 0.37. Didn't it have the fix inside? Actually I am 
reluctant to go back to (r)sdl, as it didn't show improvements for me, yet.

-- 
(°=                 =°)
//\ Prakash Punnoor /\\
V_/                 \_V

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-29  8:18       ` Mike Galbraith
  2007-03-29 12:55         ` [ck] " michael chang
@ 2007-04-03  2:35         ` Con Kolivas
  1 sibling, 0 replies; 92+ messages in thread
From: Con Kolivas @ 2007-04-03  2:35 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Thursday 29 March 2007 18:18, Mike Galbraith wrote:
> Rereading to make sure I wasn't unclear anywhere...
>
> On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote:
> > I don't see what a < 95% load really means.
>
> Egad.  Here I'm pondering the numbers and light load as I'm typing, and
> my fingers (seemingly independent when mind wanders off) typed < 95% as
> in not fully committed, instead of "light".

95% of cases where load is less than 4; not 95% load.

-- 
-ck

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-29  5:50     ` Mike Galbraith
  2007-03-29  6:29       ` Mike Galbraith
  2007-03-29  8:18       ` Mike Galbraith
@ 2007-04-03  2:37       ` Con Kolivas
  2007-04-03  5:31         ` Mike Galbraith
  2 siblings, 1 reply; 92+ messages in thread
From: Con Kolivas @ 2007-04-03  2:37 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

[-- Attachment #1: Type: text/plain, Size: 1059 bytes --]

On Thursday 29 March 2007 15:50, Mike Galbraith wrote:
> On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote:
> + * This contains a bitmap for each dynamic priority level with empty slots
> + * for the valid priorities each different nice level can have. It allows
> + * us to stagger the slots where differing priorities run in a way that
> + * keeps latency differences between different nice levels at a minimum.
> + * ie, where 0 means a slot for that priority, priority running from left
> to + * right:
> + * nice -20 0000000000000000000000000000000000000000
> + * nice -10 1001000100100010001001000100010010001000
> + * nice   0 0101010101010101010101010101010101010101
> + * nice   5 1101011010110101101011010110101101011011
> + * nice  10 0110111011011101110110111011101101110111
> + * nice  15 0111110111111011111101111101111110111111
> + * nice  19 1111111111111111111011111111111111111111

Try two instances of chew.c at _differing_ nice levels on one cpu on mainline, 
and then SD. This is why you can't renice X on mainline.

> 	-Mike

-- 
-ck

[-- Attachment #2: chew.c --]
[-- Type: text/x-csrc, Size: 1027 bytes --]

/*
 * orignal idea by Chris Friesen.  Thanks.
 */

#include <stdio.h>
#include <sys/time.h>
#include <sys/resource.h>

#define THRESHOLD_USEC 2000

unsigned long long stamp()
{
        struct timeval tv;
        gettimeofday(&tv, 0);
        return (unsigned long long) tv.tv_usec + ((unsigned long long) tv.tv_sec)*1000000;
}

int main()
{
        unsigned long long thresh_ticks = THRESHOLD_USEC;
        unsigned long long cur,last;
        struct timespec ts;

        sched_rr_get_interval(0, &ts);
        printf("pid %d, prio %3d, interval of %d nsec\n", getpid(), getpriority(PRIO_PROCESS, 0), ts.tv_nsec);

        last = stamp();
        while(1) {
                cur = stamp();
                unsigned long long delta = cur-last;
                if (delta > thresh_ticks) {
                        printf("pid %d, prio %3d, out for %4llu ms\n", getpid(), getpriority(PRIO_PROCESS, 0), delta/1000);
                        cur = stamp();
                }
                last = cur;
        }

        return 0;
}

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-03  2:37       ` Con Kolivas
@ 2007-04-03  5:31         ` Mike Galbraith
  2007-04-03  6:00           ` Mike Galbraith
                             ` (2 more replies)
  0 siblings, 3 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-03  5:31 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Tue, 2007-04-03 at 12:37 +1000, Con Kolivas wrote:
> On Thursday 29 March 2007 15:50, Mike Galbraith wrote:
> > On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote:
> > + * This contains a bitmap for each dynamic priority level with empty slots
> > + * for the valid priorities each different nice level can have. It allows
> > + * us to stagger the slots where differing priorities run in a way that
> > + * keeps latency differences between different nice levels at a minimum.
> > + * ie, where 0 means a slot for that priority, priority running from left
> > to + * right:
> > + * nice -20 0000000000000000000000000000000000000000
> > + * nice -10 1001000100100010001001000100010010001000
> > + * nice   0 0101010101010101010101010101010101010101
> > + * nice   5 1101011010110101101011010110101101011011
> > + * nice  10 0110111011011101110110111011101101110111
> > + * nice  15 0111110111111011111101111101111110111111
> > + * nice  19 1111111111111111111011111111111111111111
> 
> Try two instances of chew.c at _differing_ nice levels on one cpu on mainline, 
> and then SD. This is why you can't renice X on mainline.

How about something more challenging instead :)

The numbers below are from my scheduler tree with massive_intr running
at nice 0, and chew at nice 5.  Below these numbers are 100 lines from
the exact center of chew's output.

(interactivity remains intact with this rather heavy load)

root@Homer: ./massive_intr 30 180
005671  00001506
005657  00001506
005651  00001491
005647  00001466
005661  00001484
005660  00001475
005645  00001514
005668  00001384
005673  00001516
005656  00001449
005664  00001512
005659  00001507
005667  00001513
005663  00001521
005670  00001440
005649  00001522
005652  00001487
005648  00001405
005665  00001472
005669  00001418
005662  00001489
005674  00001523
005650  00001480
005655  00001476
005672  00001530
005653  00001463
005654  00001427
005646  00001499
005658  00001510
005666  00001476

100 sequential lines from the middle of chew's logged output.

pid 5642, prio   5, out for    2 ms, ran for    1 ms, load  34%
pid 5642, prio   5, out for 1268 ms, ran for   63 ms, load   4%
pid 5642, prio   5, out for   52 ms, ran for    0 ms, load   0%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  14%
pid 5642, prio   5, out for    9 ms, ran for    1 ms, load  12%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  15%
pid 5642, prio   5, out for    9 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  15%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  12%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  18%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  11%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  18%
pid 5642, prio   5, out for    4 ms, ran for    1 ms, load  22%
pid 5642, prio   5, out for 1395 ms, ran for   50 ms, load   3%
pid 5642, prio   5, out for   26 ms, ran for    0 ms, load   3%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  15%
pid 5642, prio   5, out for    9 ms, ran for    1 ms, load  11%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  13%
pid 5642, prio   5, out for    7 ms, ran for    0 ms, load  11%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  11%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  14%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  20%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  14%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  13%
pid 5642, prio   5, out for 1400 ms, ran for   53 ms, load   3%
pid 5642, prio   5, out for   22 ms, ran for    1 ms, load   6%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  18%
pid 5642, prio   5, out for    9 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    2 ms, ran for    1 ms, load  49%
pid 5642, prio   5, out for 1281 ms, ran for   50 ms, load   3%
pid 5642, prio   5, out for   50 ms, ran for    0 ms, load   1%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  15%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  16%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  13%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  11%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  12%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    4 ms, ran for    1 ms, load  31%
pid 5642, prio   5, out for 1248 ms, ran for   53 ms, load   4%
pid 5642, prio   5, out for   44 ms, ran for    0 ms, load   1%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  16%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    9 ms, ran for    1 ms, load  15%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  11%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  15%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  13%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  15%
pid 5642, prio   5, out for 1311 ms, ran for   55 ms, load   4%
pid 5642, prio   5, out for  121 ms, ran for    0 ms, load   0%
pid 5642, prio   5, out for   22 ms, ran for    0 ms, load   1%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  16%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  18%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  12%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  16%
pid 5642, prio   5, out for    6 ms, ran for    1 ms, load  14%
pid 5642, prio   5, out for 1289 ms, ran for   50 ms, load   3%
pid 5642, prio   5, out for   38 ms, ran for    0 ms, load   1%
pid 5642, prio   5, out for    9 ms, ran for    1 ms, load  14%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  19%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  12%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  15%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  14%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  16%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  14%
pid 5642, prio   5, out for    9 ms, ran for    1 ms, load  11%
pid 5642, prio   5, out for    6 ms, ran for    1 ms, load  22%
pid 5642, prio   5, out for 1348 ms, ran for   53 ms, load   3%
pid 5642, prio   5, out for    8 ms, ran for    0 ms, load  10%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  15%
pid 5642, prio   5, out for    7 ms, ran for    0 ms, load  11%
pid 5642, prio   5, out for    8 ms, ran for    1 ms, load  12%
pid 5642, prio   5, out for 1385 ms, ran for   65 ms, load   4%
pid 5642, prio   5, out for 1385 ms, ran for   74 ms, load   5%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    7 ms, ran for    1 ms, load  17%
pid 5642, prio   5, out for    6 ms, ran for    1 ms, load  20%
pid 5642, prio   5, out for 1375 ms, ran for   66 ms, load   4%



^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-03  5:31         ` Mike Galbraith
@ 2007-04-03  6:00           ` Mike Galbraith
  2007-04-03  6:01           ` Ingo Molnar
  2007-04-03 10:57           ` [PATCH] sched: staircase deadline misc fixes Mike Galbraith
  2 siblings, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-03  6:00 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Tue, 2007-04-03 at 07:31 +0200, Mike Galbraith wrote:
> On Tue, 2007-04-03 at 12:37 +1000, Con Kolivas wrote:
> > On Thursday 29 March 2007 15:50, Mike Galbraith wrote:
> > > On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote:
> > > + * This contains a bitmap for each dynamic priority level with empty slots
> > > + * for the valid priorities each different nice level can have. It allows
> > > + * us to stagger the slots where differing priorities run in a way that
> > > + * keeps latency differences between different nice levels at a minimum.
> > > + * ie, where 0 means a slot for that priority, priority running from left
> > > to + * right:
> > > + * nice -20 0000000000000000000000000000000000000000
> > > + * nice -10 1001000100100010001001000100010010001000
> > > + * nice   0 0101010101010101010101010101010101010101
> > > + * nice   5 1101011010110101101011010110101101011011
> > > + * nice  10 0110111011011101110110111011101101110111
> > > + * nice  15 0111110111111011111101111101111110111111
> > > + * nice  19 1111111111111111111011111111111111111111
> > 
> > Try two instances of chew.c at _differing_ nice levels on one cpu on mainline, 
> > and then SD. This is why you can't renice X on mainline.
> 
> How about something more challenging instead :)
> 
> The numbers below are from my scheduler tree with massive_intr running
> at nice 0, and chew at nice 5.  Below these numbers are 100 lines from
> the exact center of chew's output.
> 
> (interactivity remains intact with this rather heavy load)

Here are the numbers for 2.6.21-rc5 with only the earlier mentioned
patch.  Chew's log is only 20% as long as that from my other tree, and
interactivity suffers badly while running this exploit, but as you can
see, chew isn't dying of boredom.

	-Mike

root@Homer: ./massive_intr 30 180
006701  00001509
006693  00001571
006707  00001072
006690  00001582
006691  00001547
006692  00001336
006695  00001759
006710  00001766
006699  00001531
006688  00001405
006709  00001907
006703  00001572
006705  00001501
006697  00001617
006686  00001344
006713  00001922
006714  00001885
006704  00001491
006694  00001482
006689  00001395
006711  00001176
006715  00001471
006708  00001527
006687  00001200
006706  00001451
006698  00001246
006702  00001495
006696  00001421
006712  00001414
006700  00001047


pid 6683, prio   5, out for   46 ms, ran for    0 ms, load   0%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  17%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  16%
pid 6683, prio   5, out for    6 ms, ran for    1 ms, load  18%
pid 6683, prio   5, out for 3527 ms, ran for   69 ms, load   1%
pid 6683, prio   5, out for   52 ms, ran for    1 ms, load   2%
pid 6683, prio   5, out for   15 ms, ran for    1 ms, load   6%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  15%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  13%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  18%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  18%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  18%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  17%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  17%
pid 6683, prio   5, out for 3925 ms, ran for   56 ms, load   1%
pid 6683, prio   5, out for   30 ms, ran for    1 ms, load   3%
pid 6683, prio   5, out for   24 ms, ran for    1 ms, load   6%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  18%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  18%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  11%
pid 6683, prio   5, out for    5 ms, ran for    0 ms, load  16%
pid 6683, prio   5, out for  376 ms, ran for   54 ms, load  12%
pid 6683, prio   5, out for 3320 ms, ran for    9 ms, load   0%
pid 6683, prio   5, out for 3895 ms, ran for   74 ms, load   1%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  16%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  19%
pid 6683, prio   5, out for    3 ms, ran for    1 ms, load  26%
pid 6683, prio   5, out for 3364 ms, ran for   68 ms, load   2%
pid 6683, prio   5, out for 4676 ms, ran for   74 ms, load   1%
pid 6683, prio   5, out for 3726 ms, ran for   74 ms, load   1%
pid 6683, prio   5, out for 3223 ms, ran for   74 ms, load   2%
pid 6683, prio   5, out for    7 ms, ran for    0 ms, load   4%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  13%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  20%
pid 6683, prio   5, out for    9 ms, ran for    1 ms, load  12%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  16%
pid 6683, prio   5, out for 3562 ms, ran for   67 ms, load   1%
pid 6683, prio   5, out for 4372 ms, ran for   74 ms, load   1%
pid 6683, prio   5, out for 6831 ms, ran for   74 ms, load   1%
pid 6683, prio   5, out for  756 ms, ran for   74 ms, load   9%
pid 6683, prio   5, out for   27 ms, ran for    0 ms, load   1%
pid 6683, prio   5, out for    4 ms, ran for    1 ms, load  20%
pid 6683, prio   5, out for 3619 ms, ran for   71 ms, load   1%
pid 6683, prio   5, out for    7 ms, ran for    0 ms, load  11%
pid 6683, prio   5, out for    3 ms, ran for    1 ms, load  30%
pid 6683, prio   5, out for    7 ms, ran for   34 ms, load  82%
pid 6683, prio   5, out for    3 ms, ran for    1 ms, load  30%
pid 6683, prio   5, out for 3182 ms, ran for   34 ms, load   1%
pid 6683, prio   5, out for 4559 ms, ran for   74 ms, load   1%
pid 6683, prio   5, out for 2937 ms, ran for   74 ms, load   2%
pid 6683, prio   5, out for   19 ms, ran for    1 ms, load   8%
pid 6683, prio   5, out for 3869 ms, ran for   72 ms, load   1%
pid 6683, prio   5, out for    5 ms, ran for    0 ms, load   3%
pid 6683, prio   5, out for 3375 ms, ran for   75 ms, load   2%
pid 6683, prio   5, out for 4300 ms, ran for   74 ms, load   1%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  19%
pid 6683, prio   5, out for    3 ms, ran for    1 ms, load  31%
pid 6683, prio   5, out for 5949 ms, ran for   72 ms, load   1%
pid 6683, prio   5, out for 5314 ms, ran for   73 ms, load   1%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  14%
pid 6683, prio   5, out for    9 ms, ran for    1 ms, load  14%
pid 6683, prio   5, out for    3 ms, ran for    1 ms, load  34%
pid 6683, prio   5, out for 4067 ms, ran for   70 ms, load   1%
pid 6683, prio   5, out for   16 ms, ran for    7 ms, load  32%
pid 6683, prio   5, out for 4149 ms, ran for   66 ms, load   1%
pid 6683, prio   5, out for    3 ms, ran for    1 ms, load  27%
pid 6683, prio   5, out for 2366 ms, ran for   72 ms, load   2%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  16%
pid 6683, prio   5, out for    7 ms, ran for    0 ms, load  10%
pid 6683, prio   5, out for 1459 ms, ran for   73 ms, load   4%
pid 6683, prio   5, out for 3121 ms, ran for   74 ms, load   2%
pid 6683, prio   5, out for 3070 ms, ran for   74 ms, load   2%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  16%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  11%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  12%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  17%
pid 6683, prio   5, out for 1303 ms, ran for   66 ms, load   4%
pid 6683, prio   5, out for   10 ms, ran for    1 ms, load  10%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  16%
pid 6683, prio   5, out for    5 ms, ran for    1 ms, load  17%
pid 6683, prio   5, out for 2350 ms, ran for   68 ms, load   2%
pid 6683, prio   5, out for    5 ms, ran for    0 ms, load  15%
pid 6683, prio   5, out for 3242 ms, ran for   75 ms, load   2%
pid 6683, prio   5, out for 2684 ms, ran for   74 ms, load   2%
pid 6683, prio   5, out for 4941 ms, ran for   75 ms, load   1%
pid 6683, prio   5, out for 1119 ms, ran for   74 ms, load   6%
pid 6683, prio   5, out for    8 ms, ran for    0 ms, load  10%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  19%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  18%
pid 6683, prio   5, out for    5 ms, ran for    1 ms, load  17%
pid 6683, prio   5, out for 3701 ms, ran for   67 ms, load   1%
pid 6683, prio   5, out for    2 ms, ran for    1 ms, load  43%
pid 6683, prio   5, out for 3486 ms, ran for   72 ms, load   2%
pid 6683, prio   5, out for    8 ms, ran for    0 ms, load   5%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  20%
pid 6683, prio   5, out for    5 ms, ran for    1 ms, load  24%
pid 6683, prio   5, out for 5413 ms, ran for   69 ms, load   1%
pid 6683, prio   5, out for 2251 ms, ran for   74 ms, load   3%
pid 6683, prio   5, out for    8 ms, ran for    1 ms, load  18%
pid 6683, prio   5, out for    7 ms, ran for    1 ms, load  20%
pid 6683, prio   5, out for    5 ms, ran for    1 ms, load  20%



^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-03  5:31         ` Mike Galbraith
  2007-04-03  6:00           ` Mike Galbraith
@ 2007-04-03  6:01           ` Ingo Molnar
  2007-04-03  6:11             ` Mike Galbraith
  2007-04-05 11:02             ` Mike Galbraith
  2007-04-03 10:57           ` [PATCH] sched: staircase deadline misc fixes Mike Galbraith
  2 siblings, 2 replies; 92+ messages in thread
From: Ingo Molnar @ 2007-04-03  6:01 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list


* Mike Galbraith <efault@gmx.de> wrote:

> > Try two instances of chew.c at _differing_ nice levels on one cpu on 
> > mainline, and then SD. This is why you can't renice X on mainline.
> 
> How about something more challenging instead :)
> 
> The numbers below are from my scheduler tree with massive_intr running 
> at nice 0, and chew at nice 5.  Below these numbers are 100 lines from 
> the exact center of chew's output.
> 
> (interactivity remains intact with this rather heavy load)

looks interesting - could you send the patch?

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-03  6:01           ` Ingo Molnar
@ 2007-04-03  6:11             ` Mike Galbraith
  2007-04-05 11:02             ` Mike Galbraith
  1 sibling, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-03  6:11 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list

On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote:
> * Mike Galbraith <efault@gmx.de> wrote:
> 
> > > Try two instances of chew.c at _differing_ nice levels on one cpu on 
> > > mainline, and then SD. This is why you can't renice X on mainline.
> > 
> > How about something more challenging instead :)
> > 
> > The numbers below are from my scheduler tree with massive_intr running 
> > at nice 0, and chew at nice 5.  Below these numbers are 100 lines from 
> > the exact center of chew's output.
> > 
> > (interactivity remains intact with this rather heavy load)
> 
> looks interesting - could you send the patch?

Sorry, that tree is not _even_ ready for viewing yet.
(and it's got an occasional oops bug i have to kill)

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-03  5:31         ` Mike Galbraith
  2007-04-03  6:00           ` Mike Galbraith
  2007-04-03  6:01           ` Ingo Molnar
@ 2007-04-03 10:57           ` Mike Galbraith
  2 siblings, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-03 10:57 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Tue, 2007-04-03 at 07:31 +0200, Mike Galbraith wrote:
> On Tue, 2007-04-03 at 12:37 +1000, Con Kolivas wrote:
> > On Thursday 29 March 2007 15:50, Mike Galbraith wrote:
> > > On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote:
> > > + * This contains a bitmap for each dynamic priority level with empty slots
> > > + * for the valid priorities each different nice level can have. It allows
> > > + * us to stagger the slots where differing priorities run in a way that
> > > + * keeps latency differences between different nice levels at a minimum.
> > > + * ie, where 0 means a slot for that priority, priority running from left
> > > to + * right:
> > > + * nice -20 0000000000000000000000000000000000000000
> > > + * nice -10 1001000100100010001001000100010010001000
> > > + * nice   0 0101010101010101010101010101010101010101
> > > + * nice   5 1101011010110101101011010110101101011011
> > > + * nice  10 0110111011011101110110111011101101110111
> > > + * nice  15 0111110111111011111101111101111110111111
> > > + * nice  19 1111111111111111111011111111111111111111
> > 
> > Try two instances of chew.c at _differing_ nice levels on one cpu on mainline, 
> > and then SD. This is why you can't renice X on mainline.
> 
> How about something more challenging instead :)
> 
> The numbers below are from my scheduler tree with massive_intr running
> at nice 0, and chew at nice 5.  Below these numbers are 100 lines from
> the exact center of chew's output.
> 
> (interactivity remains intact with this rather heavy load)
> 
> root@Homer: ./massive_intr 30 180
> 005671  00001506
> 005657  00001506
> 005651  00001491
> 005647  00001466
> 005661  00001484
> 005660  00001475
> 005645  00001514
> 005668  00001384
> 005673  00001516
> 005656  00001449
> 005664  00001512
> 005659  00001507
> 005667  00001513
> 005663  00001521
> 005670  00001440
> 005649  00001522
> 005652  00001487
> 005648  00001405
> 005665  00001472
> 005669  00001418
> 005662  00001489
> 005674  00001523
> 005650  00001480
> 005655  00001476
> 005672  00001530
> 005653  00001463
> 005654  00001427
> 005646  00001499
> 005658  00001510
> 005666  00001476

Taking a little break from tinkering, I built/ran rsd-0.38 as well.
While chew usually says "out for N < 500ms", I see spikes like those
below the massive_intr numbers.

root@Homer: ./massive_intr 30 180 (nice 0)
006596  00001346
006613  00001475
006605  00001463
006606  00001423
006598  00001279
006609  00001458
006600  00001378
006591  00001491
006610  00001413
006588  00001361
006602  00001401
006601  00001412
006607  00001373
006604  00001449
006599  00001398
006608  00001269
006611  00001464
006593  00001349
006614  00001335
006612  00001512
006615  00001422
006589  00001363
006617  00001362
006597  00001435
006592  00001354
006595  00001425
006616  00001348
006603  00001308
006594  00001360
006590  00001397

(spikes from run above)
pid 6585, prio   0, out for  178 ms, ran for   12 ms, load   6%
pid 6585, prio   0, out for  175 ms, ran for   13 ms, load   7%
pid 6585, prio   0, out for 1901 ms, ran for   12 ms, load   0%
pid 6585, prio   0, out for   61 ms, ran for   12 ms, load  17%
...
pid 6585, prio   0, out for  148 ms, ran for   11 ms, load   7%
pid 6585, prio   0, out for  229 ms, ran for   13 ms, load   5%
pid 6585, prio   0, out for  182 ms, ran for   11 ms, load   6%
pid 6585, prio   0, out for 1306 ms, ran for   11 ms, load   0%
pid 6585, prio   0, out for   72 ms, ran for   12 ms, load  15%
pid 6585, prio   0, out for  252 ms, ran for   11 ms, load   4%
....
(spikes from massive_intr at nice 0 and chew at nice -20)
pid 6547, prio -20, out for  132 ms, ran for  119 ms, load  47%
pid 6547, prio -20, out for   52 ms, ran for  119 ms, load  69%
pid 6547, prio -20, out for    4 ms, ran for   96 ms, load  95%
pid 6547, prio -20, out for 1251 ms, ran for   24 ms, load   1%
pid 6547, prio -20, out for   78 ms, ran for 1561 ms, load  95%
pid 6547, prio -20, out for   89 ms, ran for  120 ms, load  57%
pid 6547, prio -20, out for   69 ms, ran for  119 ms, load  63%
pid 6547, prio -20, out for 4125 ms, ran for  119 ms, load   2%
pid 6547, prio -20, out for   73 ms, ran for  119 ms, load  62%
pid 6547, prio -20, out for  110 ms, ran for  120 ms, load  52%
pid 6547, prio -20, out for   57 ms, ran for  119 ms, load  67%



^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-03  6:01           ` Ingo Molnar
  2007-04-03  6:11             ` Mike Galbraith
@ 2007-04-05 11:02             ` Mike Galbraith
  2007-04-05 11:09               ` Ingo Molnar
  2007-04-05 11:54               ` [test] sched: SD-latest versus Mike's latest Ingo Molnar
  1 sibling, 2 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-05 11:02 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list

On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote:

> looks interesting - could you send the patch?

Ok, this is looking/feeling pretty good in testing.  Comments on
fugliness etc much appreciated.

Below the numbers is a snapshot of my experimental tree.  It's a mixture
of my old throttling/anti-starvation tree and the task promotion patch,
with the addition of a scheduling class for interactive tasks to dish
out some of that targeted unfairness I mentioned.  SCHED_INTERACTIVE is
also targeted at the scenario where X or one of it's clients uses enough
CPU to end up in the expired array.

(note:  Xorg was not set SCHED_INTERACTIVE during the test runs below)

	-Mike

top - 12:31:34 up 16 min, 13 users,  load average: 7.37, 8.74, 6.58

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  P COMMAND
 6542 root      15   0  1568  108   24 S   43  0.0   0:58.98 1 fiftypercent
 6540 root      17   0  1568  440  356 R   30  0.0   1:00.04 0 fiftypercent
 6544 root      18   0  1568  108   24 R   28  0.0   0:58.36 0 fiftypercent
 6541 root      20   0  1568  108   24 R   26  0.0   0:57.70 1 fiftypercent
 6536 root      25   0  1436  356  296 R   24  0.0   0:45.76 1 chew
 6538 root      25   0  1436  356  296 R   20  0.0   0:49.73 0 chew
 6543 root      19   0  1568  108   24 R   19  0.0   0:58.04 1 fiftypercent
 6409 root      15   0  154m  63m  27m R    2  6.3   0:13.09 0 amarokapp
 6410 root      15   0  154m  63m  27m S    2  6.3   0:14.36 0 amarokapp
 6376 root      15   0  2380 1092  764 R    2  0.1   0:15.63 0 top
 5591 root      18   0  4736 1036  736 S    1  0.1   0:00.14 1 smpppd
 5678 root      15   0  167m  24m 4848 S    1  2.4   0:19.37 0 Xorg
 6202 root      15   0 32364  18m  12m S    1  1.8   0:04.25 1 konsole

50 lines from center of chew nailed to cpu0's log

pid 6538, prio   0, out for   27 ms, ran for    1 ms, load   6%
pid 6538, prio   0, out for   26 ms, ran for    4 ms, load  14%
pid 6538, prio   0, out for   27 ms, ran for    7 ms, load  20%
pid 6538, prio   0, out for   13 ms, ran for    5 ms, load  27%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  49%
pid 6538, prio   0, out for   10 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    9 ms, ran for    6 ms, load  42%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  46%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  48%
pid 6538, prio   0, out for    9 ms, ran for   27 ms, load  74%
pid 6538, prio   0, out for   27 ms, ran for    4 ms, load  13%
pid 6538, prio   0, out for   26 ms, ran for    5 ms, load  17%
pid 6538, prio   0, out for   27 ms, ran for    5 ms, load  17%
pid 6538, prio   0, out for   28 ms, ran for    6 ms, load  18%
pid 6538, prio   0, out for   30 ms, ran for    4 ms, load  14%
pid 6538, prio   0, out for   18 ms, ran for    5 ms, load  24%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  42%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  45%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  45%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  44%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    2 ms, ran for    7 ms, load  78%
pid 6538, prio   0, out for   45 ms, ran for   22 ms, load  33%
pid 6538, prio   0, out for   31 ms, ran for    2 ms, load   7%
pid 6538, prio   0, out for   62 ms, ran for    1 ms, load   3%
pid 6538, prio   0, out for   29 ms, ran for    3 ms, load  11%
pid 6538, prio   0, out for   26 ms, ran for    4 ms, load  13%
pid 6538, prio   0, out for  134 ms, ran for    5 ms, load   4%
pid 6538, prio   0, out for   78 ms, ran for    2 ms, load   3%
pid 6538, prio   0, out for    9 ms, ran for    3 ms, load  28%
pid 6538, prio   0, out for   10 ms, ran for    7 ms, load  42%
pid 6538, prio   0, out for   10 ms, ran for    7 ms, load  42%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  48%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  46%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for   10 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    9 ms, ran for    6 ms, load  39%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  42%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  46%
pid 6538, prio   0, out for   14 ms, ran for    6 ms, load  30%
pid 6538, prio   0, out for   27 ms, ran for    3 ms, load  12%
pid 6538, prio   0, out for   29 ms, ran for    4 ms, load  12%
pid 6538, prio   0, out for   29 ms, ran for    4 ms, load  13%
pid 6538, prio   0, out for   26 ms, ran for    4 ms, load  14%
pid 6538, prio   0, out for   29 ms, ran for    5 ms, load  14%
pid 6538, prio   0, out for   27 ms, ran for    4 ms, load  14%
pid 6538, prio   0, out for   26 ms, ran for    5 ms, load  16%
pid 6538, prio   0, out for   24 ms, ran for    6 ms, load  20%
pid 6538, prio   0, out for    7 ms, ran for    7 ms, load  49%


root@Homer: ./massive_intr 30 180
006502	00002373
006495	00002687
006518	00002417
006490	00002544
006500	00002417
006494	00002427
006498	00003032
006517	00003060
006505	00002401
006507	00002375
006514	00002398
006497	00002483
006506	00002388
006504	00002415
006510	00002472
006516	00002365
006509	00002441
006503	00002498
006512	00002930
006496	00002565
006492	00002389
006501	00002337
006508	00002395
006491	00002486
006499	00002394
006493	00002667
006515	00002569
006511	00002555
006513	00002637
006519	00002556

--- linux-2.6.21-rc5-x/include/linux/sched.h.org	2007-03-30 05:08:47.000000000 +0200
+++ linux-2.6.21-rc5-x/include/linux/sched.h	2007-04-02 08:17:30.000000000 +0200
@@ -34,6 +34,7 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+#define SCHED_INTERACTIVE	4
 
 #ifdef __KERNEL__
 
@@ -528,7 +529,7 @@ struct signal_struct {
 #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
+#define is_rt_policy(p)		((p) == SCHED_RR || (p) == SCHED_FIFO)
 #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
 
 /*
@@ -820,14 +821,14 @@ struct task_struct {
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
-	unsigned long sleep_avg;
+	unsigned long sleep_avg, last_slice, throttle;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
 	enum sleep_type sleep_type;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
+	unsigned int time_slice, slice_info;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
--- linux-2.6.21-rc5-x/include/linux/sysctl.h.org	2007-03-31 12:52:52.000000000 +0200
+++ linux-2.6.21-rc5-x/include/linux/sysctl.h	2007-04-01 08:04:02.000000000 +0200
@@ -165,6 +165,8 @@ enum
 	KERN_MAX_LOCK_DEPTH=74,
 	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
 	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
+	KERN_SCHED_THROTTLE1=77,  /* int: throttling credit period 1 in secs */
+	KERN_SCHED_THROTTLE2=78,  /* int: throttling credit period 2 in secs */
 };
 
 
--- linux-2.6.21-rc5-x/kernel/sched.c.org	2007-03-27 15:47:49.000000000 +0200
+++ linux-2.6.21-rc5-x/kernel/sched.c	2007-04-05 12:06:38.000000000 +0200
@@ -90,6 +90,20 @@ unsigned long long __attribute__((weak))
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 
+#if (BITS_PER_LONG < 64)
+#define JIFFIES_TO_NS64(TIME) \
+	((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ)))
+
+#define NS64_TO_JIFFIES(TIME) \
+	((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \
+	(1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME)))
+#else /* BITS_PER_LONG < 64 */
+
+#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME)
+#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME)
+
+#endif /* BITS_PER_LONG < 64 */
+
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
@@ -109,6 +123,8 @@ unsigned long long __attribute__((weak))
 #define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
 #define STARVATION_LIMIT	(MAX_SLEEP_AVG)
 #define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+#define PCNT_PER_DYNPRIO	(100 / MAX_BONUS)
+#define INTERACTIVE_LIMIT	(DEF_TIMESLICE * 4)
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -167,6 +183,133 @@ unsigned long long __attribute__((weak))
 	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 
+#define INTERACTIVE_LIMIT_EXCEEDED(rq) \
+	((rq)->active->interactive_ticks + (rq)->expired->interactive_ticks > \
+		INTERACTIVE_LIMIT)
+
+/*
+ * Interactive boost can lead to starvation if the decision to
+ * boost a task turns out to be a bad one.  To combat this, we
+ * compute the sane upper limit for cpu usage 'slice_avg' based
+ * upon a task's sleep_avg, and use this information combined
+ * with a timer to determine when intervention is required.
+ *
+ * When a task is behaving as it's sleep_avg indicates it should,
+ * it's throttle is moved forward, otherwise it will timeout, and
+ * it's priority will be lowered.
+ *
+ * Throttling tunables.
+ *
+ * CREDIT_C1: The amount of cpu time in seconds that a new task
+ *           will run completely free, ie the head start a task
+ *           has before it has to push it's timer forward to avoid
+ *           being throttled.  Each conforming slice thereafter
+ *           increases it's stored credit, and vice versa.
+ *
+ * CREDIT_C2: The maximum amount of CPU time in seconds a task
+ *           can store for later use.  When a task has no stored
+ *           credit left, now is time C2.  Tasks begin life with
+ *           C1 seconds credit, ie C2 is C1 seconds in front of
+ *           them, and the 'buffer' will grow in front of them
+ *           if they perform in a conformant manner.  The maximum
+ *           credit that fits in 32 bits jiffies is 42949 seconds.
+ */
+
+int credit_c1 = 0;
+int credit_c2 = 14400;
+int credit_max = 42949;
+
+#define C1 (credit_c1 * MAX_BONUS * HZ)
+#define C2 (credit_c2 * MAX_BONUS * HZ + C1)
+#define C3 (MAX_BONUS * C2)
+
+#define credit_exhausted(p, credit) \
+	(time_after_eq(jiffies, (p)->throttle + (credit)))
+
+/*
+ * Masks for p->slice_info, formerly p->first_time_slice.
+ * SLICE_FTS:   0x80000000  Task is in it's first ever timeslice.
+ * SLICE_NEW:   0x40000000  Slice refreshed.
+ * SLICE_INT:   0x20000000  Task is a SCHED_INTERACTIVE task partner.
+ * SLICE_SPA:   0x1FFE0000  Spare bits.
+ * SLICE_LTS:   0x0001FF80  Last time slice
+ * SLICE_AVG:   0x0000007F  Task slice_avg stored as percentage.
+ */
+#define SLICE_AVG_BITS    7
+#define SLICE_LTS_BITS   10
+#define SLICE_SPA_BITS   12
+#define SLICE_INT_BITS    1
+#define SLICE_NEW_BITS    1
+#define SLICE_FTS_BITS    1
+
+#define SLICE_AVG_SHIFT   0
+#define SLICE_LTS_SHIFT   (SLICE_AVG_SHIFT + SLICE_AVG_BITS)
+#define SLICE_SPA_SHIFT   (SLICE_LTS_SHIFT + SLICE_LTS_BITS)
+#define SLICE_INT_SHIFT   (SLICE_SPA_SHIFT + SLICE_SPA_BITS)
+#define SLICE_NEW_SHIFT   (SLICE_INT_SHIFT + SLICE_INT_BITS)
+#define SLICE_FTS_SHIFT   (SLICE_NEW_SHIFT + SLICE_NEW_BITS)
+
+#define INFO_MASK(x)      ((1U << (x))-1)
+#define SLICE_AVG_MASK    (INFO_MASK(SLICE_AVG_BITS) << SLICE_AVG_SHIFT)
+#define SLICE_LTS_MASK    (INFO_MASK(SLICE_LTS_BITS) << SLICE_LTS_SHIFT)
+#define SLICE_SPA_MASK    (INFO_MASK(SLICE_SPA_BITS) << SLICE_SPA_SHIFT)
+#define SLICE_INT_MASK    (INFO_MASK(SLICE_INT_BITS) << SLICE_INT_SHIFT)
+#define SLICE_NEW_MASK    (INFO_MASK(SLICE_NEW_BITS) << SLICE_NEW_SHIFT)
+#define SLICE_FTS_MASK    (INFO_MASK(SLICE_FTS_BITS) << SLICE_FTS_SHIFT)
+
+/* p->slice_info access macros. */
+#define first_time_slice(p) ((p)->slice_info & SLICE_FTS_MASK)
+#define set_first_time_slice(p) ((p)->slice_info |= SLICE_FTS_MASK)
+#define clr_first_time_slice(p) ((p)->slice_info &= ~SLICE_FTS_MASK)
+
+#define slice_is_new(p) ((p)->slice_info & SLICE_NEW_MASK)
+#define set_slice_is_new(p) ((p)->slice_info |= SLICE_NEW_MASK)
+#define clr_slice_is_new(p) ((p)->slice_info &= ~SLICE_NEW_MASK)
+
+#define task_is_interactive(p) ((p)->slice_info & SLICE_INT_MASK)
+#define set_task_is_interactive(p) ((p)->slice_info |= SLICE_INT_MASK)
+#define clr_task_is_interactive(p) ((p)->slice_info &= ~SLICE_INT_MASK)
+
+#define last_slice(p) (((p)->slice_info & SLICE_LTS_MASK) >> SLICE_LTS_SHIFT)
+#define set_last_slice(p, n) ((p)->slice_info = (((p)->slice_info & \
+	~SLICE_LTS_MASK) | (((n) << SLICE_LTS_SHIFT) & SLICE_LTS_MASK)))
+
+#define NS_SLEEP_AVG_PCNT (NS_MAX_SLEEP_AVG / 100)
+
+/* Note: raw storage format of slice_avg is %cpu. */
+#define slice_avg(p) ((typeof((p)->sleep_avg)) \
+	((((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT) * \
+	NS_SLEEP_AVG_PCNT))
+#define set_slice_avg(p, n) ((p)->slice_info = (((p)->slice_info & \
+	~SLICE_AVG_MASK) | ((((n) / NS_SLEEP_AVG_PCNT) \
+	<< SLICE_AVG_SHIFT) & SLICE_AVG_MASK)))
+#define slice_avg_raw(p)  \
+	(((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT)
+#define set_slice_avg_raw(p, n) ((p)->slice_info = (((p)->slice_info & \
+	~SLICE_AVG_MASK) | (((n) << SLICE_AVG_SHIFT) & SLICE_AVG_MASK)))
+
+/* cpu usage macros. */
+#define cpu_avg(p) \
+	(100 - slice_avg_raw(p))
+
+#define cpu_max(p) \
+	(100 - ((p)->sleep_avg / NS_SLEEP_AVG_PCNT))
+
+#define time_this_slice(p) \
+	(jiffies - (p)->last_slice)
+
+#define cpu_this_slice(p) \
+	(100 * last_slice(p) / max((unsigned) time_this_slice(p), \
+	(unsigned) last_slice(p)))
+
+#define cpu_avg_rq(rq) \
+	(100 * DEF_TIMESLICE / max((unsigned) (rq)->slice_avg, \
+		(unsigned) DEF_TIMESLICE))
+
+/* Positively identified interactive tasks. */
+#define task_interactive(p) \
+	((p)->policy == SCHED_INTERACTIVE || task_is_interactive(p))
+
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
 
@@ -201,6 +344,7 @@ static inline unsigned int task_timeslic
 
 struct prio_array {
 	unsigned int nr_active;
+	int interactive_ticks;
 	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_PRIO];
 };
@@ -234,7 +378,8 @@ struct rq {
 	 */
 	unsigned long nr_uninterruptible;
 
-	unsigned long expired_timestamp;
+	unsigned long switch_timestamp;
+	unsigned long slice_avg;
 	/* Cached timestamp set by update_cpu_clock() */
 	unsigned long long most_recent_timestamp;
 	struct task_struct *curr, *idle;
@@ -691,6 +836,8 @@ static void dequeue_task(struct task_str
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	if (TASK_INTERACTIVE(p))
+		array->interactive_ticks -= p->time_slice;
 }
 
 static void enqueue_task(struct task_struct *p, struct prio_array *array)
@@ -700,6 +847,8 @@ static void enqueue_task(struct task_str
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
+	if (TASK_INTERACTIVE(p))
+		array->interactive_ticks += p->time_slice;
 }
 
 /*
@@ -882,7 +1031,11 @@ static int recalc_task_prio(struct task_
 	/* Caller must always ensure 'now >= p->timestamp' */
 	unsigned long sleep_time = now - p->timestamp;
 
-	if (batch_task(p))
+	/*
+	 * Migration timestamp adjustment may induce negative time.
+	 * Ignore unquantifiable values as well as SCHED_BATCH tasks.
+	 */ 
+	if (now < p->timestamp || batch_task(p))
 		sleep_time = 0;
 
 	if (likely(sleep_time > 0)) {
@@ -893,7 +1046,14 @@ static int recalc_task_prio(struct task_
 		 */
 		unsigned long ceiling = INTERACTIVE_SLEEP(p);
 
-		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+	 	/*
+		 * Update throttle position.
+		 */
+		p->throttle += NS64_TO_JIFFIES(sleep_time);
+		if (time_before(jiffies, p->throttle))
+			p->throttle = jiffies;
+
+		if (sleep_time > ceiling && p->sleep_avg < ceiling) {
 			/*
 			 * Prevents user tasks from achieving best priority
 			 * with one single large enough sleep.
@@ -915,7 +1075,7 @@ static int recalc_task_prio(struct task_
 			 * limited in their sleep_avg rise as they
 			 * are likely to be waiting on I/O
 			 */
-			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
+			if (p->sleep_type == SLEEP_NONINTERACTIVE) {
 				if (p->sleep_avg >= ceiling)
 					sleep_time = 0;
 				else if (p->sleep_avg + sleep_time >=
@@ -1531,16 +1691,23 @@ out_activate:
 		 * sleep_avg beyond just interactive state.
 		 */
 		p->sleep_type = SLEEP_NONINTERACTIVE;
-	} else
+	} else if (task_interactive(current)) {
+		/*
+		 * Tasks tagged as being truly interactive
+		 * pass temporary interactive status on to
+		 * the task they are waking.
+		 */
+		set_task_is_interactive(p);
+		p->sleep_type = SLEEP_INTERACTIVE;
+	}
 
 	/*
 	 * Tasks that have marked their sleep as noninteractive get
 	 * woken up with their sleep average not weighted in an
 	 * interactive way.
 	 */
-		if (old_state & TASK_NONINTERACTIVE)
-			p->sleep_type = SLEEP_NONINTERACTIVE;
-
+	else if (old_state & TASK_NONINTERACTIVE)
+		p->sleep_type = SLEEP_NONINTERACTIVE;
 
 	activate_task(p, rq, cpu == this_cpu);
 	/*
@@ -1628,9 +1795,24 @@ void fastcall sched_fork(struct task_str
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
 	 */
-	p->first_time_slice = 1;
 	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
+
+	/*
+	 * Set up slice_info and initial throttle position for the child.
+	 */
+	set_slice_avg(p, p->sleep_avg);
+	set_last_slice(p, p->time_slice);
+	set_slice_is_new(p);
+	set_first_time_slice(p);
+	p->last_slice = jiffies;
+	p->throttle = jiffies - C2 + C1;
+	/*
+	 * SCHED_INTERACTIVE policy cannot be inherited.
+	 */
+	if (unlikely(current->policy == SCHED_INTERACTIVE))
+		p->policy = SCHED_NORMAL;
+
 	if (unlikely(!current->time_slice)) {
 		/*
 		 * This case is rare, it happens when the parent has only
@@ -1745,7 +1927,7 @@ void fastcall sched_exit(struct task_str
 	 * the sleep_avg of the parent as well.
 	 */
 	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
+	if (first_time_slice(p) && task_cpu(p) == task_cpu(p->parent)) {
 		p->parent->time_slice += p->time_slice;
 		if (unlikely(p->parent->time_slice > task_timeslice(p)))
 			p->parent->time_slice = task_timeslice(p);
@@ -3051,9 +3233,10 @@ static inline int expired_starving(struc
 {
 	if (rq->curr->static_prio > rq->best_expired_prio)
 		return 1;
-	if (!STARVATION_LIMIT || !rq->expired_timestamp)
+	if (!STARVATION_LIMIT)
 		return 0;
-	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+	if (jiffies - rq->switch_timestamp > rq->nr_running * DEF_TIMESLICE +
+			STARVATION_LIMIT)
 		return 1;
 	return 0;
 }
@@ -3131,8 +3314,165 @@ void account_steal_time(struct task_stru
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 
+/*
+ * Promote and requeue the next lower priority task.  If no task
+ * is available in the active array, switch to the expired array.
+ * @rq: runqueue to search.
+ * @prio: priority at which to begin search.
+ */
+static inline void promote_next_lower(struct rq *rq, int prio)
+{
+	struct prio_array *array = rq->active;
+	struct task_struct *p = NULL;
+	unsigned long long now = rq->most_recent_timestamp;
+	unsigned long *bitmap;
+	unsigned long starving = JIFFIES_TO_NS(rq->slice_avg);
+	int idx = prio + 1, found_noninteractive = 0;
+	int ticks = rq->active->interactive_ticks + rq->expired->interactive_ticks;
+
+repeat:
+	bitmap = array->bitmap;
+	idx = find_next_bit(bitmap, MAX_PRIO, idx);
+	if (idx < MAX_PRIO) {
+		struct list_head *queue = array->queue + idx;
+
+		p = list_entry(queue->next, struct task_struct, run_list);
+		if (!TASK_INTERACTIVE(p))
+			found_noninteractive = 1;
+
+		/* Skip non-starved queues. */
+		if (now < p->last_ran + starving) {
+			idx++;
+			p = NULL;
+			goto repeat;
+		}
+	} else if (!found_noninteractive && array == rq->active) {
+		/* Nobody home, check the expired array. */
+		array = rq->expired;
+		idx = prio;
+		p = NULL;
+		goto repeat;
+	}
+
+	/* Found one, requeue it. */
+	if (p) {
+		dequeue_task(p, p->array);
+		if (array == rq->active)
+			p->prio--;
+		/*
+		 * If we pulled a task from the expired array, correct
+		 * expired array info.  We can't afford a full search
+		 * for best_expired_prio, but do the best we can.
+		 */
+		else {
+			idx = sched_find_first_bit(array->bitmap);
+			if (idx < MAX_PRIO) {
+				if (rq->best_expired_prio > idx)
+					rq->best_expired_prio = idx;
+			} else {
+				/* We emptied the array */
+				rq->best_expired_prio = MAX_PRIO;
+				/*
+				 * If we have excessive interactive load,
+				 * do not inhibit forced array switching.
+				 */
+				if (ticks < INTERACTIVE_LIMIT)
+					rq->switch_timestamp = jiffies;
+			}
+		}
+		enqueue_task(p, rq->active);
+	}
+}
+
+/*
+ * Refresh timeslice and associated slice information.
+ * @p: the process to refresh.
+ */
+static void refresh_timeslice(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	unsigned long slice_time = jiffies - p->last_slice;
+	int idle, cpu, cpu_avg, slice = last_slice(p);
+	int w = MAX_BONUS, delta, bonus;
+
+	if (unlikely(slice_time < slice))
+		slice_time = slice;
+
+	/* Update task's CPU usage. */
+	cpu_avg = slice_avg_raw(p);
+	cpu = cpu_this_slice(p);
+	idle = 100 - cpu;
+	delta = max(cpu_avg, idle) - min(cpu_avg, idle);
+	w = 1 + (delta / w);
+	cpu_avg = (w * cpu_avg + idle) / (w + 1);
+	set_slice_avg_raw(p, cpu_avg);
+
+	/*
+	 * If we've hit the throttle timeout, we aren't draining enough
+	 * sleep_avg to keep up with the task's cpu usage.  Up the ante
+	 * to bring the task back toward balance.
+	 */
+	if (credit_exhausted(p, C2) && p->sleep_avg > slice_avg(p)) {
+		unsigned long run_time = p->sleep_avg - slice_avg(p);
+		run_time /= w;
+		if (p->sleep_avg >= run_time)
+			p->sleep_avg -= run_time;
+	}
+
+	/*
+	 * Update throttle position and sanity check it.
+	 */
+	if (task_is_interactive(p))
+		p->throttle += slice_time - slice;
+	else if (INTERACTIVE_LIMIT_EXCEEDED(rq) &&
+			cpu_avg - cpu_avg_rq(rq) >= PCNT_PER_DYNPRIO) {
+		bonus = (cpu_avg - cpu_avg_rq(rq)) / PCNT_PER_DYNPRIO;
+		p->throttle -= slice_time * bonus;
+	} else if (cpu < cpu_max(p) + PCNT_PER_DYNPRIO) {
+		bonus = idle * PCNT_PER_DYNPRIO / 100;
+		p->throttle += (slice_time - slice) * bonus;
+	} else  if (cpu >= cpu_max(p) + PCNT_PER_DYNPRIO) {
+		bonus = (cpu - cpu_max(p)) / PCNT_PER_DYNPRIO;
+		p->throttle -= slice_time * bonus;
+	}
+
+	if (time_before(jiffies, p->throttle))
+		p->throttle = jiffies;
+	else if (credit_exhausted(p, C3))
+		p->throttle = jiffies - C3;
+
+	/* Add our slice time to the runqueue average. */
+	if (slice_time < HZ || slice_time < rq->nr_running * DEF_TIMESLICE) {
+		rq->slice_avg <<= 4;
+		rq->slice_avg += slice_time;
+		rq->slice_avg >>= 4;
+	}
+
+	/*
+	 * Ensure that SCHED_INTERACTIVE tasks and their partners will
+	 * always be classified correctly by TASK_INTERACTIVE(). Clear
+	 * propogated interactive task status.  Propogated status is
+	 * inherited from the parent, but is good for only one slice.
+	 */
+	if (task_is_interactive(p) && p->sleep_avg < INTERACTIVE_SLEEP(p))
+		p->sleep_avg = INTERACTIVE_SLEEP(p);
+	clr_task_is_interactive(p);
+
+	/* Update dynamic priority and time slice. */
+	p->prio = effective_prio(p);
+	p->time_slice = task_timeslice(p);
+	set_last_slice(p, p->time_slice);
+
+	/* And finally, stamp and flag the new slice. */
+	clr_first_time_slice(p);
+	set_slice_is_new(p);
+	p->last_slice = jiffies;
+}
+
 static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
+	int task_was_interactive;
+
 	if (p->array != rq->active) {
 		/* Task has expired but was not scheduled yet */
 		set_tsk_need_resched(p);
@@ -3152,8 +3492,7 @@ static void task_running_tick(struct rq 
 		 * FIFO tasks have no timeslices.
 		 */
 		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
+			refresh_timeslice(p);
 			set_tsk_need_resched(p);
 
 			/* put it at the end of the queue: */
@@ -3161,21 +3500,36 @@ static void task_running_tick(struct rq 
 		}
 		goto out_unlock;
 	}
+
+	/*
+	 * Tick off interactive task ticks from the active array.
+	 */
+	task_was_interactive = TASK_INTERACTIVE(p);
+	if (task_was_interactive && --rq->active->interactive_ticks < 0)
+		rq->active->interactive_ticks = 0;
+
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
+		refresh_timeslice(p);
 		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
-		p->first_time_slice = 0;
-
-		if (!rq->expired_timestamp)
-			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+
+		if (!TASK_INTERACTIVE(p) || expired_starving(rq) ||
+				credit_exhausted(p, C2)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
 				rq->best_expired_prio = p->static_prio;
 		} else
 			enqueue_task(p, rq->active);
+
+		/*
+		 * Always look to see if any queue under you is starving,
+		 * and requeue a task if that is the case.  This prevents
+		 * things like multiple tasks at any priority waking in
+		 * streams and starving their less fortunate peers via
+		 * preempt, ie ensures that the less fortunate will have
+		 * bounded latency.
+		 */
+		promote_next_lower(rq, p->prio);
 	} else {
 		/*
 		 * Prevent a too long timeslice allowing a task to monopolize
@@ -3285,7 +3639,7 @@ asmlinkage void __sched schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx, new_prio;
+	int cpu, idx, new_prio, throttle;
 	long *switch_count;
 	struct rq *rq;
 
@@ -3332,9 +3686,13 @@ need_resched_nonpreemptible:
 
 	/*
 	 * Tasks charged proportionately less run_time at high sleep_avg to
-	 * delay them losing their interactive status
-	 */
-	run_time /= (CURRENT_BONUS(prev) ? : 1);
+	 * delay them losing their interactive status.  If we have too many
+	 * interactive ticks queued or this task is being throttled, switch
+	 * behavior to linear decay.
+	 */
+	throttle = INTERACTIVE_LIMIT_EXCEEDED(rq) || credit_exhausted(prev, C2);
+	if (!throttle)
+		run_time /= 1 + CURRENT_BONUS(prev);
 
 	spin_lock_irq(&rq->lock);
 
@@ -3356,7 +3714,7 @@ need_resched_nonpreemptible:
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
-			rq->expired_timestamp = 0;
+			rq->switch_timestamp = jiffies;
 			goto switch_tasks;
 		}
 	}
@@ -3370,7 +3728,8 @@ need_resched_nonpreemptible:
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
-		rq->expired_timestamp = 0;
+		array->interactive_ticks = 0;
+		rq->switch_timestamp = jiffies;
 		rq->best_expired_prio = MAX_PRIO;
 	}
 
@@ -3380,6 +3739,8 @@ need_resched_nonpreemptible:
 
 	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
 		unsigned long long delta = now - next->timestamp;
+		int next_interactive = TASK_INTERACTIVE(next);
+
 		if (unlikely((long long)(now - next->timestamp) < 0))
 			delta = 0;
 
@@ -3389,14 +3750,33 @@ need_resched_nonpreemptible:
 		array = next->array;
 		new_prio = recalc_task_prio(next, next->timestamp + delta);
 
+		/*
+		 * If INTERACTIVE_LIMIT is exceeded, do not promote
+		 * tasks which already have interactive status.  This
+		 * can only make things worse if the load isn't truly
+		 * interactive, so let them decay.  We also don't want
+		 * a task which has been promoted while waiting to
+		 * get CPU after wakeup to be demoted, and thus end
+		 * up being preempted immediately by a task waking
+		 * at the priority it has just reached.  Tasks which
+		 * miss the tick frequently also get caught here, so
+		 * care has to be taken to not help them along. Since
+		 * these are very likely to have interactive status,
+		 * don't ever demote a non-interactive task here, and
+		 * always considered interactive tasks to be fair game.
+		 */
+		if ((throttle && next_interactive && new_prio < next->prio) ||
+			(!next_interactive && new_prio > next->prio))
+			goto switch_tasks;
+
 		if (unlikely(next->prio != new_prio)) {
 			dequeue_task(next, array);
 			next->prio = new_prio;
 			enqueue_task(next, array);
 		}
 	}
-	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
+	next->sleep_type = SLEEP_NORMAL;
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
 	prefetch(next);
@@ -3411,6 +3791,14 @@ switch_tasks:
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
+	/*
+	 * Tag start of execution of a new timeslice.
+	 */
+	if (unlikely(slice_is_new(next))) {
+		next->last_slice = jiffies;
+		clr_slice_is_new(next);
+	}
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = next->last_ran = now;
@@ -4081,7 +4469,8 @@ recheck:
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-			policy != SCHED_NORMAL && policy != SCHED_BATCH)
+			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+			policy != SCHED_INTERACTIVE)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4619,6 +5008,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_INTERACTIVE:
 		ret = 0;
 		break;
 	}
@@ -4643,6 +5033,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_INTERACTIVE:
 		ret = 0;
 	}
 	return ret;
@@ -6772,6 +7163,7 @@ void __init sched_init(void)
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		rq->best_expired_prio = MAX_PRIO;
+		rq->slice_avg = STARVATION_LIMIT;
 
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
--- linux-2.6.21-rc5-x/kernel/sysctl.c.org	2007-03-31 12:54:06.000000000 +0200
+++ linux-2.6.21-rc5-x/kernel/sysctl.c	2007-04-01 08:04:02.000000000 +0200
@@ -76,6 +76,9 @@ extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
+extern int credit_c1;
+extern int credit_c2;
+extern int credit_max;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -204,6 +207,13 @@ static ctl_table root_table[] = {
 	{ .ctl_name = 0 }
 };
 
+/*
+ * Constants for minimum and maximum testing in vm_table and
+ * kern_table.  We use these as one-element integer vectors.
+*/
+static int zero;
+static int one_hundred = 100;
+
 static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_PANIC,
@@ -603,16 +613,31 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-
+	{
+		.ctl_name	= KERN_SCHED_THROTTLE1,
+		.procname	= "credit_c1",
+		.data		= &credit_c1,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &credit_max,
+	},
+	{
+		.ctl_name	= KERN_SCHED_THROTTLE2,
+		.procname	= "credit_c2",
+		.data		= &credit_c2,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &credit_max,
+	},
 	{ .ctl_name = 0 }
 };
 
-/* Constants for minimum and maximum testing in vm_table.
-   We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
 static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,



^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-05 11:02             ` Mike Galbraith
@ 2007-04-05 11:09               ` Ingo Molnar
  2007-04-05 11:12                 ` Mike Galbraith
  2007-04-05 11:54               ` [test] sched: SD-latest versus Mike's latest Ingo Molnar
  1 sibling, 1 reply; 92+ messages in thread
From: Ingo Molnar @ 2007-04-05 11:09 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list


find a whitespace fix below.

	Ingo

Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -1034,7 +1034,7 @@ static int recalc_task_prio(struct task_
 	/*
 	 * Migration timestamp adjustment may induce negative time.
 	 * Ignore unquantifiable values as well as SCHED_BATCH tasks.
-	 */ 
+	 */
 	if (now < p->timestamp || batch_task(p))
 		sleep_time = 0;
 

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-05 11:09               ` Ingo Molnar
@ 2007-04-05 11:12                 ` Mike Galbraith
  2007-04-05 11:15                   ` Ingo Molnar
  2007-04-05 13:18                   ` Johannes Stezenbach
  0 siblings, 2 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-05 11:12 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list

On Thu, 2007-04-05 at 13:09 +0200, Ingo Molnar wrote:
> find a whitespace fix below.
> 
> 	Ingo
> 
> Index: linux/kernel/sched.c
> ===================================================================
> --- linux.orig/kernel/sched.c
> +++ linux/kernel/sched.c
> @@ -1034,7 +1034,7 @@ static int recalc_task_prio(struct task_
>  	/*
>  	 * Migration timestamp adjustment may induce negative time.
>  	 * Ignore unquantifiable values as well as SCHED_BATCH tasks.
> -	 */ 
> +	 */
>  	if (now < p->timestamp || batch_task(p))
>  		sleep_time = 0;
>  

Thanks.

(dang, i need to find that fifty "make it red" thingie for vi again)

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-05 11:12                 ` Mike Galbraith
@ 2007-04-05 11:15                   ` Ingo Molnar
  2007-04-05 13:18                   ` Johannes Stezenbach
  1 sibling, 0 replies; 92+ messages in thread
From: Ingo Molnar @ 2007-04-05 11:15 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list


* Mike Galbraith <efault@gmx.de> wrote:

> > -	 */ 
> > +	 */
> >  	if (now < p->timestamp || batch_task(p))
> >  		sleep_time = 0;
> >  
> 
> Thanks.
> 
> (dang, i need to find that fifty "make it red" thingie for vi again)

or just start using quilt, which warns about this :)

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* [test] sched: SD-latest versus Mike's latest
  2007-04-05 11:02             ` Mike Galbraith
  2007-04-05 11:09               ` Ingo Molnar
@ 2007-04-05 11:54               ` Ingo Molnar
  2007-04-05 12:10                 ` Mike Galbraith
                                   ` (2 more replies)
  1 sibling, 3 replies; 92+ messages in thread
From: Ingo Molnar @ 2007-04-05 11:54 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list


* Mike Galbraith <efault@gmx.de> wrote:

> On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote:
> 
> > looks interesting - could you send the patch?
> 
> Ok, this is looking/feeling pretty good in testing.  Comments on 
> fugliness etc much appreciated.
>
> Below the numbers is a snapshot of my experimental tree.  It's a 
> mixture of my old throttling/anti-starvation tree and the task 
> promotion patch, with the addition of a scheduling class for 
> interactive tasks to dish out some of that targeted unfairness I 
> mentioned.

here's some test results, comparing SD-latest to Mike's-latest:

re-testing the weak points of the vanilla scheduler + Mike's:

 - thud.c:    this workload has almost unnoticeable effect
 - fiftyp.c:  noticeable, but alot better than previously!

re-testing the weak points of SD:

 - hackbench: still unusable under such type of high load - no improvement.
 - make -j:   still less interactive than Mike's - no improvement.

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [test] sched: SD-latest versus Mike's latest
  2007-04-05 11:54               ` [test] sched: SD-latest versus Mike's latest Ingo Molnar
@ 2007-04-05 12:10                 ` Mike Galbraith
  2007-04-05 12:12                   ` Ingo Molnar
  2007-04-05 16:08                 ` Con Kolivas
  2007-04-06  1:03                 ` Ten percent test Con Kolivas
  2 siblings, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-05 12:10 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list

On Thu, 2007-04-05 at 13:54 +0200, Ingo Molnar wrote:

> here's some test results, comparing SD-latest to Mike's-latest:
> 
> re-testing the weak points of the vanilla scheduler + Mike's:
> 
>  - thud.c:    this workload has almost unnoticeable effect
>  - fiftyp.c:  noticeable, but alot better than previously!

Hmm.  Here fiftyp.c is utterly harmless.  If you have a second, can you
send me a top snapshot?  If you're running many of them, it can take a
bit for the throttle to catch them all.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [test] sched: SD-latest versus Mike's latest
  2007-04-05 12:10                 ` Mike Galbraith
@ 2007-04-05 12:12                   ` Ingo Molnar
  2007-04-05 12:24                     ` Mike Galbraith
  0 siblings, 1 reply; 92+ messages in thread
From: Ingo Molnar @ 2007-04-05 12:12 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list


* Mike Galbraith <efault@gmx.de> wrote:

> > re-testing the weak points of the vanilla scheduler + Mike's:
> > 
> >  - thud.c:    this workload has almost unnoticeable effect
> >  - fiftyp.c:  noticeable, but alot better than previously!
> 
> Hmm.  Here fiftyp.c is utterly harmless.  If you have a second, can 
> you send me a top snapshot?  If you're running many of them, it can 
> take a bit for the throttle to catch them all.

ah, indeed - i ran 10 of them and letting them run for a bit smoothes 
things out.

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [test] sched: SD-latest versus Mike's latest
  2007-04-05 12:12                   ` Ingo Molnar
@ 2007-04-05 12:24                     ` Mike Galbraith
  0 siblings, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-05 12:24 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list

On Thu, 2007-04-05 at 14:12 +0200, Ingo Molnar wrote:
> * Mike Galbraith <efault@gmx.de> wrote:
> 
> > > re-testing the weak points of the vanilla scheduler + Mike's:
> > > 
> > >  - thud.c:    this workload has almost unnoticeable effect
> > >  - fiftyp.c:  noticeable, but alot better than previously!
> > 
> > Hmm.  Here fiftyp.c is utterly harmless.  If you have a second, can 
> > you send me a top snapshot?  If you're running many of them, it can 
> > take a bit for the throttle to catch them all.
> 
> ah, indeed - i ran 10 of them and letting them run for a bit smoothes 
> things out.

Ok, I didn't try 10 of them.  It can still get a bit ragged here, so I
may have to latch the throttle for a bit to make sure they have to
maintain improved behavior to get unleashed.  5 of them get instantly
nailed, and stay nailed.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-05 11:12                 ` Mike Galbraith
  2007-04-05 11:15                   ` Ingo Molnar
@ 2007-04-05 13:18                   ` Johannes Stezenbach
  2007-04-05 15:28                     ` Mike Galbraith
  1 sibling, 1 reply; 92+ messages in thread
From: Johannes Stezenbach @ 2007-04-05 13:18 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Ingo Molnar, Con Kolivas, linux list, Andrew Morton, ck list

On Thu, Apr 05, 2007, Mike Galbraith wrote:
> 
> (dang, i need to find that fifty "make it red" thingie for vi again)

put "let c_space_errors=1" in .vimrc

HTH,
Johannes

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-04-05 13:18                   ` Johannes Stezenbach
@ 2007-04-05 15:28                     ` Mike Galbraith
  0 siblings, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-05 15:28 UTC (permalink / raw)
  To: Johannes Stezenbach
  Cc: Ingo Molnar, Con Kolivas, linux list, Andrew Morton, ck list

On Thu, 2007-04-05 at 15:18 +0200, Johannes Stezenbach wrote:
> On Thu, Apr 05, 2007, Mike Galbraith wrote:
> > 
> > (dang, i need to find that fifty "make it red" thingie for vi again)
                              ^(spiffy;)
> 
> put "let c_space_errors=1" in .vimrc

Thanks.

I received this link via private mail, and think it's worth posting.
Who knows, it may save Maintainers an antacid tablet or two.

http://www.pixelbeat.org/settings/.vimrc

	-Mike

(may eventually get tired of the colors, but for now they're cooler than
the plain black and white i'm used to, _and_ has "make it glow" feature)


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [test] sched: SD-latest versus Mike's latest
  2007-04-05 11:54               ` [test] sched: SD-latest versus Mike's latest Ingo Molnar
  2007-04-05 12:10                 ` Mike Galbraith
@ 2007-04-05 16:08                 ` Con Kolivas
  2007-04-05 19:05                   ` Ingo Molnar
  2007-04-05 20:29                   ` Mike Galbraith
  2007-04-06  1:03                 ` Ten percent test Con Kolivas
  2 siblings, 2 replies; 92+ messages in thread
From: Con Kolivas @ 2007-04-05 16:08 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list

On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
> * Mike Galbraith <efault@gmx.de> wrote:
> > On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote:
> > > looks interesting - could you send the patch?
> >
> > Ok, this is looking/feeling pretty good in testing.  Comments on
> > fugliness etc much appreciated.
> >
> > Below the numbers is a snapshot of my experimental tree.  It's a
> > mixture of my old throttling/anti-starvation tree and the task

Throttling to try to get to SD fairness? The mainline state machine becomes 
more complex than ever and fluctuates from interactive to fair by an as-yet 
unchosen magic number timeframe which ebbs and flows.

> > promotion patch, with the addition of a scheduling class for
> > interactive tasks to dish out some of that targeted unfairness I
> > mentioned.

Nice -10 on mainline ruins the latency of nice 0 tasks unlike SD. New 
scheduling class just for X? Sounds like a very complicated 
userspace-changing way to just do the equivalent of "nice -n -10" obfuscated. 

> here's some test results, comparing SD-latest to Mike's-latest:
>
> re-testing the weak points of the vanilla scheduler + Mike's:
>
>  - thud.c:    this workload has almost unnoticeable effect
>  - fiftyp.c:  noticeable, but alot better than previously!

Load of 1.5 makes mainline a doorstop without throttling.

> re-testing the weak points of SD:
>
>  - hackbench: still unusable under such type of high load - no improvement.

Load of 160. Is proportional slowdown bad?

>  - make -j:   still less interactive than Mike's - no improvement.

Depends on how big your job number vs cpu is. The better the throttling gets 
with mainline the better SD gets in this comparison. At equal fairness 
mainline does not have the low latency interactivity SD has.

Nice -10 X with SD is a far better solution than an ever increasing complexity 
state machine and a userspace-changing scheduling policy just for X. Half 
decent graphics cards get good interactivity with SD even without renicing.

> 	Ingo

-- 
-ck

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [test] sched: SD-latest versus Mike's latest
  2007-04-05 16:08                 ` Con Kolivas
@ 2007-04-05 19:05                   ` Ingo Molnar
  2007-04-05 20:29                   ` Mike Galbraith
  1 sibling, 0 replies; 92+ messages in thread
From: Ingo Molnar @ 2007-04-05 19:05 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list


* Con Kolivas <kernel@kolivas.org> wrote:

> Nice -10 on mainline ruins the latency of nice 0 tasks unlike SD. New 
> scheduling class just for X? Sounds like a very complicated 
> userspace-changing way to just do the equivalent of "nice -n -10" 
> obfuscated.

i think you are missing the point. We _do not know in advance_ whether X 
should be prioritized or not. It's the behavior of X that determines it. 
When X is reniced to -10 it fixes a few corner cases, but it breaks many 
other cases. We found that out time and time again.

btw., the tests i've done were not with X but using a shell prompt.

> > re-testing the weak points of SD:
> >
> >  - hackbench: still unusable under such type of high load - no 
> >  improvement.
> 
> Load of 160. Is proportional slowdown bad?

this is relative to how mainline+Mike's handles it. Users wont really 
care about the why's, they'll only see the slowdown.

> >  - make -j: still less interactive than Mike's - no improvement.
> 
> Depends on how big your job number vs cpu is. The better the 
> throttling gets with mainline the better SD gets in this comparison. 
> At equal fairness mainline does not have the low latency interactivity 
> SD has.

i often run make jobs with -j200 or larger, and SD gets worse than even 
mainline much sooner than that.

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [test] sched: SD-latest versus Mike's latest
  2007-04-05 16:08                 ` Con Kolivas
  2007-04-05 19:05                   ` Ingo Molnar
@ 2007-04-05 20:29                   ` Mike Galbraith
  1 sibling, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-05 20:29 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Fri, 2007-04-06 at 02:08 +1000, Con Kolivas wrote:
> On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
> > * Mike Galbraith <efault@gmx.de> wrote:
> > > On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote:
> > > > looks interesting - could you send the patch?
> > >
> > > Ok, this is looking/feeling pretty good in testing.  Comments on
> > > fugliness etc much appreciated.
> > >
> > > Below the numbers is a snapshot of my experimental tree.  It's a
> > > mixture of my old throttling/anti-starvation tree and the task
> 
> Throttling to try to get to SD fairness? The mainline state machine becomes 
> more complex than ever and fluctuates from interactive to fair by an as-yet 
> unchosen magic number timeframe which ebbs and flows.

I believe I've already met and surpassed SD fairness.  Bold statement,
but I believe it's true.  I'm more worried about becoming _too_ fair.

Show me your numbers.  I showed you mine with both SD and my patches.

WRT magic and state machine complexity:  If you read the patch, there is
nothing "magical" about it.  It doesn't do anything but monitor CPU
usage and move a marker.  It does nothing the least bit complicated, and
what it does, it does in the slow path.  The only thing it does in the
fast path is to move the marker, and perhaps tag a targeted task.  State
machine?  There is nothing there that resembles a state machine to me,
the heuristic is just add sleep time, burn on use.

> > > promotion patch, with the addition of a scheduling class for
> > > interactive tasks to dish out some of that targeted unfairness I
> > > mentioned.
> 
> Nice -10 on mainline ruins the latency of nice 0 tasks unlike SD. New 
> scheduling class just for X? Sounds like a very complicated 
> userspace-changing way to just do the equivalent of "nice -n -10" obfuscated.

This patch makes massive nice -10 vs nice 0 latency history I believe.
Testing welcome.  WRT "nice -10 obfuscated", that's a load of high grade
horse-hockey.  There were very good reason posted here as to why that is
a very bad idea, perhaps you haven't read them.  (you can find them if
you choose)

Your criticism SCHED_INTERACTIVE leaves me dumbfounded, since you were,
and still are, specifically telling me that I should tell the scheduler
that X is special.  I did precisely that, and am also trying to tell it
that it's clients are special too, _without_ having to start each and
every client at nice -10 or whatever static number of the day.

> > here's some test results, comparing SD-latest to Mike's-latest:
> >
> > re-testing the weak points of the vanilla scheduler + Mike's:
> >
> >  - thud.c:    this workload has almost unnoticeable effect
> >  - fiftyp.c:  noticeable, but alot better than previously!
> 
> Load of 1.5 makes mainline a doorstop without throttling.

Where does that come from?  Doesn't jibe with my experience at all.

> > re-testing the weak points of SD:
> >
> >  - hackbench: still unusable under such type of high load - no improvement.
> 
> Load of 160. Is proportional slowdown bad?
> 
> >  - make -j:   still less interactive than Mike's - no improvement.
> 
> Depends on how big your job number vs cpu is. The better the throttling gets 
> with mainline the better SD gets in this comparison. At equal fairness 
> mainline does not have the low latency interactivity SD has.

So we should do 8ms slices too?  I don't think that's necessary.

> Nice -10 X with SD is a far better solution than an ever increasing complexity 
> state machine and a userspace-changing scheduling policy just for X. Half 
> decent graphics cards get good interactivity with SD even without renicing.

SD does not retain interactivity under any appreciable load for one, and
secondly, I'm getting interactivity that SD cannot even get close to
without renicing, and without any patches - in mainline right now.

(Speaking of low latency, how long can tasks forking off sleepers who
overlap their wake times prevent an array switch with SD?  Forever?)

I posted numbers that demonstrate the improvement in fairness while
maintaining interactivity, and I'm not finished.  I've solved the
multiple fiftyp.c thing Ingo noticed, and in fact, I had 10 copies
running that I had forgotten to terminate while I was working, and I
didn't even notice until I finished, and saw my top window.  Patch to
follow as soon as I test some more (that's what takes much time, not
creating the diff.  this isn't rocket science.)

Maybe I'll succeed, maybe I won't.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Ten percent test
  2007-04-05 11:54               ` [test] sched: SD-latest versus Mike's latest Ingo Molnar
  2007-04-05 12:10                 ` Mike Galbraith
  2007-04-05 16:08                 ` Con Kolivas
@ 2007-04-06  1:03                 ` Con Kolivas
  2007-04-06  9:07                   ` Mike Galbraith
  2 siblings, 1 reply; 92+ messages in thread
From: Con Kolivas @ 2007-04-06  1:03 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list

[-- Attachment #1: Type: text/plain, Size: 599 bytes --]

On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
>  - fiftyp.c:  noticeable, but alot better than previously!

fiftyp.c seems to have been stumbled across by accident as having an effect 
when Xenofon was trying to recreate Mike's 50% x 3 test case. I suggest a ten 
percent version like the following would be more useful as a test for the 
harmful effect discovered in fiftyp.c. (/me throws in obligatory code style 
change).

Starts 15 processes that sleep ten times longer than they run. Change forks to 
15 times the number of cpus you have and it should work on any size hardware.

-- 
-ck

[-- Attachment #2: tenp.c --]
[-- Type: text/x-csrc, Size: 3784 bytes --]

// gcc -O2 -o tenp tenp.c -lrt
// code from interbench.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
/*
 * Start $forks processes that run for 10% cpu time each. Set this to
 * 15 * number of cpus for best effect.
 */
int forks = 15;

unsigned long run_us = 1000000000, sleep_us;
unsigned long loops_per_ms;

void terminal_error(const char *name)
{
	fprintf(stderr, "\n");
	perror(name);
	exit (1);
}

unsigned long long get_nsecs(struct timespec *myts)
{
	if (clock_gettime(CLOCK_REALTIME, myts))
		terminal_error("clock_gettime");
	return (myts->tv_sec * 1000000000 + myts->tv_nsec );
}

void burn_loops(unsigned long loops)
{
	unsigned long i;

	/*
	 * We need some magic here to prevent the compiler from optimising
	 * this loop away. Otherwise trying to emulate a fixed cpu load
	 * with this loop will not work.
	 */
	for (i = 0 ; i < loops ; i++)
	     asm volatile("" : : : "memory");
}

/* Use this many usecs of cpu time */
void burn_usecs(unsigned long usecs)
{
	unsigned long ms_loops;

	ms_loops = loops_per_ms / 1000 * usecs;
	burn_loops(ms_loops);
}

void microsleep(unsigned long long usecs)
{
	struct timespec req, rem;

	rem.tv_sec = rem.tv_nsec = 0;

	req.tv_sec = usecs / 1000000;
	req.tv_nsec = (usecs - (req.tv_sec * 1000000)) * 1000;
continue_sleep:
	if ((nanosleep(&req, &rem)) == -1) {
		if (errno == EINTR) {
			if (rem.tv_sec || rem.tv_nsec) {
				req.tv_sec = rem.tv_sec;
				req.tv_nsec = rem.tv_nsec;
				goto continue_sleep;
			}
			goto out;
		}
		terminal_error("nanosleep");
	}
out:
	return;
}

/*
 * In an unoptimised loop we try to benchmark how many meaningless loops
 * per second we can perform on this hardware to fairly accurately
 * reproduce certain percentage cpu usage
 */
void calibrate_loop(void)
{
	unsigned long long start_time, loops_per_msec, run_time = 0,
		min_run_us = run_us;
	unsigned long loops;
	struct timespec myts;
	int i;

	printf("Calibrating loop\n");
	loops_per_msec = 1000000;
redo:
	/* Calibrate to within 1% accuracy */
	while (run_time > 1010000 || run_time < 990000) {
		loops = loops_per_msec;
		start_time = get_nsecs(&myts);
		burn_loops(loops);
		run_time = get_nsecs(&myts) - start_time;
		loops_per_msec = (1000000 * loops_per_msec / run_time ? :
			loops_per_msec);
	}

	/* Rechecking after a pause increases reproducibility */
	microsleep(1);
	loops = loops_per_msec;
	start_time = get_nsecs(&myts);
	burn_loops(loops);
	run_time = get_nsecs(&myts) - start_time;

	/* Tolerate 5% difference on checking */
	if (run_time > 1050000 || run_time < 950000)
		goto redo;
	loops_per_ms=loops_per_msec;
	printf("Calibrating sleep interval\n");
	microsleep(1);
	/* Find the smallest time interval close to 1ms that we can sleep */
	for (i = 0; i < 100; i++) {
		start_time=get_nsecs(&myts);
		microsleep(1000);
		run_time=get_nsecs(&myts)-start_time;
		run_time /= 1000;
		if (run_time < run_us && run_us > 1000)
			run_us = run_time;
	}
	/* Then set run_us to that duration and sleep_us to 9 x that */
	sleep_us = run_us * 9;
	printf("Calibrating run interval\n");
	microsleep(1);
	/* Do a few runs to see what really gets us run_us runtime */
	for (i = 0; i < 100; i++) {
		start_time=get_nsecs(&myts);
		burn_usecs(run_us);
		run_time=get_nsecs(&myts)-start_time;
		run_time /= 1000;
		if (run_time < min_run_us && run_time > run_us)
			min_run_us = run_time;
	}
	if (min_run_us < run_us)
		run_us = run_us * run_us / min_run_us;
	printf("Each fork will run for %lu usecs and sleep for %lu usecs\n",
		run_us, sleep_us);
}

int main(void){
	int i;

	calibrate_loop();
	printf("starting %d forks\n", forks);
	for(i = 1; i < forks; i++){
		if(!fork())
			break;
	}
	while(1){
		burn_usecs(run_us);
		microsleep(sleep_us);
	}
	return 0;
}

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-06  1:03                 ` Ten percent test Con Kolivas
@ 2007-04-06  9:07                   ` Mike Galbraith
  2007-04-06  9:28                     ` Con Kolivas
  0 siblings, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-06  9:07 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Fri, 2007-04-06 at 11:03 +1000, Con Kolivas wrote:
> On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
> >  - fiftyp.c:  noticeable, but alot better than previously!
> 
> fiftyp.c seems to have been stumbled across by accident as having an effect 
> when Xenofon was trying to recreate Mike's 50% x 3 test case. I suggest a ten 
> percent version like the following would be more useful as a test for the 
> harmful effect discovered in fiftyp.c. (/me throws in obligatory code style 
> change).
> 
> Starts 15 processes that sleep ten times longer than they run. Change forks to 
> 15 times the number of cpus you have and it should work on any size hardware.

I was more focused on the general case, but all I should have to do to
de-claw all of these sleep exploits is account rr time (only a couple of
lines, done and building now).  It's only a couple of lines.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-06  9:07                   ` Mike Galbraith
@ 2007-04-06  9:28                     ` Con Kolivas
  2007-04-06 10:03                       ` Ingo Molnar
  2007-04-06 10:48                       ` Mike Galbraith
  0 siblings, 2 replies; 92+ messages in thread
From: Con Kolivas @ 2007-04-06  9:28 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Friday 06 April 2007 19:07, Mike Galbraith wrote:
> On Fri, 2007-04-06 at 11:03 +1000, Con Kolivas wrote:
> > On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
> > >  - fiftyp.c:  noticeable, but alot better than previously!
> >
> > fiftyp.c seems to have been stumbled across by accident as having an
> > effect when Xenofon was trying to recreate Mike's 50% x 3 test case. I
> > suggest a ten percent version like the following would be more useful as
> > a test for the harmful effect discovered in fiftyp.c. (/me throws in
> > obligatory code style change).
> >
> > Starts 15 processes that sleep ten times longer than they run. Change
> > forks to 15 times the number of cpus you have and it should work on any
> > size hardware.
>
> I was more focused on the general case, but all I should have to do to
> de-claw all of these sleep exploits is account rr time (only a couple of
> lines, done and building now).  It's only a couple of lines.

The more you try to "de-claw" these sleep exploits the less effective you make 
your precious interactive estimator. Feel free to keep adding endless tweaks 
to undo the other tweaks in order to try and achieve what SD has by design. 
You'll end up with an incresingly complex state machine design of 
interactivity tweaks and interactivity throttlers all fighting each other to 
the point where the intearactivity estimator doesn't do anything. What's the 
point in that? Eventually you'll have an estimator throttled to the point it 
does nothing and you end up with something far less interactive than SD which 
is as interactive as fairness allows, unlike mainline.

-- 
-ck

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-06  9:28                     ` Con Kolivas
@ 2007-04-06 10:03                       ` Ingo Molnar
  2007-04-06 10:40                         ` Mike Galbraith
  2007-04-07  6:50                         ` Con Kolivas
  2007-04-06 10:48                       ` Mike Galbraith
  1 sibling, 2 replies; 92+ messages in thread
From: Ingo Molnar @ 2007-04-06 10:03 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list


* Con Kolivas <kernel@kolivas.org> wrote:

> > I was more focused on the general case, but all I should have to do 
> > to de-claw all of these sleep exploits is account rr time (only a 
> > couple of lines, done and building now).  It's only a couple of 
> > lines.
> 
> The more you try to "de-claw" these sleep exploits the less effective 
> you make your precious interactive estimator. Feel free to keep adding 
> endless tweaks to undo the other tweaks in order to try and achieve 
> what SD has by design.

firstly, testing on various workloads Mike's tweaks work pretty well, 
while SD still doesnt handle the high-load case all that well. Note that 
it was you who raised this whole issue to begin with: everything was 
pretty quiet in scheduling interactivity land. (There was one person who 
reported wide-scale interactivity regressions against mainline but he 
didnt answer my followup posts to trace/debug the scenario.)

SD has a built-in "interactivity estimator" as well, but hardcoded into 
its design. SD has its own set of ugly-looking tweaks as well - for 
example the prio_matrix. So it all comes down on 'what interactivity 
heuristics is enough', and which one is more tweakable. So far i've yet 
to see SD address the hackbench and make -j interactivity 
problems/regression for example, while Mike has been busy addressing the 
'exploits' reported against mainline.

> You'll end up with an incresingly complex state machine design of 
> interactivity tweaks and interactivity throttlers all fighting each 
> other to the point where the intearactivity estimator doesn't do 
> anything. [...]

It comes down to defining interactivity by scheduling behavior, and 
making that definition flexible. SD's definition of interactivity is 
rigid (but it's still behavior-based, so not fundamentally different 
from an explicit 'interactivity estimator'), and currently it does not 
work well under high load. But ... i'm still entertaining the notion 
that it might be good enough, but you've got to demonstrate the design's 
flexibility.

furthermore, your description does not match my experience when using 
Mike's tweaks and comparing it to SD on the same hardware. According to 
your claim i should have seen regressions popping up in various, 
already-fixed corners, but it didnt happen in practice. But ... i'm 
awaiting further SD and Mike tweaks, the race certainly looks 
interesting ;)

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-06 10:03                       ` Ingo Molnar
@ 2007-04-06 10:40                         ` Mike Galbraith
  2007-04-07  6:50                         ` Con Kolivas
  1 sibling, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-06 10:40 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list

On Fri, 2007-04-06 at 12:03 +0200, Ingo Molnar wrote:

> already-fixed corners, but it didnt happen in practice. But ... i'm 
> awaiting further SD and Mike tweaks, the race certainly looks 
> interesting ;)

<g> I think I lapped him, but since we're running in opposite
directions, it's hard to tell.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-06  9:28                     ` Con Kolivas
  2007-04-06 10:03                       ` Ingo Molnar
@ 2007-04-06 10:48                       ` Mike Galbraith
  1 sibling, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-06 10:48 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Fri, 2007-04-06 at 19:28 +1000, Con Kolivas wrote:
> On Friday 06 April 2007 19:07, Mike Galbraith wrote:
> > On Fri, 2007-04-06 at 11:03 +1000, Con Kolivas wrote:
> > > On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
> > > >  - fiftyp.c:  noticeable, but alot better than previously!
> > >
> > > fiftyp.c seems to have been stumbled across by accident as having an
> > > effect when Xenofon was trying to recreate Mike's 50% x 3 test case. I
> > > suggest a ten percent version like the following would be more useful as
> > > a test for the harmful effect discovered in fiftyp.c. (/me throws in
> > > obligatory code style change).
> > >
> > > Starts 15 processes that sleep ten times longer than they run. Change
> > > forks to 15 times the number of cpus you have and it should work on any
> > > size hardware.
> >
> > I was more focused on the general case, but all I should have to do to
> > de-claw all of these sleep exploits is account rr time (only a couple of
> > lines, done and building now).  It's only a couple of lines.
> 
> The more you try to "de-claw" these sleep exploits the less effective you make 
> your precious interactive estimator. Feel free to keep adding endless tweaks 
> to undo the other tweaks in order to try and achieve what SD has by design.

I haven't seen SD achieve what it's design docs claim yet, so yup, I'm
going to keep right on trying to fix the corner cases in what we have
that _does_ give me the interactivity I want.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-06 10:03                       ` Ingo Molnar
  2007-04-06 10:40                         ` Mike Galbraith
@ 2007-04-07  6:50                         ` Con Kolivas
  2007-04-07 16:12                           ` Gene Heskett
                                             ` (2 more replies)
  1 sibling, 3 replies; 92+ messages in thread
From: Con Kolivas @ 2007-04-07  6:50 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list

On Friday 06 April 2007 20:03, Ingo Molnar wrote:
> * Con Kolivas <kernel@kolivas.org> wrote:
> > > I was more focused on the general case, but all I should have to do
> > > to de-claw all of these sleep exploits is account rr time (only a
> > > couple of lines, done and building now).  It's only a couple of
> > > lines.
> >
> > The more you try to "de-claw" these sleep exploits the less effective
> > you make your precious interactive estimator. Feel free to keep adding
> > endless tweaks to undo the other tweaks in order to try and achieve
> > what SD has by design.
>
> firstly, testing on various workloads Mike's tweaks work pretty well,
> while SD still doesnt handle the high-load case all that well. Note that
> it was you who raised this whole issue to begin with: everything was
> pretty quiet in scheduling interactivity land.

I'm terribly sorry but you have completely missed my intentions then. I was 
_not_ trying to improve mainline's interactivity at all. My desire was to fix 
the unfairness that mainline has, across the board without compromising 
fairness. You said yourself that an approach that fixed a lot and had a small 
number of regressions would be worth it. In a surprisingly ironic turnaround 
two bizarre things happened. People found SD fixed a lot of their 
interactivity corner cases which were showstoppers. That didn't surprise me 
because any unfair design will by its nature get it wrong sometimes. The even 
_more_ surprising thing is that you're now using interactivity as the 
argument against SD. I did not set out to create better interactivity, I set 
out to create widespread fairness without too much compromise to 
interactivity. As I said from the _very first email_, there would be cases of 
interactivity in mainline that performed better.

> (There was one person who 
> reported wide-scale interactivity regressions against mainline but he
> didnt answer my followup posts to trace/debug the scenario.)

That was one user. As I mentioned in an earlier thread, the problem with email 
threads on drawn out issues on lkml is that all that people remember is the 
last one creating noise, and that has only been the noise from Mike for 2 
weeks now. Has everyone forgotten the many many users who reported the 
advantages first up which generated the interest in the first place? Why have 
they stopped reporting? Well the answer is obvious; all the signs suggest 
that SD is slated for mainline. It is on the path, Linus has suggested it and 
now akpm is asking if it's ready for 2.6.22. So they figure there is no point 
testing and replying any further. SD is ready for prime time, finalised and 
does everything I intended it to. This is where I have to reveal to them the 
horrible truth. This is no guarantee it will go in. In fact, this one point 
that you (Ingo) go on and on about is not only a quibble, but you will call 
it an absolute showstopper. As maintainer of the cpu scheduler, in its 
current form you will flatly refuse it goes to mainline citing the 5% of 
cases where interactivity has regressed. So people will tell me to fix it, 
right?... Read on for this to unfold.

> SD has a built-in "interactivity estimator" as well, but hardcoded into
> its design. SD has its own set of ugly-looking tweaks as well - for
> example the prio_matrix.

I'm sorry but this is a mis-representation to me, as I suggested on an earlier 
thread where I disagree about what an interactivity estimator is. The idea of 
fence posts in a clock that are passed as a way of metering out 
earliest-deadline-first in a design is well established. The matrix is simply 
an array designed for O(1) lookups of the fence posts. That is not the same 
as "oh how much have we slept in the last $magic_number period and how much 
extra time should we get for that".

> So it all comes down on 'what interactivity 
> heuristics is enough', and which one is more tweakable. So far i've yet
> to see SD address the hackbench and make -j interactivity
> problems/regression for example, while Mike has been busy addressing the
> 'exploits' reported against mainline.

And BANG there is the bullet you will use against SD from here to eternity. SD 
obeys fairness at all costs. Your interactivity regression is that SD causes 
progressive slowdown with load which by definition is fairness. You 
repeatedly ask me to address it and there is on unfailing truth; the only way 
to address it is to add unfairness to the design. So why don't I? Because the 
simple fact is that any unfairness no matter how carefully administered or 
metered will always have cases where it's wrong. Look at the title of this 
email for example - it's yet another exploit for the mainline sleep/run 
mechanism. This does _not_ mean I'm implying people are logging into servers 
and running ./tenp to hang the machine. What it demonstrates is a way of 
reproducing the scenario which is biting people with real world loads. It's 
entirely believable that a simple p2p app could be behaving like tenp, only 
generating a small load and it could take ages to log in and use the console. 
Willy has complained this is why people stick to 2.4. Sure I can create 
interactivity tweaks worse than anyone else. I will not, though, because that 
precisely undoes what is special about SD. It never looks backwards, and is 
predictable to absurdity. So you'll argue that mainline can manage it 
below...

> > You'll end up with an incresingly complex state machine design of
> > interactivity tweaks and interactivity throttlers all fighting each
> > other to the point where the intearactivity estimator doesn't do
> > anything. [...]
>
> It comes down to defining interactivity by scheduling behavior, and
> making that definition flexible. SD's definition of interactivity is
> rigid (but it's still behavior-based, so not fundamentally different
> from an explicit 'interactivity estimator'), and currently it does not
> work well under high load. But ... i'm still entertaining the notion
> that it might be good enough, but you've got to demonstrate the design's
> flexibility.

I have yet to see someone find an "exploit" for SD's current design. Mainline 
is all about continually patching up the intrinsic design (and fixing this 
one test case is not the be all and end all).

> furthermore, your description does not match my experience when using
> Mike's tweaks and comparing it to SD on the same hardware. According to
> your claim i should have seen regressions popping up in various,
> already-fixed corners, but it didnt happen in practice. But ... i'm
> awaiting further SD and Mike tweaks, the race certainly looks
> interesting ;)

Well you see a race. I do not. I see a flat predictable performance from SD 
where there will always be slowdown with load. I have no intention of 
changing that. Mike is making an admirable attempt to fix issues as they are 
pointed out. You say there are no regressions but I see absolutely no testers 
of his patches besides himself and you. If I introduce any unfairness based 
on sleep behaviour into SD I'll be undoing the whole point of the design and 
end up chasing new regressions. So I won't quibble over the numbers. SD has 
produced a lot of improvements and fairness that mainline struggles with ever 
increasing patches to emulate, but SD does so at the expense of proportional 
slowdown with load. At least I accept that and will no longer put my health 
at risk trying to "fix" it by "breaking" it. SD is done.

I feel sorry for the many users out there who are simply "waiting for it to 
end up in mainline" who just discovered you will veto it on that basis. 
lwn.net had it wrong; this was far more painful than any previous attempt to 
get anything into mainline.

My health has been so badly affected by this that I've been given an ultimatum 
and must turn my computer off till I get well now which may be weeks. I 
already know the massive flameage and last-word comments that are likely to 
be fired off before the inevitable decision to veto it.

> 	Ingo

さようなら

-- 
-ck

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07  6:50                         ` Con Kolivas
@ 2007-04-07 16:12                           ` Gene Heskett
  2007-04-07 18:08                             ` Ingo Molnar
  2007-04-07 16:32                           ` Mike Galbraith
  2007-04-08 13:08                           ` Ed Tomlinson
  2 siblings, 1 reply; 92+ messages in thread
From: Gene Heskett @ 2007-04-07 16:12 UTC (permalink / raw)
  To: linux-kernel
  Cc: Con Kolivas, Ingo Molnar, Mike Galbraith, Andrew Morton, ck list

On Saturday 07 April 2007, Con Kolivas wrote:
>On Friday 06 April 2007 20:03, Ingo Molnar wrote:
>> * Con Kolivas <kernel@kolivas.org> wrote:
>[...]
>> 
>> firstly, testing on various workloads Mike's tweaks work pretty well,
>> while SD still doesnt handle the high-load case all that well. Note
>> that it was you who raised this whole issue to begin with: everything
>> was pretty quiet in scheduling interactivity land.

Con was scratching an itch, one we desktop users all have in a place we 
can't quite reach to scratch because we aren't quite the coding gods we 
should be.  Con at least has the coding knowledge to walk in and start 
shoveling, which is more than I can say of the efforts to derail the SD 
scheduler have demonstrated to this user.

>I'm terribly sorry but you have completely missed my intentions then. I
> was _not_ trying to improve mainline's interactivity at all. My desire
> was to fix the unfairness that mainline has, across the board without
> compromising fairness. You said yourself that an approach that fixed a
> lot and had a small number of regressions would be worth it. In a
> surprisingly ironic turnaround two bizarre things happened. People
> found SD fixed a lot of their interactivity corner cases which were
> showstoppers. That didn't surprise me because any unfair design will by
> its nature get it wrong sometimes. The even _more_ surprising thing is
> that you're now using interactivity as the argument against SD. I did
> not set out to create better interactivity, I set out to create
> widespread fairness without too much compromise to interactivity. As I
> said from the _very first email_, there would be cases of interactivity
> in mainline that performed better.
>
>> (There was one person who
>> reported wide-scale interactivity regressions against mainline but he
>> didnt answer my followup posts to trace/debug the scenario.)
>
>That was one user. As I mentioned in an earlier thread, the problem with
> email threads on drawn out issues on lkml is that all that people
> remember is the last one creating noise, and that has only been the
> noise from Mike for 2 weeks now. Has everyone forgotten the many many
> users who reported the advantages first up which generated the interest
> in the first place? Why have they stopped reporting? Well the answer is
> obvious; all the signs suggest that SD is slated for mainline. It is on
> the path, Linus has suggested it and now akpm is asking if it's ready
> for 2.6.22. So they figure there is no point testing and replying any
> further. SD is ready for prime time, finalised and does everything I
> intended it to. This is where I have to reveal to them the horrible
> truth. This is no guarantee it will go in. In fact, this one point that
> you (Ingo) go on and on about is not only a quibble, but you will call
> it an absolute showstopper. As maintainer of the cpu scheduler, in its
> current form you will flatly refuse it goes to mainline citing the 5%
> of cases where interactivity has regressed. So people will tell me to
> fix it, right?... Read on for this to unfold.

Sorry, this user got quiet to watch the cat fight.  Obviously I should 
have been throwing messages wrapped around rocks (or something).

>> SD has a built-in "interactivity estimator" as well, but hardcoded
>> into its design. SD has its own set of ugly-looking tweaks as well -
>> for example the prio_matrix.
>
>I'm sorry but this is a mis-representation to me, as I suggested on an
> earlier thread where I disagree about what an interactivity estimator
> is. The idea of fence posts in a clock that are passed as a way of
> metering out earliest-deadline-first in a design is well established.
> The matrix is simply an array designed for O(1) lookups of the fence
> posts. That is not the same as "oh how much have we slept in the last
> $magic_number period and how much extra time should we get for that".
>
>> So it all comes down on 'what interactivity
>> heuristics is enough', and which one is more tweakable. So far i've
>> yet to see SD address the hackbench and make -j interactivity
>> problems/regression for example, while Mike has been busy addressing
>> the 'exploits' reported against mainline.

Who gives a s*** about hackbench or a make -j 200?!  Those are NOT, and 
NEVER WILL BE, REAL WORLD LOADS for the vast majority of us.  For us SD 
Just Worked(TM).

>And BANG there is the bullet you will use against SD from here to
> eternity. SD obeys fairness at all costs. Your interactivity regression
> is that SD causes progressive slowdown with load which by definition is
> fairness. You repeatedly ask me to address it and there is on unfailing
> truth; the only way to address it is to add unfairness to the design.
> So why don't I? Because the simple fact is that any unfairness no
> matter how carefully administered or metered will always have cases
> where it's wrong. Look at the title of this email for example - it's
> yet another exploit for the mainline sleep/run mechanism. This does
> _not_ mean I'm implying people are logging into servers and running
> ./tenp to hang the machine. What it demonstrates is a way of
> reproducing the scenario which is biting people with real world loads.
> It's entirely believable that a simple p2p app could be behaving like
> tenp, only generating a small load and it could take ages to log in and
> use the console. Willy has complained this is why people stick to 2.4.
> Sure I can create interactivity tweaks worse than anyone else. I will
> not, though, because that precisely undoes what is special about SD. It
> never looks backwards, and is predictable to absurdity. So you'll argue
> that mainline can manage it below...
>
>> > You'll end up with an incresingly complex state machine design of
>> > interactivity tweaks and interactivity throttlers all fighting each
>> > other to the point where the intearactivity estimator doesn't do
>> > anything. [...]
>>
>> It comes down to defining interactivity by scheduling behavior, and
>> making that definition flexible. SD's definition of interactivity is
>> rigid (but it's still behavior-based, so not fundamentally different
>> from an explicit 'interactivity estimator'), and currently it does not
>> work well under high load. But ... i'm still entertaining the notion
>> that it might be good enough, but you've got to demonstrate the
>> design's flexibility.
>
>I have yet to see someone find an "exploit" for SD's current design.
> Mainline is all about continually patching up the intrinsic design (and
> fixing this one test case is not the be all and end all).
>
>> furthermore, your description does not match my experience when using
>> Mike's tweaks and comparing it to SD on the same hardware. According
>> to your claim i should have seen regressions popping up in various,
>> already-fixed corners, but it didnt happen in practice. But ... i'm
>> awaiting further SD and Mike tweaks, the race certainly looks
>> interesting ;)
>
>Well you see a race. I do not. I see a flat predictable performance from
> SD where there will always be slowdown with load. I have no intention
> of changing that. Mike is making an admirable attempt to fix issues as
> they are pointed out. You say there are no regressions but I see
> absolutely no testers of his patches besides himself and you. If I
> introduce any unfairness based on sleep behaviour into SD I'll be
> undoing the whole point of the design and end up chasing new
> regressions. So I won't quibble over the numbers. SD has produced a lot
> of improvements and fairness that mainline struggles with ever
> increasing patches to emulate, but SD does so at the expense of
> proportional slowdown with load.

To be expected, there are after all, only so many cpu cycles to go around.  
Here I sit, running 2.6.21-rc6 ATM, and since there is not an SD patch 
that applies cleanly to rc6, I am back to typing half or more of a 
sentence blind while I answer a posting such as this because of x 
starvation while kmail is sorting incoming stuff.

All this while gkrellm, sitting on the right edge of my screen, is showing 
a 0 to 2% cpu usage in its graphic display!  FWIW, also isn't suffering 
the same display update problems, nor is the system clock down on the 
kickstart bar.  If that isn't prima faci evidence of an unfair scheduler, 
I don't know what is. With the SD patch applied to a working kernel, I've 
pretty well got my machine back and I'm in command again, just as if I 
was running nitros9 on my trs-80 Color Computer while it was compiling a 
program in the background, or back when I was doing all this on an amiga.

Both of these had, by their simplistic designs, schedulers that were fair, 
with (nitr)os9 having the ability to schedule the order that IRQ's were 
serviced with a priority setting on a per IRQ basis.  If Amigados ever 
had the ability to fiddle with the scheduler other than niceing the 
process, it wasn't important enough for me to see if I could tweak it 
because generally it simply worked.

Con's earlier patches worked very well for this desktop user, but as Mike 
kept bitching about "production", (who the hell runs a 'make -j 200' or 
50 while(1)'s in the real world? Certainly not this user, who would like 
to think he has more sense than that) and to heck with the users 
experience. Con kept trying to satisfy Mike, so the more recent ones 
(that I was able to apply & test since I was also fighting with the 
device-mapper change) weren't quite the night and day difference the 
earlier ones were for a desktop user.

I thought I made enough noise in favor of Con's approach early on, but you 
two got into what can only be described as a cat fight, with Ingo 
apparently siding with Mike, and Con apparently isn't up to that sort of 
thing either physically, or mentally after the seemingly endless 
criticism from Mike.  And I hate to say it, but Ingo, you weren't a lot 
of help either, the setbatch utility you had me doing in my scripts 
didn't work all that well, returning an error which was its process 
number when incorporated into a script, and still left me without a 
machine for 5 seconds at a time when gzip -best was running in the 
background, all this something I didn't report all that noisily because 
of the unrelated amanda problems I was having post 2.6.20.3.  The SD 
patches, generally speaking, brought this lag into the less than 1 second 
range 90% of the time.

Maybe I wasn't able to give SD vs mainline aspect my full attention 
either, you'll recall that I was, and am, still bouncing around from the 
in-out status of the device-mapper patch.  But, now that I know how to 
handle it (as a startup from square one for amanda) and the reason for 
the patch, I am more than willing to tolerate it if its a one time thing, 
and the amanda users have been advised, but now its been reverted for a 
bit, and will have to go through this particular bit of hassle again 
later.  I disagree with the reversion of that patch on that basis alone.

> At least I accept that and will no 
> longer put my health at risk trying to "fix" it by "breaking" it. SD is
> done.
>
>I feel sorry for the many users out there who are simply "waiting for it
> to end up in mainline" who just discovered you will veto it on that
> basis. lwn.net had it wrong; this was far more painful than any
> previous attempt to get anything into mainline.
>
>My health has been so badly affected by this that I've been given an
> ultimatum and must turn my computer off till I get well now which may
> be weeks. I already know the massive flameage and last-word comments
> that are likely to be fired off before the inevitable decision to veto
> it.

In this regard, my sympathies Con.  Do get well.  And re-armed for this 
battle.  It is a worthwhile battle, and many thanks from this user to you 
for having fought it.  And I apologize for not supporting your efforts a 
lot more vocally.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
Is this going to involve RAW human ecstasy?

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07  6:50                         ` Con Kolivas
  2007-04-07 16:12                           ` Gene Heskett
@ 2007-04-07 16:32                           ` Mike Galbraith
  2007-04-08 13:08                           ` Ed Tomlinson
  2 siblings, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-07 16:32 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list

On Sat, 2007-04-07 at 16:50 +1000, Con Kolivas wrote:
> On Friday 06 April 2007 20:03, Ingo Molnar wrote:

> > (There was one person who 
> > reported wide-scale interactivity regressions against mainline but he
> > didnt answer my followup posts to trace/debug the scenario.)
> 
> That was one user. As I mentioned in an earlier thread, the problem with email 
> threads on drawn out issues on lkml is that all that people remember is the 
> last one creating noise, and that has only been the noise from Mike for 2 
> weeks now.

This doesn't even deserve a reply, so I'll just say "get well soon".

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07 16:12                           ` Gene Heskett
@ 2007-04-07 18:08                             ` Ingo Molnar
  2007-04-07 18:23                               ` Gene Heskett
  2007-04-07 19:14                               ` Mike Galbraith
  0 siblings, 2 replies; 92+ messages in thread
From: Ingo Molnar @ 2007-04-07 18:08 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list


* Gene Heskett <gene.heskett@gmail.com> wrote:

> To be expected, there are after all, only so many cpu cycles to go 
> around.  Here I sit, running 2.6.21-rc6 ATM, and since there is not an 
> SD patch that applies cleanly to rc6, I am back to typing half or more 
> of a sentence blind while I answer a posting such as this because of x 
> starvation while kmail is sorting incoming stuff.

it would be really nice to analyze this. Does the latest -rt patch boot 
on your box so that we could trace this regression? (I can send you a 
standalone tracing patch if it doesnt.) IIRC you reported that one of 
the early patches from Mike made your system behave good (but still not 
as good as SD) - it would be nice to try a later patch too.

basically, the current unfairness in the scheduler should be solved, one 
way or another. Good testcases were posted and there's progress.

> (who the hell runs a 'make -j 200' or 50 while(1)'s in the real world?

not many - and i dont think Mike tested any of these - Mike tested 
pretty low make -j values (Mike, can you confirm?).

(I personally routinely run 'make -j 200' build jobs on my box [because
 it's the central server of a build cluster and high parallelism is
 needed to overcome network latencies], but i'm pretty special in that
 regard and i didnt use that workload as a test against any of these
 schedulers.)

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07 18:08                             ` Ingo Molnar
@ 2007-04-07 18:23                               ` Gene Heskett
  2007-04-07 18:52                                 ` Ingo Molnar
  2007-04-07 19:14                               ` Mike Galbraith
  1 sibling, 1 reply; 92+ messages in thread
From: Gene Heskett @ 2007-04-07 18:23 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Con Kolivas, Mike Galbraith, Andrew Morton, ck list

On Saturday 07 April 2007, Ingo Molnar wrote:
>* Gene Heskett <gene.heskett@gmail.com> wrote:
>> To be expected, there are after all, only so many cpu cycles to go
>> around.  Here I sit, running 2.6.21-rc6 ATM, and since there is not an
>> SD patch that applies cleanly to rc6, I am back to typing half or more
>> of a sentence blind while I answer a posting such as this because of x
>> starvation while kmail is sorting incoming stuff.
>
>it would be really nice to analyze this. Does the latest -rt patch boot
>on your box so that we could trace this regression? (I can send you a
>standalone tracing patch if it doesnt.) IIRC you reported that one of
>the early patches from Mike made your system behave good (but still not
>as good as SD) - it would be nice to try a later patch too.

Yes it would be Ingo, but so far, none of the recent -rt patches has 
booted on this machine, the last one I tried a few days ago failing to 
find /dev/root, whatever the heck that is.

FWIW, I gave up on the rt stuffs 6 months or more ago when the regressions 
I was reporting weren't ever acknowledged.  I don't enjoy sitting through 
all these e2fsk's during the reboot just to have things I normally run in 
the background die, like tvtime, sitting there with some news channel 
muttering along in the background.  I was even ignored when I suggested 
it might be a dma problem, which I still think it could be.

Nevertheless, the patch you sent is building as I type, intermittently 
when the screen deigns to update so I can fix the spelling etc.

>basically, the current unfairness in the scheduler should be solved, one
>way or another. Good testcases were posted and there's progress.
>
>> (who the hell runs a 'make -j 200' or 50 while(1)'s in the real world?
>
>not many - and i dont think Mike tested any of these - Mike tested
>pretty low make -j values (Mike, can you confirm?).
>
>(I personally routinely run 'make -j 200' build jobs on my box [because
> it's the central server of a build cluster and high parallelism is
> needed to overcome network latencies], but i'm pretty special in that
> regard and i didnt use that workload as a test against any of these
> schedulers.)

And I'd wager a cool one that you don't gain more than a second or so in 
compile time between a make -j8 and a make -j200 unless your network is a 
pair of tomato juice cans & some string.  Again, to me, the network thing 
is not something that's present in an everyday users environment.  My 
drives are all here and now, on pata-133 interfaces.

>	Ingo

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
If you would keep a secret from an enemy, tell it not to a friend.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07 18:23                               ` Gene Heskett
@ 2007-04-07 18:52                                 ` Ingo Molnar
  2007-04-07 20:30                                   ` Gene Heskett
  0 siblings, 1 reply; 92+ messages in thread
From: Ingo Molnar @ 2007-04-07 18:52 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list


* Gene Heskett <gene.heskett@gmail.com> wrote:

> Yes it would be Ingo, but so far, none of the recent -rt patches has 
> booted on this machine, the last one I tried a few days ago failing to 
> find /dev/root, whatever the heck that is.

did you have a chance to try the yum kernel by any chance? The -testing 
one you can try on Fedora with little hassle, by doing this as root:

cat > /etc/yum.repos.d/rt-testing.repo
[rt-testing]
name=Ingo's Real-Time (-rt) test-kernel for FC6
baseurl=http://people.redhat.com/mingo/realtime-preempt/yum-testing/yum/
enabled=1
gpgcheck=0
<Ctrl-D>

and "yum install kernel-rt" and a reboot should get you going.

> [...]  I don't enjoy sitting through all these e2fsk's during the 
> reboot just to have things I normally run in the background die, like 
> tvtime, sitting there with some news channel muttering along in the 
> background.  I was even ignored when I suggested it might be a dma 
> problem, which I still think it could be.

i did spend quite some time to debug your tv-tuner problem back then, 
and for that purpose alone i bought a tv tuner card to test this myself. 
(but it worked on my testbox)

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07 18:08                             ` Ingo Molnar
  2007-04-07 18:23                               ` Gene Heskett
@ 2007-04-07 19:14                               ` Mike Galbraith
  2007-04-07 20:31                                 ` Gene Heskett
  2007-04-09 17:51                                 ` William Lee Irwin III
  1 sibling, 2 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-07 19:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton, ck list

On Sat, 2007-04-07 at 20:08 +0200, Ingo Molnar wrote:
> * Gene Heskett <gene.heskett@gmail.com> wrote:

> > (who the hell runs a 'make -j 200' or 50 while(1)'s in the real world?
> 
> not many - and i dont think Mike tested any of these - Mike tested 
> pretty low make -j values (Mike, can you confirm?).

Yes.  I don't test anything more than make -j5 when looking at
interactivity, and make -j nr_cpus+1 is my must have yardstick.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07 18:52                                 ` Ingo Molnar
@ 2007-04-07 20:30                                   ` Gene Heskett
  2007-04-08 10:41                                     ` Ingo Molnar
  0 siblings, 1 reply; 92+ messages in thread
From: Gene Heskett @ 2007-04-07 20:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Con Kolivas, Mike Galbraith, Andrew Morton, ck list

On Saturday 07 April 2007, Ingo Molnar wrote:
>* Gene Heskett <gene.heskett@gmail.com> wrote:
>> Yes it would be Ingo, but so far, none of the recent -rt patches has
>> booted on this machine, the last one I tried a few days ago failing to
>> find /dev/root, whatever the heck that is.
>
>did you have a chance to try the yum kernel by any chance? The -testing
>one you can try on Fedora with little hassle, by doing this as root:
>
>cat > /etc/yum.repos.d/rt-testing.repo
>[rt-testing]
>name=Ingo's Real-Time (-rt) test-kernel for FC6
>baseurl=http://people.redhat.com/mingo/realtime-preempt/yum-testing/yum/
>enabled=1
>gpgcheck=0
><Ctrl-D>
>
>and "yum install kernel-rt" and a reboot should get you going.

No, I couldn't seem to get that to show up in a yumex display, and I'm 
partial to smart anyway.

>> [...]  I don't enjoy sitting through all these e2fsk's during the
>> reboot just to have things I normally run in the background die, like
>> tvtime, sitting there with some news channel muttering along in the
>> background.  I was even ignored when I suggested it might be a dma
>> problem, which I still think it could be.
>
>i did spend quite some time to debug your tv-tuner problem back then,
>and for that purpose alone i bought a tv tuner card to test this myself.
>(but it worked on my testbox)
>
>	Ingo
You didn't tell me this.

That said, I am booted to the patch you sent me now, and this also is a 
very obvious improvement, one I could easily live with on a long term 
basis.  I haven't tried a kernel build in the background yet, but I have 
sat here and played patience for about an hour, looking for the little 
stutters, but never saw them.  So I could just as easily recommend this 
one for desktop use, it seems to be working.  tvtime hasn't had any audio 
or video glitches that I've noted when I was on that screen to check on 
an interesting story, like the 102 year old lady who finally got her hole 
in one, on a very short hole, but after 90 years of golfing, she was 
beginning to wonder if she would ever get one.  Not sure who bought at 
the 19th hole, HNN didn't cover that traditional part.

So this patch also works.  And if it gets into mainline, at least Con's 
efforts at proding the fixes needed will not have been in vain.

My question then, is why did it take a very public cat-fight to get this 
looked at and the code adjusted?  Its been what, nearly 2 years since 
Linus himself made a comment that this thing needed fixed.  The fixes 
then done were of very little actual effectiveness and the situation then 
has gradually deteriorated since.

Its on the desktop that linux will win or lose the public's market share.  
After all, there are only so many 'servers' on the planet, a market that 
linux has pretty well demo'ed its superiority, if not in terms of speed, 
at least in security.

To qualify that, I currently have 2 of yahoo's machines in 
my .procmailrc's /dev/null list as they are a source of a large number of 
little 1 to 3 line spams.  I assume they are IIS machines, but the emails 
headers aren't that explicit to my relatively untrained eyeballs.

And I'd like to see korea put on a permanent rbl black hole.  I'm less 
than amused at watching the log coming out of my router as first one 
shithead and then the next makes a 100,000 word dictionary attack against 
it.  One has even found a way too cause a tcp reset about every 10 words 
tried.  But nobody has gotten any farther than that.  That knocking 
sound?  Guess.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
You are magnetic in your bearing.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07 19:14                               ` Mike Galbraith
@ 2007-04-07 20:31                                 ` Gene Heskett
  2007-04-09 17:51                                 ` William Lee Irwin III
  1 sibling, 0 replies; 92+ messages in thread
From: Gene Heskett @ 2007-04-07 20:31 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Saturday 07 April 2007, Mike Galbraith wrote:
>On Sat, 2007-04-07 at 20:08 +0200, Ingo Molnar wrote:
>> * Gene Heskett <gene.heskett@gmail.com> wrote:
>> > (who the hell runs a 'make -j 200' or 50 while(1)'s in the real
>> > world?
>>
>> not many - and i dont think Mike tested any of these - Mike tested
>> pretty low make -j values (Mike, can you confirm?).
>
>Yes.  I don't test anything more than make -j5 when looking at
>interactivity, and make -j nr_cpus+1 is my must have yardstick.
>
>	-Mike

Somebody made that remark, maybe not you, and maybe they were being funny, 
but I didn't at the time, see any smileys.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
Please remain calm, it's no use both of us being hysterical at the same 
time.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07 20:30                                   ` Gene Heskett
@ 2007-04-08 10:41                                     ` Ingo Molnar
  2007-04-08 10:58                                       ` Ingo Molnar
                                                         ` (2 more replies)
  0 siblings, 3 replies; 92+ messages in thread
From: Ingo Molnar @ 2007-04-08 10:41 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list


* Gene Heskett <gene.heskett@gmail.com> wrote:

> That said, I am booted to the patch you sent me now, and this also is 
> a very obvious improvement, one I could easily live with on a long 
> term basis.  I haven't tried a kernel build in the background yet, but 
> I have sat here and played patience for about an hour, looking for the 
> little stutters, but never saw them.  So I could just as easily 
> recommend this one for desktop use, it seems to be working.  tvtime 
> hasn't had any audio or video glitches that I've noted when I was on 
> that screen to check on an interesting story, like the 102 year old 
> lady who finally got her hole in one, on a very short hole, but after 
> 90 years of golfing, she was beginning to wonder if she would ever get 
> one.  Not sure who bought at the 19th hole, HNN didn't cover that 
> traditional part.
> 
> So this patch also works.  And if it gets into mainline, at least 
> Con's efforts at proding the fixes needed will not have been in vain.

thanks for testing it! (for the record, Gene tested sched-mike-4.patch, 
which is Mike's patch from 4 days ago.)

> My question then, is why did it take a very public cat-fight to get 
> this looked at and the code adjusted?  Its been what, nearly 2 years 
> since Linus himself made a comment that this thing needed fixed.  The 
> fixes then done were of very little actual effectiveness and the 
> situation then has gradually deteriorated since.

this is pretty hard to get right, and the most objective way to change 
it is to do it testcase-driven. FYI, interactivity tweaking has been 
gradual, the last bigger round of interactivity changes were done a year 
ago:

 commit 5ce74abe788a26698876e66b9c9ce7e7acc25413
 Author: Mike Galbraith <efault@gmx.de>
 Date:   Mon Apr 10 22:52:44 2006 -0700

     [PATCH] sched: fix interactive task starvation

(and a few smaller tweaks since then too.)

and that change from Mike responded to a testcase. Mike's latest changes 
(the ones you just tested) were mostly driven by actual testcases too, 
which measured long-term timeslice distribution fairness.

It's really hard to judge interactivity subjectively, so we rely on 
things like interbench (written by Con) - in which testsuite the 
upstream scheduler didnt fare all that badly, plus other testcases 
(thud.c, game_sim.c, now massive_inter.c, fiftyp.c and chew.c) and all 
the usual test-workloads. This is admittedly a slow process, but it 
seems to be working too and it also ensures that we dont regress in the 
future. (because testcases stick around and do get re-tested)

your system seems to also be a bit special because you 1) drive it to 
the absolute max on the desktop but you do not overload it in obvious 
ways (i.e. your workloads are pretty fairly structured) 2) it's a bit 
under-powered (single-CPU 800 MHz CPU, right?) but not _too_ 
underpowered - so i think you /just/ managed to hit 'the worst' of the 
current interactivity estimator: with important tasks both being just 
above and just below 50%. Believe me, on all ~10 systems i use 
regularly, Linux interactivity of the vanilla scheduler is stellar. (And 
that includes a really old 500 MHz one too with FC6 on it.)

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 10:41                                     ` Ingo Molnar
@ 2007-04-08 10:58                                       ` Ingo Molnar
  2007-04-08 17:04                                         ` Gene Heskett
  2007-04-08 11:33                                       ` Gene Heskett
  2007-04-08 18:51                                       ` Rene Herman
  2 siblings, 1 reply; 92+ messages in thread
From: Ingo Molnar @ 2007-04-08 10:58 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list


* Ingo Molnar <mingo@elte.hu> wrote:

> > My question then, is why did it take a very public cat-fight to get 
> > this looked at and the code adjusted?  Its been what, nearly 2 years 
> > since Linus himself made a comment that this thing needed fixed.  
> > The fixes then done were of very little actual effectiveness and the 
> > situation then has gradually deteriorated since.
> 
> this is pretty hard to get right, and the most objective way to change 
> it is to do it testcase-driven. FYI, interactivity tweaking has been 
> gradual, the last bigger round of interactivity changes were done a 
> year ago:

and note that a year ago Mike did a larger patch too, not unlike his 
current patch - but we hoped that his smaller change would be sufficient 
- and nobody came along and said "i tested Mike's and the difference is 
significant on my system". Which seems to suggest that the number of 
problem-systems and worried users/developers isnt particularly large.

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 10:41                                     ` Ingo Molnar
  2007-04-08 10:58                                       ` Ingo Molnar
@ 2007-04-08 11:33                                       ` Gene Heskett
  2007-04-08 11:40                                         ` Mike Galbraith
  2007-04-08 18:51                                       ` Rene Herman
  2 siblings, 1 reply; 92+ messages in thread
From: Gene Heskett @ 2007-04-08 11:33 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Con Kolivas, Mike Galbraith, Andrew Morton, ck list

On Sunday 08 April 2007, Ingo Molnar wrote:
>* Gene Heskett <gene.heskett@gmail.com> wrote:
>> That said, I am booted to the patch you sent me now, and this also is
>> a very obvious improvement, one I could easily live with on a long
>> term basis.  I haven't tried a kernel build in the background yet, but
>> I have sat here and played patience for about an hour, looking for the
>> little stutters, but never saw them.  So I could just as easily
>> recommend this one for desktop use, it seems to be working.  tvtime
>> hasn't had any audio or video glitches that I've noted when I was on
>> that screen to check on an interesting story, like the 102 year old
>> lady who finally got her hole in one, on a very short hole, but after
>> 90 years of golfing, she was beginning to wonder if she would ever get
>> one.  Not sure who bought at the 19th hole, HNN didn't cover that
>> traditional part.
>>
>> So this patch also works.  And if it gets into mainline, at least
>> Con's efforts at proding the fixes needed will not have been in vain.
>
>thanks for testing it! (for the record, Gene tested sched-mike-4.patch,
>which is Mike's patch from 4 days ago.)
>
>> My question then, is why did it take a very public cat-fight to get
>> this looked at and the code adjusted?  Its been what, nearly 2 years
>> since Linus himself made a comment that this thing needed fixed.  The
>> fixes then done were of very little actual effectiveness and the
>> situation then has gradually deteriorated since.
>
>this is pretty hard to get right, and the most objective way to change
>it is to do it testcase-driven. FYI, interactivity tweaking has been
>gradual, the last bigger round of interactivity changes were done a year
>ago:
>
> commit 5ce74abe788a26698876e66b9c9ce7e7acc25413
> Author: Mike Galbraith <efault@gmx.de>
> Date:   Mon Apr 10 22:52:44 2006 -0700
>
>     [PATCH] sched: fix interactive task starvation
>
>(and a few smaller tweaks since then too.)
>
>and that change from Mike responded to a testcase. Mike's latest changes
>(the ones you just tested) were mostly driven by actual testcases too,
>which measured long-term timeslice distribution fairness.
>
>It's really hard to judge interactivity subjectively, so we rely on
>things like interbench (written by Con) - in which testsuite the
>upstream scheduler didnt fare all that badly, plus other testcases
>(thud.c, game_sim.c, now massive_inter.c, fiftyp.c and chew.c) and all
>the usual test-workloads. This is admittedly a slow process, but it
>seems to be working too and it also ensures that we dont regress in the
>future. (because testcases stick around and do get re-tested)
>
>your system seems to also be a bit special because you 1) drive it to
>the absolute max on the desktop but you do not overload it in obvious
>ways (i.e. your workloads are pretty fairly structured) 2) it's a bit
>under-powered (single-CPU 800 MHz CPU, right?) but not _too_
>underpowered - so i think you /just/ managed to hit 'the worst' of the
>current interactivity estimator: with important tasks both being just
>above and just below 50%. Believe me, on all ~10 systems i use
>regularly, Linux interactivity of the vanilla scheduler is stellar. (And
>that includes a really old 500 MHz one too with FC6 on it.)

Actually, its an XP2800 Athlon, 333 fsb, gig of memory.  And I was all 
enthusiastic about this until amanda's nightly run started, at which 
point I started losing control for quite long periods, 30+ seconds at a 
time.  Up till then I thought we had it made.  In this regard, Cons 
patches were enough better to notice it right away, lags were 1-2 seconds 
max.

That seems to be the killer loading here, building a kernel (make -j3) 
doesn't seem to lag it all that bad.  One session of gzip -best makes it 
fall plumb over though, which was a disappointment.

But, I could live with this.

Now if I could figure out a way to nail dm_mod down to a fixed LANANA 
approved address, I just got bit again, because enabling pktcdvd caused a 
MAJOR switch, only from 253 to 252 but tar thinks the whole 45GB is all 
new again.  So since it, dm_mod, no longer carries the experimental 
label, lets put that patch back in and be done with this particular 
hassle once and for all.  If I had known that using LVM2 was going to be 
such a pain in the ass just with this item alone, I wouldn't have touched 
it with a 50 foot fiberglass pole.  Or does this SOB effect normal 
partition mountings too?  I don't know, and the suggested fixes from 
David Dillow I put in /etc/modprobe.conf are ignored for dm_mod, and when 
extended to pktcdvd, cause pktcdvd to fail totally.

Mmm??, can I pass an 'option dm_mod major=238' as a kernel argument & make 
it work that way?  This is extremely frustrating as it is now.

>	Ingo

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
Real Programmers don't write in PL/I.  PL/I is for programmers who can't
decide whether to write in COBOL or FORTRAN.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 11:33                                       ` Gene Heskett
@ 2007-04-08 11:40                                         ` Mike Galbraith
  2007-04-08 12:02                                           ` Mike Galbraith
  2007-04-08 17:56                                           ` Gene Heskett
  0 siblings, 2 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-08 11:40 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote:

> That seems to be the killer loading here, building a kernel (make -j3) 
> doesn't seem to lag it all that bad.  One session of gzip -best makes it 
> fall plumb over though, which was a disappointment.

Can you make a testcase that doesn't require amanda?

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 11:40                                         ` Mike Galbraith
@ 2007-04-08 12:02                                           ` Mike Galbraith
  2007-04-08 17:57                                             ` Gene Heskett
  2007-04-08 17:56                                           ` Gene Heskett
  1 sibling, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-08 12:02 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Sun, 2007-04-08 at 13:40 +0200, Mike Galbraith wrote:
> On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote:
> 
> > That seems to be the killer loading here, building a kernel (make -j3) 
> > doesn't seem to lag it all that bad.  One session of gzip -best makes it 
> > fall plumb over though, which was a disappointment.
> 
> Can you make a testcase that doesn't require amanda?

Or at least send me a couple of 5 or 10 second top snapshots (which also
show CPU usage of sleeping tasks) while the system is misbehaving?

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07  6:50                         ` Con Kolivas
  2007-04-07 16:12                           ` Gene Heskett
  2007-04-07 16:32                           ` Mike Galbraith
@ 2007-04-08 13:08                           ` Ed Tomlinson
  2007-04-09  5:38                             ` Mike Galbraith
  2 siblings, 1 reply; 92+ messages in thread
From: Ed Tomlinson @ 2007-04-08 13:08 UTC (permalink / raw)
  To: Con Kolivas
  Cc: Ingo Molnar, Mike Galbraith, linux list, Andrew Morton, ck list

Hi,

I am one of those who have been happily testing Con's patches.  

They work better than mainline here.

There seems to be a disconnect on what Con is trying to achieve with SD.
They do not improve interactivity per say.  Instead they make the scheduler 
predictable by removing the alchemy used by the interactivity estimator.   
Mikes patches may be better alchemy but they continue down the same 
path - from prior experience, we can say with fairly good confidence, that
 there will be new corner cases that trigger problems.

With SD, if you ask too much of the machine it slows down.  You can fix this,
if required, by renicing tasks some tasks - or by reducing the load on the box.

If one really needs some sort of interactivity booster (I do not with SD), why
not move it into user space?  With SD it would be simple enough to export
some info on estimated latency.  With this user space could make a good
attempt to keep latency within bounds for a set of tasks just by renicing.... 

Thanks
Ed Tomlinson

PS.  Get well soon Con.

On Saturday 07 April 2007 02:50, Con Kolivas wrote:
> On Friday 06 April 2007 20:03, Ingo Molnar wrote:
> > * Con Kolivas <kernel@kolivas.org> wrote:
> > > > I was more focused on the general case, but all I should have to do
> > > > to de-claw all of these sleep exploits is account rr time (only a
> > > > couple of lines, done and building now).  It's only a couple of
> > > > lines.
> > >
> > > The more you try to "de-claw" these sleep exploits the less effective
> > > you make your precious interactive estimator. Feel free to keep adding
> > > endless tweaks to undo the other tweaks in order to try and achieve
> > > what SD has by design.
> >
> > firstly, testing on various workloads Mike's tweaks work pretty well,
> > while SD still doesnt handle the high-load case all that well. Note that
> > it was you who raised this whole issue to begin with: everything was
> > pretty quiet in scheduling interactivity land.
> 
> I'm terribly sorry but you have completely missed my intentions then. I was 
> _not_ trying to improve mainline's interactivity at all. My desire was to fix 
> the unfairness that mainline has, across the board without compromising 
> fairness. You said yourself that an approach that fixed a lot and had a small 
> number of regressions would be worth it. In a surprisingly ironic turnaround 
> two bizarre things happened. People found SD fixed a lot of their 
> interactivity corner cases which were showstoppers. That didn't surprise me 
> because any unfair design will by its nature get it wrong sometimes. The even 
> _more_ surprising thing is that you're now using interactivity as the 
> argument against SD. I did not set out to create better interactivity, I set 
> out to create widespread fairness without too much compromise to 
> interactivity. As I said from the _very first email_, there would be cases of 
> interactivity in mainline that performed better.
> 
> > (There was one person who 
> > reported wide-scale interactivity regressions against mainline but he
> > didnt answer my followup posts to trace/debug the scenario.)
> 
> That was one user. As I mentioned in an earlier thread, the problem with email 
> threads on drawn out issues on lkml is that all that people remember is the 
> last one creating noise, and that has only been the noise from Mike for 2 
> weeks now. Has everyone forgotten the many many users who reported the 
> advantages first up which generated the interest in the first place? Why have 
> they stopped reporting? Well the answer is obvious; all the signs suggest 
> that SD is slated for mainline. It is on the path, Linus has suggested it and 
> now akpm is asking if it's ready for 2.6.22. So they figure there is no point 
> testing and replying any further. SD is ready for prime time, finalised and 
> does everything I intended it to. This is where I have to reveal to them the 
> horrible truth. This is no guarantee it will go in. In fact, this one point 
> that you (Ingo) go on and on about is not only a quibble, but you will call 
> it an absolute showstopper. As maintainer of the cpu scheduler, in its 
> current form you will flatly refuse it goes to mainline citing the 5% of 
> cases where interactivity has regressed. So people will tell me to fix it, 
> right?... Read on for this to unfold.
> 
> > SD has a built-in "interactivity estimator" as well, but hardcoded into
> > its design. SD has its own set of ugly-looking tweaks as well - for
> > example the prio_matrix.
> 
> I'm sorry but this is a mis-representation to me, as I suggested on an earlier 
> thread where I disagree about what an interactivity estimator is. The idea of 
> fence posts in a clock that are passed as a way of metering out 
> earliest-deadline-first in a design is well established. The matrix is simply 
> an array designed for O(1) lookups of the fence posts. That is not the same 
> as "oh how much have we slept in the last $magic_number period and how much 
> extra time should we get for that".
> 
> > So it all comes down on 'what interactivity 
> > heuristics is enough', and which one is more tweakable. So far i've yet
> > to see SD address the hackbench and make -j interactivity
> > problems/regression for example, while Mike has been busy addressing the
> > 'exploits' reported against mainline.
> 
> And BANG there is the bullet you will use against SD from here to eternity. SD 
> obeys fairness at all costs. Your interactivity regression is that SD causes 
> progressive slowdown with load which by definition is fairness. You 
> repeatedly ask me to address it and there is on unfailing truth; the only way 
> to address it is to add unfairness to the design. So why don't I? Because the 
> simple fact is that any unfairness no matter how carefully administered or 
> metered will always have cases where it's wrong. Look at the title of this 
> email for example - it's yet another exploit for the mainline sleep/run 
> mechanism. This does _not_ mean I'm implying people are logging into servers 
> and running ./tenp to hang the machine. What it demonstrates is a way of 
> reproducing the scenario which is biting people with real world loads. It's 
> entirely believable that a simple p2p app could be behaving like tenp, only 
> generating a small load and it could take ages to log in and use the console. 
> Willy has complained this is why people stick to 2.4. Sure I can create 
> interactivity tweaks worse than anyone else. I will not, though, because that 
> precisely undoes what is special about SD. It never looks backwards, and is 
> predictable to absurdity. So you'll argue that mainline can manage it 
> below...
> 
> > > You'll end up with an incresingly complex state machine design of
> > > interactivity tweaks and interactivity throttlers all fighting each
> > > other to the point where the intearactivity estimator doesn't do
> > > anything. [...]
> >
> > It comes down to defining interactivity by scheduling behavior, and
> > making that definition flexible. SD's definition of interactivity is
> > rigid (but it's still behavior-based, so not fundamentally different
> > from an explicit 'interactivity estimator'), and currently it does not
> > work well under high load. But ... i'm still entertaining the notion
> > that it might be good enough, but you've got to demonstrate the design's
> > flexibility.
> 
> I have yet to see someone find an "exploit" for SD's current design. Mainline 
> is all about continually patching up the intrinsic design (and fixing this 
> one test case is not the be all and end all).
> 
> > furthermore, your description does not match my experience when using
> > Mike's tweaks and comparing it to SD on the same hardware. According to
> > your claim i should have seen regressions popping up in various,
> > already-fixed corners, but it didnt happen in practice. But ... i'm
> > awaiting further SD and Mike tweaks, the race certainly looks
> > interesting ;)
> 
> Well you see a race. I do not. I see a flat predictable performance from SD 
> where there will always be slowdown with load. I have no intention of 
> changing that. Mike is making an admirable attempt to fix issues as they are 
> pointed out. You say there are no regressions but I see absolutely no testers 
> of his patches besides himself and you. If I introduce any unfairness based 
> on sleep behaviour into SD I'll be undoing the whole point of the design and 
> end up chasing new regressions. So I won't quibble over the numbers. SD has 
> produced a lot of improvements and fairness that mainline struggles with ever 
> increasing patches to emulate, but SD does so at the expense of proportional 
> slowdown with load. At least I accept that and will no longer put my health 
> at risk trying to "fix" it by "breaking" it. SD is done.
> 
> I feel sorry for the many users out there who are simply "waiting for it to 
> end up in mainline" who just discovered you will veto it on that basis. 
> lwn.net had it wrong; this was far more painful than any previous attempt to 
> get anything into mainline.
> 
> My health has been so badly affected by this that I've been given an ultimatum 
> and must turn my computer off till I get well now which may be weeks. I 
> already know the massive flameage and last-word comments that are likely to 
> be fired off before the inevitable decision to veto it.
> 
> > 	Ingo
> 
> さようなら
> 

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 10:58                                       ` Ingo Molnar
@ 2007-04-08 17:04                                         ` Gene Heskett
  2007-04-09  4:03                                           ` Mike Galbraith
  0 siblings, 1 reply; 92+ messages in thread
From: Gene Heskett @ 2007-04-08 17:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Con Kolivas, Mike Galbraith, Andrew Morton, ck list

On Sunday 08 April 2007, Ingo Molnar wrote:
>* Ingo Molnar <mingo@elte.hu> wrote:
>> > My question then, is why did it take a very public cat-fight to get
>> > this looked at and the code adjusted?  Its been what, nearly 2 years
>> > since Linus himself made a comment that this thing needed fixed.
>> > The fixes then done were of very little actual effectiveness and the
>> > situation then has gradually deteriorated since.
>>
>> this is pretty hard to get right, and the most objective way to change
>> it is to do it testcase-driven. FYI, interactivity tweaking has been
>> gradual, the last bigger round of interactivity changes were done a
>> year ago:
>
>and note that a year ago Mike did a larger patch too, not unlike his
>current patch - but we hoped that his smaller change would be sufficient
>- and nobody came along and said "i tested Mike's and the difference is
>significant on my system".

May I suggest that while it may have been noticeable, it was 
not 'significant', so we didn't sing praises and bow to mecca at the 
time.  I just thought that this is the way it was, till Cons patch proved 
otherwise for this  'desktop' user.  We were then, and still are, looking 
for the magic that lets it all load up and slow down in a linear feeling 
fashion.  Only those IRQ's that are fleeting and need serviced NOW should 
be exceptions to that rule.  AFAIAC, gzip can take its turn in the queue, 
getting no more time in proportion than any other process that wakes up 
in its slice and finds it has something to do, if nothing to do it should 
yield the floor immediately, and in any event be put back at the far end 
of the queue when its timeslice is over.  gzip in particular seems very 
reticent to give up the cpu at what should be the end of its timeslice.  
As it is, the IRQ's are being serviced, so no keystrokes are being lost, 
or very few, unlike the situation 2 years ago when whole sentences typed 
blind were on the missing list when x finally did get a chance to play 
catchup.

As a desktop user, I fail to understand any good reason why a keystroke 
typed can't be echoed to the screen within 200 milliseconds regardless of 
how many gzip -best's amdump may be running in the background.

I have a coco3, running nitros9 at a cpu clock rate of 1.79mhz with a 
1/10th second context switch, in the basement that CAN do that while 
assembling an executable with a separate process printing the listing of 
that assembly as it progresses.

Why can't linux?

>Which seems to suggest that the number of 
>problem-systems and worried users/developers isnt particularly large.

Again, may I suggest that this sort of behavior on the desktop is a 
contributing factor to that relative scarcity?

>	Ingo



-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
The meek will inherit the earth -- if that's OK with you.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 11:40                                         ` Mike Galbraith
  2007-04-08 12:02                                           ` Mike Galbraith
@ 2007-04-08 17:56                                           ` Gene Heskett
  2007-04-09  4:17                                             ` Mike Galbraith
  1 sibling, 1 reply; 92+ messages in thread
From: Gene Heskett @ 2007-04-08 17:56 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Sunday 08 April 2007, Mike Galbraith wrote:
>On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote:
>> That seems to be the killer loading here, building a kernel (make -j3)
>> doesn't seem to lag it all that bad.  One session of gzip -best makes
>> it fall plumb over though, which was a disappointment.
>
>Can you make a testcase that doesn't require amanda?
>
>	-Mike

Sure.  Try 'tar czf nameofarchive.tar.gz /path/to-dir-to-be-backed-up'

Or, from the runtar log from this morning, and this is all one line:

runtar.20070408022016.debug:running: /bin/tar: 'gtar' '--create' '--file' '-' '--directory' '/usr/dlds-rpms' '--one-file-system' '--listed-incremental' '/usr/local/var/amanda/gnutar-lists/coyote_usr_dlds-rpms_1.new' '--sparse' '--ignore-failed-read' '--totals' '--exclude-from' '/tmp/amanda/sendbackup._usr_dlds-rpms.20070408022016.exclude' '.'

and amanda will if requested, pipe that output through a |gzip -best, and 
its this process that brings the machine to the table begging for scraps 
like a puppy.  Tar by itself can be felt but isn't bad.

Even without the -best switch in effect, I'm sure you'll see the machine 
slow considerably.

Please don't try to call amanda an unusual load as amanda itself is 
nothing but an intelligent manager, constructing the command lines passed 
to tar or dump, and gzip, which do the real work.  Amdump, the manager my 
scripts wrap around, and my scripts themselves, will not use more 
than .01% of the cpu when averaged over the whole backup session.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
We are Microsoft.  What you are experiencing is not a problem; it is an 
undocumented feature.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 12:02                                           ` Mike Galbraith
@ 2007-04-08 17:57                                             ` Gene Heskett
  2007-04-09  4:19                                               ` Mike Galbraith
  0 siblings, 1 reply; 92+ messages in thread
From: Gene Heskett @ 2007-04-08 17:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Sunday 08 April 2007, Mike Galbraith wrote:
>On Sun, 2007-04-08 at 13:40 +0200, Mike Galbraith wrote:
>> On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote:
>> > That seems to be the killer loading here, building a kernel (make
>> > -j3) doesn't seem to lag it all that bad.  One session of gzip -best
>> > makes it fall plumb over though, which was a disappointment.
>>
>> Can you make a testcase that doesn't require amanda?
>
>Or at least send me a couple of 5 or 10 second top snapshots (which also
>show CPU usage of sleeping tasks) while the system is misbehaving?
>
>	-Mike

With what monitor utility?

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
"Microsoft technology" -- isn't that an oxymoron? 

   -- Gareth Barnard

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 10:41                                     ` Ingo Molnar
  2007-04-08 10:58                                       ` Ingo Molnar
  2007-04-08 11:33                                       ` Gene Heskett
@ 2007-04-08 18:51                                       ` Rene Herman
  2007-04-09  4:23                                         ` Mike Galbraith
  2007-04-09 13:53                                         ` Ingo Molnar
  2 siblings, 2 replies; 92+ messages in thread
From: Rene Herman @ 2007-04-08 18:51 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Gene Heskett, linux-kernel, Con Kolivas, Mike Galbraith,
	Andrew Morton, ck list

On 04/08/2007 12:41 PM, Ingo Molnar wrote:

> this is pretty hard to get right, and the most objective way to change 
> it is to do it testcase-driven. FYI, interactivity tweaking has been 
> gradual, the last bigger round of interactivity changes were done a year 
> ago:
> 
>  commit 5ce74abe788a26698876e66b9c9ce7e7acc25413
>  Author: Mike Galbraith <efault@gmx.de>
>  Date:   Mon Apr 10 22:52:44 2006 -0700
> 
>      [PATCH] sched: fix interactive task starvation
> 
> (and a few smaller tweaks since then too.)
> 
> and that change from Mike responded to a testcase. Mike's latest changes 
> (the ones you just tested) were mostly driven by actual testcases too, 
> which measured long-term timeslice distribution fairness.

Ah yes, that one. Here's the next one in that series:

commit f1adad78dd2fc8edaa513e0bde92b4c64340245c
Author: Linus Torvalds <torvalds@g5.osdl.org>
Date:   Sun May 21 18:54:09 2006 -0700

     Revert "[PATCH] sched: fix interactive task starvation"

It personally had me wonder if _anyone_ was testing this stuff...

Rene.


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 17:04                                         ` Gene Heskett
@ 2007-04-09  4:03                                           ` Mike Galbraith
  2007-04-09  4:08                                             ` Gene Heskett
  0 siblings, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09  4:03 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Sun, 2007-04-08 at 13:04 -0400, Gene Heskett wrote:
> On Sunday 08 April 2007, Ingo Molnar wrote:

> >and note that a year ago Mike did a larger patch too, not unlike his
> >current patch - but we hoped that his smaller change would be sufficient
> >- and nobody came along and said "i tested Mike's and the difference is
> >significant on my system".
> 
> May I suggest that while it may have been noticeable, it was 
> not 'significant', so we didn't sing praises and bow to mecca at the 
> time.

Actually, there was practically nil interest in testing.  We made a
couple of minor adjustments to the interactivity logic, and all went
quiet, so I didn't think it was enough of a problem to require more
intrusive countermeasures.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  4:03                                           ` Mike Galbraith
@ 2007-04-09  4:08                                             ` Gene Heskett
  2007-04-09  5:59                                               ` Mike Galbraith
  0 siblings, 1 reply; 92+ messages in thread
From: Gene Heskett @ 2007-04-09  4:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Monday 09 April 2007, Mike Galbraith wrote:
>On Sun, 2007-04-08 at 13:04 -0400, Gene Heskett wrote:
>> On Sunday 08 April 2007, Ingo Molnar wrote:
>> >and note that a year ago Mike did a larger patch too, not unlike his
>> >current patch - but we hoped that his smaller change would be
>> > sufficient - and nobody came along and said "i tested Mike's and the
>> > difference is significant on my system".
>>
>> May I suggest that while it may have been noticeable, it was
>> not 'significant', so we didn't sing praises and bow to mecca at the
>> time.
>
>Actually, there was practically nil interest in testing.  We made a
>couple of minor adjustments to the interactivity logic, and all went
>quiet, so I didn't think it was enough of a problem to require more
>intrusive countermeasures.
>
>	-Mike

Does one of these messages have a url so I can test the latest of your 
patches for -rc6?  Or was the one Ingo sent the most recent?

Putting that url in your sig would be nice, and might result in its 
getting a lot more exersize which should = more feedback.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
Got a complaint about the Internal Revenue Service?  
Call the convenient toll-free "IRS Taxpayer Complaint Hot Line Number":

	1-800-AUDITME

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 17:56                                           ` Gene Heskett
@ 2007-04-09  4:17                                             ` Mike Galbraith
  2007-04-09  5:16                                               ` Gene Heskett
  0 siblings, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09  4:17 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Sun, 2007-04-08 at 13:56 -0400, Gene Heskett wrote:
> On Sunday 08 April 2007, Mike Galbraith wrote:
> >On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote:
> >> That seems to be the killer loading here, building a kernel (make -j3)
> >> doesn't seem to lag it all that bad.  One session of gzip -best makes
> >> it fall plumb over though, which was a disappointment.
> >
> >Can you make a testcase that doesn't require amanda?
> >
> >	-Mike
> 
> Sure.  Try 'tar czf nameofarchive.tar.gz /path/to-dir-to-be-backed-up'
> 
> Or, from the runtar log from this morning, and this is all one line:
> 
> runtar.20070408022016.debug:running: /bin/tar: 'gtar' '--create' '--file' '-' '--directory' '/usr/dlds-rpms' '--one-file-system' '--listed-incremental' '/usr/local/var/amanda/gnutar-lists/coyote_usr_dlds-rpms_1.new' '--sparse' '--ignore-failed-read' '--totals' '--exclude-from' '/tmp/amanda/sendbackup._usr_dlds-rpms.20070408022016.exclude' '.'
> 
> and amanda will if requested, pipe that output through a |gzip -best, and 
> its this process that brings the machine to the table begging for scraps 
> like a puppy.  Tar by itself can be felt but isn't bad.

So tar -cvf - / | gzip --best | tar -tvzf - should reproduce the
problem?

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 17:57                                             ` Gene Heskett
@ 2007-04-09  4:19                                               ` Mike Galbraith
  2007-04-09  5:23                                                 ` Gene Heskett
  0 siblings, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09  4:19 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Sun, 2007-04-08 at 13:57 -0400, Gene Heskett wrote:
> On Sunday 08 April 2007, Mike Galbraith wrote:
> >On Sun, 2007-04-08 at 13:40 +0200, Mike Galbraith wrote:
> >> On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote:
> >> > That seems to be the killer loading here, building a kernel (make
> >> > -j3) doesn't seem to lag it all that bad.  One session of gzip -best
> >> > makes it fall plumb over though, which was a disappointment.
> >>
> >> Can you make a testcase that doesn't require amanda?
> >
> >Or at least send me a couple of 5 or 10 second top snapshots (which also
> >show CPU usage of sleeping tasks) while the system is misbehaving?
> >
> >	-Mike
> 
> With what monitor utility?

Top.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 18:51                                       ` Rene Herman
@ 2007-04-09  4:23                                         ` Mike Galbraith
  2007-04-09 12:14                                           ` Rene Herman
  2007-04-09 13:53                                         ` Ingo Molnar
  1 sibling, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09  4:23 UTC (permalink / raw)
  To: Rene Herman
  Cc: Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas,
	Andrew Morton, ck list

On Sun, 2007-04-08 at 20:51 +0200, Rene Herman wrote:
> On 04/08/2007 12:41 PM, Ingo Molnar wrote:
> 
> > this is pretty hard to get right, and the most objective way to change 
> > it is to do it testcase-driven. FYI, interactivity tweaking has been 
> > gradual, the last bigger round of interactivity changes were done a year 
> > ago:
> > 
> >  commit 5ce74abe788a26698876e66b9c9ce7e7acc25413
> >  Author: Mike Galbraith <efault@gmx.de>
> >  Date:   Mon Apr 10 22:52:44 2006 -0700
> > 
> >      [PATCH] sched: fix interactive task starvation
> > 
> > (and a few smaller tweaks since then too.)
> > 
> > and that change from Mike responded to a testcase. Mike's latest changes 
> > (the ones you just tested) were mostly driven by actual testcases too, 
> > which measured long-term timeslice distribution fairness.
> 
> Ah yes, that one. Here's the next one in that series:
> 
> commit f1adad78dd2fc8edaa513e0bde92b4c64340245c
> Author: Linus Torvalds <torvalds@g5.osdl.org>
> Date:   Sun May 21 18:54:09 2006 -0700
> 
>      Revert "[PATCH] sched: fix interactive task starvation"
> 
> It personally had me wonder if _anyone_ was testing this stuff...

Well of course not.  Making random untested changes, and reverting them
later is half the fun of kernel development.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  4:17                                             ` Mike Galbraith
@ 2007-04-09  5:16                                               ` Gene Heskett
  2007-04-09  6:06                                                 ` Mike Galbraith
  2007-04-09  8:24                                                 ` Mike Galbraith
  0 siblings, 2 replies; 92+ messages in thread
From: Gene Heskett @ 2007-04-09  5:16 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Monday 09 April 2007, Mike Galbraith wrote:
>On Sun, 2007-04-08 at 13:56 -0400, Gene Heskett wrote:
>> On Sunday 08 April 2007, Mike Galbraith wrote:
>> >On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote:
>> >> That seems to be the killer loading here, building a kernel (make
>> >> -j3) doesn't seem to lag it all that bad.  One session of gzip
>> >> -best makes it fall plumb over though, which was a disappointment.
>> >
>> >Can you make a testcase that doesn't require amanda?
>> >
>> >	-Mike
>>
>> Sure.  Try 'tar czf nameofarchive.tar.gz /path/to-dir-to-be-backed-up'
>>
>> Or, from the runtar log from this morning, and this is all one line:
>>
>> runtar.20070408022016.debug:running: /bin/tar: 'gtar' '--create'
>> '--file' '-' '--directory' '/usr/dlds-rpms' '--one-file-system'
>> '--listed-incremental'
>> '/usr/local/var/amanda/gnutar-lists/coyote_usr_dlds-rpms_1.new'
>> '--sparse' '--ignore-failed-read' '--totals' '--exclude-from'
>> '/tmp/amanda/sendbackup._usr_dlds-rpms.20070408022016.exclude' '.'
>>
>> and amanda will if requested, pipe that output through a |gzip -best,
>> and its this process that brings the machine to the table begging for
>> scraps like a puppy.  Tar by itself can be felt but isn't bad.
>
>So tar -cvf - / | gzip --best | tar -tvzf - should reproduce the
>problem?
>
>	-Mike

That looks as if it should demo it pretty well if I understand correctly 
everything you're doing there.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
In /users3 did Kubla Kahn
A stately pleasure dome decree,
Where /bin, the sacred river ran
Through Test Suites measureless to Man
Down to a sunless C.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  4:19                                               ` Mike Galbraith
@ 2007-04-09  5:23                                                 ` Gene Heskett
  2007-04-09  6:09                                                   ` Mike Galbraith
  0 siblings, 1 reply; 92+ messages in thread
From: Gene Heskett @ 2007-04-09  5:23 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Monday 09 April 2007, Mike Galbraith wrote:
>On Sun, 2007-04-08 at 13:57 -0400, Gene Heskett wrote:
>> On Sunday 08 April 2007, Mike Galbraith wrote:
>> >On Sun, 2007-04-08 at 13:40 +0200, Mike Galbraith wrote:
>> >> On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote:
>> >> > That seems to be the killer loading here, building a kernel (make
>> >> > -j3) doesn't seem to lag it all that bad.  One session of gzip
>> >> > -best makes it fall plumb over though, which was a
>> >> > disappointment.
>> >>
>> >> Can you make a testcase that doesn't require amanda?
>> >
>> >Or at least send me a couple of 5 or 10 second top snapshots (which
>> > also show CPU usage of sleeping tasks) while the system is
>> > misbehaving?
>> >
>> >	-Mike
>>
>> With what monitor utility?
>
>Top.
>
>	-Mike

This may not be so informative, its almost behaving ATM.

29252 amanda    22   0  1856  572  220 R 76.4  0.1   1:07.24 gzip
29235 amanda    15   0  2992 1224  888 S  5.6  0.1   0:02.80 chunker
29500 root      18   0  2996 1164  788 S  4.0  0.1   0:02.40 tar
10459 amanda    15   0  3340 1052  832 S  3.0  0.1   0:49.04 amandad
10536 amanda    15   0  3276 1308 1004 S  2.3  0.1   0:40.92 dumper
29496 amanda    18   0  2808  472  280 S  2.0  0.0   0:01.73 sendbackup
 4057 gkrellmd  15   0 11568 1172  896 S  1.3  0.1   7:45.82 gkrellmd
29498 amanda    18   0  2396  780  656 S  1.0  0.1   0:00.60 tar
19183 root      15   0     0    0    0 S  0.7  0.0   0:01.92 pdflush

I also note with some disdain that I'm half a megabyte into swap, but I've 
had FF-2.0.0.3 busy for the last hour while amanda was trying to find a 
few cycles at the same time.  Looking at a bunch of pdf's of circuit 
boards to see if I wanna build them for my milling machine.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
Fatal Error: Found MS-Windows System -> Repartitioning Disk for Linux...

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 13:08                           ` Ed Tomlinson
@ 2007-04-09  5:38                             ` Mike Galbraith
  2007-04-09 11:26                               ` Ed Tomlinson
  2007-04-10  2:39                               ` Mike Galbraith
  0 siblings, 2 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09  5:38 UTC (permalink / raw)
  To: Ed Tomlinson; +Cc: Con Kolivas, Ingo Molnar, linux list, Andrew Morton, ck list

On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote:
> Hi,
> 
> I am one of those who have been happily testing Con's patches.  
> 
> They work better than mainline here.

(I tried a UP kernel yesterday, and even a single kernel build would
make noticeable hitches if I move a window around. YMMV etc.)

> If one really needs some sort of interactivity booster (I do not with SD), why
> not move it into user space?  With SD it would be simple enough to export
> some info on estimated latency.  With this user space could make a good
> attempt to keep latency within bounds for a set of tasks just by renicing.... 

I don't think you can have very much effect on latency using nice with
SD once the CPU is fully utilized.  See below.

/*
 * This contains a bitmap for each dynamic priority level with empty slots
 * for the valid priorities each different nice level can have. It allows
 * us to stagger the slots where differing priorities run in a way that
 * keeps latency differences between different nice levels at a minimum.
 * ie, where 0 means a slot for that priority, priority running from left to
 * right:
 * nice -20 0000000000000000000000000000000000000000
 * nice -10 1001000100100010001001000100010010001000
 * nice   0 0101010101010101010101010101010101010101
 * nice   5 1101011010110101101011010110101101011011
 * nice  10 0110111011011101110110111011101101110111
 * nice  15 0111110111111011111101111101111110111111
 * nice  19 1111111111111111111011111111111111111111
 */

Nice allocates bandwidth, but as long as the CPU is busy, tasks always
proceed downward in priority until they hit the expired array.  That's
the design.  If X gets busy and expires, and a nice 20 CPU hog wakes up
after it's previous rotation has ended, but before the current rotation
is ended (ie there is 1 task running at wakeup time), X will take a
guaranteed minimum 160ms latency hit (quite noticeable) independent of
nice level.  The only way to avoid it is to use a realtime class.

A nice -20 task has maximum bandwidth allocated, but that also makes it
a bigger target for preemption from tasks at all nice levels as it
proceeds downward toward expiration.  AFAIKT, low latency scheduling
just isn't possible once the CPU becomes 100% utilized, but it is
bounded to runqueue length.  In mainline OTOH, a nice -20 task will
always preempt a nice 0 task, giving it instant gratification, and
latency of lower priority tasks is bounded by the EXPIRED_STARVING(rq)
safety net.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  4:08                                             ` Gene Heskett
@ 2007-04-09  5:59                                               ` Mike Galbraith
  2007-04-09 13:01                                                 ` Gene Heskett
  0 siblings, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09  5:59 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Mon, 2007-04-09 at 00:08 -0400, Gene Heskett wrote:
> On Monday 09 April 2007, Mike Galbraith wrote:
> >
> >
> >Actually, there was practically nil interest in testing.  We made a
> >couple of minor adjustments to the interactivity logic, and all went
> >quiet, so I didn't think it was enough of a problem to require more
> >intrusive countermeasures.
> >
> >	-Mike
> 
> Does one of these messages have a url so I can test the latest of your 
> patches for -rc6?  Or was the one Ingo sent the most recent?

No, my tree has a bugfix and some other adjustments that try to move the
balance closer to fair without sacrificing interactivity.

> Putting that url in your sig would be nice, and might result in its 
> getting a lot more exersize which should = more feedback.

When I get it cleaned up and better tested, I'll post again.  If you
want, I'll CC you... willing victims are a highly valued commodity :)

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  5:16                                               ` Gene Heskett
@ 2007-04-09  6:06                                                 ` Mike Galbraith
  2007-04-09  8:24                                                 ` Mike Galbraith
  1 sibling, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09  6:06 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Mon, 2007-04-09 at 01:16 -0400, Gene Heskett wrote:
> On Monday 09 April 2007, Mike Galbraith wrote:
> >So tar -cvf - / | gzip --best | tar -tvzf - should reproduce the
> >problem?
> >
> >	-Mike
> 
> That looks as if it should demo it pretty well if I understand correctly 
> everything you're doing there.

Well, I let it process my ~250GB of data with my current tree, and it
looked utterly harmless (and since I'm running SMP, was of course).
I'll try building UP to make sure, and check mainline as well.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  5:23                                                 ` Gene Heskett
@ 2007-04-09  6:09                                                   ` Mike Galbraith
  0 siblings, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09  6:09 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Mon, 2007-04-09 at 01:23 -0400, Gene Heskett wrote:

> This may not be so informative, its almost behaving ATM.
> 
> 29252 amanda    22   0  1856  572  220 R 76.4  0.1   1:07.24 gzip
> 29235 amanda    15   0  2992 1224  888 S  5.6  0.1   0:02.80 chunker
> 29500 root      18   0  2996 1164  788 S  4.0  0.1   0:02.40 tar
> 10459 amanda    15   0  3340 1052  832 S  3.0  0.1   0:49.04 amandad
> 10536 amanda    15   0  3276 1308 1004 S  2.3  0.1   0:40.92 dumper
> 29496 amanda    18   0  2808  472  280 S  2.0  0.0   0:01.73 sendbackup
>  4057 gkrellmd  15   0 11568 1172  896 S  1.3  0.1   7:45.82 gkrellmd
> 29498 amanda    18   0  2396  780  656 S  1.0  0.1   0:00.60 tar
> 19183 root      15   0     0    0    0 S  0.7  0.0   0:01.92 pdflush
> 

Yeah, this is showing the scheduler behaving properly.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  5:16                                               ` Gene Heskett
  2007-04-09  6:06                                                 ` Mike Galbraith
@ 2007-04-09  8:24                                                 ` Mike Galbraith
  1 sibling, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09  8:24 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Mon, 2007-04-09 at 01:16 -0400, Gene Heskett wrote:
> On Monday 09 April 2007, Mike Galbraith wrote:
> >
> >So tar -cvf - / | gzip --best | tar -tvzf - should reproduce the
> >problem?
> >
> That looks as if it should demo it pretty well if I understand correctly 
> everything you're doing there.

Ok, I can't reproduce any bad interactivity here with that workload
either with SMP or UP kernel.  That said however, gzip does attain
interactive status, which it really should not - that gives it an unfair
advantage over it's peers.

With my throttled tree, it gets pushed back down to where it belongs.
I'm going to try to tighten the tolerance on behavior to evict the
riffraff who don't really belong in the elite interactive club sooner,
and guarantee that even fast/light tasks can't dominate the CPU without
paying heavily.

(to close the many fast/light tasks wakeup scenario that the "untested"
patch someone mentioned did, but was shown to be too painful to bare).

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  5:38                             ` Mike Galbraith
@ 2007-04-09 11:26                               ` Ed Tomlinson
  2007-04-09 16:50                                 ` Mike Galbraith
  2007-04-10  2:39                               ` Mike Galbraith
  1 sibling, 1 reply; 92+ messages in thread
From: Ed Tomlinson @ 2007-04-09 11:26 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Con Kolivas, Ingo Molnar, linux list, Andrew Morton, ck list

On Monday 09 April 2007 01:38, Mike Galbraith wrote:
> On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote:
> > Hi,
> > 
> > I am one of those who have been happily testing Con's patches.  
> > 
> > They work better than mainline here.
> 
> (I tried a UP kernel yesterday, and even a single kernel build would
> make noticeable hitches if I move a window around. YMMV etc.)

Interesting.  I run UP amd64, 1000HZ, 1.25G, preempt off (on causes kernel 
stalls with no messages - but that is another story).  I do not notice a single 
make.   When several are running the desktop slows down a bit.  I do not have 
X niced.  Wonder why we see such different results? 

I am not saying that SD is perfect - I fully expect that more bugs will turn up
in its code (some will affect mainline too).  I do however like the idea of a 
scheduler that does not need alchemy to achieve good results.  Nor do I
necessarily expect it to be 100% transparent.  If one changes something
as basic as the scheduler some tweaking should be expected.  IMO this
is fine as long as we get consistant results.

> > If one really needs some sort of interactivity booster (I do not with SD), why
> > not move it into user space?  With SD it would be simple enough to export
> > some info on estimated latency.  With this user space could make a good
> > attempt to keep latency within bounds for a set of tasks just by renicing.... 
> 
> I don't think you can have very much effect on latency using nice with
> SD once the CPU is fully utilized.  See below.
> 
> /*
>  * This contains a bitmap for each dynamic priority level with empty slots
>  * for the valid priorities each different nice level can have. It allows
>  * us to stagger the slots where differing priorities run in a way that
>  * keeps latency differences between different nice levels at a minimum.
>  * ie, where 0 means a slot for that priority, priority running from left to
>  * right:
>  * nice -20 0000000000000000000000000000000000000000
>  * nice -10 1001000100100010001001000100010010001000
>  * nice   0 0101010101010101010101010101010101010101
>  * nice   5 1101011010110101101011010110101101011011
>  * nice  10 0110111011011101110110111011101101110111
>  * nice  15 0111110111111011111101111101111110111111
>  * nice  19 1111111111111111111011111111111111111111
>  */
> 
> Nice allocates bandwidth, but as long as the CPU is busy, tasks always
> proceed downward in priority until they hit the expired array.  That's
> the design.  If X gets busy and expires, and a nice 20 CPU hog wakes up
> after it's previous rotation has ended, but before the current rotation
> is ended (ie there is 1 task running at wakeup time), X will take a
> guaranteed minimum 160ms latency hit (quite noticeable) independent of
> nice level.  The only way to avoid it is to use a realtime class.
> 
> A nice -20 task has maximum bandwidth allocated, but that also makes it
> a bigger target for preemption from tasks at all nice levels as it
> proceeds downward toward expiration.  AFAIKT, low latency scheduling
> just isn't possible once the CPU becomes 100% utilized, but it is
> bounded to runqueue length.  In mainline OTOH, a nice -20 task will
> always preempt a nice 0 task, giving it instant gratification, and
> latency of lower priority tasks is bounded by the EXPIRED_STARVING(rq)
> safety net.

Mike I made no mention of low latency.  I did mention predictable latency.  If
you are 100% utilized, and have a nice -20 task cpu hog, I would expect it to run 
and that it _should_ affect other tasks - thats why it runs with -20...

This is why I suggest that user space may be a better place to boost interactive
tasks.  A daemon that posted a message telling me that the nice -20 cpu hog
is causing 300ms delays for X would, IMHO, be a good thing.  That same daemon
could then propose a fix telling me the expected latencies and let me decide if 
I want to change priorities.  It could also be set to automaticily adjust nice levels...

Thanks
Ed

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  4:23                                         ` Mike Galbraith
@ 2007-04-09 12:14                                           ` Rene Herman
  2007-04-09 13:27                                             ` Andreas Mohr
                                                               ` (2 more replies)
  0 siblings, 3 replies; 92+ messages in thread
From: Rene Herman @ 2007-04-09 12:14 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas,
	Andrew Morton, ck list

On 04/09/2007 06:23 AM, Mike Galbraith wrote:

> On Sun, 2007-04-08 at 20:51 +0200, Rene Herman wrote:

>> On 04/08/2007 12:41 PM, Ingo Molnar wrote:

>>>  commit 5ce74abe788a26698876e66b9c9ce7e7acc25413
>>>  Author: Mike Galbraith <efault@gmx.de>
>>>  Date:   Mon Apr 10 22:52:44 2006 -0700
>>>
>>>      [PATCH] sched: fix interactive task starvation
>>>
>>> (and a few smaller tweaks since then too.)
>>>
>>> and that change from Mike responded to a testcase. Mike's latest changes 
>>> (the ones you just tested) were mostly driven by actual testcases too, 
>>> which measured long-term timeslice distribution fairness.
>> 
>> Ah yes, that one. Here's the next one in that series:
>>
>> commit f1adad78dd2fc8edaa513e0bde92b4c64340245c
>> Author: Linus Torvalds <torvalds@g5.osdl.org>
>> Date:   Sun May 21 18:54:09 2006 -0700
>>
>>      Revert "[PATCH] sched: fix interactive task starvation"
>>
>> It personally had me wonder if _anyone_ was testing this stuff...
> 
> Well of course not.  Making random untested changes, and reverting
> them later is half the fun of kernel development.

The point ofcourse is that the very example Molnar quoted as an example 
of responsible, testcase driven development was in fact hugely broken 
and sat in the tree that way for 4 rc's.

To me, the example rather serves as confirmation of what Kolivas has 
been saying; endlessly tweaking the tweaks isn't going anywhere. The 
minute you tweak A, tweak B over there in corner C-Sharp falls flat on 
its face.

Computers are horribly stupid and tend to fail most situations their 
smart human programmers didn't specifically tell them about. If, as in 
the case of a scheduler, the real-world demands on a piece of software 
are so diverse that you cannot tell them about all possible situations 
specifically, the only workable solution is to make them _predictable_ 
so that when hitting one of those special situations, the smart human 
using the computer at least gets to know how to intervene if he feels 
inclined to do so.

This turned into an interactivity thing, and while interactivity is in 
fact better for a large majority of testers, that isn't what Kolivas' 
scheduler is about. It's about predictability and leaving the dead-end 
road of these endlesss tweaks, which then break previous tweaks, rinse, 
repeat.

It's unfortunate that Kolivas is having health problems currently, but I 
certainly do hope that his scheduler finds its way into _a_ -rc1. He 
said it was done...

Rene.


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  5:59                                               ` Mike Galbraith
@ 2007-04-09 13:01                                                 ` Gene Heskett
  0 siblings, 0 replies; 92+ messages in thread
From: Gene Heskett @ 2007-04-09 13:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list

On Monday 09 April 2007, Mike Galbraith wrote:
>On Mon, 2007-04-09 at 00:08 -0400, Gene Heskett wrote:
>> On Monday 09 April 2007, Mike Galbraith wrote:
>> >Actually, there was practically nil interest in testing.  We made a
>> >couple of minor adjustments to the interactivity logic, and all went
>> >quiet, so I didn't think it was enough of a problem to require more
>> >intrusive countermeasures.
>> >
>> >	-Mike
>>
>> Does one of these messages have a url so I can test the latest of your
>> patches for -rc6?  Or was the one Ingo sent the most recent?
>
>No, my tree has a bugfix and some other adjustments that try to move the
>balance closer to fair without sacrificing interactivity.
>
>> Putting that url in your sig would be nice, and might result in its
>> getting a lot more exersize which should = more feedback.
>
>When I get it cleaned up and better tested, I'll post again.  If you
>want, I'll CC you... willing victims are a highly valued commodity :)
>
>	-Mike

:) Put me on that list, Mike.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
The box said "Requires Windows 95 or better."  I can't understand    
why it won't work on my Linux computer.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 12:14                                           ` Rene Herman
@ 2007-04-09 13:27                                             ` Andreas Mohr
  2007-04-09 19:54                                               ` Rene Herman
  2007-04-09 14:15                                             ` Ingo Molnar
  2007-04-09 17:10                                             ` Mike Galbraith
  2 siblings, 1 reply; 92+ messages in thread
From: Andreas Mohr @ 2007-04-09 13:27 UTC (permalink / raw)
  To: Rene Herman
  Cc: Mike Galbraith, Ingo Molnar, Gene Heskett, linux-kernel,
	Con Kolivas, Andrew Morton, ck list

Hi,

On Mon, Apr 09, 2007 at 02:14:49PM +0200, Rene Herman wrote:
> This turned into an interactivity thing, and while interactivity is in 
> fact better for a large majority of testers, that isn't what Kolivas' 
> scheduler is about. It's about predictability and leaving the dead-end 
> road of these endlesss tweaks, which then break previous tweaks, rinse, 
> repeat.
> 
> It's unfortunate that Kolivas is having health problems currently, but I 
> certainly do hope that his scheduler finds its way into _a_ -rc1. He 
> said it was done...

The whole recent discussion/flamefest/... here makes me think that we're
still heading towards actually introducing plugsched (most preferrably
by making mainline scheduler the builtin default and optionally building
a plugsched kernel which then allows selection).
There are fundamental behavioural differences between the various
CPU scheduler types developed; while some people want a very interactive
system with in most(!) cases good latency and exploit-less operation,
several others want a scheduler which provides very predictable latency,
low overhead and additionally as much interactivity as this strict
model can provide for. And then there are people who have very specific
SMP requirements which both characteristic scheduler types may have trouble
satisfying properly.

And I really don't see much difference whatsoever to the I/O scheduler
area: some people want predictable latency, while others want maximum
throughput or fastest operation for seek-less flash devices (noop).
Hardware varies similarly greatly has well:
Some people have huge disk arrays or NAS, others have a single flash disk.
Some people have a decaying UP machine, others have huge SMP farms.

IMHO both areas are too varied, thus runtime or compile-time selection
is justified for both areas, not simply for I/O schedulers only.
I don't think anybody would want to introduce new very similar scheduler types
just for the fun of it; development would center around improving the at
most 3 or 4 different scheduler implementations (as is the case with I/O
schedulers, BTW: there hasn't been an explosion of different variants
either!).

I think the whole discussion went on the wrong track when people somehow
had the notion of making RSDL (and its later variants) the main scheduler
for desktop machines, not just server operation. And this target of course
(and rightfully so) prompted people to ask for interactivity similar
to what the current scheduler achieves which RSDL cannot fully provide
within its strict design, however.

However having mainline remain the only scheduler doesn't seem to be such
an attractive option either, e.g. due to its non-predictability, when there
exist several alternatives with rather nice behaviour.

Thus I'd still tend towards making things runtime-selectable, scheduler goals
are just too varied to ever sufficiently achieve Best Results
In Every Area (tm).

Not to mention that making schedulers runtime-selectable would enable
uncovering various application timing bugs much faster (e.g. the RPM timing
issues that -ck managed to hit).

Oh, and get well very soon, Con, Linux needs you, a lot :)

Andreas Mohr

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-08 18:51                                       ` Rene Herman
  2007-04-09  4:23                                         ` Mike Galbraith
@ 2007-04-09 13:53                                         ` Ingo Molnar
  2007-04-09 15:37                                           ` Rene Herman
  1 sibling, 1 reply; 92+ messages in thread
From: Ingo Molnar @ 2007-04-09 13:53 UTC (permalink / raw)
  To: Rene Herman
  Cc: Gene Heskett, linux-kernel, Con Kolivas, Mike Galbraith,
	Andrew Morton, ck list

[-- Attachment #1: Type: text/plain, Size: 2249 bytes --]


* Rene Herman <rene.herman@gmail.com> wrote:

> > and that change from Mike responded to a testcase. Mike's latest 
> > changes (the ones you just tested) were mostly driven by actual 
> > testcases too, which measured long-term timeslice distribution 
> > fairness.
> 
> Ah yes, that one. Here's the next one in that series:
> 
> commit f1adad78dd2fc8edaa513e0bde92b4c64340245c
> Author: Linus Torvalds <torvalds@g5.osdl.org>
> Date:   Sun May 21 18:54:09 2006 -0700
> 
>     Revert "[PATCH] sched: fix interactive task starvation"

yes - in hindsight i regret having asked Mike for a "simpler" patch, 
which turned out to be rushed and plain broke your setup: my bad. And i 
completely forgot about that episode, Mike did a stream of changes in 
that timeframe and this one was indeed reverted :-/

> It personally had me wonder if _anyone_ was testing this stuff...

yes, i certainly tried it and it broke nothing, and it was in fact acked 
by Con too:

>    Signed-off-by: Mike Galbraith <efault@gmx.de>
>    Acked-by: Ingo Molnar <mingo@elte.hu>
>    Cc: Nick Piggin <nickpiggin@yahoo.com.au>
>    Acked-by: Con Kolivas <kernel@kolivas.org>
>    Signed-off-by: Andrew Morton <akpm@osdl.org>
>    Signed-off-by: Linus Torvalds <torvalds@osdl.org>

but it broke your setup:

>    This reverts commit 5ce74abe788a26698876e66b9c9ce7e7acc25413 (and 
>    its dependent commit 8a5bc075b8d8cf7a87b3f08fad2fba0f5d13295e), 
>    because of audio underruns.
>
>    Reported by Rene Herman <rene.herman@keyaccess.nl>, who also 
>    pinpointed the exact cause of the underruns:
>
>      "Audio underruns galore, with only ogg123 and firefox (browsing 
>      the GIT tree online is also a nice trigger by the way).
>
>      If I back it out, everything is fine for me again."

so reverting it was justified. Basically, the approach was that the 
vanilla scheduler is working reasonably well, and that any improvement 
to it must not cause regression in areas where it already works well. 
(it obviously must have been working on your audio setup to a certain 
degree if reverting Mike's patch made the underruns go away)

In any case, it would be very nice if you could try Mike's latest patch, 
how does it work on your setup? (i've attached it)

	Ingo

[-- Attachment #2: sched-mike-4.patch --]
[-- Type: text/plain, Size: 31773 bytes --]

On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote:

> looks interesting - could you send the patch?

Ok, this is looking/feeling pretty good in testing.  Comments on
fugliness etc much appreciated.

Below the numbers is a snapshot of my experimental tree.  It's a mixture
of my old throttling/anti-starvation tree and the task promotion patch,
with the addition of a scheduling class for interactive tasks to dish
out some of that targeted unfairness I mentioned.  SCHED_INTERACTIVE is
also targeted at the scenario where X or one of it's clients uses enough
CPU to end up in the expired array.

(note:  Xorg was not set SCHED_INTERACTIVE during the test runs below)

	-Mike

top - 12:31:34 up 16 min, 13 users,  load average: 7.37, 8.74, 6.58

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  P COMMAND
 6542 root      15   0  1568  108   24 S   43  0.0   0:58.98 1 fiftypercent
 6540 root      17   0  1568  440  356 R   30  0.0   1:00.04 0 fiftypercent
 6544 root      18   0  1568  108   24 R   28  0.0   0:58.36 0 fiftypercent
 6541 root      20   0  1568  108   24 R   26  0.0   0:57.70 1 fiftypercent
 6536 root      25   0  1436  356  296 R   24  0.0   0:45.76 1 chew
 6538 root      25   0  1436  356  296 R   20  0.0   0:49.73 0 chew
 6543 root      19   0  1568  108   24 R   19  0.0   0:58.04 1 fiftypercent
 6409 root      15   0  154m  63m  27m R    2  6.3   0:13.09 0 amarokapp
 6410 root      15   0  154m  63m  27m S    2  6.3   0:14.36 0 amarokapp
 6376 root      15   0  2380 1092  764 R    2  0.1   0:15.63 0 top
 5591 root      18   0  4736 1036  736 S    1  0.1   0:00.14 1 smpppd
 5678 root      15   0  167m  24m 4848 S    1  2.4   0:19.37 0 Xorg
 6202 root      15   0 32364  18m  12m S    1  1.8   0:04.25 1 konsole

50 lines from center of chew nailed to cpu0's log

pid 6538, prio   0, out for   27 ms, ran for    1 ms, load   6%
pid 6538, prio   0, out for   26 ms, ran for    4 ms, load  14%
pid 6538, prio   0, out for   27 ms, ran for    7 ms, load  20%
pid 6538, prio   0, out for   13 ms, ran for    5 ms, load  27%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  49%
pid 6538, prio   0, out for   10 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    9 ms, ran for    6 ms, load  42%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  46%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  48%
pid 6538, prio   0, out for    9 ms, ran for   27 ms, load  74%
pid 6538, prio   0, out for   27 ms, ran for    4 ms, load  13%
pid 6538, prio   0, out for   26 ms, ran for    5 ms, load  17%
pid 6538, prio   0, out for   27 ms, ran for    5 ms, load  17%
pid 6538, prio   0, out for   28 ms, ran for    6 ms, load  18%
pid 6538, prio   0, out for   30 ms, ran for    4 ms, load  14%
pid 6538, prio   0, out for   18 ms, ran for    5 ms, load  24%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  42%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  45%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  45%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  44%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    2 ms, ran for    7 ms, load  78%
pid 6538, prio   0, out for   45 ms, ran for   22 ms, load  33%
pid 6538, prio   0, out for   31 ms, ran for    2 ms, load   7%
pid 6538, prio   0, out for   62 ms, ran for    1 ms, load   3%
pid 6538, prio   0, out for   29 ms, ran for    3 ms, load  11%
pid 6538, prio   0, out for   26 ms, ran for    4 ms, load  13%
pid 6538, prio   0, out for  134 ms, ran for    5 ms, load   4%
pid 6538, prio   0, out for   78 ms, ran for    2 ms, load   3%
pid 6538, prio   0, out for    9 ms, ran for    3 ms, load  28%
pid 6538, prio   0, out for   10 ms, ran for    7 ms, load  42%
pid 6538, prio   0, out for   10 ms, ran for    7 ms, load  42%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  48%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  46%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for   10 ms, ran for    7 ms, load  43%
pid 6538, prio   0, out for    9 ms, ran for    6 ms, load  39%
pid 6538, prio   0, out for    9 ms, ran for    7 ms, load  42%
pid 6538, prio   0, out for    8 ms, ran for    7 ms, load  46%
pid 6538, prio   0, out for   14 ms, ran for    6 ms, load  30%
pid 6538, prio   0, out for   27 ms, ran for    3 ms, load  12%
pid 6538, prio   0, out for   29 ms, ran for    4 ms, load  12%
pid 6538, prio   0, out for   29 ms, ran for    4 ms, load  13%
pid 6538, prio   0, out for   26 ms, ran for    4 ms, load  14%
pid 6538, prio   0, out for   29 ms, ran for    5 ms, load  14%
pid 6538, prio   0, out for   27 ms, ran for    4 ms, load  14%
pid 6538, prio   0, out for   26 ms, ran for    5 ms, load  16%
pid 6538, prio   0, out for   24 ms, ran for    6 ms, load  20%
pid 6538, prio   0, out for    7 ms, ran for    7 ms, load  49%


root@Homer: ./massive_intr 30 180
006502	00002373
006495	00002687
006518	00002417
006490	00002544
006500	00002417
006494	00002427
006498	00003032
006517	00003060
006505	00002401
006507	00002375
006514	00002398
006497	00002483
006506	00002388
006504	00002415
006510	00002472
006516	00002365
006509	00002441
006503	00002498
006512	00002930
006496	00002565
006492	00002389
006501	00002337
006508	00002395
006491	00002486
006499	00002394
006493	00002667
006515	00002569
006511	00002555
006513	00002637
006519	00002556

---
 include/linux/sched.h  |    7 
 include/linux/sysctl.h |    2 
 kernel/sched.c         |  450 +++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sysctl.c        |   39 +++-
 4 files changed, 459 insertions(+), 39 deletions(-)

Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -34,6 +34,7 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+#define SCHED_INTERACTIVE	4
 
 #ifdef __KERNEL__
 
@@ -528,7 +529,7 @@ struct signal_struct {
 #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
+#define is_rt_policy(p)		((p) == SCHED_RR || (p) == SCHED_FIFO)
 #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
 
 /*
@@ -820,14 +821,14 @@ struct task_struct {
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
-	unsigned long sleep_avg;
+	unsigned long sleep_avg, last_slice, throttle;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
 	enum sleep_type sleep_type;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
+	unsigned int time_slice, slice_info;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
Index: linux/include/linux/sysctl.h
===================================================================
--- linux.orig/include/linux/sysctl.h
+++ linux/include/linux/sysctl.h
@@ -165,6 +165,8 @@ enum
 	KERN_MAX_LOCK_DEPTH=74,
 	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
 	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
+	KERN_SCHED_THROTTLE1=77,  /* int: throttling credit period 1 in secs */
+	KERN_SCHED_THROTTLE2=78,  /* int: throttling credit period 2 in secs */
 };
 
 
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -90,6 +90,20 @@ unsigned long long __attribute__((weak))
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 
+#if (BITS_PER_LONG < 64)
+#define JIFFIES_TO_NS64(TIME) \
+	((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ)))
+
+#define NS64_TO_JIFFIES(TIME) \
+	((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \
+	(1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME)))
+#else /* BITS_PER_LONG < 64 */
+
+#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME)
+#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME)
+
+#endif /* BITS_PER_LONG < 64 */
+
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
@@ -109,6 +123,8 @@ unsigned long long __attribute__((weak))
 #define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
 #define STARVATION_LIMIT	(MAX_SLEEP_AVG)
 #define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+#define PCNT_PER_DYNPRIO	(100 / MAX_BONUS)
+#define INTERACTIVE_LIMIT	(DEF_TIMESLICE * 4)
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -167,6 +183,133 @@ unsigned long long __attribute__((weak))
 	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 
+#define INTERACTIVE_LIMIT_EXCEEDED(rq) \
+	((rq)->active->interactive_ticks + (rq)->expired->interactive_ticks > \
+		INTERACTIVE_LIMIT)
+
+/*
+ * Interactive boost can lead to starvation if the decision to
+ * boost a task turns out to be a bad one.  To combat this, we
+ * compute the sane upper limit for cpu usage 'slice_avg' based
+ * upon a task's sleep_avg, and use this information combined
+ * with a timer to determine when intervention is required.
+ *
+ * When a task is behaving as it's sleep_avg indicates it should,
+ * it's throttle is moved forward, otherwise it will timeout, and
+ * it's priority will be lowered.
+ *
+ * Throttling tunables.
+ *
+ * CREDIT_C1: The amount of cpu time in seconds that a new task
+ *           will run completely free, ie the head start a task
+ *           has before it has to push it's timer forward to avoid
+ *           being throttled.  Each conforming slice thereafter
+ *           increases it's stored credit, and vice versa.
+ *
+ * CREDIT_C2: The maximum amount of CPU time in seconds a task
+ *           can store for later use.  When a task has no stored
+ *           credit left, now is time C2.  Tasks begin life with
+ *           C1 seconds credit, ie C2 is C1 seconds in front of
+ *           them, and the 'buffer' will grow in front of them
+ *           if they perform in a conformant manner.  The maximum
+ *           credit that fits in 32 bits jiffies is 42949 seconds.
+ */
+
+int credit_c1 = 0;
+int credit_c2 = 14400;
+int credit_max = 42949;
+
+#define C1 (credit_c1 * MAX_BONUS * HZ)
+#define C2 (credit_c2 * MAX_BONUS * HZ + C1)
+#define C3 (MAX_BONUS * C2)
+
+#define credit_exhausted(p, credit) \
+	(time_after_eq(jiffies, (p)->throttle + (credit)))
+
+/*
+ * Masks for p->slice_info, formerly p->first_time_slice.
+ * SLICE_FTS:   0x80000000  Task is in it's first ever timeslice.
+ * SLICE_NEW:   0x40000000  Slice refreshed.
+ * SLICE_INT:   0x20000000  Task is a SCHED_INTERACTIVE task partner.
+ * SLICE_SPA:   0x1FFE0000  Spare bits.
+ * SLICE_LTS:   0x0001FF80  Last time slice
+ * SLICE_AVG:   0x0000007F  Task slice_avg stored as percentage.
+ */
+#define SLICE_AVG_BITS    7
+#define SLICE_LTS_BITS   10
+#define SLICE_SPA_BITS   12
+#define SLICE_INT_BITS    1
+#define SLICE_NEW_BITS    1
+#define SLICE_FTS_BITS    1
+
+#define SLICE_AVG_SHIFT   0
+#define SLICE_LTS_SHIFT   (SLICE_AVG_SHIFT + SLICE_AVG_BITS)
+#define SLICE_SPA_SHIFT   (SLICE_LTS_SHIFT + SLICE_LTS_BITS)
+#define SLICE_INT_SHIFT   (SLICE_SPA_SHIFT + SLICE_SPA_BITS)
+#define SLICE_NEW_SHIFT   (SLICE_INT_SHIFT + SLICE_INT_BITS)
+#define SLICE_FTS_SHIFT   (SLICE_NEW_SHIFT + SLICE_NEW_BITS)
+
+#define INFO_MASK(x)      ((1U << (x))-1)
+#define SLICE_AVG_MASK    (INFO_MASK(SLICE_AVG_BITS) << SLICE_AVG_SHIFT)
+#define SLICE_LTS_MASK    (INFO_MASK(SLICE_LTS_BITS) << SLICE_LTS_SHIFT)
+#define SLICE_SPA_MASK    (INFO_MASK(SLICE_SPA_BITS) << SLICE_SPA_SHIFT)
+#define SLICE_INT_MASK    (INFO_MASK(SLICE_INT_BITS) << SLICE_INT_SHIFT)
+#define SLICE_NEW_MASK    (INFO_MASK(SLICE_NEW_BITS) << SLICE_NEW_SHIFT)
+#define SLICE_FTS_MASK    (INFO_MASK(SLICE_FTS_BITS) << SLICE_FTS_SHIFT)
+
+/* p->slice_info access macros. */
+#define first_time_slice(p) ((p)->slice_info & SLICE_FTS_MASK)
+#define set_first_time_slice(p) ((p)->slice_info |= SLICE_FTS_MASK)
+#define clr_first_time_slice(p) ((p)->slice_info &= ~SLICE_FTS_MASK)
+
+#define slice_is_new(p) ((p)->slice_info & SLICE_NEW_MASK)
+#define set_slice_is_new(p) ((p)->slice_info |= SLICE_NEW_MASK)
+#define clr_slice_is_new(p) ((p)->slice_info &= ~SLICE_NEW_MASK)
+
+#define task_is_interactive(p) ((p)->slice_info & SLICE_INT_MASK)
+#define set_task_is_interactive(p) ((p)->slice_info |= SLICE_INT_MASK)
+#define clr_task_is_interactive(p) ((p)->slice_info &= ~SLICE_INT_MASK)
+
+#define last_slice(p) (((p)->slice_info & SLICE_LTS_MASK) >> SLICE_LTS_SHIFT)
+#define set_last_slice(p, n) ((p)->slice_info = (((p)->slice_info & \
+	~SLICE_LTS_MASK) | (((n) << SLICE_LTS_SHIFT) & SLICE_LTS_MASK)))
+
+#define NS_SLEEP_AVG_PCNT (NS_MAX_SLEEP_AVG / 100)
+
+/* Note: raw storage format of slice_avg is %cpu. */
+#define slice_avg(p) ((typeof((p)->sleep_avg)) \
+	((((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT) * \
+	NS_SLEEP_AVG_PCNT))
+#define set_slice_avg(p, n) ((p)->slice_info = (((p)->slice_info & \
+	~SLICE_AVG_MASK) | ((((n) / NS_SLEEP_AVG_PCNT) \
+	<< SLICE_AVG_SHIFT) & SLICE_AVG_MASK)))
+#define slice_avg_raw(p)  \
+	(((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT)
+#define set_slice_avg_raw(p, n) ((p)->slice_info = (((p)->slice_info & \
+	~SLICE_AVG_MASK) | (((n) << SLICE_AVG_SHIFT) & SLICE_AVG_MASK)))
+
+/* cpu usage macros. */
+#define cpu_avg(p) \
+	(100 - slice_avg_raw(p))
+
+#define cpu_max(p) \
+	(100 - ((p)->sleep_avg / NS_SLEEP_AVG_PCNT))
+
+#define time_this_slice(p) \
+	(jiffies - (p)->last_slice)
+
+#define cpu_this_slice(p) \
+	(100 * last_slice(p) / max((unsigned) time_this_slice(p), \
+	(unsigned) last_slice(p)))
+
+#define cpu_avg_rq(rq) \
+	(100 * DEF_TIMESLICE / max((unsigned) (rq)->slice_avg, \
+		(unsigned) DEF_TIMESLICE))
+
+/* Positively identified interactive tasks. */
+#define task_interactive(p) \
+	((p)->policy == SCHED_INTERACTIVE || task_is_interactive(p))
+
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
 
@@ -201,6 +344,7 @@ static inline unsigned int task_timeslic
 
 struct prio_array {
 	unsigned int nr_active;
+	int interactive_ticks;
 	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_PRIO];
 };
@@ -234,7 +378,8 @@ struct rq {
 	 */
 	unsigned long nr_uninterruptible;
 
-	unsigned long expired_timestamp;
+	unsigned long switch_timestamp;
+	unsigned long slice_avg;
 	/* Cached timestamp set by update_cpu_clock() */
 	unsigned long long most_recent_timestamp;
 	struct task_struct *curr, *idle;
@@ -691,6 +836,8 @@ static void dequeue_task(struct task_str
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	if (TASK_INTERACTIVE(p))
+		array->interactive_ticks -= p->time_slice;
 }
 
 static void enqueue_task(struct task_struct *p, struct prio_array *array)
@@ -700,6 +847,8 @@ static void enqueue_task(struct task_str
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
+	if (TASK_INTERACTIVE(p))
+		array->interactive_ticks += p->time_slice;
 }
 
 /*
@@ -882,7 +1031,11 @@ static int recalc_task_prio(struct task_
 	/* Caller must always ensure 'now >= p->timestamp' */
 	unsigned long sleep_time = now - p->timestamp;
 
-	if (batch_task(p))
+	/*
+	 * Migration timestamp adjustment may induce negative time.
+	 * Ignore unquantifiable values as well as SCHED_BATCH tasks.
+	 */
+	if (now < p->timestamp || batch_task(p))
 		sleep_time = 0;
 
 	if (likely(sleep_time > 0)) {
@@ -893,7 +1046,14 @@ static int recalc_task_prio(struct task_
 		 */
 		unsigned long ceiling = INTERACTIVE_SLEEP(p);
 
-		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+	 	/*
+		 * Update throttle position.
+		 */
+		p->throttle += NS64_TO_JIFFIES(sleep_time);
+		if (time_before(jiffies, p->throttle))
+			p->throttle = jiffies;
+
+		if (sleep_time > ceiling && p->sleep_avg < ceiling) {
 			/*
 			 * Prevents user tasks from achieving best priority
 			 * with one single large enough sleep.
@@ -915,7 +1075,7 @@ static int recalc_task_prio(struct task_
 			 * limited in their sleep_avg rise as they
 			 * are likely to be waiting on I/O
 			 */
-			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
+			if (p->sleep_type == SLEEP_NONINTERACTIVE) {
 				if (p->sleep_avg >= ceiling)
 					sleep_time = 0;
 				else if (p->sleep_avg + sleep_time >=
@@ -1531,16 +1691,23 @@ out_activate:
 		 * sleep_avg beyond just interactive state.
 		 */
 		p->sleep_type = SLEEP_NONINTERACTIVE;
-	} else
+	} else if (task_interactive(current)) {
+		/*
+		 * Tasks tagged as being truly interactive
+		 * pass temporary interactive status on to
+		 * the task they are waking.
+		 */
+		set_task_is_interactive(p);
+		p->sleep_type = SLEEP_INTERACTIVE;
+	}
 
 	/*
 	 * Tasks that have marked their sleep as noninteractive get
 	 * woken up with their sleep average not weighted in an
 	 * interactive way.
 	 */
-		if (old_state & TASK_NONINTERACTIVE)
-			p->sleep_type = SLEEP_NONINTERACTIVE;
-
+	else if (old_state & TASK_NONINTERACTIVE)
+		p->sleep_type = SLEEP_NONINTERACTIVE;
 
 	activate_task(p, rq, cpu == this_cpu);
 	/*
@@ -1628,9 +1795,24 @@ void fastcall sched_fork(struct task_str
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
 	 */
-	p->first_time_slice = 1;
 	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
+
+	/*
+	 * Set up slice_info and initial throttle position for the child.
+	 */
+	set_slice_avg(p, p->sleep_avg);
+	set_last_slice(p, p->time_slice);
+	set_slice_is_new(p);
+	set_first_time_slice(p);
+	p->last_slice = jiffies;
+	p->throttle = jiffies - C2 + C1;
+	/*
+	 * SCHED_INTERACTIVE policy cannot be inherited.
+	 */
+	if (unlikely(current->policy == SCHED_INTERACTIVE))
+		p->policy = SCHED_NORMAL;
+
 	if (unlikely(!current->time_slice)) {
 		/*
 		 * This case is rare, it happens when the parent has only
@@ -1745,7 +1927,7 @@ void fastcall sched_exit(struct task_str
 	 * the sleep_avg of the parent as well.
 	 */
 	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
+	if (first_time_slice(p) && task_cpu(p) == task_cpu(p->parent)) {
 		p->parent->time_slice += p->time_slice;
 		if (unlikely(p->parent->time_slice > task_timeslice(p)))
 			p->parent->time_slice = task_timeslice(p);
@@ -3051,9 +3233,10 @@ static inline int expired_starving(struc
 {
 	if (rq->curr->static_prio > rq->best_expired_prio)
 		return 1;
-	if (!STARVATION_LIMIT || !rq->expired_timestamp)
+	if (!STARVATION_LIMIT)
 		return 0;
-	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+	if (jiffies - rq->switch_timestamp > rq->nr_running * DEF_TIMESLICE +
+			STARVATION_LIMIT)
 		return 1;
 	return 0;
 }
@@ -3131,8 +3314,165 @@ void account_steal_time(struct task_stru
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 
+/*
+ * Promote and requeue the next lower priority task.  If no task
+ * is available in the active array, switch to the expired array.
+ * @rq: runqueue to search.
+ * @prio: priority at which to begin search.
+ */
+static inline void promote_next_lower(struct rq *rq, int prio)
+{
+	struct prio_array *array = rq->active;
+	struct task_struct *p = NULL;
+	unsigned long long now = rq->most_recent_timestamp;
+	unsigned long *bitmap;
+	unsigned long starving = JIFFIES_TO_NS(rq->slice_avg);
+	int idx = prio + 1, found_noninteractive = 0;
+	int ticks = rq->active->interactive_ticks + rq->expired->interactive_ticks;
+
+repeat:
+	bitmap = array->bitmap;
+	idx = find_next_bit(bitmap, MAX_PRIO, idx);
+	if (idx < MAX_PRIO) {
+		struct list_head *queue = array->queue + idx;
+
+		p = list_entry(queue->next, struct task_struct, run_list);
+		if (!TASK_INTERACTIVE(p))
+			found_noninteractive = 1;
+
+		/* Skip non-starved queues. */
+		if (now < p->last_ran + starving) {
+			idx++;
+			p = NULL;
+			goto repeat;
+		}
+	} else if (!found_noninteractive && array == rq->active) {
+		/* Nobody home, check the expired array. */
+		array = rq->expired;
+		idx = prio;
+		p = NULL;
+		goto repeat;
+	}
+
+	/* Found one, requeue it. */
+	if (p) {
+		dequeue_task(p, p->array);
+		if (array == rq->active)
+			p->prio--;
+		/*
+		 * If we pulled a task from the expired array, correct
+		 * expired array info.  We can't afford a full search
+		 * for best_expired_prio, but do the best we can.
+		 */
+		else {
+			idx = sched_find_first_bit(array->bitmap);
+			if (idx < MAX_PRIO) {
+				if (rq->best_expired_prio > idx)
+					rq->best_expired_prio = idx;
+			} else {
+				/* We emptied the array */
+				rq->best_expired_prio = MAX_PRIO;
+				/*
+				 * If we have excessive interactive load,
+				 * do not inhibit forced array switching.
+				 */
+				if (ticks < INTERACTIVE_LIMIT)
+					rq->switch_timestamp = jiffies;
+			}
+		}
+		enqueue_task(p, rq->active);
+	}
+}
+
+/*
+ * Refresh timeslice and associated slice information.
+ * @p: the process to refresh.
+ */
+static void refresh_timeslice(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	unsigned long slice_time = jiffies - p->last_slice;
+	int idle, cpu, cpu_avg, slice = last_slice(p);
+	int w = MAX_BONUS, delta, bonus;
+
+	if (unlikely(slice_time < slice))
+		slice_time = slice;
+
+	/* Update task's CPU usage. */
+	cpu_avg = slice_avg_raw(p);
+	cpu = cpu_this_slice(p);
+	idle = 100 - cpu;
+	delta = max(cpu_avg, idle) - min(cpu_avg, idle);
+	w = 1 + (delta / w);
+	cpu_avg = (w * cpu_avg + idle) / (w + 1);
+	set_slice_avg_raw(p, cpu_avg);
+
+	/*
+	 * If we've hit the throttle timeout, we aren't draining enough
+	 * sleep_avg to keep up with the task's cpu usage.  Up the ante
+	 * to bring the task back toward balance.
+	 */
+	if (credit_exhausted(p, C2) && p->sleep_avg > slice_avg(p)) {
+		unsigned long run_time = p->sleep_avg - slice_avg(p);
+		run_time /= w;
+		if (p->sleep_avg >= run_time)
+			p->sleep_avg -= run_time;
+	}
+
+	/*
+	 * Update throttle position and sanity check it.
+	 */
+	if (task_is_interactive(p))
+		p->throttle += slice_time - slice;
+	else if (INTERACTIVE_LIMIT_EXCEEDED(rq) &&
+			cpu_avg - cpu_avg_rq(rq) >= PCNT_PER_DYNPRIO) {
+		bonus = (cpu_avg - cpu_avg_rq(rq)) / PCNT_PER_DYNPRIO;
+		p->throttle -= slice_time * bonus;
+	} else if (cpu < cpu_max(p) + PCNT_PER_DYNPRIO) {
+		bonus = idle * PCNT_PER_DYNPRIO / 100;
+		p->throttle += (slice_time - slice) * bonus;
+	} else  if (cpu >= cpu_max(p) + PCNT_PER_DYNPRIO) {
+		bonus = (cpu - cpu_max(p)) / PCNT_PER_DYNPRIO;
+		p->throttle -= slice_time * bonus;
+	}
+
+	if (time_before(jiffies, p->throttle))
+		p->throttle = jiffies;
+	else if (credit_exhausted(p, C3))
+		p->throttle = jiffies - C3;
+
+	/* Add our slice time to the runqueue average. */
+	if (slice_time < HZ || slice_time < rq->nr_running * DEF_TIMESLICE) {
+		rq->slice_avg <<= 4;
+		rq->slice_avg += slice_time;
+		rq->slice_avg >>= 4;
+	}
+
+	/*
+	 * Ensure that SCHED_INTERACTIVE tasks and their partners will
+	 * always be classified correctly by TASK_INTERACTIVE(). Clear
+	 * propogated interactive task status.  Propogated status is
+	 * inherited from the parent, but is good for only one slice.
+	 */
+	if (task_is_interactive(p) && p->sleep_avg < INTERACTIVE_SLEEP(p))
+		p->sleep_avg = INTERACTIVE_SLEEP(p);
+	clr_task_is_interactive(p);
+
+	/* Update dynamic priority and time slice. */
+	p->prio = effective_prio(p);
+	p->time_slice = task_timeslice(p);
+	set_last_slice(p, p->time_slice);
+
+	/* And finally, stamp and flag the new slice. */
+	clr_first_time_slice(p);
+	set_slice_is_new(p);
+	p->last_slice = jiffies;
+}
+
 static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
+	int task_was_interactive;
+
 	if (p->array != rq->active) {
 		/* Task has expired but was not scheduled yet */
 		set_tsk_need_resched(p);
@@ -3152,8 +3492,7 @@ static void task_running_tick(struct rq 
 		 * FIFO tasks have no timeslices.
 		 */
 		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
+			refresh_timeslice(p);
 			set_tsk_need_resched(p);
 
 			/* put it at the end of the queue: */
@@ -3161,21 +3500,36 @@ static void task_running_tick(struct rq 
 		}
 		goto out_unlock;
 	}
+
+	/*
+	 * Tick off interactive task ticks from the active array.
+	 */
+	task_was_interactive = TASK_INTERACTIVE(p);
+	if (task_was_interactive && --rq->active->interactive_ticks < 0)
+		rq->active->interactive_ticks = 0;
+
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
+		refresh_timeslice(p);
 		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
-		p->first_time_slice = 0;
-
-		if (!rq->expired_timestamp)
-			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+
+		if (!TASK_INTERACTIVE(p) || expired_starving(rq) ||
+				credit_exhausted(p, C2)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
 				rq->best_expired_prio = p->static_prio;
 		} else
 			enqueue_task(p, rq->active);
+
+		/*
+		 * Always look to see if any queue under you is starving,
+		 * and requeue a task if that is the case.  This prevents
+		 * things like multiple tasks at any priority waking in
+		 * streams and starving their less fortunate peers via
+		 * preempt, ie ensures that the less fortunate will have
+		 * bounded latency.
+		 */
+		promote_next_lower(rq, p->prio);
 	} else {
 		/*
 		 * Prevent a too long timeslice allowing a task to monopolize
@@ -3285,7 +3639,7 @@ asmlinkage void __sched schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx, new_prio;
+	int cpu, idx, new_prio, throttle;
 	long *switch_count;
 	struct rq *rq;
 
@@ -3332,9 +3686,13 @@ need_resched_nonpreemptible:
 
 	/*
 	 * Tasks charged proportionately less run_time at high sleep_avg to
-	 * delay them losing their interactive status
-	 */
-	run_time /= (CURRENT_BONUS(prev) ? : 1);
+	 * delay them losing their interactive status.  If we have too many
+	 * interactive ticks queued or this task is being throttled, switch
+	 * behavior to linear decay.
+	 */
+	throttle = INTERACTIVE_LIMIT_EXCEEDED(rq) || credit_exhausted(prev, C2);
+	if (!throttle)
+		run_time /= 1 + CURRENT_BONUS(prev);
 
 	spin_lock_irq(&rq->lock);
 
@@ -3356,7 +3714,7 @@ need_resched_nonpreemptible:
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
-			rq->expired_timestamp = 0;
+			rq->switch_timestamp = jiffies;
 			goto switch_tasks;
 		}
 	}
@@ -3370,7 +3728,8 @@ need_resched_nonpreemptible:
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
-		rq->expired_timestamp = 0;
+		array->interactive_ticks = 0;
+		rq->switch_timestamp = jiffies;
 		rq->best_expired_prio = MAX_PRIO;
 	}
 
@@ -3380,6 +3739,8 @@ need_resched_nonpreemptible:
 
 	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
 		unsigned long long delta = now - next->timestamp;
+		int next_interactive = TASK_INTERACTIVE(next);
+
 		if (unlikely((long long)(now - next->timestamp) < 0))
 			delta = 0;
 
@@ -3389,14 +3750,33 @@ need_resched_nonpreemptible:
 		array = next->array;
 		new_prio = recalc_task_prio(next, next->timestamp + delta);
 
+		/*
+		 * If INTERACTIVE_LIMIT is exceeded, do not promote
+		 * tasks which already have interactive status.  This
+		 * can only make things worse if the load isn't truly
+		 * interactive, so let them decay.  We also don't want
+		 * a task which has been promoted while waiting to
+		 * get CPU after wakeup to be demoted, and thus end
+		 * up being preempted immediately by a task waking
+		 * at the priority it has just reached.  Tasks which
+		 * miss the tick frequently also get caught here, so
+		 * care has to be taken to not help them along. Since
+		 * these are very likely to have interactive status,
+		 * don't ever demote a non-interactive task here, and
+		 * always considered interactive tasks to be fair game.
+		 */
+		if ((throttle && next_interactive && new_prio < next->prio) ||
+			(!next_interactive && new_prio > next->prio))
+			goto switch_tasks;
+
 		if (unlikely(next->prio != new_prio)) {
 			dequeue_task(next, array);
 			next->prio = new_prio;
 			enqueue_task(next, array);
 		}
 	}
-	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
+	next->sleep_type = SLEEP_NORMAL;
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
 	prefetch(next);
@@ -3411,6 +3791,14 @@ switch_tasks:
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
+	/*
+	 * Tag start of execution of a new timeslice.
+	 */
+	if (unlikely(slice_is_new(next))) {
+		next->last_slice = jiffies;
+		clr_slice_is_new(next);
+	}
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = next->last_ran = now;
@@ -4081,7 +4469,8 @@ recheck:
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-			policy != SCHED_NORMAL && policy != SCHED_BATCH)
+			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+			policy != SCHED_INTERACTIVE)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4619,6 +5008,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_INTERACTIVE:
 		ret = 0;
 		break;
 	}
@@ -4643,6 +5033,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_INTERACTIVE:
 		ret = 0;
 	}
 	return ret;
@@ -6739,6 +7130,7 @@ void __init sched_init(void)
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		rq->best_expired_prio = MAX_PRIO;
+		rq->slice_avg = STARVATION_LIMIT;
 
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
Index: linux/kernel/sysctl.c
===================================================================
--- linux.orig/kernel/sysctl.c
+++ linux/kernel/sysctl.c
@@ -76,6 +76,9 @@ extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
+extern int credit_c1;
+extern int credit_c2;
+extern int credit_max;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -204,6 +207,13 @@ static ctl_table root_table[] = {
 	{ .ctl_name = 0 }
 };
 
+/*
+ * Constants for minimum and maximum testing in vm_table and
+ * kern_table.  We use these as one-element integer vectors.
+*/
+static int zero;
+static int one_hundred = 100;
+
 static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_PANIC,
@@ -611,16 +621,31 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-
+	{
+		.ctl_name	= KERN_SCHED_THROTTLE1,
+		.procname	= "credit_c1",
+		.data		= &credit_c1,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &credit_max,
+	},
+	{
+		.ctl_name	= KERN_SCHED_THROTTLE2,
+		.procname	= "credit_c2",
+		.data		= &credit_c2,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &credit_max,
+	},
 	{ .ctl_name = 0 }
 };
 
-/* Constants for minimum and maximum testing in vm_table.
-   We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
 static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 12:14                                           ` Rene Herman
  2007-04-09 13:27                                             ` Andreas Mohr
@ 2007-04-09 14:15                                             ` Ingo Molnar
  2007-04-09 17:05                                               ` Rene Herman
  2007-04-09 17:10                                             ` Mike Galbraith
  2 siblings, 1 reply; 92+ messages in thread
From: Ingo Molnar @ 2007-04-09 14:15 UTC (permalink / raw)
  To: Rene Herman
  Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton


* Rene Herman <rene.herman@gmail.com> wrote:

> To me, the example rather serves as confirmation of what Kolivas has 
> been saying; endlessly tweaking the tweaks isn't going anywhere.

but ... SD clearly regresses in some areas, so by that logic SD isnt 
going anywhere either?

note that i still like the basic idea about SD, that it is an experiment 
that if the only conceptual focus is on "scheduling fairness", we'll get 
a better scheduler. But for that to work out two things have to be done 
i think:

 - the code actually has to match that stated goal. Right now it
   diverges from it (it is not a "fair" scheduler), and it's not clear
   why.

note that SD at the moment produces ~10% more code in sched.o, and the 
reason is that SD is more complex than the vanilla scheduler. People 
tend to get the impression that SD is simpler, partly because it is a 
net linecount win in sched.c, but many of the removed lines are 
comments.

this "provide fairness" goal is quite important, because if SD's code is 
not only about providing fairness, what is the rest of the logic doing? 
Are they "tweaks", to achieve interactivity? If yes, why are they not 
marked as such? I.e. will we go down the _same_ road again, but this 
time with a much less clearly defined rule for what a "tweak" is?

note that under the interactivity estimator it is not that hard to 
achieve forced "fairness".

So _if_ we accept that scheduling must include a fair dose of heuristics 
(which i tend to think it has to), we are perhaps better off with an 
interactivity design that _accepts_ this fundamental fact and separates 
heuristics from core scheduling. Right now i dont see the SD proponents 
even _accepting_ that even the current SD code does include heuristics.

the other one is:

 - the code has to demonstrate that it can flexibly react to various 
   complaints of regressions.

(I identified a few problem workloads that we tend to care about and i 
havent seen much progress with them - but i really reserve judgement 
about that, given Con's medical condition.)

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 13:53                                         ` Ingo Molnar
@ 2007-04-09 15:37                                           ` Rene Herman
  0 siblings, 0 replies; 92+ messages in thread
From: Rene Herman @ 2007-04-09 15:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Gene Heskett, linux-kernel, Con Kolivas, Mike Galbraith,
	Andrew Morton, ck list

On 04/09/2007 03:53 PM, Ingo Molnar wrote:

> In any case, it would be very nice if you could try Mike's latest
> patch, how does it work on your setup? (i've attached it)

Can do. Note that "my setup" in that case consisted of browsing around 
eBay in firefox with ogg123 playing audio directly to ALSA in an xterm 
as the only other thing running. That is, just about as basic a Linux 
desktop as imagineable.

Testing Mike's latest will have to wait a bit though; I'm currently 
testing the latest incarnation of SD (against 2.6.20.6). For people 
who've lost track of what and where, it's available as:

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.5-sd-0.39.patch

and versus 2.6.21-rc5 as:

http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc5-sd-0.39.patch

For the moment it is giving me a snappy feeling desktop on this Duron 
1300, with ogg123 playing in an xterm without audio underruns, with a 
make -j2 kernel compile running (not niced) and me browsing around in 
firefox.

Mike latest would probably also support this load without much problem. 
Given that I feel the basic idea of SD is better than mainline though, 
I'll be concentrating on using SD for a bit for now.

Rene.


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 11:26                               ` Ed Tomlinson
@ 2007-04-09 16:50                                 ` Mike Galbraith
  2007-04-22 10:48                                   ` [ck] " Martin Steigerwald
  0 siblings, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09 16:50 UTC (permalink / raw)
  To: Ed Tomlinson; +Cc: Con Kolivas, Ingo Molnar, linux list, Andrew Morton, ck list

On Mon, 2007-04-09 at 07:26 -0400, Ed Tomlinson wrote:
> On Monday 09 April 2007 01:38, Mike Galbraith wrote:
> > On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote:
> > > Hi,
> > > 
> > > I am one of those who have been happily testing Con's patches.  
> > > 
> > > They work better than mainline here.
> > 
> > (I tried a UP kernel yesterday, and even a single kernel build would
> > make noticeable hitches if I move a window around. YMMV etc.)
> 
> Interesting.  I run UP amd64, 1000HZ, 1.25G, preempt off (on causes kernel 
> stalls with no messages - but that is another story).  I do not notice a single 
> make.   When several are running the desktop slows down a bit.  I do not have 
> X niced.  Wonder why we see such different results?

Probably because with your processor, in general cc1 can get the job
done faster, as can X.  The latency big hit happens when you hit the end
of the rotation.  You simply don't hit it as often as I do.  Anyone with
an old PIII box should hit the wall very quickly indeed.  I haven't had
time to try it here.
 
> I am not saying that SD is perfect - I fully expect that more bugs will turn up
> in its code (some will affect mainline too).  I do however like the idea of a 
> scheduler that does not need alchemy to achieve good results.  Nor do I
> necessarily expect it to be 100% transparent.  If one changes something
> as basic as the scheduler some tweaking should be expected.  IMO this
> is fine as long as we get consistant results.

Alchemy is a rather colorful word for arithmetic, but I see your point.

> > > If one really needs some sort of interactivity booster (I do not with SD), why
> > > not move it into user space?  With SD it would be simple enough to export
> > > some info on estimated latency.  With this user space could make a good
> > > attempt to keep latency within bounds for a set of tasks just by renicing.... 
> > 
> > I don't think you can have very much effect on latency using nice with
> > SD once the CPU is fully utilized.  See below.
> > 
> > /*
> >  * This contains a bitmap for each dynamic priority level with empty slots
> >  * for the valid priorities each different nice level can have. It allows
> >  * us to stagger the slots where differing priorities run in a way that
> >  * keeps latency differences between different nice levels at a minimum.
> >  * ie, where 0 means a slot for that priority, priority running from left to
> >  * right:
> >  * nice -20 0000000000000000000000000000000000000000
> >  * nice -10 1001000100100010001001000100010010001000
> >  * nice   0 0101010101010101010101010101010101010101
> >  * nice   5 1101011010110101101011010110101101011011
> >  * nice  10 0110111011011101110110111011101101110111
> >  * nice  15 0111110111111011111101111101111110111111
> >  * nice  19 1111111111111111111011111111111111111111
> >  */
> > 
> > Nice allocates bandwidth, but as long as the CPU is busy, tasks always
> > proceed downward in priority until they hit the expired array.  That's
> > the design.  If X gets busy and expires, and a nice 20 CPU hog wakes up
> > after it's previous rotation has ended, but before the current rotation
> > is ended (ie there is 1 task running at wakeup time), X will take a
> > guaranteed minimum 160ms latency hit (quite noticeable) independent of
> > nice level.  The only way to avoid it is to use a realtime class.
> > 
> > A nice -20 task has maximum bandwidth allocated, but that also makes it
> > a bigger target for preemption from tasks at all nice levels as it
> > proceeds downward toward expiration.  AFAIKT, low latency scheduling
> > just isn't possible once the CPU becomes 100% utilized, but it is
> > bounded to runqueue length.  In mainline OTOH, a nice -20 task will
> > always preempt a nice 0 task, giving it instant gratification, and
> > latency of lower priority tasks is bounded by the EXPIRED_STARVING(rq)
> > safety net.
> 
> Mike I made no mention of low latency.

You did say that Con's patch works better than mainline, and you seemed
very much to be talking about the desktop.  X very definitely is a
latency sensitive application, and often a CPU hog to boot.  The point I
illustrated above is a salient point.

If you don't want to hear about anything other than this idea about
using nice from userland, skip to my last sentence :)

>   I did mention predictable latency.  If
> you are 100% utilized, and have a nice -20 task cpu hog, I would expect it to run 
> and that it _should_ affect other tasks - thats why it runs with -20...

:-/  It does the user of X absolutely no good to be able to predict, as
I did above, that we are absolutely _going_ to take a 160ms + remaining
task ticks latency hit.

Nice -20 was used only to show clearly what SD trades away, and it's not
only the desktop it's trading for mundane latency, it's trading any
possibility of low latency, and dismissing burst loads as if they don't
even exist.  The current scheduler is dynamic.  SD is utterly rigid.

Apply what I wrote to X at the recommended nice -10.  It makes no
difference what bandwidth you allocate if the latency sensitive
application _will_ take a very major latency hit if it uses it.  X does
do that, so it will take those hits by design.

> This is why I suggest that user space may be a better place to boost interactive
> tasks.  A daemon that posted a message telling me that the nice -20 cpu hog
> is causing 300ms delays for X would, IMHO, be a good thing.  That same daemon
> could then propose a fix telling me the expected latencies and let me decide if 
> I want to change priorities.  It could also be set to automaticily adjust nice levels...

Re-read what I wrote.  You simply can't get there from here, by design.

If I'm wrong, someone please show me where.

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 14:15                                             ` Ingo Molnar
@ 2007-04-09 17:05                                               ` Rene Herman
  2007-04-09 17:48                                                 ` Ingo Molnar
  0 siblings, 1 reply; 92+ messages in thread
From: Rene Herman @ 2007-04-09 17:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton

On 04/09/2007 04:15 PM, Ingo Molnar wrote:

> * Rene Herman <rene.herman@gmail.com> wrote:
> 
>> To me, the example rather serves as confirmation of what Kolivas
>> has been saying; endlessly tweaking the tweaks isn't going
>> anywhere.
> 
> but ... SD clearly regresses in some areas, so by that logic SD isnt
> going anywhere either?

No. The logic isn't that (performance and other) characteristics must 
always be exactly the same between two schedulers, the logic is that 
having one of them turn into a contrived heap of heuristics where every 
progression on one front turns into a regression on another means that 
one is on a dead-end road.

Now ofcourse, while not needing to behave the same in all conceivable 
situations, any alternative like SD needs to behave _well_ and for me, 
it is currently, while just using it.

> note that i still like the basic idea about SD, that it is an
> experiment that if the only conceptual focus is on "scheduling
> fairness", we'll get a better scheduler. But for that to work out two
> things have to be done i think:
> 
> - the code actually has to match that stated goal. Right now it 
>   diverges from it (it is not a "fair" scheduler), and it's not clear 
>   why.

I read most of the discussion centering around that specific point as 
well, and frankly, I mostly came away from it thinking "so what?". It 
seems this is largely an issue of you and Kolivas disagreeing on what 
needs to be called design and what needs to be called implementation, 
but more importantly I feel a solution is to just shy away from the 
inherently subjective word "fair". If you feel that some of the things 
SD does need to be called "unfair" as much as mainline, so be it, but do 
you think that SD is less _predictably_ fair or unfair than mainline?

This is what I consider to be very important; if my retarted kid brother 
sometimes walk left and sometimes right when I tell him to walk forward, 
I can't go stand to the right and say "nono, forward I said". If on the 
right there's a highway, you can imagine what that means... All software 
is stupid, but the one that's predictably so allows you to compensate.

> this "provide fairness" goal is quite important, because if SD's code
> is not only about providing fairness, what is the rest of the logic
> doing? Are they "tweaks", to achieve interactivity? If yes, why are
> they not marked as such? I.e. will we go down the _same_ road again,
> but this time with a much less clearly defined rule for what a
> "tweak" is?

One answer to that is that it's much less important what a tweak is as 
long as it's the same always. If I then don't like the definition I'll 
just define it the other way around privately and be done with it. I do 
believe that SDs objective is not fairness as such, it's predictability. 
Being "fair" was postulated as a condition for being so, but let's not 
put too much focus on that one point; it's a matter of definitions (and 
taste) and secondary.

> So _if_ we accept that scheduling must include a fair dose of
> heuristics (which i tend to think it has to), we are perhaps better
> off with an interactivity design that _accepts_ this fundamental fact
> and separates heuristics from core scheduling.

I agree that the demands on a (one) general purpose scheduler are so 
diverse that it's impossible to have one that doesn't break down under 
some set of conditions. The mainline scheduler does so, and SD does so. 
What SD does is take some of the guesswork out of it. I haven't needed 
anything like it yet, but I wouldn't feel particularly bad about, say, 
renicing a kernel compile upon having audio stutter while I'm browsing eBay.

The "I haven't needed anything like it" is important; I ofcourse only 
wouldn't mind it under the condition that what I consider loads that my 
desktop should be able to handle without problem don't need anything 
special. If I'd transpose this load onto the Pentium 1 that's sitting at 
my feet, I wouldn't mind at all though.

> Right now i dont see the SD proponents even _accepting_ that even the
> current SD code does include heuristics.
> 
> the other one is:
> 
>  - the code has to demonstrate that it can flexibly react to various 
>    complaints of regressions.

With one important point -- if every single _change_ in behaviour is 
going to be defined a regression, then obviously noone will ever again 
be able to change anything fundamental. Behaviour is being changed since 
people see current behaviour as not being desireable. Predictability for 
one is in my opinion a strong enough "progression" that I'm willing to 
mark of a few "regressions" against it.

> (I identified a few problem workloads that we tend to care about and
> i havent seen much progress with them - but i really reserve
> judgement about that, given Con's medical condition.)

Indeed. Going forward with it while its main developer is out might be 
unwise ofcourse. From his emails I gather he'll be out for some time, 
but hey, after kernel N+1, there'll probably be a kernel N+2...

I'd just hate to see this being blocked outright. It seems to be 
performing so nicely for me.

Rene.


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 12:14                                           ` Rene Herman
  2007-04-09 13:27                                             ` Andreas Mohr
  2007-04-09 14:15                                             ` Ingo Molnar
@ 2007-04-09 17:10                                             ` Mike Galbraith
  2 siblings, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-09 17:10 UTC (permalink / raw)
  To: Rene Herman
  Cc: Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas,
	Andrew Morton, ck list

On Mon, 2007-04-09 at 14:14 +0200, Rene Herman wrote:

> This turned into an interactivity thing, and while interactivity is in 
> fact better for a large majority of testers, that isn't what Kolivas' 
> scheduler is about. It's about predictability and leaving the dead-end 
> road of these endlesss tweaks, which then break previous tweaks, rinse, 
> repeat.

To me, it's more than an interactivity thing.  It is also about reacting
to a dynamic environment, which the desktop is.  SD is not dynamic.

> It's unfortunate that Kolivas is having health problems currently, but I 
> certainly do hope that his scheduler finds its way into _a_ -rc1. He 
> said it was done...

Well, there I disagree with him quite strongly, but it's not my decision
what gets integrated into any tree but my own ;-)

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 17:05                                               ` Rene Herman
@ 2007-04-09 17:48                                                 ` Ingo Molnar
  2007-04-09 19:09                                                   ` Rene Herman
  2007-04-09 19:56                                                   ` Gene Heskett
  0 siblings, 2 replies; 92+ messages in thread
From: Ingo Molnar @ 2007-04-09 17:48 UTC (permalink / raw)
  To: Rene Herman
  Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton


* Rene Herman <rene.herman@gmail.com> wrote:

> > - the code actually has to match that stated goal. Right now it
> >   diverges from it (it is not a "fair" scheduler), and it's not 
> >   clear why.
> 
> I read most of the discussion centering around that specific point as 
> well, and frankly, I mostly came away from it thinking "so what?". 
> [...]

it's important due to what Mike mentioned in the previous mail too: SD 
seems to be quite rigid in certain aspects. So if we end up with that 
fundamental rigidity we might as well be _very_ sure that it makes 
sense. Because otherwise there might be no other way out but to "revert 
the whole thing again". Today we always have the "tweak the 
interactivity estimator" route, because that code is not rigid at the 
core of the scheduler.

> [...] one of them turn into a contrived heap of heuristics where every 
> progression on one front turns into a regression on another means that 
> one is on a dead-end road.

that's not what i found when testing Mike's latest patches - they 
visibly improved those testcases, part of which were written to 
"exploit" heuristics, without regressing others. Several people reported 
improvements with those patches.

Why was that possible without spending years on writing a new scheduler? 
Because the interactivity estimator is fundamentally _tweakable_. What 
you flag with sometimes derogative sentences as a weakness of the 
interactivity estimator is also its strength: tweakability is 
flexibility. And no, despite what you claim to be a "patchwork" it makes 
quite some sense: reward certain scheduling behavior and punish other 
type of behavior. That's what SD does too in the end. Sure, if your 
"reward" fights against the "punishment", they cancel out each other, or 
if the metrics used are just arbitrary and make no independent sense 
it's bad, but that's just plain bad engineering.

Why didnt much happen in the past year or so? Frankly, due to lack of 
demand for change - because most people were just happy about it, or 
just not upset enough. And i know the types of complaints first-hand, 
the -rt tree is a _direct answer_ to desktop-space complaints of Linux 
and it includes a fair bit of scheduler changes too. Now that we have 
actual new testcases and people with complaints and their willingness to 
try patches, we can do something about it.

> > the other one is:
> >
> > - the code has to demonstrate that it can flexibly react to various 
> >   complaints of regressions.
> 
> With one important point -- if every single _change_ in behaviour is 
> going to be defined a regression, then obviously noone will ever again 
> be able to change anything fundamental. [...]

i didnt say that, in fact my first lkml comment about RSDL on lkml was 
the exact opposite, but you SD advocates are _still_ bickering about 
(and not accepting) fundamental things like Mike's make -j5 workload and 
flagging it as unrealistic, so until there's so much reality disconnect 
there's not much chance for this issue to progress i'm afraid.

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-07 19:14                               ` Mike Galbraith
  2007-04-07 20:31                                 ` Gene Heskett
@ 2007-04-09 17:51                                 ` William Lee Irwin III
  2007-04-09 18:03                                   ` Ingo Molnar
  1 sibling, 1 reply; 92+ messages in thread
From: William Lee Irwin III @ 2007-04-09 17:51 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas,
	Andrew Morton, ck list

On Sat, 2007-04-07 at 20:08 +0200, Ingo Molnar wrote:
>> not many - and i dont think Mike tested any of these - Mike tested 
>> pretty low make -j values (Mike, can you confirm?).

On Sat, Apr 07, 2007 at 09:14:21PM +0200, Mike Galbraith wrote:
> Yes.  I don't test anything more than make -j5 when looking at
> interactivity, and make -j nr_cpus+1 is my must have yardstick.

I strongly suggest assembling a battery of cleanly and properly written,
configurable testcases, and scripting a series of regression tests as
opposed to just randomly running kernel compiles and relying on Braille.
For instance, a program that spawns a set of tasks with some spectrum
of interactive vs. noninteractive behaviors and maybe priorities too
according to command-line flags and then measures and reports the
distribution of CPU bandwidth between them, with some notion of success
or failure and performance within the realm of success reported would
be something to include in such a battery of testcases. Different sorts
of cooperating processes attempting to defeat whatever sorts of
guarantees the scheduler is intended to provide would also be good
testcases, particularly if they're arranged so as to automatically
report success or failure in their attempts to defeat the scheduler
(which even irman2.c, while quite good otherwise, fails to do).

IMHO the failure of these threads to converge to some clear conclusion
is in part due to the lack of an agreed-upon set of standards for what
the scheduler should achieve and overreliance on subjective criteria.
The testcase code going around is also somewhat embarrassing.

>From the point of view of someone wondering what these schedulers solve,
how any of this is to be demonstrated, and what the status of various
pathological cases are, these threads are a nightmare of subjective
squishiness and a tug-of-war between testcases only ever considered one
at a time needing Lindent to read that furthermore have all their
parameters hardcoded. Scripting edits and recompiles is awkward. Just
finding the testcases is also awkward; con has a collection of a few,
but they've got the aforementioned flaws and others also go around
that can only be dredged up from mailing list archive searches, plus
there's nothing like LTP where they can be run in a script with
pass/fail reports and/or performance metrics for each. One patch goes
through for one testcase and regressions against the others are open
questions.

Scheduling does have a strong subjective component, but this is too
disorganized to be allowed to pass without comment. Some minimum bar
must be set for schedulers to pass before they're considered correct.
Some method of regression testing must be arranged. And the code to
do such testing should not be complete crap with hardcoded parameters.


-- wli

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 17:51                                 ` William Lee Irwin III
@ 2007-04-09 18:03                                   ` Ingo Molnar
  2007-04-09 18:44                                     ` William Lee Irwin III
  0 siblings, 1 reply; 92+ messages in thread
From: Ingo Molnar @ 2007-04-09 18:03 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton


* William Lee Irwin III <wli@holomorphy.com> wrote:

> I strongly suggest assembling a battery of cleanly and properly 
> written, configurable testcases, and scripting a series of regression 
> tests as opposed to just randomly running kernel compiles and relying 
> on Braille.

there's interbench, written by Con (with the purpose of improving 
RSDL/SD), which does exactly that, but vanilla and SD performs quite the 
same in those tests.

it's quite hard to test interactivity, because it's both subjective and 
because even for objective workloads, things depend so much on exact 
circumstances. So the best way is to wait for actual complaints, and/or 
actual testcases that trigger badness, and victims^H^H^H^H^H testers.

(also note that often it needs _that precise_ workload to trigger some 
badness. For example make -j depends on the kind of X shell terminal 
that is used - gterm behaves differently from xterm, etc.)

	Ingo

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 18:03                                   ` Ingo Molnar
@ 2007-04-09 18:44                                     ` William Lee Irwin III
  0 siblings, 0 replies; 92+ messages in thread
From: William Lee Irwin III @ 2007-04-09 18:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton

* William Lee Irwin III <wli@holomorphy.com> wrote:
>> I strongly suggest assembling a battery of cleanly and properly 
>> written, configurable testcases, and scripting a series of regression 
>> tests as opposed to just randomly running kernel compiles and relying 
>> on Braille.

On Mon, Apr 09, 2007 at 08:03:56PM +0200, Ingo Molnar wrote:
> there's interbench, written by Con (with the purpose of improving 
> RSDL/SD), which does exactly that, but vanilla and SD performs quite the 
> same in those tests.
> it's quite hard to test interactivity, because it's both subjective and 
> because even for objective workloads, things depend so much on exact 
> circumstances. So the best way is to wait for actual complaints, and/or 
> actual testcases that trigger badness, and victims^H^H^H^H^H testers.
> (also note that often it needs _that precise_ workload to trigger some 
> badness. For example make -j depends on the kind of X shell terminal 
> that is used - gterm behaves differently from xterm, etc.)

Interactivity will probably have to stay squishy. The DoS affairs like
fiftyp.c, tenp.c, etc. are more of what I had in mind. There are also
a number of instances where CPU bandwidth distributions are gauged by
top(1) with noninteractive tests where the scriptable testcase affair
should be coming into play.

There are other, relatively obvious testcases for basic functionality
missing, too. For instance, where is the testcase to prove that nice
levels have the intended effect upon CPU bandwidth distribution between
sets of CPU-bound tasks? Or one that gauges the CPU bandwidth
distribution between a task that sleeps some (command-line configurable)
percentage of the time and some (command-line configurable) number of
competing CPU-bound tasks? Or one that gauges the CPU bandwidth
distribution between sets of cooperating processes competing with
ordinary CPU-bound processes? Can it be proven that any of this is
staying constant across interactivity or other changes? Is any of it
being changed as an unintended side-effect? Are the CPU bandwidth
distributions among such sets of competing tasks even consciously decided?

There should be readily-available answers to these questions, but they
are not so.


-- wli

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 17:48                                                 ` Ingo Molnar
@ 2007-04-09 19:09                                                   ` Rene Herman
  2007-04-09 19:56                                                   ` Gene Heskett
  1 sibling, 0 replies; 92+ messages in thread
From: Rene Herman @ 2007-04-09 19:09 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton

On 04/09/2007 07:48 PM, Ingo Molnar wrote:

> i didnt say that, in fact my first lkml comment about RSDL on lkml
> was the exact opposite, but you SD advocates are _still_ bickering
> about (and not accepting) fundamental things like Mike's make -j5
> workload and flagging it as unrealistic, so until there's so much
> reality disconnect there's not much chance for this issue to progress
> i'm afraid.

I suppose I'm lumped in with the "SD advocates" now but you will note 
that I haven't been bickering about make -j5 loads. You cut away the 
entire meat of my reply which was all that predictability harping.

What I did say about make -j5 loads is that I do not think that they, 
under all circumstances, on all machines and at all cost, need to 
perform the same as currently if other situations improve. Do I want 
heuristics? Sure, I'm just saying the kernel is fundamentally incapable 
of getting it right all of the time and as such it should provide me 
with as many opportunities as possible at stepping in. That is, let me 
understand what it is and is going to be doing and then listen to me.

I agree not a lot of progress is to be made if people keep ignoring each 
other like that but also while SD's author is offline. Let's just shelve 
it until he's back. Not bury though...

Rene.


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 13:27                                             ` Andreas Mohr
@ 2007-04-09 19:54                                               ` Rene Herman
  0 siblings, 0 replies; 92+ messages in thread
From: Rene Herman @ 2007-04-09 19:54 UTC (permalink / raw)
  To: Andreas Mohr
  Cc: Mike Galbraith, Ingo Molnar, Gene Heskett, linux-kernel,
	Con Kolivas, Andrew Morton, ck list

On 04/09/2007 03:27 PM, Andreas Mohr wrote:

> And I really don't see much difference whatsoever to the I/O scheduler
> area: some people want predictable latency, while others want maximum
> throughput or fastest operation for seek-less flash devices (noop).
> Hardware varies similarly greatly has well:
> Some people have huge disk arrays or NAS, others have a single flash disk.
> Some people have a decaying UP machine, others have huge SMP farms.

I do agree, and yes, I/O scheduling seems to not have suffered from the 
choice although I must say I'm not sure how much use each I/O scheduler 
individualy sees.

If one CPU scheduler can be good enough then it would better to just 
have that one, but well, yes, maybe it can't. I certainly believe any 
one scheduler can't avoid breaking down onder some condition. Demand is 
just too varied.

I find it interesting that you see SD as a server scheduler and I guess 
deterministic behaviour does point in that direction somewhat. I would 
be enabling it on the desktop though, which probably is _some_ argument 
on having multiple schedulers.

Rene.


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09 17:48                                                 ` Ingo Molnar
  2007-04-09 19:09                                                   ` Rene Herman
@ 2007-04-09 19:56                                                   ` Gene Heskett
  1 sibling, 0 replies; 92+ messages in thread
From: Gene Heskett @ 2007-04-09 19:56 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Rene Herman, Mike Galbraith, Con Kolivas, Andrew Morton

On Monday 09 April 2007, Ingo Molnar wrote:[...]
>
>i didnt say that, in fact my first lkml comment about RSDL on lkml was
>the exact opposite, but you SD advocates are _still_ bickering about
>(and not accepting) fundamental things like Mike's make -j5 workload and
>flagging it as unrealistic, so until there's so much reality disconnect
>there's not much chance for this issue to progress i'm afraid.

Mikes -j5 workload is AFAIAC, a very realistic workload for building a 
kernel.  My own script I just discovered was using -j8, and that was 
noticeable, but by no means a killing hit on my poor old Xp2800 Athlon.  
I pulled it back to 4 for this mornings build and the hit, while less, is 
still noticeable.  Killer hit?  No way.  Using Mike's v4 patch I think it 
was called.

>	Ingo



-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
When you jump for joy, beware that no-one moves the ground from beneath
your feet.
		-- Stanislaw Lem, "Unkempt Thoughts"

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-09  5:38                             ` Mike Galbraith
  2007-04-09 11:26                               ` Ed Tomlinson
@ 2007-04-10  2:39                               ` Mike Galbraith
  2007-04-10 11:23                                 ` Ed Tomlinson
  1 sibling, 1 reply; 92+ messages in thread
From: Mike Galbraith @ 2007-04-10  2:39 UTC (permalink / raw)
  To: LKML; +Cc: Con Kolivas, Ingo Molnar, Andrew Morton, ck list

On Mon, 2007-04-09 at 07:38 +0200, Mike Galbraith wrote:

> I don't think you can have very much effect on latency using nice with
> SD once the CPU is fully utilized.  See below.
> 
> /*
>  * This contains a bitmap for each dynamic priority level with empty slots
>  * for the valid priorities each different nice level can have. It allows
>  * us to stagger the slots where differing priorities run in a way that
>  * keeps latency differences between different nice levels at a minimum.
>  * ie, where 0 means a slot for that priority, priority running from left to
>  * right:
>  * nice -20 0000000000000000000000000000000000000000
>  * nice -10 1001000100100010001001000100010010001000
>  * nice   0 0101010101010101010101010101010101010101
>  * nice   5 1101011010110101101011010110101101011011
>  * nice  10 0110111011011101110110111011101101110111
>  * nice  15 0111110111111011111101111101111110111111
>  * nice  19 1111111111111111111011111111111111111111
>  */
> 
> Nice allocates bandwidth, but as long as the CPU is busy, tasks always
> proceed downward in priority until they hit the expired array.  That's
> the design.

There's another aspect of this that may require some thought - kernel
threads.  As load increases, so does rotation length.  Would you really
want CPU hogs routinely preempting house-keepers under load?

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-10  2:39                               ` Mike Galbraith
@ 2007-04-10 11:23                                 ` Ed Tomlinson
  2007-04-10 12:04                                   ` Mike Galbraith
  0 siblings, 1 reply; 92+ messages in thread
From: Ed Tomlinson @ 2007-04-10 11:23 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: LKML, Con Kolivas, Ingo Molnar, Andrew Morton, ck list

On Monday 09 April 2007 22:39, Mike Galbraith wrote:
> On Mon, 2007-04-09 at 07:38 +0200, Mike Galbraith wrote:
> 
> > I don't think you can have very much effect on latency using nice with
> > SD once the CPU is fully utilized.  See below.
> > 
> > /*
> >  * This contains a bitmap for each dynamic priority level with empty slots
> >  * for the valid priorities each different nice level can have. It allows
> >  * us to stagger the slots where differing priorities run in a way that
> >  * keeps latency differences between different nice levels at a minimum.
> >  * ie, where 0 means a slot for that priority, priority running from left to
> >  * right:
> >  * nice -20 0000000000000000000000000000000000000000
> >  * nice -10 1001000100100010001001000100010010001000
> >  * nice   0 0101010101010101010101010101010101010101
> >  * nice   5 1101011010110101101011010110101101011011
> >  * nice  10 0110111011011101110110111011101101110111
> >  * nice  15 0111110111111011111101111101111110111111
> >  * nice  19 1111111111111111111011111111111111111111
> >  */
> > 
> > Nice allocates bandwidth, but as long as the CPU is busy, tasks always
> > proceed downward in priority until they hit the expired array.  That's
> > the design.
> 
> There's another aspect of this that may require some thought - kernel
> threads.  As load increases, so does rotation length.  Would you really
> want CPU hogs routinely preempting house-keepers under load?

SD has a schedule batch nice level.  This is good for tasks that want lots
of cpu when they can get it.  If you overload your cpu I expect the box
to slow down - including kernel threads.  If really required they can be
started with a higher priority...

Ed

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: Ten percent test
  2007-04-10 11:23                                 ` Ed Tomlinson
@ 2007-04-10 12:04                                   ` Mike Galbraith
  0 siblings, 0 replies; 92+ messages in thread
From: Mike Galbraith @ 2007-04-10 12:04 UTC (permalink / raw)
  To: Ed Tomlinson; +Cc: LKML, Con Kolivas, Ingo Molnar, Andrew Morton, ck list

On Tue, 2007-04-10 at 07:23 -0400, Ed Tomlinson wrote:
> On Monday 09 April 2007 22:39, Mike Galbraith wrote:
> > On Mon, 2007-04-09 at 07:38 +0200, Mike Galbraith wrote:
> > 
> > > I don't think you can have very much effect on latency using nice with
> > > SD once the CPU is fully utilized.  See below.
> > > 
> > > /*
> > >  * This contains a bitmap for each dynamic priority level with empty slots
> > >  * for the valid priorities each different nice level can have. It allows
> > >  * us to stagger the slots where differing priorities run in a way that
> > >  * keeps latency differences between different nice levels at a minimum.
> > >  * ie, where 0 means a slot for that priority, priority running from left to
> > >  * right:
> > >  * nice -20 0000000000000000000000000000000000000000
> > >  * nice -10 1001000100100010001001000100010010001000
> > >  * nice   0 0101010101010101010101010101010101010101
> > >  * nice   5 1101011010110101101011010110101101011011
> > >  * nice  10 0110111011011101110110111011101101110111
> > >  * nice  15 0111110111111011111101111101111110111111
> > >  * nice  19 1111111111111111111011111111111111111111
> > >  */
> > > 
> > > Nice allocates bandwidth, but as long as the CPU is busy, tasks always
> > > proceed downward in priority until they hit the expired array.  That's
> > > the design.
> > 
> > There's another aspect of this that may require some thought - kernel
> > threads.  As load increases, so does rotation length.  Would you really
> > want CPU hogs routinely preempting house-keepers under load?
> 
> SD has a schedule batch nice level.  This is good for tasks that want lots
> of cpu when they can get it.  If you overload your cpu I expect the box
> to slow down - including kernel threads.  If really required they can be
> started with a higher priority...

Sure.  Anything that is latency sensitive, and those kernel threads that
are necessary for system function can be made RT to bypass the designed
in latency.  It's just another thing that should be considered before
integration.  Now if burst loads (only one of which it the desktop)
would just cease to exist...

	-Mike


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [ck] Re: Ten percent test
  2007-04-09 16:50                                 ` Mike Galbraith
@ 2007-04-22 10:48                                   ` Martin Steigerwald
  2007-04-22 11:15                                     ` Con Kolivas
  0 siblings, 1 reply; 92+ messages in thread
From: Martin Steigerwald @ 2007-04-22 10:48 UTC (permalink / raw)
  To: ck; +Cc: Mike Galbraith, Ed Tomlinson, Andrew Morton, linux list

Am Montag 09 April 2007 schrieb Mike Galbraith:
> On Mon, 2007-04-09 at 07:26 -0400, Ed Tomlinson wrote:
> > On Monday 09 April 2007 01:38, Mike Galbraith wrote:
> > > On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote:
> > > > Hi,
> > > >
> > > > I am one of those who have been happily testing Con's patches.
> > > >
> > > > They work better than mainline here.
> > >
> > > (I tried a UP kernel yesterday, and even a single kernel build
> > > would make noticeable hitches if I move a window around. YMMV etc.)
> >
> > Interesting.  I run UP amd64, 1000HZ, 1.25G, preempt off (on causes
> > kernel stalls with no messages - but that is another story).  I do
> > not notice a single make.   When several are running the desktop
> > slows down a bit.  I do not have X niced.  Wonder why we see such
> > different results?
>
> Probably because with your processor, in general cc1 can get the job
> done faster, as can X.  The latency big hit happens when you hit the
> end of the rotation.  You simply don't hit it as often as I do.  Anyone
> with an old PIII box should hit the wall very quickly indeed.  I
> haven't had time to try it here.

Hi!

I am running 2.6.20.7 + sd-0.44 on an IBM ThinkPad T23 that I use as my 
Amarok machine[1]. It  has a Pentium 3 with 1.13 GHz using ondemand 
frequency scaling and XFS as filesystem.

So far music playback has been perfect even when I had it building kernel 
packages while wildly clicking around starting apps and then moving the 
Amarok window like mad while solid window moving is enabled. Amarok / 
xine continued to play the music totally unimpressed of that.

So for me from a users point of view who wants good music playback *no 
matter what*, this is already perfect. Also the desktop feels quite 
snappy to me. It was only slow on anything I/O bound but thats 
understandable IMHO when make-kpkg tar -bzips the kernel source while 20 
KDE applications are starting and Amarok plays music.

Should I try any specific tests? This also goes out to anybody else, 
especially to you, Con.  So if you want me to run some benchmarks, please 
tell me. I am not experienced in benchmarking, but if you tell me what to 
do, I can try it out. I prefer benchmarks that do not disrupt music 
playback, but can run more aggressive benchmarks over night. I think it 
might be good to use a benchmark that isn't I/O bound to really test the 
scheduler... but as said I am no expert on that and real life loads 
usually are I/O bound as well.

Have to have an carefully eye on the harddisk though...

Apr 22 11:51:06 deepdance smartd[3116]: Device: /dev/sda, SMART Prefailure 
Attribute: 3 Spin_Up_Time changed from 154 to 150

(well threshold is at 033, so still plenty to go, hope it will take some 
time till the next change)

[1] http://martin-steigerwald.de/amarok-machine/ ;)

Regards,
-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [ck] Re: Ten percent test
  2007-04-22 10:48                                   ` [ck] " Martin Steigerwald
@ 2007-04-22 11:15                                     ` Con Kolivas
  0 siblings, 0 replies; 92+ messages in thread
From: Con Kolivas @ 2007-04-22 11:15 UTC (permalink / raw)
  To: ck
  Cc: Martin Steigerwald, Ed Tomlinson, Mike Galbraith, Andrew Morton,
	linux list

On Sunday 22 April 2007 20:48, Martin Steigerwald wrote:
> Am Montag 09 April 2007 schrieb Mike Galbraith:
> > On Mon, 2007-04-09 at 07:26 -0400, Ed Tomlinson wrote:
> > > On Monday 09 April 2007 01:38, Mike Galbraith wrote:
> > > > On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote:
> > > > > Hi,
> > > > >
> > > > > I am one of those who have been happily testing Con's patches.
> > > > >
> > > > > They work better than mainline here.
> > > >
> > > > (I tried a UP kernel yesterday, and even a single kernel build
> > > > would make noticeable hitches if I move a window around. YMMV etc.)
> > >
> > > Interesting.  I run UP amd64, 1000HZ, 1.25G, preempt off (on causes
> > > kernel stalls with no messages - but that is another story).  I do
> > > not notice a single make.   When several are running the desktop
> > > slows down a bit.  I do not have X niced.  Wonder why we see such
> > > different results?
> >
> > Probably because with your processor, in general cc1 can get the job
> > done faster, as can X.  The latency big hit happens when you hit the
> > end of the rotation.  You simply don't hit it as often as I do.  Anyone
> > with an old PIII box should hit the wall very quickly indeed.  I
> > haven't had time to try it here.
>
> Hi!
>
> I am running 2.6.20.7 + sd-0.44 on an IBM ThinkPad T23 that I use as my
> Amarok machine[1]. It  has a Pentium 3 with 1.13 GHz using ondemand
> frequency scaling and XFS as filesystem.
>
> So far music playback has been perfect even when I had it building kernel
> packages while wildly clicking around starting apps and then moving the
> Amarok window like mad while solid window moving is enabled. Amarok /
> xine continued to play the music totally unimpressed of that.
>
> So for me from a users point of view who wants good music playback *no
> matter what*, this is already perfect. Also the desktop feels quite
> snappy to me. It was only slow on anything I/O bound but thats
> understandable IMHO when make-kpkg tar -bzips the kernel source while 20
> KDE applications are starting and Amarok plays music.
>
> Should I try any specific tests? This also goes out to anybody else,
> especially to you, Con.  So if you want me to run some benchmarks, please
> tell me. I am not experienced in benchmarking, but if you tell me what to
> do, I can try it out. I prefer benchmarks that do not disrupt music
> playback, but can run more aggressive benchmarks over night. I think it
> might be good to use a benchmark that isn't I/O bound to really test the
> scheduler... but as said I am no expert on that and real life loads
> usually are I/O bound as well.
>
> Have to have an carefully eye on the harddisk though...
>
> Apr 22 11:51:06 deepdance smartd[3116]: Device: /dev/sda, SMART Prefailure
> Attribute: 3 Spin_Up_Time changed from 154 to 150
>
> (well threshold is at 033, so still plenty to go, hope it will take some
> time till the next change)
>
> [1] http://martin-steigerwald.de/amarok-machine/ ;)

Thanks for the report. In your case, you've done the testing I require; that 
for your workloads everything works as you'd desire it without obvious 
problems. Just keeping an eye on newer versions if you have the time and 
inclination and making sure that everything stays as you expect it would be 
the most helpful thing you can do.

Thanks!

-- 
-ck

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] sched: staircase deadline misc fixes
  2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas
                   ` (2 preceding siblings ...)
  2007-03-29  6:36 ` Con Kolivas
@ 2007-04-23  8:58 ` Andrew Morton
  3 siblings, 0 replies; 92+ messages in thread
From: Andrew Morton @ 2007-04-23  8:58 UTC (permalink / raw)
  To: Con Kolivas; +Cc: linux list, Ingo Molnar, Andy Whitcroft, ck list

On Thu, 29 Mar 2007 02:37:38 +1000 Con Kolivas <kernel@kolivas.org> wrote:

> test.kernel.org found some idle time regressions in the latest update to the
> staircase deadline scheduler and Andy Whitcroft helped me track down the 
> offending problem which was present in all previous RSDL schedulers but
> previously wouldn't be manifest without changes in nice. So here is a bugfix
> for the set_load_weight being incorrectly set and a few other minor 
> improvements. Thanks Andy!
> 
> I'm cautiously optimistic that we're at the thin edge of the bugfix wedge now.
> 
> ---
> set_load_weight() should be performed after p->quota is set. This fixes a
> large SMP performance regression.
> 
> Make sure rr_interval is never set to less than one jiffy.
> 
> Some sanity checking in update_cpu_clock will prevent bogus sched_clock
> values.
> 
> SCHED_BATCH tasks should not set the rq->best_static_prio field.
> 
> Correct sysctl rr_interval description to describe the value in milliseconds.
> 
> Style fixes.
> 
> Signed-off-by: Con Kolivas <kernel@kolivas.org>
> 
> ---
>  Documentation/sysctl/kernel.txt |    8 ++--
>  kernel/sched.c                  |   73 +++++++++++++++++++++++++++++-----------

OK, this is bizarre.  I'm getting this:

[   52.754522] RTNL: assertion failed at net/ipv4/devinet.c (1055)
[   52.758258]  [<c02cb6f7>] inetdev_event+0x46/0x2d8
[   52.762041]  [<c01049c9>] show_trace_log_lvl+0x28/0x2c
[   52.765887]  [<c0105482>] show_trace+0xf/0x13
[   52.769627]  [<c01054d7>] dump_stack+0x14/0x18
[   52.773320]  [<c029b22e>] rtnl_unlock+0xd/0x2f
[   52.776999]  [<c029f410>] fib_rules_event+0x3a/0xeb
[   52.780678]  [<c01236aa>] notifier_call_chain+0x2c/0x55
[   52.784339]  [<c012371a>] raw_notifier_call_chain+0x17/0x1b
[   52.787975]  [<c0295984>] dev_open+0x63/0x6b
[   52.791587]  [<c02944fd>] dev_change_flags+0x50/0x104
[   52.795201]  [<c02cbcf4>] devinet_ioctl+0x259/0x57b
[   52.798798]  [<c02955b2>] dev_ifsioc+0x113/0x3a0
[   52.802408]  [<c028b127>] sock_ioctl+0x1a1/0x1c4
[   52.805966]  [<c028af86>] sock_ioctl+0x0/0x1c4
[   52.809475]  [<c0165969>] do_ioctl+0x19/0x4d
[   52.812977]  [<c0165b99>] vfs_ioctl+0x1fc/0x216
[   52.816478]  [<c0165bff>] sys_ioctl+0x4c/0x65
[   52.819944]  [<c0103b68>] syscall_call+0x7/0xb
[   52.823395]  =======================
[   52.826923] RTNL: assertion failed at net/ipv4/igmp.c (1358)
[   52.830485]  [<c02cf545>] ip_mc_up+0x35/0x59
[   52.834034]  [<c029b22e>] rtnl_unlock+0xd/0x2f
[   52.837569]  [<c02cb7ed>] inetdev_event+0x13c/0x2d8
[   52.841123]  [<c01049c9>] show_trace_log_lvl+0x28/0x2c
[   52.844682]  [<c0105482>] show_trace+0xf/0x13
[   52.848227]  [<c01054d7>] dump_stack+0x14/0x18
[   52.851752]  [<c029b22e>] rtnl_unlock+0xd/0x2f
[   52.855242]  [<c029f410>] fib_rules_event+0x3a/0xeb
[   52.858734]  [<c01236aa>] notifier_call_chain+0x2c/0x55
[   52.862241]  [<c012371a>] raw_notifier_call_chain+0x17/0x1b
[   52.865759]  [<c0295984>] dev_open+0x63/0x6b
[   52.869191]  [<c02944fd>] dev_change_flags+0x50/0x104
[   52.872571]  [<c02cbcf4>] devinet_ioctl+0x259/0x57b
[   52.875998]  [<c02955b2>] dev_ifsioc+0x113/0x3a0
[   52.879399]  [<c028b127>] sock_ioctl+0x1a1/0x1c4
[   52.882741]  [<c028af86>] sock_ioctl+0x0/0x1c4
[   52.886025]  [<c0165969>] do_ioctl+0x19/0x4d
[   52.889292]  [<c0165b99>] vfs_ioctl+0x1fc/0x216
[   52.892534]  [<c0165bff>] sys_ioctl+0x4c/0x65
[   52.895760]  [<c0103b68>] syscall_call+0x7/0xb
[   52.898982]  =======================
[   52.907714] RTNL: assertion failed at net/ipv4/igmp.c (1205)
[   52.910229]  [<c02cf3b7>] ip_mc_inc_group+0x3c/0x195
[   52.912771]  [<c01054d7>] dump_stack+0x14/0x18
[   52.915314]  [<c02cf551>] ip_mc_up+0x41/0x59
[   52.917856]  [<c029b22e>] rtnl_unlock+0xd/0x2f
[   52.920411]  [<c02cb7ed>] inetdev_event+0x13c/0x2d8
[   52.922990]  [<c01049c9>] show_trace_log_lvl+0x28/0x2c
[   52.925568]  [<c0105482>] show_trace+0xf/0x13
[   52.928101]  [<c01054d7>] dump_stack+0x14/0x18
[   52.930591]  [<c029b22e>] rtnl_unlock+0xd/0x2f
[   52.933061]  [<c029f410>] fib_rules_event+0x3a/0xeb
[   52.935551]  [<c01236aa>] notifier_call_chain+0x2c/0x55
[   52.938071]  [<c012371a>] raw_notifier_call_chain+0x17/0x1b
[   52.940605]  [<c0295984>] dev_open+0x63/0x6b
[   52.943141]  [<c02944fd>] dev_change_flags+0x50/0x104
[   52.945670]  [<c02cbcf4>] devinet_ioctl+0x259/0x57b
[   52.948191]  [<c02955b2>] dev_ifsioc+0x113/0x3a0
[   52.950698]  [<c028b127>] sock_ioctl+0x1a1/0x1c4
[   52.953185]  [<c028af86>] sock_ioctl+0x0/0x1c4
[   52.955656]  [<c0165969>] do_ioctl+0x19/0x4d
[   52.958122]  [<c0165b99>] vfs_ioctl+0x1fc/0x216
[   52.960590]  [<c0165bff>] sys_ioctl+0x4c/0x65
[   52.963058]  [<c0103b68>] syscall_call+0x7/0xb
[   52.965523]  =======================

and bisection shows that this patch is where it starts happening.

I see no way in which this patch can cause ASSERT_RTNL to start triggering. 
Could be the there are dynamic changes which are triggering some problem in
the networking tree, but the net code looks straightforward enough.

Anyway, after a few such traces things seem to settle down and there are no
apparent problems, so I guess I'll just ship it as-is.

Config is http://userweb.kernel.org/~akpm/config-sony.txt

^ permalink raw reply	[flat|nested] 92+ messages in thread

end of thread, other threads:[~2007-04-23  8:59 UTC | newest]

Thread overview: 92+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas
2007-03-28 17:34 ` [ck] " Prakash Punnoor
2007-04-01  6:40   ` Prakash Punnoor
     [not found]     ` <b14e81f00704010724i3155a16en91074ab789416f3d@mail.gmail.com>
2007-04-01 20:03       ` Prakash Punnoor
2007-03-28 18:48 ` Ingo Molnar
2007-03-28 23:44   ` Con Kolivas
2007-03-29  5:50     ` Mike Galbraith
2007-03-29  6:29       ` Mike Galbraith
2007-03-29  6:54         ` Mike Galbraith
2007-03-29  8:18       ` Mike Galbraith
2007-03-29 12:55         ` [ck] " michael chang
2007-04-03  2:35         ` Con Kolivas
2007-04-03  2:37       ` Con Kolivas
2007-04-03  5:31         ` Mike Galbraith
2007-04-03  6:00           ` Mike Galbraith
2007-04-03  6:01           ` Ingo Molnar
2007-04-03  6:11             ` Mike Galbraith
2007-04-05 11:02             ` Mike Galbraith
2007-04-05 11:09               ` Ingo Molnar
2007-04-05 11:12                 ` Mike Galbraith
2007-04-05 11:15                   ` Ingo Molnar
2007-04-05 13:18                   ` Johannes Stezenbach
2007-04-05 15:28                     ` Mike Galbraith
2007-04-05 11:54               ` [test] sched: SD-latest versus Mike's latest Ingo Molnar
2007-04-05 12:10                 ` Mike Galbraith
2007-04-05 12:12                   ` Ingo Molnar
2007-04-05 12:24                     ` Mike Galbraith
2007-04-05 16:08                 ` Con Kolivas
2007-04-05 19:05                   ` Ingo Molnar
2007-04-05 20:29                   ` Mike Galbraith
2007-04-06  1:03                 ` Ten percent test Con Kolivas
2007-04-06  9:07                   ` Mike Galbraith
2007-04-06  9:28                     ` Con Kolivas
2007-04-06 10:03                       ` Ingo Molnar
2007-04-06 10:40                         ` Mike Galbraith
2007-04-07  6:50                         ` Con Kolivas
2007-04-07 16:12                           ` Gene Heskett
2007-04-07 18:08                             ` Ingo Molnar
2007-04-07 18:23                               ` Gene Heskett
2007-04-07 18:52                                 ` Ingo Molnar
2007-04-07 20:30                                   ` Gene Heskett
2007-04-08 10:41                                     ` Ingo Molnar
2007-04-08 10:58                                       ` Ingo Molnar
2007-04-08 17:04                                         ` Gene Heskett
2007-04-09  4:03                                           ` Mike Galbraith
2007-04-09  4:08                                             ` Gene Heskett
2007-04-09  5:59                                               ` Mike Galbraith
2007-04-09 13:01                                                 ` Gene Heskett
2007-04-08 11:33                                       ` Gene Heskett
2007-04-08 11:40                                         ` Mike Galbraith
2007-04-08 12:02                                           ` Mike Galbraith
2007-04-08 17:57                                             ` Gene Heskett
2007-04-09  4:19                                               ` Mike Galbraith
2007-04-09  5:23                                                 ` Gene Heskett
2007-04-09  6:09                                                   ` Mike Galbraith
2007-04-08 17:56                                           ` Gene Heskett
2007-04-09  4:17                                             ` Mike Galbraith
2007-04-09  5:16                                               ` Gene Heskett
2007-04-09  6:06                                                 ` Mike Galbraith
2007-04-09  8:24                                                 ` Mike Galbraith
2007-04-08 18:51                                       ` Rene Herman
2007-04-09  4:23                                         ` Mike Galbraith
2007-04-09 12:14                                           ` Rene Herman
2007-04-09 13:27                                             ` Andreas Mohr
2007-04-09 19:54                                               ` Rene Herman
2007-04-09 14:15                                             ` Ingo Molnar
2007-04-09 17:05                                               ` Rene Herman
2007-04-09 17:48                                                 ` Ingo Molnar
2007-04-09 19:09                                                   ` Rene Herman
2007-04-09 19:56                                                   ` Gene Heskett
2007-04-09 17:10                                             ` Mike Galbraith
2007-04-09 13:53                                         ` Ingo Molnar
2007-04-09 15:37                                           ` Rene Herman
2007-04-07 19:14                               ` Mike Galbraith
2007-04-07 20:31                                 ` Gene Heskett
2007-04-09 17:51                                 ` William Lee Irwin III
2007-04-09 18:03                                   ` Ingo Molnar
2007-04-09 18:44                                     ` William Lee Irwin III
2007-04-07 16:32                           ` Mike Galbraith
2007-04-08 13:08                           ` Ed Tomlinson
2007-04-09  5:38                             ` Mike Galbraith
2007-04-09 11:26                               ` Ed Tomlinson
2007-04-09 16:50                                 ` Mike Galbraith
2007-04-22 10:48                                   ` [ck] " Martin Steigerwald
2007-04-22 11:15                                     ` Con Kolivas
2007-04-10  2:39                               ` Mike Galbraith
2007-04-10 11:23                                 ` Ed Tomlinson
2007-04-10 12:04                                   ` Mike Galbraith
2007-04-06 10:48                       ` Mike Galbraith
2007-04-03 10:57           ` [PATCH] sched: staircase deadline misc fixes Mike Galbraith
2007-03-29  6:36 ` Con Kolivas
2007-04-23  8:58 ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).