LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [git pull] scheduler updates
@ 2008-02-29 18:04 Ingo Molnar
  0 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-02-29 18:04 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Andrew Morton


Linus, please pull the latest scheduler fixes git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

warning: the rcupreempt patch is a bit larger than what i'd like to do 
for -rc4 but rcupreempt is a new feature, the fix has been in the queue 
for ages (with no changes to it) - and half of the patch's size is 
comments. It has no effect on !RCUPREEMPT || !NOHZ kernels. [this fix 
became more urgent because on larger SMP it's rather easy to trigger a 
boot time hang.]

Thanks,

	Ingo

------------------>
Dmitry Adamushko (1):
      softlockup: fix task state setting

Steven Rostedt (1):
      rcu: add support for dynamic ticks and preempt rcu

 include/linux/hardirq.h    |   10 ++
 include/linux/rcuclassic.h |    3 +
 include/linux/rcupreempt.h |   22 +++++
 kernel/rcupreempt.c        |  224 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/softirq.c           |    1 +
 kernel/softlockup.c        |   13 ++-
 kernel/time/tick-sched.c   |    3 +
 7 files changed, 266 insertions(+), 10 deletions(-)

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 2961ec7..4982998 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -109,6 +109,14 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
+#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+extern void rcu_irq_enter(void);
+extern void rcu_irq_exit(void);
+#else
+# define rcu_irq_enter() do { } while (0)
+# define rcu_irq_exit() do { } while (0)
+#endif /* CONFIG_PREEMPT_RCU */
+
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
  * because NMI handlers may not preempt and the ops are
@@ -117,6 +125,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
  */
 #define __irq_enter()					\
 	do {						\
+		rcu_irq_enter();			\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
@@ -135,6 +144,7 @@ extern void irq_enter(void);
 		trace_hardirq_exit();			\
 		account_system_vtime(current);		\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
+		rcu_irq_exit();				\
 	} while (0)
 
 /*
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 4d66242..b3dccd6 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -160,5 +160,8 @@ extern void rcu_restart_cpu(int cpu);
 extern long rcu_batches_completed(void);
 extern long rcu_batches_completed_bh(void);
 
+#define rcu_enter_nohz()	do { } while (0)
+#define rcu_exit_nohz()		do { } while (0)
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 60c2a03..01152ed 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -82,5 +82,27 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
 
 struct softirq_action;
 
+#ifdef CONFIG_NO_HZ
+DECLARE_PER_CPU(long, dynticks_progress_counter);
+
+static inline void rcu_enter_nohz(void)
+{
+	__get_cpu_var(dynticks_progress_counter)++;
+	WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1);
+	mb();
+}
+
+static inline void rcu_exit_nohz(void)
+{
+	mb();
+	__get_cpu_var(dynticks_progress_counter)++;
+	WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1));
+}
+
+#else /* CONFIG_NO_HZ */
+#define rcu_enter_nohz()	do { } while (0)
+#define rcu_exit_nohz()		do { } while (0)
+#endif /* CONFIG_NO_HZ */
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPREEMPT_H */
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 987cfb7..c7c5209 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -23,6 +23,10 @@
  *		to Suparna Bhattacharya for pushing me completely away
  *		from atomic instructions on the read side.
  *
+ *  - Added handling of Dynamic Ticks
+ *      Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
+ *                     - Steven Rostedt <srostedt@redhat.com>
+ *
  * Papers:  http://www.rdrop.com/users/paulmck/RCU
  *
  * Design Document: http://lwn.net/Articles/253651/
@@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
 	}
 }
 
+#ifdef CONFIG_NO_HZ
+
+DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
+static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+static DEFINE_PER_CPU(int, rcu_update_flag);
+
+/**
+ * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * dynticks_progress_counter to let the RCU handling know that the
+ * CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+	int cpu = smp_processor_id();
+
+	if (per_cpu(rcu_update_flag, cpu))
+		per_cpu(rcu_update_flag, cpu)++;
+
+	/*
+	 * Only update if we are coming from a stopped ticks mode
+	 * (dynticks_progress_counter is even).
+	 */
+	if (!in_interrupt() &&
+	    (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+		/*
+		 * The following might seem like we could have a race
+		 * with NMI/SMIs. But this really isn't a problem.
+		 * Here we do a read/modify/write, and the race happens
+		 * when an NMI/SMI comes in after the read and before
+		 * the write. But NMI/SMIs will increment this counter
+		 * twice before returning, so the zero bit will not
+		 * be corrupted by the NMI/SMI which is the most important
+		 * part.
+		 *
+		 * The only thing is that we would bring back the counter
+		 * to a postion that it was in during the NMI/SMI.
+		 * But the zero bit would be set, so the rest of the
+		 * counter would again be ignored.
+		 *
+		 * On return from the IRQ, the counter may have the zero
+		 * bit be 0 and the counter the same as the return from
+		 * the NMI/SMI. If the state machine was so unlucky to
+		 * see that, it still doesn't matter, since all
+		 * RCU read-side critical sections on this CPU would
+		 * have already completed.
+		 */
+		per_cpu(dynticks_progress_counter, cpu)++;
+		/*
+		 * The following memory barrier ensures that any
+		 * rcu_read_lock() primitives in the irq handler
+		 * are seen by other CPUs to follow the above
+		 * increment to dynticks_progress_counter. This is
+		 * required in order for other CPUs to correctly
+		 * determine when it is safe to advance the RCU
+		 * grace-period state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		/*
+		 * Since we can't determine the dynamic tick mode from
+		 * the dynticks_progress_counter after this routine,
+		 * we use a second flag to acknowledge that we came
+		 * from an idle state with ticks stopped.
+		 */
+		per_cpu(rcu_update_flag, cpu)++;
+		/*
+		 * If we take an NMI/SMI now, they will also increment
+		 * the rcu_update_flag, and will not update the
+		 * dynticks_progress_counter on exit. That is for
+		 * this IRQ to do.
+		 */
+	}
+}
+
+/**
+ * rcu_irq_exit - Called from exiting Hard irq context.
+ *
+ * If the CPU was idle with dynamic ticks active, update the
+ * dynticks_progress_counter to put let the RCU handling be
+ * aware that the CPU is going back to idle with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+	int cpu = smp_processor_id();
+
+	/*
+	 * rcu_update_flag is set if we interrupted the CPU
+	 * when it was idle with ticks stopped.
+	 * Once this occurs, we keep track of interrupt nesting
+	 * because a NMI/SMI could also come in, and we still
+	 * only want the IRQ that started the increment of the
+	 * dynticks_progress_counter to be the one that modifies
+	 * it on exit.
+	 */
+	if (per_cpu(rcu_update_flag, cpu)) {
+		if (--per_cpu(rcu_update_flag, cpu))
+			return;
+
+		/* This must match the interrupt nesting */
+		WARN_ON(in_interrupt());
+
+		/*
+		 * If an NMI/SMI happens now we are still
+		 * protected by the dynticks_progress_counter being odd.
+		 */
+
+		/*
+		 * The following memory barrier ensures that any
+		 * rcu_read_unlock() primitives in the irq handler
+		 * are seen by other CPUs to preceed the following
+		 * increment to dynticks_progress_counter. This
+		 * is required in order for other CPUs to determine
+		 * when it is safe to advance the RCU grace-period
+		 * state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		per_cpu(dynticks_progress_counter, cpu)++;
+		WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+	}
+}
+
+static void dyntick_save_progress_counter(int cpu)
+{
+	per_cpu(rcu_dyntick_snapshot, cpu) =
+		per_cpu(dynticks_progress_counter, cpu);
+}
+
+static inline int
+rcu_try_flip_waitack_needed(int cpu)
+{
+	long curr;
+	long snap;
+
+	curr = per_cpu(dynticks_progress_counter, cpu);
+	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU remained in dynticks mode for the entire time
+	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
+	 * then it cannot be in the middle of an rcu_read_lock(), so
+	 * the next rcu_read_lock() it executes must use the new value
+	 * of the counter.  So we can safely pretend that this CPU
+	 * already acknowledged the counter.
+	 */
+
+	if ((curr == snap) && ((curr & 0x1) == 0))
+		return 0;
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq handlers, then, as above, we can safely pretend
+	 * that this CPU already acknowledged the counter.
+	 */
+
+	if ((curr - snap) > 2 || (snap & 0x1) == 0)
+		return 0;
+
+	/* We need this CPU to explicitly acknowledge the counter flip. */
+
+	return 1;
+}
+
+static inline int
+rcu_try_flip_waitmb_needed(int cpu)
+{
+	long curr;
+	long snap;
+
+	curr = per_cpu(dynticks_progress_counter, cpu);
+	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU remained in dynticks mode for the entire time
+	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
+	 * then it cannot have executed an RCU read-side critical section
+	 * during that time, so there is no need for it to execute a
+	 * memory barrier.
+	 */
+
+	if ((curr == snap) && ((curr & 0x1) == 0))
+		return 0;
+
+	/*
+	 * If the CPU either entered or exited an outermost interrupt,
+	 * SMI, NMI, or whatever handler, then we know that it executed
+	 * a memory barrier when doing so.  So we don't need another one.
+	 */
+	if (curr != snap)
+		return 0;
+
+	/* We need the CPU to execute a memory barrier. */
+
+	return 1;
+}
+
+#else /* !CONFIG_NO_HZ */
+
+# define dyntick_save_progress_counter(cpu)	do { } while (0)
+# define rcu_try_flip_waitack_needed(cpu)	(1)
+# define rcu_try_flip_waitmb_needed(cpu)	(1)
+
+#endif /* CONFIG_NO_HZ */
+
 /*
  * Get here when RCU is idle.  Decide whether we need to
  * move out of idle state, and return non-zero if so.
@@ -447,8 +657,10 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
+		dyntick_save_progress_counter(cpu);
+	}
 
 	return 1;
 }
@@ -464,7 +676,8 @@ rcu_try_flip_waitack(void)
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
 	for_each_cpu_mask(cpu, rcu_cpu_online_map)
-		if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
+		if (rcu_try_flip_waitack_needed(cpu) &&
+		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
 			return 0;
 		}
@@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
+		dyntick_save_progress_counter(cpu);
+	}
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
 	return 1;
@@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void)
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
 	for_each_cpu_mask(cpu, rcu_cpu_online_map)
-		if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
+		if (rcu_try_flip_waitmb_needed(cpu) &&
+		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
 			return 0;
 		}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5b3aea5..31e9f2a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -313,6 +313,7 @@ void irq_exit(void)
 	/* Make sure that timer wheel updates are propagated */
 	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
 		tick_nohz_stop_sched_tick();
+	rcu_irq_exit();
 #endif
 	preempt_enable_no_resched();
 }
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7c2da88..01b6522 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu)
 	/* initialize timestamp */
 	touch_softlockup_watchdog();
 
+	set_current_state(TASK_INTERRUPTIBLE);
 	/*
 	 * Run briefly once per second to reset the softlockup timestamp.
 	 * If this gets delayed for more than 60 seconds then the
 	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		set_current_state(TASK_INTERRUPTIBLE);
 		touch_softlockup_watchdog();
 		schedule();
 
 		if (kthread_should_stop())
 			break;
 
-		if (this_cpu != check_cpu)
-			continue;
-
-		if (sysctl_hung_task_timeout_secs)
-			check_hung_uninterruptible_tasks(this_cpu);
+		if (this_cpu == check_cpu) {
+			if (sysctl_hung_task_timeout_secs)
+				check_hung_uninterruptible_tasks(this_cpu);
+		}
 
+		set_current_state(TASK_INTERRUPTIBLE);
 	}
+	__set_current_state(TASK_RUNNING);
 
 	return 0;
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fa9bb73..2968298 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void)
 			ts->idle_tick = ts->sched_timer.expires;
 			ts->tick_stopped = 1;
 			ts->idle_jiffies = last_jiffies;
+			rcu_enter_nohz();
 		}
 
 		/*
@@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void)
 		return;
 	}
 
+	rcu_exit_nohz();
+
 	/* Update jiffies first */
 	select_nohz_load_balancer(0);
 	now = ktime_get();


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [GIT PULL] scheduler updates
@ 2018-02-06 21:38 Ingo Molnar
  0 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2018-02-06 21:38 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Mike Galbraith,
	Paul E. McKenney, Mathieu Desnoyers, Andrew Morton

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus

   # HEAD: 82845079160817cc6ac64e5321bbd935e0a47b3a Merge branch 'linus' into sched/urgent, to resolve conflicts

This tree includes the following changes:

 - membarrier updates (Mathieu Desnoyers)

 - SMP balancing optimizations (Mel Gorman)

 - stats update optimizations (Peter Zijlstra)

 - RT scheduler race fixes (Steven Rostedt)

 - misc fixes and updates

Note: I merged in a (tested) version of your tree to resolve three separate 
conflicts, two trivial, the third one a bit more complex - but feel free to pull 
the original sha1 before the merge as well.

 Thanks,

	Ingo

------------------>
Mathieu Desnoyers (11):
      membarrier/selftest: Test private expedited command
      powerpc, membarrier: Skip memory barrier in switch_mm()
      membarrier: Document scheduler barrier requirements
      membarrier: Provide GLOBAL_EXPEDITED command
      membarrier/selftest: Test global expedited command
      locking: Introduce sync_core_before_usermode()
      lockin/x86: Implement sync_core_before_usermode()
      membarrier: Provide core serializing command, *_SYNC_CORE
      membarrier/x86: Provide core serializing command
      membarrier/arm64: Provide core serializing command
      membarrier/selftest: Test private expedited sync core command

Mel Gorman (4):
      sched/fair: Remove unnecessary parameters from wake_affine_idle()
      sched/fair: Restructure wake_affine*() to return a CPU id
      sched/fair: Do not migrate if the prev_cpu is idle
      sched/fair: Use a recently used CPU as an idle candidate and the basis for SIS

Peter Zijlstra (2):
      sched/core: Optimize ttwu_stat()
      sched/core: Optimize update_stats_*()

Steven Rostedt (VMware) (2):
      sched/rt: Use container_of() to get root domain in rto_push_irq_work_func()
      sched/rt: Up the root domain ref count when passing it around via IPIs

Wen Yang (1):
      sched/rt: Make update_curr_rt() more accurate


 MAINTAINERS                                        |   1 +
 arch/arm64/Kconfig                                 |   1 +
 arch/arm64/kernel/entry.S                          |   4 +
 arch/powerpc/Kconfig                               |   1 +
 arch/powerpc/include/asm/membarrier.h              |  27 +++
 arch/powerpc/mm/mmu_context.c                      |   7 +
 arch/x86/Kconfig                                   |   2 +
 arch/x86/entry/entry_32.S                          |   5 +
 arch/x86/entry/entry_64.S                          |   4 +
 arch/x86/include/asm/sync_core.h                   |  28 +++
 arch/x86/mm/tlb.c                                  |   6 +
 include/linux/sched.h                              |   8 +
 include/linux/sched/mm.h                           |  35 ++-
 include/linux/sync_core.h                          |  21 ++
 include/uapi/linux/membarrier.h                    |  74 ++++++-
 init/Kconfig                                       |   9 +
 kernel/fork.c                                      |   5 +
 kernel/sched/core.c                                |  74 ++++---
 kernel/sched/fair.c                                | 101 +++++----
 kernel/sched/membarrier.c                          | 177 +++++++++++++--
 kernel/sched/rt.c                                  |  29 ++-
 kernel/sched/sched.h                               |   2 +
 kernel/sched/stats.h                               |   6 +
 kernel/sched/topology.c                            |  13 ++
 .../testing/selftests/membarrier/membarrier_test.c | 237 +++++++++++++++++++--
 25 files changed, 750 insertions(+), 127 deletions(-)
 create mode 100644 arch/powerpc/include/asm/membarrier.h
 create mode 100644 arch/x86/include/asm/sync_core.h
 create mode 100644 include/linux/sync_core.h

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-17 23:04                 ` Venki Pallipadi
@ 2008-11-17 23:13                   ` Ingo Molnar
  0 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-11-17 23:13 UTC (permalink / raw)
  To: Venki Pallipadi
  Cc: Linus Torvalds, Arjan van de Ven, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra, Mike Galbraith


* Venki Pallipadi <venkatesh.pallipadi@intel.com> wrote:

> On Mon, Nov 17, 2008 at 02:50:18PM -0800, Ingo Molnar wrote:
> > 
> > * Venki Pallipadi <venkatesh.pallipadi@intel.com> wrote:
> > 
> > > Patch being discussed on this thread (commit 0d12cdd) has a
> > > regression on one of the test systems here.
> > >
> > > With the patch, I see
> > >
> > > checking TSC synchronization [CPU#0 -> CPU#1]:
> > > Measured 28 cycles TSC warp between CPUs, turning off TSC clock.
> > > Marking TSC unstable due to check_tsc_sync_source failed
> > >
> > > Whereas, without the patch syncs pass fine on all CPUs
> > >
> > > checking TSC synchronization [CPU#0 -> CPU#1]: passed.
> > >
> > > Due to this, TSC is marke unstable, when it is not actually unstable.
> > > This is because syncs in check_tsc_wrap() goes away due to this commit.
> > >
> > > As per the discussion on this thread, correct way to fix this is to add
> > > explicit syncs as below?
> > 
> > ah. Yes.
> > 
> > Could you please check whether:
> > 
> > > +     rdtsc_barrier();
> > >       start = get_cycles();
> > > +     rdtsc_barrier();
> > >       /*
> > >        * The measurement runs for 20 msecs:
> > >        */
> > > @@ -61,7 +63,9 @@ static __cpuinit void check_tsc_warp(voi
> > >                */
> > >               __raw_spin_lock(&sync_lock);
> > >               prev = last_tsc;
> > > +             rdtsc_barrier();
> > >               now = get_cycles();
> > > +             rdtsc_barrier();
> > 
> > adding the barrier just _after_ the get_cycles() call (but not before
> > it) does the trick too? That should be enough in this case.
> >
> 
> With barrier only after get_cycles, I do see syncs across first few 
> CPUs passing. But later I see:
> 
> checking TSC synchronization [CPU#0 -> CPU#13]: Measured 4 cycles 
> TSC warp between CPUs, turning off TSC clock. Marking TSC unstable 
> due to check_tsc_sync_source failed

yeah - has to be surrounded, to make sure our last_tsc observation 
does not happen after the RDTSC.

I have applied your patch to tip/x86/urgent, thanks!

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-17 22:50               ` Ingo Molnar
@ 2008-11-17 23:04                 ` Venki Pallipadi
  2008-11-17 23:13                   ` Ingo Molnar
  0 siblings, 1 reply; 24+ messages in thread
From: Venki Pallipadi @ 2008-11-17 23:04 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pallipadi, Venkatesh, Linus Torvalds, Arjan van de Ven,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra,
	Mike Galbraith

On Mon, Nov 17, 2008 at 02:50:18PM -0800, Ingo Molnar wrote:
> 
> * Venki Pallipadi <venkatesh.pallipadi@intel.com> wrote:
> 
> > Patch being discussed on this thread (commit 0d12cdd) has a
> > regression on one of the test systems here.
> >
> > With the patch, I see
> >
> > checking TSC synchronization [CPU#0 -> CPU#1]:
> > Measured 28 cycles TSC warp between CPUs, turning off TSC clock.
> > Marking TSC unstable due to check_tsc_sync_source failed
> >
> > Whereas, without the patch syncs pass fine on all CPUs
> >
> > checking TSC synchronization [CPU#0 -> CPU#1]: passed.
> >
> > Due to this, TSC is marke unstable, when it is not actually unstable.
> > This is because syncs in check_tsc_wrap() goes away due to this commit.
> >
> > As per the discussion on this thread, correct way to fix this is to add
> > explicit syncs as below?
> 
> ah. Yes.
> 
> Could you please check whether:
> 
> > +     rdtsc_barrier();
> >       start = get_cycles();
> > +     rdtsc_barrier();
> >       /*
> >        * The measurement runs for 20 msecs:
> >        */
> > @@ -61,7 +63,9 @@ static __cpuinit void check_tsc_warp(voi
> >                */
> >               __raw_spin_lock(&sync_lock);
> >               prev = last_tsc;
> > +             rdtsc_barrier();
> >               now = get_cycles();
> > +             rdtsc_barrier();
> 
> adding the barrier just _after_ the get_cycles() call (but not before
> it) does the trick too? That should be enough in this case.
>

With barrier only after get_cycles, I do see syncs across first few CPUs
passing. But later I see:

checking TSC synchronization [CPU#0 -> CPU#13]:
Measured 4 cycles TSC warp between CPUs, turning off TSC clock.
Marking TSC unstable due to check_tsc_sync_source failed


Thanks,
Venki

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-17 22:43             ` Venki Pallipadi
@ 2008-11-17 22:50               ` Ingo Molnar
  2008-11-17 23:04                 ` Venki Pallipadi
  0 siblings, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2008-11-17 22:50 UTC (permalink / raw)
  To: Venki Pallipadi
  Cc: Linus Torvalds, Arjan van de Ven, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra, Mike Galbraith


* Venki Pallipadi <venkatesh.pallipadi@intel.com> wrote:

> Patch being discussed on this thread (commit 0d12cdd) has a 
> regression on one of the test systems here.
> 
> With the patch, I see
> 
> checking TSC synchronization [CPU#0 -> CPU#1]:
> Measured 28 cycles TSC warp between CPUs, turning off TSC clock.
> Marking TSC unstable due to check_tsc_sync_source failed
> 
> Whereas, without the patch syncs pass fine on all CPUs
> 
> checking TSC synchronization [CPU#0 -> CPU#1]: passed.
> 
> Due to this, TSC is marke unstable, when it is not actually unstable.
> This is because syncs in check_tsc_wrap() goes away due to this commit.
> 
> As per the discussion on this thread, correct way to fix this is to add
> explicit syncs as below?

ah. Yes.

Could you please check whether:

> +	rdtsc_barrier();
>  	start = get_cycles();
> +	rdtsc_barrier();
>  	/*
>  	 * The measurement runs for 20 msecs:
>  	 */
> @@ -61,7 +63,9 @@ static __cpuinit void check_tsc_warp(voi
>  		 */
>  		__raw_spin_lock(&sync_lock);
>  		prev = last_tsc;
> +		rdtsc_barrier();
>  		now = get_cycles();
> +		rdtsc_barrier();

adding the barrier just _after_ the get_cycles() call (but not before 
it) does the trick too? That should be enough in this case.

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 19:29           ` Ingo Molnar
@ 2008-11-17 22:43             ` Venki Pallipadi
  2008-11-17 22:50               ` Ingo Molnar
  0 siblings, 1 reply; 24+ messages in thread
From: Venki Pallipadi @ 2008-11-17 22:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Arjan van de Ven, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra, Mike Galbraith

On Sat, Nov 08, 2008 at 11:29:57AM -0800, Ingo Molnar wrote:
> 
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> > On Sat, 8 Nov 2008, Ingo Molnar wrote:
> > > 
> > > So that's why my change moves it from the __native_read_tsc() over to 
> > > _only_ the vget_cycles().
> > 
> > Ahh. I was looking at native_read_tscp(). Which has no barriers. But then 
> > we don't actually save the actual TSC, we only end up using the "p" part, 
> > so we don't care..
> > 
> > Anyway, even for the vget_cycles(), is there really any reason to 
> > have _two_ barriers? Also, I still think it would be a hell of a lot 
> > more readable and logical to put the barriers in the _caller_, so 
> > that people actually see what the barriers are there for.
> > 
> > When they are hidden, they make no sense. The helper function just 
> > has two insane barriers without explanation, and the caller doesn't 
> > know that the code is serialized wrt something random.
> 
> ok, fully agreed, i've queued up the cleanup for that, see it below.
> 
> sidenote: i still kept the get_cycles() versus vget_cycles() 
> distinction, to preserve the explicit marker that vget_cycles() is 
> used in user-space mode code. We periodically forgot about that in the 
> past. But otherwise, the two inline functions are now identical. 
> (except for the assymetry of its inlining, and the comment about the 
> boot_cpu_data use of the has_tsc check)
> 


Patch being discussed on this thread (commit 0d12cdd) has a regression on
one of the test systems here.

With the patch, I see

checking TSC synchronization [CPU#0 -> CPU#1]:
Measured 28 cycles TSC warp between CPUs, turning off TSC clock.
Marking TSC unstable due to check_tsc_sync_source failed

Whereas, without the patch syncs pass fine on all CPUs

checking TSC synchronization [CPU#0 -> CPU#1]: passed.

Due to this, TSC is marke unstable, when it is not actually unstable.
This is because syncs in check_tsc_wrap() goes away due to this commit.

As per the discussion on this thread, correct way to fix this is to add
explicit syncs as below?

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>

---
 arch/x86/kernel/tsc_sync.c |    4 ++++
 1 file changed, 4 insertions(+)

Index: linux-2.6/arch/x86/kernel/tsc_sync.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/tsc_sync.c	2008-11-10 15:27:12.000000000 -0800
+++ linux-2.6/arch/x86/kernel/tsc_sync.c	2008-11-17 14:13:17.000000000 -0800
@@ -46,7 +46,9 @@ static __cpuinit void check_tsc_warp(voi
 	cycles_t start, now, prev, end;
 	int i;
 
+	rdtsc_barrier();
 	start = get_cycles();
+	rdtsc_barrier();
 	/*
 	 * The measurement runs for 20 msecs:
 	 */
@@ -61,7 +63,9 @@ static __cpuinit void check_tsc_warp(voi
 		 */
 		__raw_spin_lock(&sync_lock);
 		prev = last_tsc;
+		rdtsc_barrier();
 		now = get_cycles();
+		rdtsc_barrier();
 		last_tsc = now;
 		__raw_spin_unlock(&sync_lock);
 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 19:20         ` Linus Torvalds
  2008-11-08 19:29           ` Ingo Molnar
@ 2008-11-08 19:40           ` Ingo Molnar
  1 sibling, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-11-08 19:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Arjan van de Ven, Linux Kernel Mailing List, Andrew Morton,
	Peter Zijlstra, Mike Galbraith


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sat, 8 Nov 2008, Ingo Molnar wrote:
> > 
> > So that's why my change moves it from the __native_read_tsc() over 
> > to _only_ the vget_cycles().
> 
> Ahh. I was looking at native_read_tscp(). Which has no barriers. But 
> then we don't actually save the actual TSC, we only end up using the 
> "p" part, so we don't care..

sidenote #3: RDTSCP is a relatively new instruction, and has implicit 
barrier properties: it is a serializing instruction.

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 18:52   ` Ingo Molnar
  2008-11-08 18:57     ` Ingo Molnar
@ 2008-11-08 19:32     ` Ingo Molnar
  1 sibling, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-11-08 19:32 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Andrew Morton, Peter Zijlstra, Mike Galbraith,
	Thomas Gleixner, H. Peter Anvin


* Ingo Molnar <mingo@elte.hu> wrote:

> But you are right, and i've queued up the full fix below as well, as 
> a reminder.

... i zapped this, as it's wrong to remove the barrier use from 
vsyscall_64.c.

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 19:20         ` Linus Torvalds
@ 2008-11-08 19:29           ` Ingo Molnar
  2008-11-17 22:43             ` Venki Pallipadi
  2008-11-08 19:40           ` Ingo Molnar
  1 sibling, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2008-11-08 19:29 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Arjan van de Ven, Linux Kernel Mailing List, Andrew Morton,
	Peter Zijlstra, Mike Galbraith


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sat, 8 Nov 2008, Ingo Molnar wrote:
> > 
> > So that's why my change moves it from the __native_read_tsc() over to 
> > _only_ the vget_cycles().
> 
> Ahh. I was looking at native_read_tscp(). Which has no barriers. But then 
> we don't actually save the actual TSC, we only end up using the "p" part, 
> so we don't care..
> 
> Anyway, even for the vget_cycles(), is there really any reason to 
> have _two_ barriers? Also, I still think it would be a hell of a lot 
> more readable and logical to put the barriers in the _caller_, so 
> that people actually see what the barriers are there for.
> 
> When they are hidden, they make no sense. The helper function just 
> has two insane barriers without explanation, and the caller doesn't 
> know that the code is serialized wrt something random.

ok, fully agreed, i've queued up the cleanup for that, see it below.

sidenote: i still kept the get_cycles() versus vget_cycles() 
distinction, to preserve the explicit marker that vget_cycles() is 
used in user-space mode code. We periodically forgot about that in the 
past. But otherwise, the two inline functions are now identical. 
(except for the assymetry of its inlining, and the comment about the 
boot_cpu_data use of the has_tsc check)

	Ingo

--------------->
>From cb9e35dce94a1b9c59d46224e8a94377d673e204 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 8 Nov 2008 20:27:00 +0100
Subject: [PATCH] x86: clean up rdtsc_barrier() use

Impact: cleanup

Move rdtsc_barrier() use to vsyscall_64.c where it's relied on,
and point out its role in the context of its use.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/tsc.h    |    6 +-----
 arch/x86/kernel/vsyscall_64.c |    9 +++++++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 9cd83a8..700aeb8 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -44,11 +44,7 @@ static __always_inline cycles_t vget_cycles(void)
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	rdtsc_barrier();
-	cycles = (cycles_t)__native_read_tsc();
-	rdtsc_barrier();
-
-	return cycles;
+	return (cycles_t)__native_read_tsc();
 }
 
 extern void tsc_init(void);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b669..ebf2f12 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -128,7 +128,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
 			gettimeofday(tv,NULL);
 			return;
 		}
+
+		/*
+		 * Surround the RDTSC by barriers, to make sure it's not
+		 * speculated to outside the seqlock critical section and
+		 * does not cause time warps:
+		 */
+		rdtsc_barrier();
 		now = vread();
+		rdtsc_barrier();
+
 		base = __vsyscall_gtod_data.clock.cycle_last;
 		mask = __vsyscall_gtod_data.clock.mask;
 		mult = __vsyscall_gtod_data.clock.mult;


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 19:05       ` Ingo Molnar
@ 2008-11-08 19:20         ` Linus Torvalds
  2008-11-08 19:29           ` Ingo Molnar
  2008-11-08 19:40           ` Ingo Molnar
  0 siblings, 2 replies; 24+ messages in thread
From: Linus Torvalds @ 2008-11-08 19:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Arjan van de Ven, Linux Kernel Mailing List, Andrew Morton,
	Peter Zijlstra, Mike Galbraith



On Sat, 8 Nov 2008, Ingo Molnar wrote:
> 
> So that's why my change moves it from the __native_read_tsc() over to 
> _only_ the vget_cycles().

Ahh. I was looking at native_read_tscp(). Which has no barriers. But then 
we don't actually save the actual TSC, we only end up using the "p" part, 
so we don't care..

Anyway, even for the vget_cycles(), is there really any reason to have 
_two_ barriers? Also, I still think it would be a hell of a lot more 
readable and logical to put the barriers in the _caller_, so that people 
actually see what the barriers are there for.

When they are hidden, they make no sense. The helper function just has two 
insane barriers without explanation, and the caller doesn't know that the 
code is serialized wrt something random.

		Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 19:00     ` Linus Torvalds
  2008-11-08 19:05       ` Ingo Molnar
@ 2008-11-08 19:10       ` Ingo Molnar
  1 sibling, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-11-08 19:10 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Arjan van de Ven, linux-kernel, Andrew Morton, Peter Zijlstra,
	Mike Galbraith


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sat, 8 Nov 2008, Arjan van de Ven wrote:
> > 
> > historically it was for early AMD cpus (K7, not sure if early K8 did
> > this) where 2 consecutive rdtsc's in the same codestream would get
> > reordered compared to eachother, so you could observe the tsc go
> > backwards...
> 
> .. but this only happens with two _consecutive_ ones.
> 
> The thing is, nobody sane does that in generic code. The scheduler wants 
> to have cycles, yes, but two consecutive scheduler invocations will have 
> spinlocks etc in between. That's true of _all_ sane uses of a TSC.
> 
> I don't see that there is ever any reason to do the barriers for any 
> normal case. And the cases where it does matter would actually be worth 
> pointing out (ie making the barriers explicit in those cases, and those 
> cases only).
> 
> Doing it in get_cycles() and "forgetting about it" may sound like a simple 
> solution, but it's likely wrong. For example, one of the few cases where 
> we realy care about time going backwards is gettimeofday() - which uses 
> tsc, but which also has tons of serializing instructions on its own. 
> EXCEPT WHEN IT IS a vsyscall!
> 
> But in that case, we don't even have the barrier, because we put it in the 
> wrong function and 'forgot about it'. Of course, we may not need it 
> (rdtscp maybe always serializes, I didn't check), but the point is, an 
> explicit barrier is actually better than one that is hidden.
> 
> So who _really_ needs it? And why not just do it there?

i think, the tree as offered to you, intends to do just that, unless i 
made some grave (and unintended) mistake somewhere.

The barrier is only present in the vread function: which is the 
vsyscall-read function, to be used from user-space.

Even in the past, no was actually forgotten or put in the wrong 
function as far as i can see because previously _everything_ 
(including the vread method) had the barrier.

The change from me simply removes the barrier from the places that 
dont need it - exactly for the reason you outlined: the scheduler is 
both imprecise and has a ton of natural serialization anyway, so it's 
a non-issue there.

Hm?

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 19:00     ` Linus Torvalds
@ 2008-11-08 19:05       ` Ingo Molnar
  2008-11-08 19:20         ` Linus Torvalds
  2008-11-08 19:10       ` Ingo Molnar
  1 sibling, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2008-11-08 19:05 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Arjan van de Ven, linux-kernel, Andrew Morton, Peter Zijlstra,
	Mike Galbraith


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> Doing it in get_cycles() and "forgetting about it" may sound like a 
> simple solution, but it's likely wrong. For example, one of the few 
> cases where we realy care about time going backwards is 
> gettimeofday() - which uses tsc, but which also has tons of 
> serializing instructions on its own. EXCEPT WHEN IT IS a vsyscall!
> 
> But in that case, we don't even have the barrier, because we put it 
> in the wrong function and 'forgot about it'. Of course, we may not 
> need it (rdtscp maybe always serializes, I didn't check), but the 
> point is, an explicit barrier is actually better than one that is 
> hidden.

no, we really had it in the vsyscall case: which uses vread, which 
uses __native_read_tsc(), which had the barriers.

And i think that's the _only_ valid place to have it.

So that's why my change moves it from the __native_read_tsc() over to 
_only_ the vget_cycles().

am i missing something on such a nice Saturday evening? :)

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 18:41   ` Arjan van de Ven
@ 2008-11-08 19:00     ` Linus Torvalds
  2008-11-08 19:05       ` Ingo Molnar
  2008-11-08 19:10       ` Ingo Molnar
  0 siblings, 2 replies; 24+ messages in thread
From: Linus Torvalds @ 2008-11-08 19:00 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, linux-kernel, Andrew Morton, Peter Zijlstra, Mike Galbraith



On Sat, 8 Nov 2008, Arjan van de Ven wrote:
> 
> historically it was for early AMD cpus (K7, not sure if early K8 did
> this) where 2 consecutive rdtsc's in the same codestream would get
> reordered compared to eachother, so you could observe the tsc go
> backwards...

.. but this only happens with two _consecutive_ ones.

The thing is, nobody sane does that in generic code. The scheduler wants 
to have cycles, yes, but two consecutive scheduler invocations will have 
spinlocks etc in between. That's true of _all_ sane uses of a TSC.

I don't see that there is ever any reason to do the barriers for any 
normal case. And the cases where it does matter would actually be worth 
pointing out (ie making the barriers explicit in those cases, and those 
cases only).

Doing it in get_cycles() and "forgetting about it" may sound like a simple 
solution, but it's likely wrong. For example, one of the few cases where 
we realy care about time going backwards is gettimeofday() - which uses 
tsc, but which also has tons of serializing instructions on its own. 
EXCEPT WHEN IT IS a vsyscall!

But in that case, we don't even have the barrier, because we put it in the 
wrong function and 'forgot about it'. Of course, we may not need it 
(rdtscp maybe always serializes, I didn't check), but the point is, an 
explicit barrier is actually better than one that is hidden.

So who _really_ needs it? And why not just do it there?

			Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 18:52   ` Ingo Molnar
@ 2008-11-08 18:57     ` Ingo Molnar
  2008-11-08 19:32     ` Ingo Molnar
  1 sibling, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-11-08 18:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Andrew Morton, Peter Zijlstra, Mike Galbraith,
	Thomas Gleixner, H. Peter Anvin


* Ingo Molnar <mingo@elte.hu> wrote:

> For that one, i chickened out, because we have this use in 
> arch/x86/kernel/vsyscall_64.c:
> 
>                 now = vread();
>                 base = __vsyscall_gtod_data.clock.cycle_last;
>                 mask = __vsyscall_gtod_data.clock.mask;
>                 mult = __vsyscall_gtod_data.clock.mult;
>                 shift = __vsyscall_gtod_data.clock.shift;
> 
> which can be triggered by gettimeofday() on certain systems.
>
> And i couldnt convince myself that this sequence couldnt result in 
> userspace-observable GTOD time warps there, so i went for the 
> obvious fix first.
> 
> If the "now = vread()"'s RDTSC instruction is speculated to after it 
> reads cycle_last, and another vdso call shortly after this does 
> another RDTSC in this same sequence, the two RDTSC's could be mixed 
> up in theory, resulting in negative time?

the fuller sequence is:

                now = vread();
                base = __vsyscall_gtod_data.clock.cycle_last;
                mask = __vsyscall_gtod_data.clock.mask;
                mult = __vsyscall_gtod_data.clock.mult;
                shift = __vsyscall_gtod_data.clock.shift;

                tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
                nsec = __vsyscall_gtod_data.wall_time_nsec;
        } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));

now here we could have another race as well: on another CPU we have a 
timer IRQ running, which updates 
__vsyscall_gtod_data.wall_time_[n]sec.

now __vsyscall_gtod_data updates are protected via the 
__vsyscall_gtod_data.lock seqlock, but that assumes that all 
instructions within that sequence listen to the barriers.

Except for RDTSC, which can be speculated to outside that region of 
code.

RDTSC has no 'explicit' data dependency - there's no MESI-alike 
coherency guarantee for stuffing a cycle counter into a register and 
then putting that into __vsyscall_gtod_data.clock.cycle_last. So we 
create one, by using the combination of LFENCE and SFENCE. (because 
RDTSC implementations on Intel and AMD CPUs listen to different 
sequences.)

all in one, i think it's still needed to avoid negative GTOD jumps. 

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 18:28 ` Linus Torvalds
  2008-11-08 18:38   ` Linus Torvalds
  2008-11-08 18:41   ` Arjan van de Ven
@ 2008-11-08 18:52   ` Ingo Molnar
  2008-11-08 18:57     ` Ingo Molnar
  2008-11-08 19:32     ` Ingo Molnar
  2 siblings, 2 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-11-08 18:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Andrew Morton, Peter Zijlstra, Mike Galbraith,
	Thomas Gleixner, H. Peter Anvin


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sat, 8 Nov 2008, Ingo Molnar wrote:
> >
> > Ingo Molnar (2):
> >       sched: improve sched_clock() performance
> >       sched: optimize sched_clock() a bit
> 
> Btw, why do we do that _idiotic_ rdtsc_barrier() AT ALL?
> 
> No sane user can possibly want it. If you do 'rdtsc', there's 
> nothing you can do about a few cycles difference due to OoO 
> _anyway_. Adding barriers is entirely meaningless - it's not going 
> to make the return value mean anything else.
> 
> Can we please just remove that idiocy? Or can somebody give a _sane_ 
> argument for it?

yeah, i had the same thinking, so i zapped it for everything but the 
vdso driven vgettimeofday GTOD method.

For that one, i chickened out, because we have this use in 
arch/x86/kernel/vsyscall_64.c:

                now = vread();
                base = __vsyscall_gtod_data.clock.cycle_last;
                mask = __vsyscall_gtod_data.clock.mask;
                mult = __vsyscall_gtod_data.clock.mult;
                shift = __vsyscall_gtod_data.clock.shift;

which can be triggered by gettimeofday() on certain systems.

And i couldnt convince myself that this sequence couldnt result in 
userspace-observable GTOD time warps there, so i went for the obvious 
fix first.

If the "now = vread()"'s RDTSC instruction is speculated to after it 
reads cycle_last, and another vdso call shortly after this does 
another RDTSC in this same sequence, the two RDTSC's could be mixed up 
in theory, resulting in negative time?

I _think_ i heard some noises in the past that this could indeed 
happen (and have vague memories that this was the justification for 
the barrier's introduction), but have to check the old emails to 
figure out what exactly the issue was and on what CPUs.

It's not completely impossible for this to happen, as the vdso calls 
are really just simple function calls, so not nearly as strongly 
serialized as say a real syscall based gettimeofday() call.

In any case, it failed my "it must be obvious within 1 minute for it 
to be eligible for sched/urgent" threshold and i didnt want to 
introduce a time warp.

But you are right, and i've queued up the full fix below as well, as a 
reminder.

	Ingo

----------------->
>From 2efe2c42e008a80ebe1992db63749386778f7df8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 8 Nov 2008 19:46:48 +0100
Subject: [PATCH] x86, time: remove rdtsc_barrier()

Linus pointed out that even for vread() rdtsc_barrier() is pointless
overhead - as due to speculative instructions there's no such thing
as reliable cycle count anyway.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/system.h |   13 -------------
 arch/x86/include/asm/tsc.h    |    6 +-----
 2 files changed, 1 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 2ed3f0f..1a1d45e 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -409,17 +409,4 @@ void default_idle(void);
 #define set_mb(var, value) do { var = value; barrier(); } while (0)
 #endif
 
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static inline void rdtsc_barrier(void)
-{
-	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
 #endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 9cd83a8..700aeb8 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -44,11 +44,7 @@ static __always_inline cycles_t vget_cycles(void)
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	rdtsc_barrier();
-	cycles = (cycles_t)__native_read_tsc();
-	rdtsc_barrier();
-
-	return cycles;
+	return (cycles_t)__native_read_tsc();
 }
 
 extern void tsc_init(void);

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 18:28 ` Linus Torvalds
  2008-11-08 18:38   ` Linus Torvalds
@ 2008-11-08 18:41   ` Arjan van de Ven
  2008-11-08 19:00     ` Linus Torvalds
  2008-11-08 18:52   ` Ingo Molnar
  2 siblings, 1 reply; 24+ messages in thread
From: Arjan van de Ven @ 2008-11-08 18:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, linux-kernel, Andrew Morton, Peter Zijlstra, Mike Galbraith

On Sat, 8 Nov 2008 10:28:21 -0800 (PST)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Sat, 8 Nov 2008, Ingo Molnar wrote:
> >
> > Ingo Molnar (2):
> >       sched: improve sched_clock() performance
> >       sched: optimize sched_clock() a bit
> 
> Btw, why do we do that _idiotic_ rdtsc_barrier() AT ALL?
> 
> No sane user can possibly want it. If you do 'rdtsc', there's nothing
> you can do about a few cycles difference due to OoO _anyway_. Adding
> barriers is entirely meaningless - it's not going to make the return
> value mean anything else.
> 
> Can we please just remove that idiocy? Or can somebody give a _sane_ 
> argument for it?

historically it was for early AMD cpus (K7, not sure if early K8 did
this) where 2 consecutive rdtsc's in the same codestream would get
reordered compared to eachother, so you could observe the tsc go
backwards...



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 18:28 ` Linus Torvalds
@ 2008-11-08 18:38   ` Linus Torvalds
  2008-11-08 18:41   ` Arjan van de Ven
  2008-11-08 18:52   ` Ingo Molnar
  2 siblings, 0 replies; 24+ messages in thread
From: Linus Torvalds @ 2008-11-08 18:38 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Andrew Morton, Peter Zijlstra, Mike Galbraith



On Sat, 8 Nov 2008, Linus Torvalds wrote:
> 
> Can we please just remove that idiocy? Or can somebody give a _sane_ 
> argument for it?

Btw, the only _possible_ sane argument I see is 

 - two consecutive rdtsc calls
 - timing the code in between
 - the code in between is not self-serializing

and quite frankly, if that's the case, then it's _that_ code that should 
have the barriers, not some generic "[v]get_cycles()". 

IOW, the rdtsc_barrier may make sense when you're synchronizing the TSC to 
some other hardware event (eg the "tie the TSC to the HPET" kind of 
code), but then the barriers are about the code, not about the TSC access 
itself.

			Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [git pull] scheduler updates
  2008-11-08 17:02 [git pull] " Ingo Molnar
@ 2008-11-08 18:28 ` Linus Torvalds
  2008-11-08 18:38   ` Linus Torvalds
                     ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Linus Torvalds @ 2008-11-08 18:28 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Andrew Morton, Peter Zijlstra, Mike Galbraith



On Sat, 8 Nov 2008, Ingo Molnar wrote:
>
> Ingo Molnar (2):
>       sched: improve sched_clock() performance
>       sched: optimize sched_clock() a bit

Btw, why do we do that _idiotic_ rdtsc_barrier() AT ALL?

No sane user can possibly want it. If you do 'rdtsc', there's nothing you 
can do about a few cycles difference due to OoO _anyway_. Adding barriers 
is entirely meaningless - it's not going to make the return value mean 
anything else.

Can we please just remove that idiocy? Or can somebody give a _sane_ 
argument for it?

		Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [git pull] scheduler updates
@ 2008-11-08 17:02 Ingo Molnar
  2008-11-08 18:28 ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2008-11-08 17:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Andrew Morton, Peter Zijlstra, Mike Galbraith

Linus,

Please pull the latest scheduler updates git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus

 Thanks,

	Ingo

------------------>
Ingo Molnar (2):
      sched: improve sched_clock() performance
      sched: optimize sched_clock() a bit


 arch/x86/include/asm/msr.h |    2 --
 arch/x86/include/asm/tsc.h |    8 +++++++-
 arch/x86/kernel/tsc.c      |    2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 46be2fa..c2a812e 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -108,9 +108,7 @@ static __always_inline unsigned long long __native_read_tsc(void)
 {
 	DECLARE_ARGS(val, low, high);
 
-	rdtsc_barrier();
 	asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
-	rdtsc_barrier();
 
 	return EAX_EDX_VAL(val, low, high);
 }
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 38ae163..9cd83a8 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -34,6 +34,8 @@ static inline cycles_t get_cycles(void)
 
 static __always_inline cycles_t vget_cycles(void)
 {
+	cycles_t cycles;
+
 	/*
 	 * We only do VDSOs on TSC capable CPUs, so this shouldnt
 	 * access boot_cpu_data (which is not VDSO-safe):
@@ -42,7 +44,11 @@ static __always_inline cycles_t vget_cycles(void)
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	return (cycles_t)__native_read_tsc();
+	rdtsc_barrier();
+	cycles = (cycles_t)__native_read_tsc();
+	rdtsc_barrier();
+
+	return cycles;
 }
 
 extern void tsc_init(void);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2ef80e3..424093b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -55,7 +55,7 @@ u64 native_sched_clock(void)
 	rdtscll(this_offset);
 
 	/* return the value in ns */
-	return cycles_2_ns(this_offset);
+	return __cycles_2_ns(this_offset);
 }
 
 /* We need to define a real function for sched_clock, to override the

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [git pull] scheduler updates
@ 2008-03-21 16:23 Ingo Molnar
  0 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-03-21 16:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Andrew Morton


Linus, please pull the latest scheduler git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel.git for-linus

(the topology.h change is a NOP for upstream, to make it easier for s390 
to work on their topology changes.)

Thanks,

	Ingo

------------------>
Heiko Carstens (2):
      sched: add exported arch_reinit_sched_domains() to header file.
      sched: add arch_update_cpu_topology hook.

Peter Zijlstra (1):
      sched: cleanup old and rarely used 'debug' features.

Roel Kluin (1):
      sched: remove double unlikely from schedule()

 include/linux/sched.h    |    1 +
 include/linux/topology.h |    2 ++
 kernel/sched.c           |   17 +++++++++--------
 kernel/sched_fair.c      |   14 --------------
 4 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3625fca..fed07d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -790,6 +790,7 @@ struct sched_domain {
 };
 
 extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
+extern int arch_reinit_sched_domains(void);
 
 #endif	/* CONFIG_SMP */
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 2d8dac8..bd14f8b 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -50,6 +50,8 @@
 	for_each_online_node(node)						\
 		if (nr_cpus_node(node))
 
+void arch_update_cpu_topology(void);
+
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
 #define REMOTE_DISTANCE		20
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f7c5eb..28c73f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -594,18 +594,14 @@ enum {
 	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
 	SCHED_FEAT_WAKEUP_PREEMPT	= 2,
 	SCHED_FEAT_START_DEBIT		= 4,
-	SCHED_FEAT_TREE_AVG		= 8,
-	SCHED_FEAT_APPROX_AVG		= 16,
-	SCHED_FEAT_HRTICK		= 32,
-	SCHED_FEAT_DOUBLE_TICK		= 64,
+	SCHED_FEAT_HRTICK		= 8,
+	SCHED_FEAT_DOUBLE_TICK		= 16,
 };
 
 const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	* 1 |
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
-		SCHED_FEAT_TREE_AVG		* 0 |
-		SCHED_FEAT_APPROX_AVG		* 0 |
 		SCHED_FEAT_HRTICK		* 1 |
 		SCHED_FEAT_DOUBLE_TICK		* 0;
 
@@ -3886,7 +3882,7 @@ need_resched_nonpreemptible:
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
-				unlikely(signal_pending(prev)))) {
+				signal_pending(prev))) {
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, 1);
@@ -6811,6 +6807,10 @@ static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
  */
 static cpumask_t fallback_doms;
 
+void __attribute__((weak)) arch_update_cpu_topology(void)
+{
+}
+
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -6820,6 +6820,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	int err;
 
+	arch_update_cpu_topology();
 	ndoms_cur = 1;
 	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms_cur)
@@ -6924,7 +6925,7 @@ match2:
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static int arch_reinit_sched_domains(void)
+int arch_reinit_sched_domains(void)
 {
 	int err;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b85cac4..86a9337 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -302,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
 	return vslice;
 }
 
-static u64 sched_vslice(struct cfs_rq *cfs_rq)
-{
-	return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
-}
-
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	return __sched_vslice(cfs_rq->load.weight + se->load.weight,
@@ -504,15 +499,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	} else
 		vruntime = cfs_rq->min_vruntime;
 
-	if (sched_feat(TREE_AVG)) {
-		struct sched_entity *last = __pick_last_entity(cfs_rq);
-		if (last) {
-			vruntime += last->vruntime;
-			vruntime >>= 1;
-		}
-	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
-		vruntime += sched_vslice(cfs_rq)/2;
-
 	/*
 	 * The 'current' period is already promised to the current tasks,
 	 * however the extra weight of the new task will slow them down a

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [git pull] scheduler updates
@ 2008-02-13 15:58 Ingo Molnar
  0 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-02-13 15:58 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Andrew Morton, Peter Zijlstra


Linus, please pull the latest scheduler git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

Find the shortlog and diffstat below.

The main body of this tree is about resolving interactions between the 
group scheduler and the new rt_ratio API. This restructuring also fixes 
an -rc1 bug where tasks would try to gain RT privileges that were in an 
UID group by default which had no RT runtime assigned to them - 
resulting in those tasks not running at all.

Furthermore, Peter found that the "ratio" API was a bit unfortunate 
choice (which causes some arbitrary granularity to be chosen) so it's 
microseconds based now. Also, RT group scheduling is now selectable in 
the .config (off by default). These all got a bit intrusive [and had a 
bug that took some time to find] so i kept testing it for a while so it 
missed -rc1.

It's good IMO and most of the code is default-disabled - but if this 
looks too complex our other option would be to disable the new RT bits 
in .25 and do it in .26.

	Ingo

------------------>
Peter Zijlstra (8):
      hrtimer: more hrtimer_init_sleeper() fallout.
      sched: fair-group: separate tg->shares from task_group_lock
      sched: fix incorrect irq lock usage in normalize_rt_tasks()
      sched: rt-group: deal with PI
      sched: rt-group: interface
      sched: rt-group: make rt groups scheduling configurable
      sched: rt-group: clean up the ifdeffery
      sched: rt-group: refure unrunnable tasks

 Documentation/sched-rt-group.txt |   59 +++++
 include/linux/cgroup_subsys.h    |    2 +-
 include/linux/sched.h            |   18 +-
 init/Kconfig                     |   23 ++-
 kernel/rtmutex.c                 |    5 +-
 kernel/sched.c                   |  494 +++++++++++++++++++++++++++----------
 kernel/sched_rt.c                |  102 +++++---
 kernel/sysctl.c                  |   32 ++--
 kernel/user.c                    |   50 ++++-
 9 files changed, 576 insertions(+), 209 deletions(-)
 create mode 100644 Documentation/sched-rt-group.txt


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [git pull] scheduler updates
@ 2008-01-31 21:54 Ingo Molnar
  0 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2008-01-31 21:54 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel


Linus, please pull the latest scheduler-fixes git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

Thanks! The only behavioral changes are two fixes for interactivity 
problems reported recently. (they may also be backport candidates for 
v2.6.24)

	Ingo

------------------>
Gerald Stralko (1):
      sched: remove unused params

Ingo Molnar (1):
      debug: turn ignore_loglevel into an early param

Paul E. McKenney (1):
      RCU: add help text for "RCU implementation type"

Peter Zijlstra (1):
      sched: let +nice tasks have smaller impact

Srivatsa Vaddagiri (1):
      sched: fix high wake up latencies with FAIR_USER_SCHED

 init/Kconfig        |    8 ++++++++
 kernel/printk.c     |    4 ++--
 kernel/sched.c      |   10 +++++-----
 kernel/sched_fair.c |    8 ++++++--
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 0d0bbf2..dcc96a8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -775,6 +775,14 @@ config PREEMPT_NOTIFIERS
 choice
 	prompt "RCU implementation type:"
 	default CLASSIC_RCU
+	help
+	  This allows you to choose either the classic RCU implementation
+	  that is designed for best read-side performance on non-realtime
+	  systems, or the preemptible RCU implementation for best latency
+	  on realtime systems.  Note that some kernel preemption modes
+	  will restrict your choice.
+
+	  Select the default if you are unsure.
 
 config CLASSIC_RCU
 	bool "Classic RCU"
diff --git a/kernel/printk.c b/kernel/printk.c
index 58bbec6..29ae1e9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -455,10 +455,10 @@ static int __init ignore_loglevel_setup(char *str)
 	ignore_loglevel = 1;
 	printk(KERN_INFO "debug: ignoring loglevel setting.\n");
 
-	return 1;
+	return 0;
 }
 
-__setup("ignore_loglevel", ignore_loglevel_setup);
+early_param("ignore_loglevel", ignore_loglevel_setup);
 
 /*
  * Write out chars from start to end - 1 inclusive
diff --git a/kernel/sched.c b/kernel/sched.c
index ba4c880..8355e00 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1255,12 +1255,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
 #define sched_class_highest (&rt_sched_class)
 
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
+static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
 }
 
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
 {
 	rq->nr_running--;
 }
@@ -1354,7 +1354,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup);
-	inc_nr_running(p, rq);
+	inc_nr_running(rq);
 }
 
 /*
@@ -1366,7 +1366,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(p, rq);
+	dec_nr_running(rq);
 }
 
 /**
@@ -2006,7 +2006,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(p, rq);
+		inc_nr_running(rq);
 	}
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 72e25c7..6c091d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -520,7 +520,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se))
+		if (sched_feat(NEW_FAIR_SLEEPERS))
 			vruntime -= sysctl_sched_latency;
 
 		/* ensure we never gain time by being placed backwards. */
@@ -1106,7 +1106,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	}
 
 	gran = sysctl_sched_wakeup_granularity;
-	if (unlikely(se->load.weight != NICE_0_LOAD))
+	/*
+	 * More easily preempt - nice tasks, while not making
+	 * it harder for + nice tasks.
+	 */
+	if (unlikely(se->load.weight > NICE_0_LOAD))
 		gran = calc_delta_fair(gran, &se->load);
 
 	if (pse->vruntime + gran < se->vruntime)

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [git pull] scheduler updates
@ 2007-12-30 16:45 Ingo Molnar
  0 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2007-12-30 16:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Andrew Morton


Linus, please pull the latest scheduler git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

Warning fix for sparc64 - no code changed. Thanks!

	Ingo

------------------>
Ingo Molnar (1):
      sched: fix gcc warnings

 sched_debug.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index d30467b..80fbbfc 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -31,9 +31,9 @@
 /*
  * Ease the printing of nsec fields:
  */
-static long long nsec_high(long long nsec)
+static long long nsec_high(unsigned long long nsec)
 {
-	if (nsec < 0) {
+	if ((long long)nsec < 0) {
 		nsec = -nsec;
 		do_div(nsec, 1000000);
 		return -nsec;
@@ -43,9 +43,9 @@ static long long nsec_high(long long nsec)
 	return nsec;
 }
 
-static unsigned long nsec_low(long long nsec)
+static unsigned long nsec_low(unsigned long long nsec)
 {
-	if (nsec < 0)
+	if ((long long)nsec < 0)
 		nsec = -nsec;
 
 	return do_div(nsec, 1000000);


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [git pull] scheduler updates
@ 2007-10-24 16:39 Ingo Molnar
  0 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2007-10-24 16:39 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel


Linus, please pull the latest scheduler git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

it includes 11 commits: 3 (low-key) fixlets, 6 low-impact cleanups, the 
marking of CONFIG_FAIR_GROUP_SCHED as !EXPERIMENTAL and a debug-printout 
improvement.

The "reduce balance-tasks overhead" commit is admittedly borderline for 
a post-rc1 pull because it is primarily a speedup and only secondarily a 
cleanup - but it's been in my queue for some time with no problems and a 
followup cleanup depended on it too so i included it.

build and boot tested on x86 32-bit and 64-bit.

	Ingo

------------------>
Adrian Bunk (1):
      sched: make cpu_shares_{show,store}() static

Ingo Molnar (3):
      sched: fix fastcall mismatch in completion APIs
      sched: clean up sched_domain_debug()
      sched: mark CONFIG_FAIR_GROUP_SCHED as !EXPERIMENTAL

Joe Perches (1):
      sched: constify sched.h

Mel Gorman (1):
      sched: document profile=sleep requiring CONFIG_SCHEDSTATS

Milton Miller (1):
      sched: fix sched_domain sysctl registration again

Paul Menage (1):
      sched: clean up some control group code

Peter Williams (2):
      sched: reduce balance-tasks overhead
      sched: isolate SMP balancing code a bit more

Satyam Sharma (1):
      sched: use show_regs() to improve __schedule_bug() output

 Documentation/kernel-parameters.txt |    3 
 include/linux/completion.h          |   18 -
 include/linux/sched.h               |   37 ++--
 init/Kconfig                        |    1 
 kernel/profile.c                    |    5 
 kernel/sched.c                      |  330 +++++++++++++++++++-----------------
 kernel/sched_fair.c                 |   48 ++++-
 kernel/sched_idletask.c             |   18 +
 kernel/sched_rt.c                   |   32 ++-
 kernel/user.c                       |    5 
 10 files changed, 292 insertions(+), 205 deletions(-)

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2018-02-06 21:38 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-02-29 18:04 [git pull] scheduler updates Ingo Molnar
  -- strict thread matches above, loose matches on Subject: below --
2018-02-06 21:38 [GIT PULL] " Ingo Molnar
2008-11-08 17:02 [git pull] " Ingo Molnar
2008-11-08 18:28 ` Linus Torvalds
2008-11-08 18:38   ` Linus Torvalds
2008-11-08 18:41   ` Arjan van de Ven
2008-11-08 19:00     ` Linus Torvalds
2008-11-08 19:05       ` Ingo Molnar
2008-11-08 19:20         ` Linus Torvalds
2008-11-08 19:29           ` Ingo Molnar
2008-11-17 22:43             ` Venki Pallipadi
2008-11-17 22:50               ` Ingo Molnar
2008-11-17 23:04                 ` Venki Pallipadi
2008-11-17 23:13                   ` Ingo Molnar
2008-11-08 19:40           ` Ingo Molnar
2008-11-08 19:10       ` Ingo Molnar
2008-11-08 18:52   ` Ingo Molnar
2008-11-08 18:57     ` Ingo Molnar
2008-11-08 19:32     ` Ingo Molnar
2008-03-21 16:23 Ingo Molnar
2008-02-13 15:58 Ingo Molnar
2008-01-31 21:54 Ingo Molnar
2007-12-30 16:45 Ingo Molnar
2007-10-24 16:39 Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).