LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH]: Fix bogus softlockup warning with sysrq-t
@ 2007-03-15 13:22 Prarit Bhargava
  2007-03-23 23:46 ` Rick Lindsley
  2007-03-27  5:43 ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 8+ messages in thread
From: Prarit Bhargava @ 2007-03-15 13:22 UTC (permalink / raw)
  To: linux-kernel, akpm, clalance, mingo, davej; +Cc: Prarit Bhargava

There are some situations when soft lockup warnings are expected in the
kernel.  For example, when doing an alt-sysrq-t on a large number of processes,
the dump to console can take a long time and the tasklist_lock is held over
that period.  This results in a bogus soft lockup warning.

This patch reworks touch_softlockup_watchdog to touch ALL cpu's
touch_timestamp.  It also introduces touch_cpu_softlockup_watchdog to touch
a single cpu's touch_timestamp.  This makes it functionally equivalent to
touch_nmi_watchdog.

touch_nmi_watchdog is not modified -- AFAICT it was attempting to touch all
cpu's softlockup watchdogs, not just a specific cpu.

/drivers/ide/ide-iops.c does not need to call touch_softlockup_watchdog as it
is done in the call to touch_nmi_watchdog.

The EXPORT_SYMBOL for touch_softlockup_watchdog is needed by
drivers/scsi/ips.ko

Acked-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Prarit Bhargava <prarit@redhat.com>

diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index c58e933..db514bb 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -254,7 +254,7 @@ static int __init uncached_build_memmap(unsigned long uc_start,
 	struct gen_pool *pool = uncached_pools[nid].pool;
 	size_t size = uc_end - uc_start;
 
-	touch_softlockup_watchdog();
+	touch_cpu_softlockup_watchdog();
 
 	if (pool != NULL) {
 		memset((char *)uc_start, 0, size);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index bd513f5..176c97b 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -1225,7 +1225,6 @@ int ide_wait_not_busy(ide_hwif_t *hwif, unsigned long timeout)
 		 */
 		if (stat == 0xff)
 			return -ENODEV;
-		touch_softlockup_watchdog();
 		touch_nmi_watchdog();
 	}
 	return -EBUSY;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 30175c7..5fb2dce 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -313,7 +313,7 @@ static void ide_pio_datablock(ide_drive_t *drive, struct request *rq,
 	if (rq->bio)	/* fs request */
 		rq->errors = 0;
 
-	touch_softlockup_watchdog();
+	touch_cpu_softlockup_watchdog();
 
 	switch (drive->hwif->data_phase) {
 	case TASKFILE_MULTI_IN:
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 6af37b8..9669d5f 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -425,7 +425,7 @@ void nand_wait_ready(struct mtd_info *mtd)
 	do {
 		if (chip->dev_ready(mtd))
 			break;
-		touch_softlockup_watchdog();
+		touch_cpu_softlockup_watchdog();
 	} while (time_before(jiffies, timeo));
 	led_trigger_event(nand_led_trigger, LED_OFF);
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49fe299..113421e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -225,6 +225,7 @@ extern void scheduler_tick(void);
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
+extern void touch_cpu_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog(void);
 #else
 static inline void softlockup_tick(void)
@@ -233,6 +234,9 @@ static inline void softlockup_tick(void)
 static inline void spawn_softlockup_task(void)
 {
 }
+static inline void touch_cpu_softlockup_watchdog(void)
+{
+}
 static inline void touch_softlockup_watchdog(void)
 {
 }
diff --git a/kernel/panic.c b/kernel/panic.c
index 623d182..9e4565c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -132,7 +132,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 #endif
 	local_irq_enable();
 	for (i = 0;;) {
-		touch_softlockup_watchdog();
+		touch_cpu_softlockup_watchdog();
 		i += panic_blink(i);
 		mdelay(1);
 		i++;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 7fb8343..d259b12 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -323,7 +323,7 @@ int swsusp_resume(void)
 	 */
 	swsusp_free();
 	restore_processor_state();
-	touch_softlockup_watchdog();
+	touch_cpu_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
 	return error;
diff --git a/kernel/sched.c b/kernel/sched.c
index a4ca632..243f558 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4783,6 +4783,7 @@ void show_state_filter(unsigned long state_filter)
 		if (p->state & state_filter)
 			show_task(p);
 	} while_each_thread(g, p);
+	touch_softlockup_watchdog();
 
 	read_unlock(&tasklist_lock);
 	/*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 50afeb8..05b66db 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -8,6 +8,7 @@
  */
 #include <linux/mm.h>
 #include <linux/cpu.h>
+#include <linux/cpumask.h>
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
@@ -34,10 +35,19 @@ static struct notifier_block panic_block = {
 	.notifier_call = softlock_panic,
 };
 
-void touch_softlockup_watchdog(void)
+void touch_cpu_softlockup_watchdog(void)
 {
 	__raw_get_cpu_var(touch_timestamp) = jiffies;
 }
+EXPORT_SYMBOL(touch_cpu_softlockup_watchdog);
+
+void touch_softlockup_watchdog(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		per_cpu(touch_timestamp, cpu) = jiffies;
+}
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
 /*
@@ -57,7 +67,7 @@ void softlockup_tick(void)
 
 	/* do not print during early bootup: */
 	if (unlikely(system_state != SYSTEM_RUNNING)) {
-		touch_softlockup_watchdog();
+		touch_cpu_softlockup_watchdog();
 		return;
 	}
 
@@ -94,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 	 */
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		touch_softlockup_watchdog();
+		touch_cpu_softlockup_watchdog();
 		schedule();
 	}
 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH]: Fix bogus softlockup warning with sysrq-t 
  2007-03-15 13:22 [PATCH]: Fix bogus softlockup warning with sysrq-t Prarit Bhargava
@ 2007-03-23 23:46 ` Rick Lindsley
  2007-03-27  5:43 ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 8+ messages in thread
From: Rick Lindsley @ 2007-03-23 23:46 UTC (permalink / raw)
  To: linux-kernel; +Cc: Prarit Bhargava, akpm, clalance, mingo, davej

We've seen these here and had arrived at a similar patch.  Extensive prints
on the console can take longer than the watchdog likes.

Acked-by: Rick Lindsley <ricklind@us.ibm.com>

Rick

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH]: Fix bogus softlockup warning with sysrq-t
  2007-03-15 13:22 [PATCH]: Fix bogus softlockup warning with sysrq-t Prarit Bhargava
  2007-03-23 23:46 ` Rick Lindsley
@ 2007-03-27  5:43 ` Jeremy Fitzhardinge
  2007-03-27  6:47   ` Cestonaro, Thilo (external)
  2007-03-27 11:41   ` Prarit Bhargava
  1 sibling, 2 replies; 8+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-27  5:43 UTC (permalink / raw)
  To: Prarit Bhargava
  Cc: linux-kernel, akpm, clalance, mingo, davej, Thilo.Cestonaro.external

Prarit Bhargava wrote:
> There are some situations when soft lockup warnings are expected in the
> kernel.  For example, when doing an alt-sysrq-t on a large number of processes,
> the dump to console can take a long time and the tasklist_lock is held over
> that period.  This results in a bogus soft lockup warning.
>   

Wouldn't it be better to just temporarily disable softlockups for the
duration?

> This patch reworks touch_softlockup_watchdog to touch ALL cpu's
> touch_timestamp.  It also introduces touch_cpu_softlockup_watchdog to touch
> a single cpu's touch_timestamp.

Doesn't this mean that if one CPU gets locked up, it will be undetected
so long as some other CPU is making progress?

I have another pair of softlockup patches in which I try to address:

    * ignoring time stolen by hypervisors
    * threads going to sleep tickless for long periods of time

I could easy add a "global disable" function, which would allow long
sysrq messages, and it would help Thilo with his long flash update freezes.

    J

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH]: Fix bogus softlockup warning with sysrq-t
  2007-03-27  5:43 ` Jeremy Fitzhardinge
@ 2007-03-27  6:47   ` Cestonaro, Thilo (external)
  2007-03-27 11:41   ` Prarit Bhargava
  1 sibling, 0 replies; 8+ messages in thread
From: Cestonaro, Thilo (external) @ 2007-03-27  6:47 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: linux-kernel

> I could easy add a "global disable" function, which would allow long
> sysrq messages, and it would help Thilo with his long flash update freezes.

A "global disable" and "reenable" functions pair which works during irq disabled,
would be a perfect solution for me. Thx Jeremy for your effort :)

Ciao Thilo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH]: Fix bogus softlockup warning with sysrq-t
  2007-03-27  5:43 ` Jeremy Fitzhardinge
  2007-03-27  6:47   ` Cestonaro, Thilo (external)
@ 2007-03-27 11:41   ` Prarit Bhargava
  2007-03-27 16:46     ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 8+ messages in thread
From: Prarit Bhargava @ 2007-03-27 11:41 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: linux-kernel, akpm, clalance, mingo, davej, Thilo.Cestonaro.external


> I have another pair of softlockup patches in which I try to address:
>
>     * ignoring time stolen by hypervisors
>     * threads going to sleep tickless for long periods of time
>
>   

I'm looking at the code now.  Your solution is definately better :)

> I could easy add a "global disable" function, which would allow long
> sysrq messages, and it would help Thilo with his long flash update freezes.
>
>   

I think that's a good idea -- I'll propose an add on patch to fix the 
sysrq-t case ...

P.

>     J
>   

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH]: Fix bogus softlockup warning with sysrq-t
  2007-03-27 11:41   ` Prarit Bhargava
@ 2007-03-27 16:46     ` Jeremy Fitzhardinge
  2007-03-27 16:56       ` Prarit Bhargava
  2007-03-27 17:35       ` Prarit Bhargava
  0 siblings, 2 replies; 8+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-27 16:46 UTC (permalink / raw)
  To: Prarit Bhargava
  Cc: linux-kernel, akpm, clalance, mingo, davej, Thilo.Cestonaro.external

Prarit Bhargava wrote:
> I think that's a good idea -- I'll propose an add on patch to fix the
> sysrq-t case ...

I'm working on this patch at the moment.  I'm just wondering what
happens if you do a global re-enable while a CPU is locally disabled.  I
think it won't matter; it will end up in the "enabled but need to update
timestamp" state, and the next time it gets a timer tick, it will simply
update the timestamp and carry on.

(This is relative to the other two softlockup patches, but modified
since I posted them.)

    J

diff -r 4c81d8cafb67 drivers/char/sysrq.c
--- a/drivers/char/sysrq.c	Tue Mar 27 01:16:07 2007 -0700
+++ b/drivers/char/sysrq.c	Tue Mar 27 01:18:05 2007 -0700
@@ -408,6 +408,8 @@ void __handle_sysrq(int key, struct tty_
 	int i;
 	unsigned long flags;
 
+	softlockup_global_disable();
+
 	spin_lock_irqsave(&sysrq_key_table_lock, flags);
 	orig_log_level = console_loglevel;
 	console_loglevel = 7;
@@ -445,6 +447,8 @@ void __handle_sysrq(int key, struct tty_
 		console_loglevel = orig_log_level;
 	}
 	spin_unlock_irqrestore(&sysrq_key_table_lock, flags);
+
+	softlockup_global_enable();
 }
 
 /*
diff -r 4c81d8cafb67 include/linux/sched.h
--- a/include/linux/sched.h	Tue Mar 27 01:16:07 2007 -0700
+++ b/include/linux/sched.h	Tue Mar 27 01:18:05 2007 -0700
@@ -235,6 +235,8 @@ extern void softlockup_tick(void);
 extern void softlockup_tick(void);
 extern void softlockup_enable(void);
 extern void softlockup_disable(void);
+extern void softlockup_global_enable(void);
+extern void softlockup_global_disable(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 #else
@@ -245,6 +247,12 @@ static inline void softlockup_enable(voi
 {
 }
 static inline void softlockup_disable(void)
+{
+}
+static inline void softlockup_global_enable(void)
+{
+}
+static inline void softlockup_global_disable(void)
 {
 }
 static inline void spawn_softlockup_task(void)
diff -r 4c81d8cafb67 kernel/softlockup.c
--- a/kernel/softlockup.c	Tue Mar 27 01:16:07 2007 -0700
+++ b/kernel/softlockup.c	Tue Mar 27 01:18:05 2007 -0700
@@ -17,10 +17,16 @@
 
 static DEFINE_SPINLOCK(print_lock);
 
+enum enable {
+	SL_OFF = 0,		/* disabled */
+	SL_UPDATE,		/* enabled, but timestamp old */
+	SL_ON,			/* enabled */
+};
+
 static DEFINE_PER_CPU(unsigned long long, touch_timestamp);
 static DEFINE_PER_CPU(unsigned long long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
-static DEFINE_PER_CPU(int, enabled);
+static DEFINE_PER_CPU(enum enable, enabled);
 
 static int did_panic = 0;
 
@@ -39,6 +45,8 @@ void touch_softlockup_watchdog(void)
 void touch_softlockup_watchdog(void)
 {
 	__raw_get_cpu_var(touch_timestamp) = sched_clock();
+	barrier();			/* update timestamp before enable */
+	__raw_get_cpu_var(enabled) = SL_ON;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
@@ -57,13 +65,27 @@ void softlockup_enable(void)
 void softlockup_enable(void)
 {
 	touch_softlockup_watchdog();
-	barrier();			/* update timestamp before enable */
-	__get_cpu_var(enabled) = 1;
 }
 
 void softlockup_disable(void)
 {
-	__get_cpu_var(enabled) = 0;
+	__get_cpu_var(enabled) = SL_OFF;
+}
+
+void softlockup_global_enable()
+{
+	unsigned cpu;
+
+	for_each_online_cpu(cpu)
+		per_cpu(enabled, cpu) = SL_UPDATE;
+}
+
+void softlockup_global_disable()
+{
+	unsigned cpu;
+
+	for_each_online_cpu(cpu)
+		per_cpu(enabled, cpu) = SL_OFF;
 }
 
 /*
@@ -79,9 +101,19 @@ void softlockup_tick(void)
 
 	touch_timestamp = get_timestamp(&__get_cpu_var(touch_timestamp));
 
-	/* return if not enabled */
-	if (!__get_cpu_var(enabled))
-		return;
+	switch(__get_cpu_var(enabled)) {
+	case SL_OFF:
+		/* not enabled */
+		return;
+
+	case SL_UPDATE:
+		/* update timestamp */
+		touch_softlockup_watchdog();
+		break;
+
+	case SL_ON:
+		break;
+	}
 
 	print_timestamp = __get_cpu_var(print_timestamp);
 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH]: Fix bogus softlockup warning with sysrq-t
  2007-03-27 16:46     ` Jeremy Fitzhardinge
@ 2007-03-27 16:56       ` Prarit Bhargava
  2007-03-27 17:35       ` Prarit Bhargava
  1 sibling, 0 replies; 8+ messages in thread
From: Prarit Bhargava @ 2007-03-27 16:56 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: linux-kernel, akpm, clalance, mingo, davej, Thilo.Cestonaro.external



Jeremy Fitzhardinge wrote:
> Prarit Bhargava wrote:
>   
>> I think that's a good idea -- I'll propose an add on patch to fix the
>> sysrq-t case ...
>>     
>
> I'm working on this patch at the moment.  I'm just wondering what
> happens if you do a global re-enable while a CPU is locally disabled.  I
> think it won't matter; it will end up in the "enabled but need to update
> timestamp" state, and the next time it gets a timer tick, it will simply
> update the timestamp and carry on.
>
> (This is relative to the other two softlockup patches, but modified
> since I posted them.)
>
>     J
>
> diff -r 4c81d8cafb67 drivers/char/sysrq.c
> --- a/drivers/char/sysrq.c	Tue Mar 27 01:16:07 2007 -0700
> +++ b/drivers/char/sysrq.c	Tue Mar 27 01:18:05 2007 -0700
> @@ -408,6 +408,8 @@ void __handle_sysrq(int key, struct tty_
>  	int i;
>  	unsigned long flags;
>  
> +	softlockup_global_disable();
> +
>  	spin_lock_irqsave(&sysrq_key_table_lock, flags);
>  	orig_log_level = console_loglevel;
>  	console_loglevel = 7;
> @@ -445,6 +447,8 @@ void __handle_sysrq(int key, struct tty_
>  		console_loglevel = orig_log_level;
>  	}
>  	spin_unlock_irqrestore(&sysrq_key_table_lock, flags);
> +
> +	softlockup_global_enable();
>   

I think that works -- I'll test it out on a big honkin' ia64 box ;)

Shouldn't you also do softlockup_disable/softlockup_enable instead of 
touch_softlockup_watchdog?  Why do we have both?  I can't see why we 
would have two exported methods to stop/reset the softlockup timer...

P.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH]: Fix bogus softlockup warning with sysrq-t
  2007-03-27 16:46     ` Jeremy Fitzhardinge
  2007-03-27 16:56       ` Prarit Bhargava
@ 2007-03-27 17:35       ` Prarit Bhargava
  1 sibling, 0 replies; 8+ messages in thread
From: Prarit Bhargava @ 2007-03-27 17:35 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: linux-kernel, akpm, clalance, mingo, davej, Thilo.Cestonaro.external



Jeremy Fitzhardinge wrote:
>
> diff -r 4c81d8cafb67 drivers/char/sysrq.c
> --- a/drivers/char/sysrq.c	Tue Mar 27 01:16:07 2007 -0700
> +++ b/drivers/char/sysrq.c	Tue Mar 27 01:18:05 2007 -0700
> @@ -408,6 +408,8 @@ void __handle_sysrq(int key, struct tty_
>  	int i;
>  	unsigned long flags;
>  
> +	softlockup_global_disable();
> +
>  	spin_lock_irqsave(&sysrq_key_table_lock, flags);
>  	orig_log_level = console_loglevel;
>  	console_loglevel = 7;
> @@ -445,6 +447,8 @@ void __handle_sysrq(int key, struct tty_
>  		console_loglevel = orig_log_level;
>  	}
>  	spin_unlock_irqrestore(&sysrq_key_table_lock, flags);
> +
> +	softlockup_global_enable();
>  }
>  
>   

I wonder if that's too strong of a fix.  It's only the sysrq-t case that 
is the problem.  AFAIK, none of the other sysrq-t operations hold the 
tasklist_lock for a long time.  I'd move these around show_state.

P.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2007-03-27 17:35 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-15 13:22 [PATCH]: Fix bogus softlockup warning with sysrq-t Prarit Bhargava
2007-03-23 23:46 ` Rick Lindsley
2007-03-27  5:43 ` Jeremy Fitzhardinge
2007-03-27  6:47   ` Cestonaro, Thilo (external)
2007-03-27 11:41   ` Prarit Bhargava
2007-03-27 16:46     ` Jeremy Fitzhardinge
2007-03-27 16:56       ` Prarit Bhargava
2007-03-27 17:35       ` Prarit Bhargava

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).