LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] s2io: add PCI error recovery support
@ 2007-02-15 23:08 Linas Vepstas
  0 siblings, 0 replies; 14+ messages in thread
From: Linas Vepstas @ 2007-02-15 23:08 UTC (permalink / raw)
  To: Ramkrishna Vepa, Raghavendra Koushik, Ananda Raju
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton


Koushik, Raju,

Please review, comment, and if you find this acceptable, 
please forward upstream. This patch incorporates all of 
fixes resulting from the last set of discussions, circa 
November 2006.

--linas

This patch adds PCI error recovery support to the 
s2io 10-Gigabit ethernet device driver. Fourth revision,
blocks interrupts and the watchdog. Adds a flag to 
s2io_down(), to avoid doing I/O when PCI bus is offline.

Tested, seems to work well.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Acked-by: Ramkrishna Vepa <Ramkrishna.Vepa@neterion.com>
Cc: Raghavendra Koushik <raghavendra.koushik@neterion.com>
Cc: Ananda Raju <Ananda.Raju@neterion.com>
Cc: Wen Xiong <wenxiong@us.ibm.com>

----
 drivers/net/s2io.c |  116 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 drivers/net/s2io.h |    5 ++
 2 files changed, 116 insertions(+), 5 deletions(-)

Index: linux-2.6.20-git4/drivers/net/s2io.c
===================================================================
--- linux-2.6.20-git4.orig/drivers/net/s2io.c	2007-02-15 15:39:35.000000000 -0600
+++ linux-2.6.20-git4/drivers/net/s2io.c	2007-02-15 16:15:10.000000000 -0600
@@ -435,11 +435,18 @@ static struct pci_device_id s2io_tbl[] _
 
 MODULE_DEVICE_TABLE(pci, s2io_tbl);
 
+static struct pci_error_handlers s2io_err_handler = {
+	.error_detected = s2io_io_error_detected,
+	.slot_reset = s2io_io_slot_reset,
+	.resume = s2io_io_resume,
+};
+
 static struct pci_driver s2io_driver = {
       .name = "S2IO",
       .id_table = s2io_tbl,
       .probe = s2io_init_nic,
       .remove = __devexit_p(s2io_rem_nic),
+      .err_handler = &s2io_err_handler,
 };
 
 /* A simplifier macro used both by init and free shared_mem Fns(). */
@@ -2577,6 +2584,9 @@ static void s2io_netpoll(struct net_devi
 	u64 val64 = 0xFFFFFFFFFFFFFFFFULL;
 	int i;
 
+	if (pci_channel_offline(nic->pdev))
+		return;
+
 	disable_irq(dev->irq);
 
 	atomic_inc(&nic->isr_cnt);
@@ -3079,6 +3089,8 @@ static void alarm_intr_handler(struct s2
 	int i;
 	if (atomic_read(&nic->card_state) == CARD_DOWN)
 		return;
+	if (pci_channel_offline(nic->pdev))
+		return;
 	nic->mac_control.stats_info->sw_stat.ring_full_cnt = 0;
 	/* Handling the XPAK counters update */
 	if(nic->mac_control.stats_info->xpak_stat.xpak_timer_count < 72000) {
@@ -4117,6 +4129,10 @@ static irqreturn_t s2io_isr(int irq, voi
 	struct mac_info *mac_control;
 	struct config_param *config;
 
+	/* Pretend we handled any irq's from a disconnected card */
+	if (pci_channel_offline(sp->pdev))
+		return IRQ_NONE;
+
 	atomic_inc(&sp->isr_cnt);
 	mac_control = &sp->mac_control;
 	config = &sp->config;
@@ -6188,7 +6204,7 @@ static void s2io_rem_isr(struct s2io_nic
 	} while(cnt < 5);
 }
 
-static void s2io_card_down(struct s2io_nic * sp)
+static void do_s2io_card_down(struct s2io_nic * sp, int do_io)
 {
 	int cnt = 0;
 	struct XENA_dev_config __iomem *bar0 = sp->bar0;
@@ -6203,7 +6219,8 @@ static void s2io_card_down(struct s2io_n
 	atomic_set(&sp->card_state, CARD_DOWN);
 
 	/* disable Tx and Rx traffic on the NIC */
-	stop_nic(sp);
+	if (do_io)
+		stop_nic(sp);
 
 	s2io_rem_isr(sp);
 
@@ -6211,7 +6228,7 @@ static void s2io_card_down(struct s2io_n
 	tasklet_kill(&sp->task);
 
 	/* Check if the device is Quiescent and then Reset the NIC */
-	do {
+	while(do_io) {
 		/* As per the HW requirement we need to replenish the
 		 * receive buffer to avoid the ring bump. Since there is
 		 * no intention of processing the Rx frame at this pointwe are
@@ -6236,8 +6253,9 @@ static void s2io_card_down(struct s2io_n
 				  (unsigned long long) val64);
 			break;
 		}
-	} while (1);
-	s2io_reset(sp);
+	}
+	if (do_io)
+		s2io_reset(sp);
 
 	spin_lock_irqsave(&sp->tx_lock, flags);
 	/* Free all Tx buffers */
@@ -6252,6 +6270,11 @@ static void s2io_card_down(struct s2io_n
 	clear_bit(0, &(sp->link_state));
 }
 
+static void s2io_card_down(struct s2io_nic * sp)
+{
+	do_s2io_card_down(sp, 1);
+}
+
 static int s2io_card_up(struct s2io_nic * sp)
 {
 	int i, ret = 0;
@@ -7536,3 +7559,86 @@ static void lro_append_pkt(struct s2io_n
 	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
 	return;
 }
+
+/**
+ * s2io_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci conneection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+                                               pci_channel_state_t state)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct s2io_nic *sp = netdev->priv;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev)) {
+		/* Bring down the card, while avoiding PCI I/O */
+		do_s2io_card_down(sp, 0);
+		sp->device_close_flag = TRUE;	/* Device is shut down. */
+	}
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * s2io_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ * At this point, the card has exprienced a hard reset,
+ * followed by fixups by BIOS, and has its config space
+ * set up identically to what it was at cold boot.
+ */
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct s2io_nic *sp = netdev->priv;
+
+	if (pci_enable_device(pdev)) {
+		printk(KERN_ERR "s2io: "
+		       "Cannot re-enable PCI device after reset.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	s2io_reset(sp);
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * s2io_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells
+ * us that its OK to resume normal operation.
+ */
+static void s2io_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct s2io_nic *sp = netdev->priv;
+
+	if (netif_running(netdev)) {
+		if (s2io_card_up(sp)) {
+			printk(KERN_ERR "s2io: "
+			       "Can't bring device back up after reset.\n");
+			return;
+		}
+
+		if (s2io_set_mac_addr(netdev, netdev->dev_addr) == FAILURE) {
+			s2io_card_down(sp);
+			printk(KERN_ERR "s2io: "
+			       "Can't resetore mac addr after reset.\n");
+			return;
+		}
+	}
+
+	netif_device_attach(netdev);
+	netif_wake_queue(netdev);
+}
Index: linux-2.6.20-git4/drivers/net/s2io.h
===================================================================
--- linux-2.6.20-git4.orig/drivers/net/s2io.h	2007-02-15 15:39:35.000000000 -0600
+++ linux-2.6.20-git4/drivers/net/s2io.h	2007-02-15 16:59:40.000000000 -0600
@@ -1020,6 +1020,11 @@ static void update_L3L4_header(struct s2
 static void lro_append_pkt(struct s2io_nic *sp, struct lro *lro,
 			   struct sk_buff *skb, u32 tcp_len);
 
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+			                      pci_channel_state_t state);
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
+static void s2io_io_resume(struct pci_dev *pdev);
+
 #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] s2io: add PCI error recovery support
  2007-03-16 19:49 ` Linas Vepstas
@ 2007-03-16 19:58   ` Ramkrishna Vepa
  0 siblings, 0 replies; 14+ messages in thread
From: Ramkrishna Vepa @ 2007-03-16 19:58 UTC (permalink / raw)
  To: Linas Vepstas
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

> > Ideally s2io_reset should
> > return a failure in this case (return is void now)
> 
> Would you care to provide a patch that did this? I could
> experiment a bit, and try to do this myself; but I really
> don't know this hardware, or this driver, that well.
[Ram] Yes, I can submit a patch to address this.

Ram
> 
> --linas
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] s2io: add PCI error recovery support
  2007-03-05 22:33 Ramkrishna Vepa
@ 2007-03-16 19:49 ` Linas Vepstas
  2007-03-16 19:58   ` Ramkrishna Vepa
  0 siblings, 1 reply; 14+ messages in thread
From: Linas Vepstas @ 2007-03-16 19:49 UTC (permalink / raw)
  To: Ramkrishna Vepa
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

On Mon, Mar 05, 2007 at 05:33:39PM -0500, Ramkrishna Vepa wrote:
> Comments on this patch -
> 
> 1. device_close_flag is unused and is not required.

I'll submit a patch to strip this out sometime next week.

> 2. s2io_reset can fail to reset the device. 

I thought I'd seen this occasionally, and its on my to-do list
to look into this further.

> Ideally s2io_reset should
> return a failure in this case (return is void now) 

Would you care to provide a patch that did this? I could 
experiment a bit, and try to do this myself; but I really
don't know this hardware, or this driver, that well.

--linas



^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] s2io: add PCI error recovery support
@ 2007-03-07  0:42 Ramkrishna Vepa
  0 siblings, 0 replies; 14+ messages in thread
From: Ramkrishna Vepa @ 2007-03-07  0:42 UTC (permalink / raw)
  To: Linas Vepstas, Jeff Garzik
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Andrew Morton

Jeff,

Please apply and forward this patch upstream.

Ram
> -----Original Message-----
> From: Ramkrishna Vepa
> Sent: Monday, March 05, 2007 2:34 PM
> To: 'Linas Vepstas'
> Cc: Wen Xiong; linux-kernel@vger.kernel.org; linux-
> pci@atrey.karlin.mff.cuni.cz; netdev@vger.kernel.org; Jeff Garzik;
Andrew
> Morton
> Subject: RE: [PATCH] s2io: add PCI error recovery support
> 
> Comments on this patch -
> 
> 1. device_close_flag is unused and is not required.
> > +static pci_ers_result_t s2io_io_error_detected(struct pci_dev
*pdev,
> > +                                               pci_channel_state_t
> state)
> > +{
> 		...
> > +		do_s2io_card_down(sp, 0);
> > +		sp->device_close_flag = TRUE;	/* Device is shut down.
*/
> 
> 2. s2io_reset can fail to reset the device. Ideally s2io_reset should
> return a failure in this case (return is void now) and in this case
could
> s2io_io_slot_reset() be called again, maybe try thrice, in total,
before
> failing to reset the slot?
> 
> Ram
> > -----Original Message-----
> > From: Linas Vepstas [mailto:linas@austin.ibm.com]
> > Sent: Thursday, February 15, 2007 3:09 PM
> > To: Ramkrishna Vepa; Raghavendra Koushik; Ananda Raju
> > Cc: Wen Xiong; linux-kernel@vger.kernel.org; linux-
> > pci@atrey.karlin.mff.cuni.cz; netdev@vger.kernel.org; Jeff Garzik;
> Andrew
> > Morton
> > Subject: [PATCH] s2io: add PCI error recovery support
> >
> >
> > Koushik, Raju,
> >
> > Please review, comment, and if you find this acceptable,
> > please forward upstream. This patch incorporates all of
> > fixes resulting from the last set of discussions, circa
> > November 2006.
> >
> > --linas
> >
> > This patch adds PCI error recovery support to the
> > s2io 10-Gigabit ethernet device driver. Fourth revision,
> > blocks interrupts and the watchdog. Adds a flag to
> > s2io_down(), to avoid doing I/O when PCI bus is offline.
> >
> > Tested, seems to work well.
> >
> > Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
> > Acked-by: Ramkrishna Vepa <Ramkrishna.Vepa@neterion.com>
> > Cc: Raghavendra Koushik <raghavendra.koushik@neterion.com>
> > Cc: Ananda Raju <Ananda.Raju@neterion.com>
> > Cc: Wen Xiong <wenxiong@us.ibm.com>
> >
> > ----
> >  drivers/net/s2io.c |  116
> > ++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  drivers/net/s2io.h |    5 ++
> >  2 files changed, 116 insertions(+), 5 deletions(-)
> >
> > Index: linux-2.6.20-git4/drivers/net/s2io.c
> > ===================================================================
> > --- linux-2.6.20-git4.orig/drivers/net/s2io.c	2007-02-15
> > 15:39:35.000000000 -0600
> > +++ linux-2.6.20-git4/drivers/net/s2io.c	2007-02-15
> 16:15:10.000000000 -
> > 0600
> > @@ -435,11 +435,18 @@ static struct pci_device_id s2io_tbl[] _
> >
> >  MODULE_DEVICE_TABLE(pci, s2io_tbl);
> >
> > +static struct pci_error_handlers s2io_err_handler = {
> > +	.error_detected = s2io_io_error_detected,
> > +	.slot_reset = s2io_io_slot_reset,
> > +	.resume = s2io_io_resume,
> > +};
> > +
> >  static struct pci_driver s2io_driver = {
> >        .name = "S2IO",
> >        .id_table = s2io_tbl,
> >        .probe = s2io_init_nic,
> >        .remove = __devexit_p(s2io_rem_nic),
> > +      .err_handler = &s2io_err_handler,
> >  };
> >
> >  /* A simplifier macro used both by init and free shared_mem Fns().
*/
> > @@ -2577,6 +2584,9 @@ static void s2io_netpoll(struct net_devi
> >  	u64 val64 = 0xFFFFFFFFFFFFFFFFULL;
> >  	int i;
> >
> > +	if (pci_channel_offline(nic->pdev))
> > +		return;
> > +
> >  	disable_irq(dev->irq);
> >
> >  	atomic_inc(&nic->isr_cnt);
> > @@ -3079,6 +3089,8 @@ static void alarm_intr_handler(struct s2
> >  	int i;
> >  	if (atomic_read(&nic->card_state) == CARD_DOWN)
> >  		return;
> > +	if (pci_channel_offline(nic->pdev))
> > +		return;
> >  	nic->mac_control.stats_info->sw_stat.ring_full_cnt = 0;
> >  	/* Handling the XPAK counters update */
> >  	if(nic->mac_control.stats_info->xpak_stat.xpak_timer_count <
72000)
> > {
> > @@ -4117,6 +4129,10 @@ static irqreturn_t s2io_isr(int irq, voi
> >  	struct mac_info *mac_control;
> >  	struct config_param *config;
> >
> > +	/* Pretend we handled any irq's from a disconnected card */
> > +	if (pci_channel_offline(sp->pdev))
> > +		return IRQ_NONE;
> > +
> >  	atomic_inc(&sp->isr_cnt);
> >  	mac_control = &sp->mac_control;
> >  	config = &sp->config;
> > @@ -6188,7 +6204,7 @@ static void s2io_rem_isr(struct s2io_nic
> >  	} while(cnt < 5);
> >  }
> >
> > -static void s2io_card_down(struct s2io_nic * sp)
> > +static void do_s2io_card_down(struct s2io_nic * sp, int do_io)
> >  {
> >  	int cnt = 0;
> >  	struct XENA_dev_config __iomem *bar0 = sp->bar0;
> > @@ -6203,7 +6219,8 @@ static void s2io_card_down(struct s2io_n
> >  	atomic_set(&sp->card_state, CARD_DOWN);
> >
> >  	/* disable Tx and Rx traffic on the NIC */
> > -	stop_nic(sp);
> > +	if (do_io)
> > +		stop_nic(sp);
> >
> >  	s2io_rem_isr(sp);
> >
> > @@ -6211,7 +6228,7 @@ static void s2io_card_down(struct s2io_n
> >  	tasklet_kill(&sp->task);
> >
> >  	/* Check if the device is Quiescent and then Reset the NIC */
> > -	do {
> > +	while(do_io) {
> >  		/* As per the HW requirement we need to replenish the
> >  		 * receive buffer to avoid the ring bump. Since there is
> >  		 * no intention of processing the Rx frame at this
pointwe are
> > @@ -6236,8 +6253,9 @@ static void s2io_card_down(struct s2io_n
> >  				  (unsigned long long) val64);
> >  			break;
> >  		}
> > -	} while (1);
> > -	s2io_reset(sp);
> > +	}
> > +	if (do_io)
> > +		s2io_reset(sp);
> >
> >  	spin_lock_irqsave(&sp->tx_lock, flags);
> >  	/* Free all Tx buffers */
> > @@ -6252,6 +6270,11 @@ static void s2io_card_down(struct s2io_n
> >  	clear_bit(0, &(sp->link_state));
> >  }
> >
> > +static void s2io_card_down(struct s2io_nic * sp)
> > +{
> > +	do_s2io_card_down(sp, 1);
> > +}
> > +
> >  static int s2io_card_up(struct s2io_nic * sp)
> >  {
> >  	int i, ret = 0;
> > @@ -7536,3 +7559,86 @@ static void lro_append_pkt(struct s2io_n
> >  	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
> >  	return;
> >  }
> > +
> > +/**
> > + * s2io_io_error_detected - called when PCI error is detected
> > + * @pdev: Pointer to PCI device
> > + * @state: The current pci conneection state
> > + *
> > + * This function is called after a PCI bus error affecting
> > + * this device has been detected.
> > + */
> > +static pci_ers_result_t s2io_io_error_detected(struct pci_dev
*pdev,
> > +                                               pci_channel_state_t
> state)
> > +{
> > +	struct net_device *netdev = pci_get_drvdata(pdev);
> > +	struct s2io_nic *sp = netdev->priv;
> > +
> > +	netif_device_detach(netdev);
> > +
> > +	if (netif_running(netdev)) {
> > +		/* Bring down the card, while avoiding PCI I/O */
> > +		do_s2io_card_down(sp, 0);
> > +		sp->device_close_flag = TRUE;	/* Device is shut down.
*/
> > +	}
> > +	pci_disable_device(pdev);
> > +
> > +	return PCI_ERS_RESULT_NEED_RESET;
> > +}
> > +
> > +/**
> > + * s2io_io_slot_reset - called after the pci bus has been reset.
> > + * @pdev: Pointer to PCI device
> > + *
> > + * Restart the card from scratch, as if from a cold-boot.
> > + * At this point, the card has exprienced a hard reset,
> > + * followed by fixups by BIOS, and has its config space
> > + * set up identically to what it was at cold boot.
> > + */
> > +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
> > +{
> > +	struct net_device *netdev = pci_get_drvdata(pdev);
> > +	struct s2io_nic *sp = netdev->priv;
> > +
> > +	if (pci_enable_device(pdev)) {
> > +		printk(KERN_ERR "s2io: "
> > +		       "Cannot re-enable PCI device after reset.\n");
> > +		return PCI_ERS_RESULT_DISCONNECT;
> > +	}
> > +
> > +	pci_set_master(pdev);
> > +	s2io_reset(sp);
> > +
> > +	return PCI_ERS_RESULT_RECOVERED;
> > +}
> > +
> > +/**
> > + * s2io_io_resume - called when traffic can start flowing again.
> > + * @pdev: Pointer to PCI device
> > + *
> > + * This callback is called when the error recovery driver tells
> > + * us that its OK to resume normal operation.
> > + */
> > +static void s2io_io_resume(struct pci_dev *pdev)
> > +{
> > +	struct net_device *netdev = pci_get_drvdata(pdev);
> > +	struct s2io_nic *sp = netdev->priv;
> > +
> > +	if (netif_running(netdev)) {
> > +		if (s2io_card_up(sp)) {
> > +			printk(KERN_ERR "s2io: "
> > +			       "Can't bring device back up after
reset.\n");
> > +			return;
> > +		}
> > +
> > +		if (s2io_set_mac_addr(netdev, netdev->dev_addr) ==
FAILURE) {
> > +			s2io_card_down(sp);
> > +			printk(KERN_ERR "s2io: "
> > +			       "Can't resetore mac addr after
reset.\n");
> > +			return;
> > +		}
> > +	}
> > +
> > +	netif_device_attach(netdev);
> > +	netif_wake_queue(netdev);
> > +}
> > Index: linux-2.6.20-git4/drivers/net/s2io.h
> > ===================================================================
> > --- linux-2.6.20-git4.orig/drivers/net/s2io.h	2007-02-15
> > 15:39:35.000000000 -0600
> > +++ linux-2.6.20-git4/drivers/net/s2io.h	2007-02-15
> 16:59:40.000000000 -
> > 0600
> > @@ -1020,6 +1020,11 @@ static void update_L3L4_header(struct s2
> >  static void lro_append_pkt(struct s2io_nic *sp, struct lro *lro,
> >  			   struct sk_buff *skb, u32 tcp_len);
> >
> > +static pci_ers_result_t s2io_io_error_detected(struct pci_dev
*pdev,
> > +			                      pci_channel_state_t
state);
> > +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
> > +static void s2io_io_resume(struct pci_dev *pdev);
> > +
> >  #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
> >  #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
> >  #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] s2io: add PCI error recovery support
@ 2007-03-05 22:33 Ramkrishna Vepa
  2007-03-16 19:49 ` Linas Vepstas
  0 siblings, 1 reply; 14+ messages in thread
From: Ramkrishna Vepa @ 2007-03-05 22:33 UTC (permalink / raw)
  To: Linas Vepstas
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

Comments on this patch -

1. device_close_flag is unused and is not required.
> +static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
> +                                               pci_channel_state_t
state)
> +{
		...
> +		do_s2io_card_down(sp, 0);
> +		sp->device_close_flag = TRUE;	/* Device is shut down.
*/

2. s2io_reset can fail to reset the device. Ideally s2io_reset should
return a failure in this case (return is void now) and in this case
could s2io_io_slot_reset() be called again, maybe try thrice, in total,
before failing to reset the slot?

Ram
> -----Original Message-----
> From: Linas Vepstas [mailto:linas@austin.ibm.com]
> Sent: Thursday, February 15, 2007 3:09 PM
> To: Ramkrishna Vepa; Raghavendra Koushik; Ananda Raju
> Cc: Wen Xiong; linux-kernel@vger.kernel.org; linux-
> pci@atrey.karlin.mff.cuni.cz; netdev@vger.kernel.org; Jeff Garzik;
Andrew
> Morton
> Subject: [PATCH] s2io: add PCI error recovery support
> 
> 
> Koushik, Raju,
> 
> Please review, comment, and if you find this acceptable,
> please forward upstream. This patch incorporates all of
> fixes resulting from the last set of discussions, circa
> November 2006.
> 
> --linas
> 
> This patch adds PCI error recovery support to the
> s2io 10-Gigabit ethernet device driver. Fourth revision,
> blocks interrupts and the watchdog. Adds a flag to
> s2io_down(), to avoid doing I/O when PCI bus is offline.
> 
> Tested, seems to work well.
> 
> Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
> Acked-by: Ramkrishna Vepa <Ramkrishna.Vepa@neterion.com>
> Cc: Raghavendra Koushik <raghavendra.koushik@neterion.com>
> Cc: Ananda Raju <Ananda.Raju@neterion.com>
> Cc: Wen Xiong <wenxiong@us.ibm.com>
> 
> ----
>  drivers/net/s2io.c |  116
> ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  drivers/net/s2io.h |    5 ++
>  2 files changed, 116 insertions(+), 5 deletions(-)
> 
> Index: linux-2.6.20-git4/drivers/net/s2io.c
> ===================================================================
> --- linux-2.6.20-git4.orig/drivers/net/s2io.c	2007-02-15
> 15:39:35.000000000 -0600
> +++ linux-2.6.20-git4/drivers/net/s2io.c	2007-02-15
16:15:10.000000000 -
> 0600
> @@ -435,11 +435,18 @@ static struct pci_device_id s2io_tbl[] _
> 
>  MODULE_DEVICE_TABLE(pci, s2io_tbl);
> 
> +static struct pci_error_handlers s2io_err_handler = {
> +	.error_detected = s2io_io_error_detected,
> +	.slot_reset = s2io_io_slot_reset,
> +	.resume = s2io_io_resume,
> +};
> +
>  static struct pci_driver s2io_driver = {
>        .name = "S2IO",
>        .id_table = s2io_tbl,
>        .probe = s2io_init_nic,
>        .remove = __devexit_p(s2io_rem_nic),
> +      .err_handler = &s2io_err_handler,
>  };
> 
>  /* A simplifier macro used both by init and free shared_mem Fns(). */
> @@ -2577,6 +2584,9 @@ static void s2io_netpoll(struct net_devi
>  	u64 val64 = 0xFFFFFFFFFFFFFFFFULL;
>  	int i;
> 
> +	if (pci_channel_offline(nic->pdev))
> +		return;
> +
>  	disable_irq(dev->irq);
> 
>  	atomic_inc(&nic->isr_cnt);
> @@ -3079,6 +3089,8 @@ static void alarm_intr_handler(struct s2
>  	int i;
>  	if (atomic_read(&nic->card_state) == CARD_DOWN)
>  		return;
> +	if (pci_channel_offline(nic->pdev))
> +		return;
>  	nic->mac_control.stats_info->sw_stat.ring_full_cnt = 0;
>  	/* Handling the XPAK counters update */
>  	if(nic->mac_control.stats_info->xpak_stat.xpak_timer_count <
72000)
> {
> @@ -4117,6 +4129,10 @@ static irqreturn_t s2io_isr(int irq, voi
>  	struct mac_info *mac_control;
>  	struct config_param *config;
> 
> +	/* Pretend we handled any irq's from a disconnected card */
> +	if (pci_channel_offline(sp->pdev))
> +		return IRQ_NONE;
> +
>  	atomic_inc(&sp->isr_cnt);
>  	mac_control = &sp->mac_control;
>  	config = &sp->config;
> @@ -6188,7 +6204,7 @@ static void s2io_rem_isr(struct s2io_nic
>  	} while(cnt < 5);
>  }
> 
> -static void s2io_card_down(struct s2io_nic * sp)
> +static void do_s2io_card_down(struct s2io_nic * sp, int do_io)
>  {
>  	int cnt = 0;
>  	struct XENA_dev_config __iomem *bar0 = sp->bar0;
> @@ -6203,7 +6219,8 @@ static void s2io_card_down(struct s2io_n
>  	atomic_set(&sp->card_state, CARD_DOWN);
> 
>  	/* disable Tx and Rx traffic on the NIC */
> -	stop_nic(sp);
> +	if (do_io)
> +		stop_nic(sp);
> 
>  	s2io_rem_isr(sp);
> 
> @@ -6211,7 +6228,7 @@ static void s2io_card_down(struct s2io_n
>  	tasklet_kill(&sp->task);
> 
>  	/* Check if the device is Quiescent and then Reset the NIC */
> -	do {
> +	while(do_io) {
>  		/* As per the HW requirement we need to replenish the
>  		 * receive buffer to avoid the ring bump. Since there is
>  		 * no intention of processing the Rx frame at this
pointwe are
> @@ -6236,8 +6253,9 @@ static void s2io_card_down(struct s2io_n
>  				  (unsigned long long) val64);
>  			break;
>  		}
> -	} while (1);
> -	s2io_reset(sp);
> +	}
> +	if (do_io)
> +		s2io_reset(sp);
> 
>  	spin_lock_irqsave(&sp->tx_lock, flags);
>  	/* Free all Tx buffers */
> @@ -6252,6 +6270,11 @@ static void s2io_card_down(struct s2io_n
>  	clear_bit(0, &(sp->link_state));
>  }
> 
> +static void s2io_card_down(struct s2io_nic * sp)
> +{
> +	do_s2io_card_down(sp, 1);
> +}
> +
>  static int s2io_card_up(struct s2io_nic * sp)
>  {
>  	int i, ret = 0;
> @@ -7536,3 +7559,86 @@ static void lro_append_pkt(struct s2io_n
>  	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
>  	return;
>  }
> +
> +/**
> + * s2io_io_error_detected - called when PCI error is detected
> + * @pdev: Pointer to PCI device
> + * @state: The current pci conneection state
> + *
> + * This function is called after a PCI bus error affecting
> + * this device has been detected.
> + */
> +static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
> +                                               pci_channel_state_t
state)
> +{
> +	struct net_device *netdev = pci_get_drvdata(pdev);
> +	struct s2io_nic *sp = netdev->priv;
> +
> +	netif_device_detach(netdev);
> +
> +	if (netif_running(netdev)) {
> +		/* Bring down the card, while avoiding PCI I/O */
> +		do_s2io_card_down(sp, 0);
> +		sp->device_close_flag = TRUE;	/* Device is shut down.
*/
> +	}
> +	pci_disable_device(pdev);
> +
> +	return PCI_ERS_RESULT_NEED_RESET;
> +}
> +
> +/**
> + * s2io_io_slot_reset - called after the pci bus has been reset.
> + * @pdev: Pointer to PCI device
> + *
> + * Restart the card from scratch, as if from a cold-boot.
> + * At this point, the card has exprienced a hard reset,
> + * followed by fixups by BIOS, and has its config space
> + * set up identically to what it was at cold boot.
> + */
> +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
> +{
> +	struct net_device *netdev = pci_get_drvdata(pdev);
> +	struct s2io_nic *sp = netdev->priv;
> +
> +	if (pci_enable_device(pdev)) {
> +		printk(KERN_ERR "s2io: "
> +		       "Cannot re-enable PCI device after reset.\n");
> +		return PCI_ERS_RESULT_DISCONNECT;
> +	}
> +
> +	pci_set_master(pdev);
> +	s2io_reset(sp);
> +
> +	return PCI_ERS_RESULT_RECOVERED;
> +}
> +
> +/**
> + * s2io_io_resume - called when traffic can start flowing again.
> + * @pdev: Pointer to PCI device
> + *
> + * This callback is called when the error recovery driver tells
> + * us that its OK to resume normal operation.
> + */
> +static void s2io_io_resume(struct pci_dev *pdev)
> +{
> +	struct net_device *netdev = pci_get_drvdata(pdev);
> +	struct s2io_nic *sp = netdev->priv;
> +
> +	if (netif_running(netdev)) {
> +		if (s2io_card_up(sp)) {
> +			printk(KERN_ERR "s2io: "
> +			       "Can't bring device back up after
reset.\n");
> +			return;
> +		}
> +
> +		if (s2io_set_mac_addr(netdev, netdev->dev_addr) ==
FAILURE) {
> +			s2io_card_down(sp);
> +			printk(KERN_ERR "s2io: "
> +			       "Can't resetore mac addr after
reset.\n");
> +			return;
> +		}
> +	}
> +
> +	netif_device_attach(netdev);
> +	netif_wake_queue(netdev);
> +}
> Index: linux-2.6.20-git4/drivers/net/s2io.h
> ===================================================================
> --- linux-2.6.20-git4.orig/drivers/net/s2io.h	2007-02-15
> 15:39:35.000000000 -0600
> +++ linux-2.6.20-git4/drivers/net/s2io.h	2007-02-15
16:59:40.000000000 -
> 0600
> @@ -1020,6 +1020,11 @@ static void update_L3L4_header(struct s2
>  static void lro_append_pkt(struct s2io_nic *sp, struct lro *lro,
>  			   struct sk_buff *skb, u32 tcp_len);
> 
> +static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
> +			                      pci_channel_state_t
state);
> +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
> +static void s2io_io_resume(struct pci_dev *pdev);
> +
>  #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
>  #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
>  #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] s2io: add PCI error recovery support
@ 2007-01-10 19:54 Ramkrishna Vepa
  0 siblings, 0 replies; 14+ messages in thread
From: Ramkrishna Vepa @ 2007-01-10 19:54 UTC (permalink / raw)
  To: netdev-owner, Linas Vepstas
  Cc: Ananda Raju, Jeff Garzik, linux-kernel, linux-pci, Andrew Morton

Linas,

We reviewed this patch and it looks good.

Thanks,
Ram
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org
[mailto:netdev-owner@vger.kernel.org]
> > On Behalf Of Linas Vepstas
> > Sent: Tuesday, October 24, 2006 2:55 PM
> > To: Raghavendra Koushik; Ananda Raju; Wen Xiong
> > Cc: linux-kernel@vger.kernel.org;
linux-pci@atrey.karlin.mff.cuni.cz;
> > netdev@vger.kernel.org; Jeff Garzik; Andrew Morton
> > Subject: [PATCH] s2io: add PCI error recovery support
> >
> >
> > Koushik, Raju,
> > Please review, comment, and if you find this acceptable,
> > please forward upstream.
> >
> > --linas
> >
> > This patch adds PCI error recovery support to the
> > s2io 10-Gigabit ethernet device driver.
> >
> > Tested, seems to work well.
> >
> > Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
> > Cc: Raghavendra Koushik <raghavendra.koushik@neterion.com>
> > Cc: Ananda Raju <Ananda.Raju@neterion.com>
> > Cc: Wen Xiong <wenxiong@us.ibm.com>
> >
> > ----
> >  drivers/net/s2io.c |   77
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  drivers/net/s2io.h |    5 +++
> >  2 files changed, 82 insertions(+)
> >
> > Index: linux-2.6.19-rc1-git11/drivers/net/s2io.c
> > ===================================================================
> > --- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.c	2006-10-20
> > 12:24:17.000000000 -0500
> > +++ linux-2.6.19-rc1-git11/drivers/net/s2io.c	2006-10-24
> > 16:19:49.000000000 -0500
> > @@ -434,11 +434,18 @@ static struct pci_device_id s2io_tbl[] _
> >
> >  MODULE_DEVICE_TABLE(pci, s2io_tbl);
> >
> > +static struct pci_error_handlers s2io_err_handler = {
> > +	.error_detected = s2io_io_error_detected,
> > +	.slot_reset = s2io_io_slot_reset,
> > +	.resume = s2io_io_resume,
> > +};
> > +
> >  static struct pci_driver s2io_driver = {
> >        .name = "S2IO",
> >        .id_table = s2io_tbl,
> >        .probe = s2io_init_nic,
> >        .remove = __devexit_p(s2io_rem_nic),
> > +      .err_handler = &s2io_err_handler,
> >  };
> >
> >  /* A simplifier macro used both by init and free shared_mem Fns().
*/
> > @@ -7564,3 +7571,73 @@ static void lro_append_pkt(nic_t *sp, lr
> >  	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
> >  	return;
> >  }
> > +
> > +/**
> > + * s2io_io_error_detected - called when PCI error is detected
> > + * @pdev: Pointer to PCI device
> > + * @state: The current pci conneection state
> > + *
> > + * This function is called after a PCI bus error affecting
> > + * this device has been detected.
> > + */
> > +static pci_ers_result_t s2io_io_error_detected(struct pci_dev
*pdev,
> > pci_channel_state_t state)
> > +{
> > +	struct net_device *netdev = pci_get_drvdata(pdev);
> > +	nic_t *sp = netdev->priv;
> > +
> > +	netif_device_detach(netdev);
> > +
> > +	if (netif_running(netdev)) {
> > +		/* Reset card */
> > +		s2io_card_down(sp);
> > +		sp->device_close_flag = TRUE;	/* Device is shut down.
*/
> > +	}
> > +	pci_disable_device(pdev);
> > +
> > +	return PCI_ERS_RESULT_NEED_RESET;
> > +}
> > +
> > +/**
> > + * s2io_io_slot_reset - called after the pci bus has been reset.
> > + * @pdev: Pointer to PCI device
> > + *
> > + * Restart the card from scratch, as if from a cold-boot.
> > + */
> > +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
> > +{
> > +	struct net_device *netdev = pci_get_drvdata(pdev);
> > +	nic_t *sp = netdev->priv;
> > +
> > +	if (pci_enable_device(pdev)) {
> > +		printk(KERN_ERR "s2io: Cannot re-enable PCI device after
> > reset.\n");
> > +		return PCI_ERS_RESULT_DISCONNECT;
> > +	}
> > +
> > +	pci_set_master(pdev);
> > +	s2io_reset(sp);
> > +
> > +	return PCI_ERS_RESULT_RECOVERED;
> > +}
> > +
> > +/**
> > + * s2io_io_resume - called when traffic can start flowing again.
> > + * @pdev: Pointer to PCI device
> > + *
> > + * This callback is called when the error recovery driver tells us
that
> > + * its OK to resume normal operation.
> > + */
> > +static void s2io_io_resume(struct pci_dev *pdev)
> > +{
> > +	struct net_device *netdev = pci_get_drvdata(pdev);
> > +	nic_t *sp = netdev->priv;
> > +
> > +	if (netif_running(netdev)) {
> > +		if (s2io_card_up(sp)) {
> > +			printk(KERN_ERR "s2io: can't bring device back
up after
> > reset\n");
> > +			return;
> > +		}
> > +	}
> > +
> > +	netif_device_attach(netdev);
> > +	netif_wake_queue(netdev);
> > +}
> > Index: linux-2.6.19-rc1-git11/drivers/net/s2io.h
> > ===================================================================
> > --- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.h	2006-10-20
> > 12:24:17.000000000 -0500
> > +++ linux-2.6.19-rc1-git11/drivers/net/s2io.h	2006-10-20
> > 12:41:39.000000000 -0500
> > @@ -1013,6 +1013,11 @@ static void queue_rx_frame(struct sk_buf
> >  static void update_L3L4_header(nic_t *sp, lro_t *lro);
> >  static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff
*skb,
> > u32 tcp_len);
> >
> > +static pci_ers_result_t s2io_io_error_detected(struct pci_dev
*pdev,
> > +
pci_channel_state_t
> > state);
> > +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
> > +static void s2io_io_resume(struct pci_dev *pdev);
> > +
> >  #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
> >  #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
> >  #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type
> > -
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] s2io: add PCI error recovery support
  2006-10-27 11:35 Ananda Raju
@ 2006-10-27 19:32 ` Linas Vepstas
  0 siblings, 0 replies; 14+ messages in thread
From: Linas Vepstas @ 2006-10-27 19:32 UTC (permalink / raw)
  To: Ananda Raju
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

On Fri, Oct 27, 2006 at 07:35:18AM -0400, Ananda Raju wrote:
> Looking at all scenarios I feel the first patch is OK. Can you add the
> watchdog timer fix to first initial patch and resubmit. 

Appended below.

> So -- just for grins, I thought to myself, "Maybe I can make 
> s2io be the first adapter ever to fully recover without 
> a hard reset of the card."

... I couldn't quite make this work. Since the patch below
already works, I didn't see much point exterting myself further.

--linas

This patch adds PCI error recovery support to the 
s2io 10-Gigabit ethernet device driver. Third revision,
blocks interrupts and the watchdog.

Tested, seems to work well.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Cc: Raghavendra Koushik <raghavendra.koushik@neterion.com>
Cc: Ananda Raju <Ananda.Raju@neterion.com>
Cc: Wen Xiong <wenxiong@us.ibm.com>

----
 drivers/net/s2io.c |  121 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/s2io.h |    5 ++
 2 files changed, 126 insertions(+)

Index: linux-2.6.19-rc1-git11/drivers/net/s2io.c
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.c	2006-10-27 10:49:07.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.c	2006-10-27 13:55:01.000000000 -0500
@@ -434,11 +434,18 @@ static struct pci_device_id s2io_tbl[] _
 
 MODULE_DEVICE_TABLE(pci, s2io_tbl);
 
+static struct pci_error_handlers s2io_err_handler = {
+	.error_detected = s2io_io_error_detected,
+	.slot_reset = s2io_io_slot_reset,
+	.resume = s2io_io_resume,
+};
+
 static struct pci_driver s2io_driver = {
       .name = "S2IO",
       .id_table = s2io_tbl,
       .probe = s2io_init_nic,
       .remove = __devexit_p(s2io_rem_nic),
+      .err_handler = &s2io_err_handler,
 };
 
 /* A simplifier macro used both by init and free shared_mem Fns(). */
@@ -3159,6 +3166,11 @@ static void alarm_intr_handler(struct s2
 	register u64 val64 = 0, err_reg = 0;
 	u64 cnt;
 	int i;
+
+	if ((nic->pdev->error_state != pci_channel_io_normal) &&
+		 (nic->pdev->error_state != 0))
+		return;
+
 	nic->mac_control.stats_info->sw_stat.ring_full_cnt = 0;
 	/* Handling the XPAK counters update */
 	if(nic->mac_control.stats_info->xpak_stat.xpak_timer_count < 72000) {
@@ -4171,6 +4183,11 @@ static irqreturn_t s2io_isr(int irq, voi
 	mac_info_t *mac_control;
 	struct config_param *config;
 
+	/* Pretend we handled any irq's from a disconnected card */
+	if ((sp->pdev->error_state != pci_channel_io_normal) &&
+		 (sp->pdev->error_state != 0))
+		return IRQ_HANDLED;
+
 	atomic_inc(&sp->isr_cnt);
 	mac_control = &sp->mac_control;
 	config = &sp->config;
@@ -7564,3 +7581,107 @@ static void lro_append_pkt(nic_t *sp, lr
 	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
 	return;
 }
+
+/**
+ * s2io_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci conneection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+                                               pci_channel_state_t state)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev)) {
+		unsigned long flags;
+
+		/* The folowing is an abreviated subset of the
+		 * steps taken by s2io_card_down(), avoiding
+		 * steps that touch the card itself.
+		 */
+		del_timer_sync(&sp->alarm_timer);
+		atomic_set(&sp->card_state, CARD_DOWN);
+
+		/* Kill tasklet. */
+		tasklet_kill(&sp->task);
+
+		/* Free all Tx buffers */
+		spin_lock_irqsave(&sp->tx_lock, flags);
+		free_tx_buffers(sp);
+		spin_unlock_irqrestore(&sp->tx_lock, flags);
+
+		/* Free all Rx buffers */
+		spin_lock_irqsave(&sp->rx_lock, flags);
+		free_rx_buffers(sp);
+		spin_unlock_irqrestore(&sp->rx_lock, flags);
+
+		clear_bit(0, &(sp->link_state));
+		sp->device_close_flag = TRUE;	/* Device is shut down. */
+	}
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * s2io_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ * At this point, the card has exprienced a hard reset,
+ * followed by fixups by BIOS, and has its config space
+ * set up identically to what it was at cold boot.
+ */
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (pci_enable_device(pdev)) {
+		printk(KERN_ERR "s2io: "
+		       "Cannot re-enable PCI device after reset.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	s2io_reset(sp);
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * s2io_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells
+ * us that its OK to resume normal operation.
+ */
+static void s2io_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (netif_running(netdev)) {
+		if (s2io_card_up(sp)) {
+			printk(KERN_ERR "s2io: "
+			       "Can't bring device back up after reset.\n");
+			return;
+		}
+
+		if (s2io_set_mac_addr(netdev, netdev->dev_addr) == FAILURE) {
+			s2io_card_down(sp);
+			printk(KERN_ERR "s2io: "
+			       "Can't resetore mac addr after reset.\n");
+			return;
+		}
+	}
+
+	netif_device_attach(netdev);
+	netif_wake_queue(netdev);
+}
Index: linux-2.6.19-rc1-git11/drivers/net/s2io.h
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.h	2006-10-27 10:49:07.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.h	2006-10-27 10:50:53.000000000 -0500
@@ -1013,6 +1013,11 @@ static void queue_rx_frame(struct sk_buf
 static void update_L3L4_header(nic_t *sp, lro_t *lro);
 static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb, u32 tcp_len);
 
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+					                      pci_channel_state_t state);
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
+static void s2io_io_resume(struct pci_dev *pdev);
+
 #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] s2io: add PCI error recovery support
@ 2006-10-27 11:35 Ananda Raju
  2006-10-27 19:32 ` Linas Vepstas
  0 siblings, 1 reply; 14+ messages in thread
From: Ananda Raju @ 2006-10-27 11:35 UTC (permalink / raw)
  To: Linas Vepstas
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

Looking at all scenarios I feel the first patch is OK. Can you add the
watchdog timer fix to first initial patch and resubmit. 

-----Original Message-----
From: Linas Vepstas [mailto:linas@austin.ibm.com] 
Sent: Thursday, October 26, 2006 3:52 PM
To: Ananda Raju
Cc: Wen Xiong; linux-kernel@vger.kernel.org;
linux-pci@atrey.karlin.mff.cuni.cz; netdev@vger.kernel.org; Jeff Garzik;
Andrew Morton
Subject: Re: [PATCH] s2io: add PCI error recovery support

Hi.

On Thu, Oct 26, 2006 at 05:56:34AM -0400, Ananda Raju wrote:
> Hi, 
> Can you try attached patch. The attached patch is simple. We set card
> state as down in error_detecct() so that all entry points return error
> and don't proceed further.
> 
> In slot_reset() we do s2io_card_down() will reset adapter. 
> In io_resume() we bringup the driver. 

Simplicity is always better. However, some questions/comments:

> @@ -4175,6 +4186,10 @@ static irqreturn_t s2io_isr(int irq, voi
>  	mac_info_t *mac_control;
>  	struct config_param *config;
>  
> +	if (atomic_read(&sp->card_state) == CARD_DOWN) {
> +		return IRQ_NONE;
> +	}

I used 

    if ((sp->pdev->error_state != pci_channel_io_normal)

here for a reason: the pdev->error_state is set even in an interrupt
context, that is, it gets set even if interrups are disabled, and
so it represents the actual state immediately. By contrast, the
error callbacks do not get called until possibly much later, 
and so sp->card_state = CARD_DOWN might not get set for a while.

If, for any reason, e.g. some obscure corner case, the s2io 
generates zillions of interupts, this could result in a soft-lockup.
I actually saw this in the symbios device driver, which will
regenerate an interrupt until its acknowledged -- and so it 
sat there, spinning. :-(

I was returning IRQ_HANDLED instead of IRQ_NONE, so as to avoid
falling into handle_bad_irq() or report_bad_irq(). I haven't 
seen this happen on s2io, but thought it would still be wise.

If this can't happen, then there's no problem here.

> +/**
> + * s2io_io_slot_reset - called after the pci bus has been reset.
> + * @pdev: Pointer to PCI device
> + *
> + * Restart the card from scratch, as if from a cold-boot.
> + */
> +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
> +{

At this point, the card has just experienced a hardware reset,
(the #RST wire was held low for 250 millisecs, followed by
a settle time of 2 seconds, followed by whatever BIOS thinks
it needed to do, followed by a restore of the pci config space
to what it was after a cold boot. So the card is in a "fresh"
state; in theory its identitcal to a cold boot. So ... 
are you sure you want to "down" at this point? 

> +		s2io_card_down(sp);
> +		sp->device_close_flag = TRUE;	/* Device is shut down.
*/


One problem I'm having is that the watchdog timer sometimes
pops and tries to reset the card before s2io_card_down()
has a chance to run. I fixed this ... 

======================
So -- just for grins, I thought to myself, "Maybe I can make 
s2io be the first adapter ever to fully recover without 
a hard reset of the card."

The idea is simple: 

1) enable MMIO,
2) call s2io_card_down()
3) enable DMA
4) cal s2io_card_up()

I have a patch that does this, but then hit a few more snags.
I haven't yet nailed down all the trouble spots, maybe tommorrow.

--linas



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] s2io: add PCI error recovery support
  2006-10-26  9:56 Ananda Raju
@ 2006-10-26 22:51 ` Linas Vepstas
  0 siblings, 0 replies; 14+ messages in thread
From: Linas Vepstas @ 2006-10-26 22:51 UTC (permalink / raw)
  To: Ananda Raju
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

Hi.

On Thu, Oct 26, 2006 at 05:56:34AM -0400, Ananda Raju wrote:
> Hi, 
> Can you try attached patch. The attached patch is simple. We set card
> state as down in error_detecct() so that all entry points return error
> and don't proceed further.
> 
> In slot_reset() we do s2io_card_down() will reset adapter. 
> In io_resume() we bringup the driver. 

Simplicity is always better. However, some questions/comments:

> @@ -4175,6 +4186,10 @@ static irqreturn_t s2io_isr(int irq, voi
>  	mac_info_t *mac_control;
>  	struct config_param *config;
>  
> +	if (atomic_read(&sp->card_state) == CARD_DOWN) {
> +		return IRQ_NONE;
> +	}

I used 

    if ((sp->pdev->error_state != pci_channel_io_normal)

here for a reason: the pdev->error_state is set even in an interrupt
context, that is, it gets set even if interrups are disabled, and
so it represents the actual state immediately. By contrast, the
error callbacks do not get called until possibly much later, 
and so sp->card_state = CARD_DOWN might not get set for a while.

If, for any reason, e.g. some obscure corner case, the s2io 
generates zillions of interupts, this could result in a soft-lockup.
I actually saw this in the symbios device driver, which will
regenerate an interrupt until its acknowledged -- and so it 
sat there, spinning. :-(

I was returning IRQ_HANDLED instead of IRQ_NONE, so as to avoid
falling into handle_bad_irq() or report_bad_irq(). I haven't 
seen this happen on s2io, but thought it would still be wise.

If this can't happen, then there's no problem here.

> +/**
> + * s2io_io_slot_reset - called after the pci bus has been reset.
> + * @pdev: Pointer to PCI device
> + *
> + * Restart the card from scratch, as if from a cold-boot.
> + */
> +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
> +{

At this point, the card has just experienced a hardware reset,
(the #RST wire was held low for 250 millisecs, followed by
a settle time of 2 seconds, followed by whatever BIOS thinks
it needed to do, followed by a restore of the pci config space
to what it was after a cold boot. So the card is in a "fresh"
state; in theory its identitcal to a cold boot. So ... 
are you sure you want to "down" at this point? 

> +		s2io_card_down(sp);
> +		sp->device_close_flag = TRUE;	/* Device is shut down. */


One problem I'm having is that the watchdog timer sometimes
pops and tries to reset the card before s2io_card_down()
has a chance to run. I fixed this ... 

======================
So -- just for grins, I thought to myself, "Maybe I can make 
s2io be the first adapter ever to fully recover without 
a hard reset of the card."

The idea is simple: 

1) enable MMIO,
2) call s2io_card_down()
3) enable DMA
4) cal s2io_card_up()

I have a patch that does this, but then hit a few more snags.
I haven't yet nailed down all the trouble spots, maybe tommorrow.

--linas


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] s2io: add PCI error recovery support
@ 2006-10-26  9:56 Ananda Raju
  2006-10-26 22:51 ` Linas Vepstas
  0 siblings, 1 reply; 14+ messages in thread
From: Ananda Raju @ 2006-10-26  9:56 UTC (permalink / raw)
  To: Linas Vepstas
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 6357 bytes --]

Hi, 
Can you try attached patch. The attached patch is simple. We set card
state as down in error_detecct() so that all entry points return error
and don't proceed further.

In slot_reset() we do s2io_card_down() will reset adapter. 
In io_resume() we bringup the driver. 

Ananda 

-----Original Message-----
From: Linas Vepstas [mailto:linas@austin.ibm.com] 
Sent: Wednesday, October 25, 2006 1:55 PM
To: Ananda Raju
Cc: Wen Xiong; linux-kernel@vger.kernel.org;
linux-pci@atrey.karlin.mff.cuni.cz; netdev@vger.kernel.org; Jeff Garzik;
Andrew Morton
Subject: Re: [PATCH] s2io: add PCI error recovery support

On Wed, Oct 25, 2006 at 10:11:24AM -0500, Linas Vepstas wrote:
> 
> > Also we have to add following if statement in beginning of
s2io_isr().

Done, below,

> > If it is ok to do BAR0 read/write in error_detected() then patch is
OK. 

I re-wrote that section to avoid doing I/O. It seems to work well,
and generates a few less messages in the process.  New, improved patch
below, please ack and send upstream if you like it.

--linas

This patch adds PCI error recovery support to the 
s2io 10-Gigabit ethernet device driver.

Tested, seems to work well.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Cc: Raghavendra Koushik <raghavendra.koushik@neterion.com>
Cc: Ananda Raju <Ananda.Raju@neterion.com>
Cc: Wen Xiong <wenxiong@us.ibm.com>

----
 drivers/net/s2io.c |  103
+++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/s2io.h |    5 ++
 2 files changed, 108 insertions(+)

Index: linux-2.6.19-rc1-git11/drivers/net/s2io.c
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.c	2006-10-25
14:09:47.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.c	2006-10-25
15:18:25.000000000 -0500
@@ -434,11 +434,18 @@ static struct pci_device_id s2io_tbl[] _
 
 MODULE_DEVICE_TABLE(pci, s2io_tbl);
 
+static struct pci_error_handlers s2io_err_handler = {
+	.error_detected = s2io_io_error_detected,
+	.slot_reset = s2io_io_slot_reset,
+	.resume = s2io_io_resume,
+};
+
 static struct pci_driver s2io_driver = {
       .name = "S2IO",
       .id_table = s2io_tbl,
       .probe = s2io_init_nic,
       .remove = __devexit_p(s2io_rem_nic),
+      .err_handler = &s2io_err_handler,
 };
 
 /* A simplifier macro used both by init and free shared_mem Fns(). */
@@ -4171,6 +4178,11 @@ static irqreturn_t s2io_isr(int irq, voi
 	mac_info_t *mac_control;
 	struct config_param *config;
 
+	if ((sp->pdev->error_state != pci_channel_io_normal) &&
+		 (sp->pdev->error_state != 0)) {
+		return IRQ_HANDLED;
+	}
+
 	atomic_inc(&sp->isr_cnt);
 	mac_control = &sp->mac_control;
 	config = &sp->config;
@@ -7564,3 +7576,94 @@ static void lro_append_pkt(nic_t *sp, lr
 	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
 	return;
 }
+
+/**
+ * s2io_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci conneection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev)) {
+		unsigned long flags;
+
+		/* The folowing is an abreviated subset of the
+		 * steps taken by s2io_card_down(), avoiding
+		 * steps that touch the card itself.
+		 */
+		del_timer_sync(&sp->alarm_timer);
+		atomic_set(&sp->card_state, CARD_DOWN);
+
+		/* Kill tasklet. */
+		tasklet_kill(&sp->task);
+
+		/* Free all Tx buffers */
+		spin_lock_irqsave(&sp->tx_lock, flags);
+		free_tx_buffers(sp);
+		spin_unlock_irqrestore(&sp->tx_lock, flags);
+
+		/* Free all Rx buffers */
+		spin_lock_irqsave(&sp->rx_lock, flags);
+		free_rx_buffers(sp);
+		spin_unlock_irqrestore(&sp->rx_lock, flags);
+	
+		clear_bit(0, &(sp->link_state));
+		sp->device_close_flag = TRUE;	/* Device is shut down.
*/
+	}
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * s2io_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ */
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (pci_enable_device(pdev)) {
+		printk(KERN_ERR "s2io: Cannot re-enable PCI device after
reset.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	s2io_reset(sp);
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * s2io_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells us that
+ * its OK to resume normal operation.
+ */
+static void s2io_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (netif_running(netdev)) {
+		if (s2io_card_up(sp)) {
+			printk(KERN_ERR "s2io: can't bring device back
up after reset\n");
+			return;
+		}
+	}
+
+	netif_device_attach(netdev);
+	netif_wake_queue(netdev);
+}
Index: linux-2.6.19-rc1-git11/drivers/net/s2io.h
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.h	2006-10-25
14:09:47.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.h	2006-10-25
14:11:05.000000000 -0500
@@ -1013,6 +1013,11 @@ static void queue_rx_frame(struct sk_buf
 static void update_L3L4_header(nic_t *sp, lro_t *lro);
 static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb,
u32 tcp_len);
 
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+                                               pci_channel_state_t
state);
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
+static void s2io_io_resume(struct pci_dev *pdev);
+
 #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type


[-- Attachment #2: pci_err_rec.patch --]
[-- Type: application/octet-stream, Size: 4436 bytes --]

hi, 

This patch adds PCI error recovery support to the 
s2io 10-Gigabit ethernet device driver. This is modified patch based on previous
submitted by Linas Vepstas <linas@austin.ibm.com>

Signed-off-by: Ananda Raju <Ananda.Raju@neterion.com>
Cc: Wen Xiong <wenxiong@us.ibm.com>
Cc: Linas Vepstas <linas@austin.ibm.com>
---
diff -upNr linux-2.6.19-rc1/drivers/net/s2io.c pci_err_rec/drivers/net/s2io.c
--- linux-2.6.19-rc1/drivers/net/s2io.c	2006-10-26 02:25:36.000000000 +0530
+++ pci_err_rec/drivers/net/s2io.c	2006-10-26 03:20:26.000000000 +0530
@@ -434,11 +434,18 @@ static struct pci_device_id s2io_tbl[] _
 
 MODULE_DEVICE_TABLE(pci, s2io_tbl);
 
+static struct pci_error_handlers s2io_err_handler = {
+	.error_detected = s2io_io_error_detected,
+	.slot_reset = s2io_io_slot_reset,
+	.resume = s2io_io_resume,
+};
+
 static struct pci_driver s2io_driver = {
       .name = "S2IO",
       .id_table = s2io_tbl,
       .probe = s2io_init_nic,
       .remove = __devexit_p(s2io_rem_nic),
+      .err_handler = &s2io_err_handler,
 };
 
 /* A simplifier macro used both by init and free shared_mem Fns(). */
@@ -2580,6 +2587,10 @@ static int s2io_poll(struct net_device *
 	u64 val64 = 0xFFFFFFFFFFFFFFFFULL;
 	int i;
 
+	if (atomic_read(&nic->card_state) == CARD_DOWN) {
+		return 0;
+	}
+
 	atomic_inc(&nic->isr_cnt);
 	mac_control = &nic->mac_control;
 	config = &nic->config;
@@ -4175,6 +4186,10 @@ static irqreturn_t s2io_isr(int irq, voi
 	mac_info_t *mac_control;
 	struct config_param *config;
 
+	if (atomic_read(&sp->card_state) == CARD_DOWN) {
+		return IRQ_NONE;
+	}
+
 	atomic_inc(&sp->isr_cnt);
 	mac_control = &sp->mac_control;
 	config = &sp->config;
@@ -7568,3 +7583,71 @@ static void lro_append_pkt(nic_t *sp, lr
 	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
 	return;
 }
+
+/**
+ * s2io_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci conneection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	netif_device_detach(netdev);
+	atomic_set(&sp->card_state, CARD_DOWN);
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * s2io_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ */
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (pci_enable_device(pdev)) {
+		printk(KERN_ERR "s2io: Cannot re-enable PCI device after reset.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	if (netif_running(netdev)) {
+		s2io_card_down(sp);
+		sp->device_close_flag = TRUE;	/* Device is shut down. */
+	}
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * s2io_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells us that
+ * its OK to resume normal operation.
+ */
+static void s2io_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (netif_running(netdev)) {
+		if (s2io_card_up(sp)) {
+			printk(KERN_ERR "s2io: can't bring device back up after reset\n");
+			return;
+		}
+	}
+
+	netif_device_attach(netdev);
+	netif_wake_queue(netdev);
+}
diff -upNr linux-2.6.19-rc1/drivers/net/s2io.h pci_err_rec/drivers/net/s2io.h
--- linux-2.6.19-rc1/drivers/net/s2io.h	2006-10-26 02:25:42.000000000 +0530
+++ pci_err_rec/drivers/net/s2io.h	2006-10-26 03:06:20.000000000 +0530
@@ -1013,6 +1013,11 @@ static void queue_rx_frame(struct sk_buf
 static void update_L3L4_header(nic_t *sp, lro_t *lro);
 static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb, u32 tcp_len);
 
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+						pci_channel_state_t state);
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
+static void s2io_io_resume(struct pci_dev *pdev);
+
 #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] s2io: add PCI error recovery support
  2006-10-25 15:11 ` Linas Vepstas
@ 2006-10-25 20:55   ` Linas Vepstas
  0 siblings, 0 replies; 14+ messages in thread
From: Linas Vepstas @ 2006-10-25 20:55 UTC (permalink / raw)
  To: Ananda Raju
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

On Wed, Oct 25, 2006 at 10:11:24AM -0500, Linas Vepstas wrote:
> 
> > Also we have to add following if statement in beginning of s2io_isr().

Done, below,

> > If it is ok to do BAR0 read/write in error_detected() then patch is OK. 

I re-wrote that section to avoid doing I/O. It seems to work well,
and generates a few less messages in the process.  New, improved patch
below, please ack and send upstream if you like it.

--linas

This patch adds PCI error recovery support to the 
s2io 10-Gigabit ethernet device driver.

Tested, seems to work well.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Cc: Raghavendra Koushik <raghavendra.koushik@neterion.com>
Cc: Ananda Raju <Ananda.Raju@neterion.com>
Cc: Wen Xiong <wenxiong@us.ibm.com>

----
 drivers/net/s2io.c |  103 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/s2io.h |    5 ++
 2 files changed, 108 insertions(+)

Index: linux-2.6.19-rc1-git11/drivers/net/s2io.c
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.c	2006-10-25 14:09:47.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.c	2006-10-25 15:18:25.000000000 -0500
@@ -434,11 +434,18 @@ static struct pci_device_id s2io_tbl[] _
 
 MODULE_DEVICE_TABLE(pci, s2io_tbl);
 
+static struct pci_error_handlers s2io_err_handler = {
+	.error_detected = s2io_io_error_detected,
+	.slot_reset = s2io_io_slot_reset,
+	.resume = s2io_io_resume,
+};
+
 static struct pci_driver s2io_driver = {
       .name = "S2IO",
       .id_table = s2io_tbl,
       .probe = s2io_init_nic,
       .remove = __devexit_p(s2io_rem_nic),
+      .err_handler = &s2io_err_handler,
 };
 
 /* A simplifier macro used both by init and free shared_mem Fns(). */
@@ -4171,6 +4178,11 @@ static irqreturn_t s2io_isr(int irq, voi
 	mac_info_t *mac_control;
 	struct config_param *config;
 
+	if ((sp->pdev->error_state != pci_channel_io_normal) &&
+		 (sp->pdev->error_state != 0)) {
+		return IRQ_HANDLED;
+	}
+
 	atomic_inc(&sp->isr_cnt);
 	mac_control = &sp->mac_control;
 	config = &sp->config;
@@ -7564,3 +7576,94 @@ static void lro_append_pkt(nic_t *sp, lr
 	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
 	return;
 }
+
+/**
+ * s2io_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci conneection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev)) {
+		unsigned long flags;
+
+		/* The folowing is an abreviated subset of the
+		 * steps taken by s2io_card_down(), avoiding
+		 * steps that touch the card itself.
+		 */
+		del_timer_sync(&sp->alarm_timer);
+		atomic_set(&sp->card_state, CARD_DOWN);
+
+		/* Kill tasklet. */
+		tasklet_kill(&sp->task);
+
+		/* Free all Tx buffers */
+		spin_lock_irqsave(&sp->tx_lock, flags);
+		free_tx_buffers(sp);
+		spin_unlock_irqrestore(&sp->tx_lock, flags);
+
+		/* Free all Rx buffers */
+		spin_lock_irqsave(&sp->rx_lock, flags);
+		free_rx_buffers(sp);
+		spin_unlock_irqrestore(&sp->rx_lock, flags);
+	
+		clear_bit(0, &(sp->link_state));
+		sp->device_close_flag = TRUE;	/* Device is shut down. */
+	}
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * s2io_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ */
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (pci_enable_device(pdev)) {
+		printk(KERN_ERR "s2io: Cannot re-enable PCI device after reset.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	s2io_reset(sp);
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * s2io_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells us that
+ * its OK to resume normal operation.
+ */
+static void s2io_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (netif_running(netdev)) {
+		if (s2io_card_up(sp)) {
+			printk(KERN_ERR "s2io: can't bring device back up after reset\n");
+			return;
+		}
+	}
+
+	netif_device_attach(netdev);
+	netif_wake_queue(netdev);
+}
Index: linux-2.6.19-rc1-git11/drivers/net/s2io.h
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.h	2006-10-25 14:09:47.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.h	2006-10-25 14:11:05.000000000 -0500
@@ -1013,6 +1013,11 @@ static void queue_rx_frame(struct sk_buf
 static void update_L3L4_header(nic_t *sp, lro_t *lro);
 static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb, u32 tcp_len);
 
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+                                               pci_channel_state_t state);
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
+static void s2io_io_resume(struct pci_dev *pdev);
+
 #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] s2io: add PCI error recovery support
  2006-10-25  6:29 Ananda Raju
@ 2006-10-25 15:11 ` Linas Vepstas
  2006-10-25 20:55   ` Linas Vepstas
  0 siblings, 1 reply; 14+ messages in thread
From: Linas Vepstas @ 2006-10-25 15:11 UTC (permalink / raw)
  To: Ananda Raju
  Cc: Wen Xiong, linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

On Wed, Oct 25, 2006 at 02:29:33AM -0400, Ananda Raju wrote:
> Hi, 
> 
> s2io_card_down() will do few BAR0 read/write. As per
> pci-error-recovery.txt Documentation we are not supposed to do any new
> IO in error_detected(). 

Hmm, actually, its harmless to do further i/o. The s2io driver barks
(as it should) because the result of reads is always 0xffffffff.

> Can you try using 
> 
> atomic_set(&sp->card_state, CARD_DOWN); 
> 
> instead of s2io_card_down().

I will try that. I was mostly concerned that s2io_card_down()
also may do some other "important" things with regards to the driver
state, things which might be needed to keep the down-up sequence
symmetrical. I wasn't sure, so I took the conservative route.

> Also we have to add following if statement in beginning of s2io_isr().
> 
> if (atomic_read(&nic->card_state) == CARD_DOWN)
> 	return IRQ_NOTHANDLED.

Right! Will revise this patch shortly.

> If it is ok to do BAR0 read/write in error_detected() then patch is OK. 

Its OK on pSeries, and I beleive it will be OK on PCIE, but I do
not quite know enough about actual PCIE chipsets.

--linas


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] s2io: add PCI error recovery support
@ 2006-10-25  6:29 Ananda Raju
  2006-10-25 15:11 ` Linas Vepstas
  0 siblings, 1 reply; 14+ messages in thread
From: Ananda Raju @ 2006-10-25  6:29 UTC (permalink / raw)
  To: Linas Vepstas, Wen Xiong
  Cc: linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton

Hi, 

s2io_card_down() will do few BAR0 read/write. As per
pci-error-recovery.txt Documentation we are not supposed to do any new
IO in error_detected(). 

Can you try using 

atomic_set(&sp->card_state, CARD_DOWN); 

instead of s2io_card_down().

Also we have to add following if statement in beginning of s2io_isr().

if (atomic_read(&nic->card_state) == CARD_DOWN)
	return IRQ_NOTHANDLED.

If it is ok to do BAR0 read/write in error_detected() then patch is OK. 

Ananda 
-----Original Message-----
From: Linas Vepstas [mailto:linas@austin.ibm.com] 
Sent: Tuesday, October 24, 2006 2:55 PM
To: Raghavendra Koushik; Ananda Raju; Wen Xiong
Cc: linux-kernel@vger.kernel.org; linux-pci@atrey.karlin.mff.cuni.cz;
netdev@vger.kernel.org; Jeff Garzik; Andrew Morton
Subject: [PATCH] s2io: add PCI error recovery support


Koushik, Raju,
Please review, comment, and if you find this acceptable, 
please forward upstream.

--linas

This patch adds PCI error recovery support to the 
s2io 10-Gigabit ethernet device driver.

Tested, seems to work well.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Cc: Raghavendra Koushik <raghavendra.koushik@neterion.com>
Cc: Ananda Raju <Ananda.Raju@neterion.com>
Cc: Wen Xiong <wenxiong@us.ibm.com>

----
 drivers/net/s2io.c |   77
+++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/s2io.h |    5 +++
 2 files changed, 82 insertions(+)

Index: linux-2.6.19-rc1-git11/drivers/net/s2io.c
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.c	2006-10-20
12:24:17.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.c	2006-10-24
16:19:49.000000000 -0500
@@ -434,11 +434,18 @@ static struct pci_device_id s2io_tbl[] _
 
 MODULE_DEVICE_TABLE(pci, s2io_tbl);
 
+static struct pci_error_handlers s2io_err_handler = {
+	.error_detected = s2io_io_error_detected,
+	.slot_reset = s2io_io_slot_reset,
+	.resume = s2io_io_resume,
+};
+
 static struct pci_driver s2io_driver = {
       .name = "S2IO",
       .id_table = s2io_tbl,
       .probe = s2io_init_nic,
       .remove = __devexit_p(s2io_rem_nic),
+      .err_handler = &s2io_err_handler,
 };
 
 /* A simplifier macro used both by init and free shared_mem Fns(). */
@@ -7564,3 +7571,73 @@ static void lro_append_pkt(nic_t *sp, lr
 	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
 	return;
 }
+
+/**
+ * s2io_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci conneection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev)) {
+		/* Reset card */
+		s2io_card_down(sp);
+		sp->device_close_flag = TRUE;	/* Device is shut down.
*/
+	}
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * s2io_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ */
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (pci_enable_device(pdev)) {
+		printk(KERN_ERR "s2io: Cannot re-enable PCI device after
reset.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	s2io_reset(sp);
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * s2io_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells us that
+ * its OK to resume normal operation.
+ */
+static void s2io_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (netif_running(netdev)) {
+		if (s2io_card_up(sp)) {
+			printk(KERN_ERR "s2io: can't bring device back
up after reset\n");
+			return;
+		}
+	}
+
+	netif_device_attach(netdev);
+	netif_wake_queue(netdev);
+}
Index: linux-2.6.19-rc1-git11/drivers/net/s2io.h
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.h	2006-10-20
12:24:17.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.h	2006-10-20
12:41:39.000000000 -0500
@@ -1013,6 +1013,11 @@ static void queue_rx_frame(struct sk_buf
 static void update_L3L4_header(nic_t *sp, lro_t *lro);
 static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb,
u32 tcp_len);
 
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+
pci_channel_state_t state);
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
+static void s2io_io_resume(struct pci_dev *pdev);
+
 #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] s2io: add PCI error recovery support
@ 2006-10-24 21:54 Linas Vepstas
  0 siblings, 0 replies; 14+ messages in thread
From: Linas Vepstas @ 2006-10-24 21:54 UTC (permalink / raw)
  To: Raghavendra Koushik, Ananda Raju, Wen Xiong
  Cc: linux-kernel, linux-pci, netdev, Jeff Garzik, Andrew Morton


Koushik, Raju,
Please review, comment, and if you find this acceptable, 
please forward upstream.

--linas

This patch adds PCI error recovery support to the 
s2io 10-Gigabit ethernet device driver.

Tested, seems to work well.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Cc: Raghavendra Koushik <raghavendra.koushik@neterion.com>
Cc: Ananda Raju <Ananda.Raju@neterion.com>
Cc: Wen Xiong <wenxiong@us.ibm.com>

----
 drivers/net/s2io.c |   77 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/s2io.h |    5 +++
 2 files changed, 82 insertions(+)

Index: linux-2.6.19-rc1-git11/drivers/net/s2io.c
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.c	2006-10-20 12:24:17.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.c	2006-10-24 16:19:49.000000000 -0500
@@ -434,11 +434,18 @@ static struct pci_device_id s2io_tbl[] _
 
 MODULE_DEVICE_TABLE(pci, s2io_tbl);
 
+static struct pci_error_handlers s2io_err_handler = {
+	.error_detected = s2io_io_error_detected,
+	.slot_reset = s2io_io_slot_reset,
+	.resume = s2io_io_resume,
+};
+
 static struct pci_driver s2io_driver = {
       .name = "S2IO",
       .id_table = s2io_tbl,
       .probe = s2io_init_nic,
       .remove = __devexit_p(s2io_rem_nic),
+      .err_handler = &s2io_err_handler,
 };
 
 /* A simplifier macro used both by init and free shared_mem Fns(). */
@@ -7564,3 +7571,73 @@ static void lro_append_pkt(nic_t *sp, lr
 	sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
 	return;
 }
+
+/**
+ * s2io_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci conneection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev)) {
+		/* Reset card */
+		s2io_card_down(sp);
+		sp->device_close_flag = TRUE;	/* Device is shut down. */
+	}
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * s2io_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ */
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (pci_enable_device(pdev)) {
+		printk(KERN_ERR "s2io: Cannot re-enable PCI device after reset.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	s2io_reset(sp);
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * s2io_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells us that
+ * its OK to resume normal operation.
+ */
+static void s2io_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	nic_t *sp = netdev->priv;
+
+	if (netif_running(netdev)) {
+		if (s2io_card_up(sp)) {
+			printk(KERN_ERR "s2io: can't bring device back up after reset\n");
+			return;
+		}
+	}
+
+	netif_device_attach(netdev);
+	netif_wake_queue(netdev);
+}
Index: linux-2.6.19-rc1-git11/drivers/net/s2io.h
===================================================================
--- linux-2.6.19-rc1-git11.orig/drivers/net/s2io.h	2006-10-20 12:24:17.000000000 -0500
+++ linux-2.6.19-rc1-git11/drivers/net/s2io.h	2006-10-20 12:41:39.000000000 -0500
@@ -1013,6 +1013,11 @@ static void queue_rx_frame(struct sk_buf
 static void update_L3L4_header(nic_t *sp, lro_t *lro);
 static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb, u32 tcp_len);
 
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+					                      pci_channel_state_t state);
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
+static void s2io_io_resume(struct pci_dev *pdev);
+
 #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
 #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2007-03-16 19:59 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-15 23:08 [PATCH] s2io: add PCI error recovery support Linas Vepstas
  -- strict thread matches above, loose matches on Subject: below --
2007-03-07  0:42 Ramkrishna Vepa
2007-03-05 22:33 Ramkrishna Vepa
2007-03-16 19:49 ` Linas Vepstas
2007-03-16 19:58   ` Ramkrishna Vepa
2007-01-10 19:54 Ramkrishna Vepa
2006-10-27 11:35 Ananda Raju
2006-10-27 19:32 ` Linas Vepstas
2006-10-26  9:56 Ananda Raju
2006-10-26 22:51 ` Linas Vepstas
2006-10-25  6:29 Ananda Raju
2006-10-25 15:11 ` Linas Vepstas
2006-10-25 20:55   ` Linas Vepstas
2006-10-24 21:54 Linas Vepstas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).