LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
@ 2010-10-06 22:58 Ram Pai
  2010-10-06 23:39 ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2010-10-06 22:58 UTC (permalink / raw)
  To: linux-pci
  Cc: linux-kernel, clemens, Yinghai Lu, Jesse Barnes, Linus Torvalds,
	Bjorn Helgaas

        PCI: override BIOS/firmware memory resource allocation
		through command line parameters

	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
	OS fails to  enable  SRIOV.
	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
	the conflicting devices are disabled.

	Ideally we  would  want  the  OS  to detect and fix automatically
	such problems and conflicts.  However previous  attempts to do so
	have led to regression on legacy platforms.

	The  following patch provides the ability to override the BIOS
	allocations  using    user-hints   provided   through   kernel
	command-line parameters.

	The patch is based upon an idea by Linus, details of which can
	be found at
	http://marc.info/?l=linux-kernel&m=127846075028242&w=2

	Change log:
	    PCI Bridge window reset is done early just after
		early_dump_pci_devices() (Suggested by Yinghai)
	    command line parameters processing is moved from pci_setup() to
	      	pcibios_setup()   	 (Suggested by Yinghai)
    

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8dd7248..99ee10e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1961,6 +1961,18 @@ and is between 256 and 4096 characters. It is defined in the file
 				PAGE_SIZE is used as alignment.
 				PCI-PCI bridge can be specified, if resource
 				windows need to be expanded.
+             	override=[off, conflict, always, <device>]
+                             	off : Do not override BIOS/firmware memory resource
+					allocations. This is the default.
+                             	conflict : override BIOS/firmware memory resource
+					allocations if conflicting or not allocated.
+                             	always : override all BIOS/firmware memory resource
+					allocations.
+                             	<device>: Format [<domain>:]<bus>:<slot>.<func>[; ...]
+                                     override BIOS/firmware memory resource allocations
+				     of specified devices. If the device is a bridge
+				     override allocations of all devices under the
+				     bridge.
 		ecrc=		Enable/disable PCIe ECRC (transaction layer
 				end-to-end CRC checking).
 				bios: Use BIOS/firmware settings. This is the
diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
index b1e7a45..3a2c439 100644
--- a/arch/x86/include/asm/pci-direct.h
+++ b/arch/x86/include/asm/pci-direct.h
@@ -18,4 +18,5 @@ extern int early_pci_allowed(void);
 extern unsigned int pci_early_dump_regs;
 extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
 extern void early_dump_pci_devices(void);
+extern void early_reset_pci_devices(void);
 #endif /* _ASM_X86_PCI_DIRECT_H */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb..956d7ac 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -855,6 +855,7 @@ void __init setup_arch(char **cmdline_p)
 	if (pci_early_dump_regs)
 		early_dump_pci_devices();
 #endif
+	early_reset_pci_devices();
 
 	finish_e820_parsing();
 
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index a0772af..b26f782 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -453,6 +453,66 @@ int __init pcibios_init(void)
 	return 0;
 }
 
+#define RESOURCE_RELEASE_PARAM_SIZE 256
+static char __initdata resource_release_param[RESOURCE_RELEASE_PARAM_SIZE] = {0};
+int pci_override=PCI_OVERRIDE_OFF;
+
+char *next_pci_device(int *bus, int *slot, int *func, char *p)
+{
+	int seg, count;
+
+	if ( !p )
+		p = resource_release_param;
+
+	if ( !*p )
+		return NULL;
+
+	count = 0;
+	if (sscanf(p, "%x:%x:%x.%x%n",
+		&seg, bus, slot, func, &count) != 4) {
+		seg = 0;
+		if (sscanf(p, "%x:%x.%x%n",
+				bus, slot, func, &count) != 3) {
+			/* Invalid format */
+			printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n",
+				p);
+			return NULL;
+		}
+	}
+	p += count;
+	if (!*p)
+		return p;
+	if (*p == ';')
+		return ++p;
+	return NULL;
+}
+
+
+void pci_override_setup(const char *str, int override)
+{
+	int count;
+	if (override && !strncmp(str, "off", 3)) {
+		pci_override = PCI_OVERRIDE_OFF;
+		printk(KERN_INFO "pci: do not override BIOS/uEFI memory allocations\n");
+	} else if (override && !strncmp(str, "conflict", 8)) {
+		pci_override = PCI_OVERRIDE_CONFLICT;
+		printk(KERN_INFO "pci: reallocate BIOS/uEFI allocated memory resource conflicts\n");
+	} else if (override && !strncmp(str, "always", 6)) {
+		pci_override =  PCI_OVERRIDE_ALWAYS;
+		printk(KERN_INFO "pci: override all BIOS/uEFI memory resource allocations\n");
+	} else {
+		pci_override =  PCI_OVERRIDE_DEVICE;
+		count=strlen(str);
+		if (count > RESOURCE_RELEASE_PARAM_SIZE - 1)
+			count = RESOURCE_RELEASE_PARAM_SIZE - 1;
+		strncpy(resource_release_param, str, count);
+		resource_release_param[count] = '\0';
+		printk(KERN_INFO "pci: override BIOS/uEFI memory allocations for %s\n",
+			   resource_release_param);
+	}
+	return;
+}
+
 char * __devinit  pcibios_setup(char *str)
 {
 	if (!strcmp(str, "off")) {
@@ -558,6 +618,9 @@ char * __devinit  pcibios_setup(char *str)
 		if (noioapicreroute != -1)
 			noioapicreroute = 1;
 		return NULL;
+	} else if (!strncmp(str, "override=", 9)) {
+		pci_override_setup(str + 9, 1);
+		return NULL;
 	}
 	return str;
 }
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index d1067d5..7a30a87 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -109,3 +109,101 @@ void early_dump_pci_devices(void)
 		}
 	}
 }
+
+static void __reset_bridge_window(u8 bus, u8 slot, u8 func)
+{
+	/* reset mem resources */
+	write_pci_config_16(bus, slot, func, PCI_MEMORY_BASE, 0x00);
+
+	/* reset pref mem resources */
+	write_pci_config(bus, slot, func, PCI_PREF_LIMIT_UPPER32, 0x00);
+	write_pci_config(bus, slot, func, PCI_PREF_MEMORY_BASE, 0x00);
+	write_pci_config(bus, slot, func, PCI_PREF_BASE_UPPER32, 0x00);
+	write_pci_config(bus, slot, func, PCI_PREF_LIMIT_UPPER32, 0x00);
+}
+
+
+int find_parent_bridge(u8 *bus, u8 *slot, u8 *func, u8 secondary_bus)
+{
+   u8 tbus, tslot, tfunc;
+	for (tbus = secondary_bus-1; tbus >= 0; tbus--) {
+ 	 for (tslot = 0; tslot < 32; tslot++) {
+	  for (tfunc = 0; tfunc < 8; tfunc++) {
+		if (0xffffffff == read_pci_config(tbus,
+			    tslot, tfunc, PCI_CLASS_REVISION))
+			continue;
+
+		if (read_pci_config_byte(tbus, tslot, tfunc,
+			PCI_HEADER_TYPE) != PCI_HEADER_TYPE_BRIDGE)
+			continue;
+
+		if (secondary_bus == read_pci_config_byte(tbus,
+			  tslot, tfunc, PCI_SECONDARY_BUS)) {
+			*bus=tbus;
+			*slot=tslot;
+			*func=tfunc;
+			return 1;
+		}
+	  }
+         }
+	}
+	return 0;
+}
+
+void reset_bridge_window(u8 bus, u8 slot, u8 func)
+{
+	u8 tbus, tslot, tfunc;
+	if (read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE) ==
+			PCI_HEADER_TYPE_BRIDGE) {
+		__reset_bridge_window(bus, slot, func);
+	} else if (find_parent_bridge(&tbus, &tslot, &tfunc, bus))
+		__reset_bridge_window(tbus, tslot, tfunc);
+	return;
+}
+
+void reset_topmost_bridges(void)
+{
+	u8 slot, func;
+
+	/* topmost bridges as assumed to be on bus 0. good assumption? */
+	for (slot = 0; slot < 32; slot++) {
+	 for (func = 0; func < 8; func++) {
+		if (0xffffffff == read_pci_config(0, slot, func, PCI_CLASS_REVISION))
+			continue;
+
+		if (read_pci_config_byte(0, slot, func, PCI_HEADER_TYPE) ==
+				PCI_HEADER_TYPE_BRIDGE) {
+			__reset_bridge_window(0, slot, func);
+		}
+	 }
+	}
+	return;
+}
+
+extern char * next_pci_device(int *bus, int *slot, int *func, char *pr);
+extern int pci_override;
+
+void early_reset_pci_devices(void)
+{
+	unsigned bus, slot, func;
+	char *ptr=NULL;
+
+	if (!early_pci_allowed())
+		return;
+
+	if ( pci_override == PCI_OVERRIDE_OFF )
+		return;
+
+	if ( pci_override == PCI_OVERRIDE_ALWAYS ) {
+		reset_topmost_bridges();
+		return;
+	}
+
+	while((ptr = next_pci_device(&bus, &slot, &func, ptr))) {
+		if (0xffffffff == read_pci_config(bus, slot,
+				func, PCI_CLASS_REVISION))
+			continue;
+		reset_bridge_window(bus, slot, func);
+	}
+	return;
+}
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 66cb8f4..981e701 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -768,6 +768,7 @@ static void pci_bridge_release_resources(struct pci_bus *bus,
 	}
 }
 
+
 enum release_type {
 	leaf_only,
 	whole_subtree,
@@ -808,6 +809,7 @@ static void __ref pci_bus_release_bridge_resources(struct pci_bus *bus,
 		pci_bridge_release_resources(bus, type);
 }
 
+
 static void pci_bus_dump_res(struct pci_bus *bus)
 {
 	struct resource *res;
@@ -838,26 +840,78 @@ static void pci_bus_dump_resources(struct pci_bus *bus)
 	}
 }
 
+static void __init
+pci_release_resources(struct resource_list_x *head)
+{
+	struct resource_list_x *list;
+	unsigned long type_mask = IORESOURCE_IO | IORESOURCE_MEM |
+				  IORESOURCE_PREFETCH;
+	enum release_type rel_type = whole_subtree;
+	/*
+	 * Try to release leaf bridge's resources that doesn't fit resource of
+	 * child device under that bridge
+	 */
+	for (list = head->next; list;) {
+		struct pci_bus *bus =  list->dev->bus;
+		pci_bus_release_bridge_resources(bus, list->flags & type_mask,
+						  rel_type);
+		list = list->next;
+ 	}
+	/* restore size and flags */
+	for (list = head->next; list;) {
+		struct resource *res = list->res;
+
+		res->start = list->start;
+		res->end = list->end;
+		res->flags = list->flags;
+		if (list->dev->subordinate)
+			res->flags = 0;
+
+		list = list->next;
+	}
+	free_failed_list(head);
+}
+
 void __init
 pci_assign_unassigned_resources(void)
 {
 	struct pci_bus *bus;
+	int tried_times = 0;
+	struct resource_list_x head;
+
+ 	head.next = NULL;
+	do { /* loop at most 2 times */
+		/* Depth first, calculate sizes and alignments of all
+		   subordinate buses. */
+		list_for_each_entry(bus, &pci_root_buses, node) {
+			pci_bus_size_bridges(bus);
+		}
+		/* Depth last, allocate resources and update the hardware. */
+		list_for_each_entry(bus, &pci_root_buses, node) {
+			__pci_bus_assign_resources(bus, &head);
+		}
+		/* any device complain? */
+		if (!head.next)
+			goto enable_and_dump;
+
+		/* do we care if any device complained? */
+		if (pci_override != PCI_OVERRIDE_CONFLICT) {
+			free_failed_list(&head);
+			goto enable_and_dump;
+		}
+		printk(KERN_INFO "PCI: No. %d try to assign unassigned res\n",
+			tried_times+1);
 
-	/* Depth first, calculate sizes and alignments of all
-	   subordinate buses. */
-	list_for_each_entry(bus, &pci_root_buses, node) {
-		pci_bus_size_bridges(bus);
-	}
-	/* Depth last, allocate resources and update the hardware. */
-	list_for_each_entry(bus, &pci_root_buses, node) {
-		pci_bus_assign_resources(bus);
+		pci_release_resources(&head);
+	} while (!tried_times++);
+
+enable_and_dump:
+	list_for_each_entry(bus, &pci_root_buses, node)
 		pci_enable_bridges(bus);
-	}
 
 	/* dump the resource on buses */
-	list_for_each_entry(bus, &pci_root_buses, node) {
+	list_for_each_entry(bus, &pci_root_buses, node)
 		pci_bus_dump_resources(bus);
-	}
 }
 
 void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c8d95e3..3449297 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1360,6 +1360,12 @@ extern u8 pci_cache_line_size;
 extern unsigned long pci_hotplug_io_size;
 extern unsigned long pci_hotplug_mem_size;
 
+extern int pci_override;
+#define  PCI_OVERRIDE_OFF	     1
+#define  PCI_OVERRIDE_CONFLICT	     2
+#define  PCI_OVERRIDE_ALWAYS	     3
+#define  PCI_OVERRIDE_DEVICE	     4
+
 int pcibios_add_platform_entries(struct pci_dev *dev);
 void pcibios_disable_device(struct pci_dev *dev);
 int pcibios_set_pcie_reset_state(struct pci_dev *dev,

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-06 22:58 [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation Ram Pai
@ 2010-10-06 23:39 ` Bjorn Helgaas
  2010-10-07  0:30   ` Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2010-10-06 23:39 UTC (permalink / raw)
  To: Ram Pai
  Cc: linux-pci, linux-kernel, clemens, Yinghai Lu, Jesse Barnes,
	Linus Torvalds

On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
>         PCI: override BIOS/firmware memory resource allocation
> 		through command line parameters
> 
> 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> 	OS fails to  enable  SRIOV.
> 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> 	the conflicting devices are disabled.
> 
> 	Ideally we  would  want  the  OS  to detect and fix automatically
> 	such problems and conflicts.  However previous  attempts to do so
> 	have led to regression on legacy platforms.

I'm sorry to be a nay-sayer, but I think we just haven't tried hard
enough.  Our ACPI/PCI/e820 resource management is not well integrated,
and I suspect if we straightened that out, we could avoid some of the
regressions we saw with previous attempts.

My fear is that we'll add this parameter, which is a relatively easy fix
for you.  Later we'll realize that telling users to boot with this or
that parameter, depending on what system they have and how they way to
use it, is not a sustainable strategy.  Then somebody else will be left
with the hard work of figuring out how to make Linux smart enough to do
it automatically.

Bjorn

> 	The  following patch provides the ability to override the BIOS
> 	allocations  using    user-hints   provided   through   kernel
> 	command-line parameters.
> 
> 	The patch is based upon an idea by Linus, details of which can
> 	be found at
> 	http://marc.info/?l=linux-kernel&m=127846075028242&w=2
> 
> 	Change log:
> 	    PCI Bridge window reset is done early just after
> 		early_dump_pci_devices() (Suggested by Yinghai)
> 	    command line parameters processing is moved from pci_setup() to
> 	      	pcibios_setup()   	 (Suggested by Yinghai)
>     
> 
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index 8dd7248..99ee10e 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1961,6 +1961,18 @@ and is between 256 and 4096 characters. It is defined in the file
>  				PAGE_SIZE is used as alignment.
>  				PCI-PCI bridge can be specified, if resource
>  				windows need to be expanded.
> +             	override=[off, conflict, always, <device>]
> +                             	off : Do not override BIOS/firmware memory resource
> +					allocations. This is the default.
> +                             	conflict : override BIOS/firmware memory resource
> +					allocations if conflicting or not allocated.
> +                             	always : override all BIOS/firmware memory resource
> +					allocations.
> +                             	<device>: Format [<domain>:]<bus>:<slot>.<func>[; ...]
> +                                     override BIOS/firmware memory resource allocations
> +				     of specified devices. If the device is a bridge
> +				     override allocations of all devices under the
> +				     bridge.
>  		ecrc=		Enable/disable PCIe ECRC (transaction layer
>  				end-to-end CRC checking).
>  				bios: Use BIOS/firmware settings. This is the
> diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
> index b1e7a45..3a2c439 100644
> --- a/arch/x86/include/asm/pci-direct.h
> +++ b/arch/x86/include/asm/pci-direct.h
> @@ -18,4 +18,5 @@ extern int early_pci_allowed(void);
>  extern unsigned int pci_early_dump_regs;
>  extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
>  extern void early_dump_pci_devices(void);
> +extern void early_reset_pci_devices(void);
>  #endif /* _ASM_X86_PCI_DIRECT_H */
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index c3a4fbb..956d7ac 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -855,6 +855,7 @@ void __init setup_arch(char **cmdline_p)
>  	if (pci_early_dump_regs)
>  		early_dump_pci_devices();
>  #endif
> +	early_reset_pci_devices();
>  
>  	finish_e820_parsing();
>  
> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
> index a0772af..b26f782 100644
> --- a/arch/x86/pci/common.c
> +++ b/arch/x86/pci/common.c
> @@ -453,6 +453,66 @@ int __init pcibios_init(void)
>  	return 0;
>  }
>  
> +#define RESOURCE_RELEASE_PARAM_SIZE 256
> +static char __initdata resource_release_param[RESOURCE_RELEASE_PARAM_SIZE] = {0};
> +int pci_override=PCI_OVERRIDE_OFF;
> +
> +char *next_pci_device(int *bus, int *slot, int *func, char *p)
> +{
> +	int seg, count;
> +
> +	if ( !p )
> +		p = resource_release_param;
> +
> +	if ( !*p )
> +		return NULL;
> +
> +	count = 0;
> +	if (sscanf(p, "%x:%x:%x.%x%n",
> +		&seg, bus, slot, func, &count) != 4) {
> +		seg = 0;
> +		if (sscanf(p, "%x:%x.%x%n",
> +				bus, slot, func, &count) != 3) {
> +			/* Invalid format */
> +			printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n",
> +				p);
> +			return NULL;
> +		}
> +	}
> +	p += count;
> +	if (!*p)
> +		return p;
> +	if (*p == ';')
> +		return ++p;
> +	return NULL;
> +}
> +
> +
> +void pci_override_setup(const char *str, int override)
> +{
> +	int count;
> +	if (override && !strncmp(str, "off", 3)) {
> +		pci_override = PCI_OVERRIDE_OFF;
> +		printk(KERN_INFO "pci: do not override BIOS/uEFI memory allocations\n");
> +	} else if (override && !strncmp(str, "conflict", 8)) {
> +		pci_override = PCI_OVERRIDE_CONFLICT;
> +		printk(KERN_INFO "pci: reallocate BIOS/uEFI allocated memory resource conflicts\n");
> +	} else if (override && !strncmp(str, "always", 6)) {
> +		pci_override =  PCI_OVERRIDE_ALWAYS;
> +		printk(KERN_INFO "pci: override all BIOS/uEFI memory resource allocations\n");
> +	} else {
> +		pci_override =  PCI_OVERRIDE_DEVICE;
> +		count=strlen(str);
> +		if (count > RESOURCE_RELEASE_PARAM_SIZE - 1)
> +			count = RESOURCE_RELEASE_PARAM_SIZE - 1;
> +		strncpy(resource_release_param, str, count);
> +		resource_release_param[count] = '\0';
> +		printk(KERN_INFO "pci: override BIOS/uEFI memory allocations for %s\n",
> +			   resource_release_param);
> +	}
> +	return;
> +}
> +
>  char * __devinit  pcibios_setup(char *str)
>  {
>  	if (!strcmp(str, "off")) {
> @@ -558,6 +618,9 @@ char * __devinit  pcibios_setup(char *str)
>  		if (noioapicreroute != -1)
>  			noioapicreroute = 1;
>  		return NULL;
> +	} else if (!strncmp(str, "override=", 9)) {
> +		pci_override_setup(str + 9, 1);
> +		return NULL;
>  	}
>  	return str;
>  }
> diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
> index d1067d5..7a30a87 100644
> --- a/arch/x86/pci/early.c
> +++ b/arch/x86/pci/early.c
> @@ -109,3 +109,101 @@ void early_dump_pci_devices(void)
>  		}
>  	}
>  }
> +
> +static void __reset_bridge_window(u8 bus, u8 slot, u8 func)
> +{
> +	/* reset mem resources */
> +	write_pci_config_16(bus, slot, func, PCI_MEMORY_BASE, 0x00);
> +
> +	/* reset pref mem resources */
> +	write_pci_config(bus, slot, func, PCI_PREF_LIMIT_UPPER32, 0x00);
> +	write_pci_config(bus, slot, func, PCI_PREF_MEMORY_BASE, 0x00);
> +	write_pci_config(bus, slot, func, PCI_PREF_BASE_UPPER32, 0x00);
> +	write_pci_config(bus, slot, func, PCI_PREF_LIMIT_UPPER32, 0x00);
> +}
> +
> +
> +int find_parent_bridge(u8 *bus, u8 *slot, u8 *func, u8 secondary_bus)
> +{
> +   u8 tbus, tslot, tfunc;
> +	for (tbus = secondary_bus-1; tbus >= 0; tbus--) {
> + 	 for (tslot = 0; tslot < 32; tslot++) {
> +	  for (tfunc = 0; tfunc < 8; tfunc++) {
> +		if (0xffffffff == read_pci_config(tbus,
> +			    tslot, tfunc, PCI_CLASS_REVISION))
> +			continue;
> +
> +		if (read_pci_config_byte(tbus, tslot, tfunc,
> +			PCI_HEADER_TYPE) != PCI_HEADER_TYPE_BRIDGE)
> +			continue;
> +
> +		if (secondary_bus == read_pci_config_byte(tbus,
> +			  tslot, tfunc, PCI_SECONDARY_BUS)) {
> +			*bus=tbus;
> +			*slot=tslot;
> +			*func=tfunc;
> +			return 1;
> +		}
> +	  }
> +         }
> +	}
> +	return 0;
> +}
> +
> +void reset_bridge_window(u8 bus, u8 slot, u8 func)
> +{
> +	u8 tbus, tslot, tfunc;
> +	if (read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE) ==
> +			PCI_HEADER_TYPE_BRIDGE) {
> +		__reset_bridge_window(bus, slot, func);
> +	} else if (find_parent_bridge(&tbus, &tslot, &tfunc, bus))
> +		__reset_bridge_window(tbus, tslot, tfunc);
> +	return;
> +}
> +
> +void reset_topmost_bridges(void)
> +{
> +	u8 slot, func;
> +
> +	/* topmost bridges as assumed to be on bus 0. good assumption? */
> +	for (slot = 0; slot < 32; slot++) {
> +	 for (func = 0; func < 8; func++) {
> +		if (0xffffffff == read_pci_config(0, slot, func, PCI_CLASS_REVISION))
> +			continue;
> +
> +		if (read_pci_config_byte(0, slot, func, PCI_HEADER_TYPE) ==
> +				PCI_HEADER_TYPE_BRIDGE) {
> +			__reset_bridge_window(0, slot, func);
> +		}
> +	 }
> +	}
> +	return;
> +}
> +
> +extern char * next_pci_device(int *bus, int *slot, int *func, char *pr);
> +extern int pci_override;
> +
> +void early_reset_pci_devices(void)
> +{
> +	unsigned bus, slot, func;
> +	char *ptr=NULL;
> +
> +	if (!early_pci_allowed())
> +		return;
> +
> +	if ( pci_override == PCI_OVERRIDE_OFF )
> +		return;
> +
> +	if ( pci_override == PCI_OVERRIDE_ALWAYS ) {
> +		reset_topmost_bridges();
> +		return;
> +	}
> +
> +	while((ptr = next_pci_device(&bus, &slot, &func, ptr))) {
> +		if (0xffffffff == read_pci_config(bus, slot,
> +				func, PCI_CLASS_REVISION))
> +			continue;
> +		reset_bridge_window(bus, slot, func);
> +	}
> +	return;
> +}
> diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
> index 66cb8f4..981e701 100644
> --- a/drivers/pci/setup-bus.c
> +++ b/drivers/pci/setup-bus.c
> @@ -768,6 +768,7 @@ static void pci_bridge_release_resources(struct pci_bus *bus,
>  	}
>  }
>  
> +
>  enum release_type {
>  	leaf_only,
>  	whole_subtree,
> @@ -808,6 +809,7 @@ static void __ref pci_bus_release_bridge_resources(struct pci_bus *bus,
>  		pci_bridge_release_resources(bus, type);
>  }
>  
> +
>  static void pci_bus_dump_res(struct pci_bus *bus)
>  {
>  	struct resource *res;
> @@ -838,26 +840,78 @@ static void pci_bus_dump_resources(struct pci_bus *bus)
>  	}
>  }
>  
> +static void __init
> +pci_release_resources(struct resource_list_x *head)
> +{
> +	struct resource_list_x *list;
> +	unsigned long type_mask = IORESOURCE_IO | IORESOURCE_MEM |
> +				  IORESOURCE_PREFETCH;
> +	enum release_type rel_type = whole_subtree;
> +	/*
> +	 * Try to release leaf bridge's resources that doesn't fit resource of
> +	 * child device under that bridge
> +	 */
> +	for (list = head->next; list;) {
> +		struct pci_bus *bus =  list->dev->bus;
> +		pci_bus_release_bridge_resources(bus, list->flags & type_mask,
> +						  rel_type);
> +		list = list->next;
> + 	}
> +	/* restore size and flags */
> +	for (list = head->next; list;) {
> +		struct resource *res = list->res;
> +
> +		res->start = list->start;
> +		res->end = list->end;
> +		res->flags = list->flags;
> +		if (list->dev->subordinate)
> +			res->flags = 0;
> +
> +		list = list->next;
> +	}
> +	free_failed_list(head);
> +}
> +
>  void __init
>  pci_assign_unassigned_resources(void)
>  {
>  	struct pci_bus *bus;
> +	int tried_times = 0;
> +	struct resource_list_x head;
> +
> + 	head.next = NULL;
> +	do { /* loop at most 2 times */
> +		/* Depth first, calculate sizes and alignments of all
> +		   subordinate buses. */
> +		list_for_each_entry(bus, &pci_root_buses, node) {
> +			pci_bus_size_bridges(bus);
> +		}
> +		/* Depth last, allocate resources and update the hardware. */
> +		list_for_each_entry(bus, &pci_root_buses, node) {
> +			__pci_bus_assign_resources(bus, &head);
> +		}
> +		/* any device complain? */
> +		if (!head.next)
> +			goto enable_and_dump;
> +
> +		/* do we care if any device complained? */
> +		if (pci_override != PCI_OVERRIDE_CONFLICT) {
> +			free_failed_list(&head);
> +			goto enable_and_dump;
> +		}
> +		printk(KERN_INFO "PCI: No. %d try to assign unassigned res\n",
> +			tried_times+1);
>  
> -	/* Depth first, calculate sizes and alignments of all
> -	   subordinate buses. */
> -	list_for_each_entry(bus, &pci_root_buses, node) {
> -		pci_bus_size_bridges(bus);
> -	}
> -	/* Depth last, allocate resources and update the hardware. */
> -	list_for_each_entry(bus, &pci_root_buses, node) {
> -		pci_bus_assign_resources(bus);
> +		pci_release_resources(&head);
> +	} while (!tried_times++);
> +
> +enable_and_dump:
> +	list_for_each_entry(bus, &pci_root_buses, node)
>  		pci_enable_bridges(bus);
> -	}
>  
>  	/* dump the resource on buses */
> -	list_for_each_entry(bus, &pci_root_buses, node) {
> +	list_for_each_entry(bus, &pci_root_buses, node)
>  		pci_bus_dump_resources(bus);
> -	}
>  }
>  
>  void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge)
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index c8d95e3..3449297 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1360,6 +1360,12 @@ extern u8 pci_cache_line_size;
>  extern unsigned long pci_hotplug_io_size;
>  extern unsigned long pci_hotplug_mem_size;
>  
> +extern int pci_override;
> +#define  PCI_OVERRIDE_OFF	     1
> +#define  PCI_OVERRIDE_CONFLICT	     2
> +#define  PCI_OVERRIDE_ALWAYS	     3
> +#define  PCI_OVERRIDE_DEVICE	     4
> +
>  int pcibios_add_platform_entries(struct pci_dev *dev);
>  void pcibios_disable_device(struct pci_dev *dev);
>  int pcibios_set_pcie_reset_state(struct pci_dev *dev,

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-06 23:39 ` Bjorn Helgaas
@ 2010-10-07  0:30   ` Ram Pai
  2010-10-07  4:13     ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2010-10-07  0:30 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Ram Pai, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Jesse Barnes, Linus Torvalds

On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> >         PCI: override BIOS/firmware memory resource allocation
> > 		through command line parameters
> > 
> > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > 	OS fails to  enable  SRIOV.
> > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > 	the conflicting devices are disabled.
> > 
> > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > 	such problems and conflicts.  However previous  attempts to do so
> > 	have led to regression on legacy platforms.
> 
> I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> and I suspect if we straightened that out, we could avoid some of the
> regressions we saw with previous attempts.

Can you be more specific as to what can be done to fix it automatically?

Neither accepting this approach nor telling what needs to be straightened out
to automatically fix all the systems out there, is just a deadend.

The choice is between
	(a) an automated patch with the risk of regressing some platforms.
	(b) an semi-automated patch that does not regress *any* platform,
		with the ability to fix platforms that are currently broken.
	(c) status quo, which means broken platforms continue to be so.

I thought the initial proposal was to use (b), with the long
term goal of fixing it automatically, assuming that it is even possible.

Let me know if that is *not* the goal and I will change directions.
RP

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-07  0:30   ` Ram Pai
@ 2010-10-07  4:13     ` Bjorn Helgaas
  2010-10-07 20:42       ` Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2010-10-07  4:13 UTC (permalink / raw)
  To: Ram Pai
  Cc: linux-pci, linux-kernel, clemens, Yinghai Lu, Jesse Barnes,
	Linus Torvalds

On Wed, Oct 06, 2010 at 05:30:41PM -0700, Ram Pai wrote:
> On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> > >         PCI: override BIOS/firmware memory resource allocation
> > > 		through command line parameters
> > > 
> > > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > > 	OS fails to  enable  SRIOV.
> > > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > > 	the conflicting devices are disabled.
> > > 
> > > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > > 	such problems and conflicts.  However previous  attempts to do so
> > > 	have led to regression on legacy platforms.
> > 
> > I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> > enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> > and I suspect if we straightened that out, we could avoid some of the
> > regressions we saw with previous attempts.
> 
> Can you be more specific as to what can be done to fix it automatically?
> 
> Neither accepting this approach nor telling what needs to be straightened out
> to automatically fix all the systems out there, is just a deadend.

Yeah, I guess that wasn't really fair, sorry.  And keep in mind that I'm
not the PCI maintainer, so these are just my opinions, nothing like an
official "nack."

I did look at this dmesg log from the thread you referenced:
    http://marc.info/?l=linux-kernel&m=127178918128740&w=2
but it looks to me like we just completely botched it.  I don't see an
SRIOV device or anything else that didn't have resources, so as far as I
can tell, we started with working resource assignments from the BIOS,
threw them away, and started over from scratch.  We failed because we
tried to assign I/O port space to bridges with nothing behind them, and
there was nothing left by the time we got to the 0000:09:04.0 device
that actually *did* need the space.

I think what would be more interesting is to look at a log from *your*
system, where you have SRIOV devices that don't have resources assigned,
and see whether we have enough information to make the minimal changes
required to assign resources to them.  If we can come up with a strategy
that only does something when absolutely required, and does it a little
more carefully, I think we have a decent chance of success.

> The choice is between
> 	(a) an automated patch with the risk of regressing some platforms.
> 	(b) an semi-automated patch that does not regress *any* platform,
> 		with the ability to fix platforms that are currently broken.
> 	(c) status quo, which means broken platforms continue to be so.
> 
> I thought the initial proposal was to use (b), with the long
> term goal of fixing it automatically, assuming that it is even possible.
> 
> Let me know if that is *not* the goal and I will change directions.

*My* goal is that a user would never need a kernel option except to help
debug kernel problems.  I think of an option like "pci=override" as a
band-aid that covers up a kernel problem without really fixing it, so I
guess my choice would be (a).  Yes, there's a risk of regression, and we
have to do everything we can to avoid it.  But the result is a more
usable system.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-07  4:13     ` Bjorn Helgaas
@ 2010-10-07 20:42       ` Ram Pai
  2010-10-07 21:41         ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2010-10-07 20:42 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Ram Pai, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Jesse Barnes, Linus Torvalds

On Wed, Oct 06, 2010 at 10:13:02PM -0600, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2010 at 05:30:41PM -0700, Ram Pai wrote:
> > On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> > > On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> > > >         PCI: override BIOS/firmware memory resource allocation
> > > > 		through command line parameters
> > > > 
> > > > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > > > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > > > 	OS fails to  enable  SRIOV.
> > > > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > > > 	the conflicting devices are disabled.
> > > > 
> > > > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > > > 	such problems and conflicts.  However previous  attempts to do so
> > > > 	have led to regression on legacy platforms.
> > > 
> > > I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> > > enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> > > and I suspect if we straightened that out, we could avoid some of the
> > > regressions we saw with previous attempts.
> > 
> > Can you be more specific as to what can be done to fix it automatically?
> > 
> > Neither accepting this approach nor telling what needs to be straightened out
> > to automatically fix all the systems out there, is just a deadend.
> 
> Yeah, I guess that wasn't really fair, sorry.  And keep in mind that I'm
> not the PCI maintainer, so these are just my opinions, nothing like an
> official "nack."
> 
> I did look at this dmesg log from the thread you referenced:
>     http://marc.info/?l=linux-kernel&m=127178918128740&w=2
> but it looks to me like we just completely botched it.  I don't see an
> SRIOV device or anything else that didn't have resources, so as far as I
> can tell, we started with working resource assignments from the BIOS,
> threw them away, and started over from scratch.  We failed because we
> tried to assign I/O port space to bridges with nothing behind them, and
> there was nothing left by the time we got to the 0000:09:04.0 device
> that actually *did* need the space.

hmm.. is that possible? Yinghai's patch sized the resource requirement of each
of the bridges, before actually allocating them. Which means a bridge with
no device behind it would not get any i/o space.

> 
> I think what would be more interesting is to look at a log from *your*
> system, where you have SRIOV devices that don't have resources assigned,
> and see whether we have enough information to make the minimal changes
> required to assign resources to them.  If we can come up with a strategy
> that only does something when absolutely required, and does it a little
> more carefully, I think we have a decent chance of success.

In my setup, the bridge has a BIOS allocated window large enough to 
satisfy the standard BARs of my device. But nothing more to satisfy the 
SRIOV BARs.  We could come up with some strategy to satisfy this particular 
configuration. Is that sufficient? Will that be the grand solution or 
just a band-aid solution?


> 
> > The choice is between
> > 	(a) an automated patch with the risk of regressing some platforms.
> > 	(b) an semi-automated patch that does not regress *any* platform,
> > 		with the ability to fix platforms that are currently broken.
> > 	(c) status quo, which means broken platforms continue to be so.
> > 
> > I thought the initial proposal was to use (b), with the long
> > term goal of fixing it automatically, assuming that it is even possible.
> > 
> > Let me know if that is *not* the goal and I will change directions.
> 
> *My* goal is that a user would never need a kernel option except to help
> debug kernel problems.  I think of an option like "pci=override" as a
> band-aid that covers up a kernel problem without really fixing it, so I
> guess my choice would be (a).  Yes, there's a risk of regression, and we
> have to do everything we can to avoid it.  But the result is a more
> usable system.

Ok. I think we agree with your goal, but the disagreement is on the approach.
You want to take one big leap, whereas the consensus  I heard in earlier 
threads was that we need to take baby steps.

RP

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-07 20:42       ` Ram Pai
@ 2010-10-07 21:41         ` Bjorn Helgaas
  2010-10-08 17:32           ` Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2010-10-07 21:41 UTC (permalink / raw)
  To: Ram Pai
  Cc: linux-pci, linux-kernel, clemens, Yinghai Lu, Jesse Barnes,
	Linus Torvalds

On Thursday, October 07, 2010 02:42:13 pm Ram Pai wrote:
> On Wed, Oct 06, 2010 at 10:13:02PM -0600, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2010 at 05:30:41PM -0700, Ram Pai wrote:
> > > On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> > > > On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> > > > >         PCI: override BIOS/firmware memory resource allocation
> > > > > 		through command line parameters
> > > > > 
> > > > > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > > > > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > > > > 	OS fails to  enable  SRIOV.
> > > > > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > > > > 	the conflicting devices are disabled.
> > > > > 
> > > > > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > > > > 	such problems and conflicts.  However previous  attempts to do so
> > > > > 	have led to regression on legacy platforms.
> > > > 
> > > > I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> > > > enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> > > > and I suspect if we straightened that out, we could avoid some of the
> > > > regressions we saw with previous attempts.
> > > 
> > > Can you be more specific as to what can be done to fix it automatically?
> > > 
> > > Neither accepting this approach nor telling what needs to be straightened out
> > > to automatically fix all the systems out there, is just a deadend.
> > 
> > Yeah, I guess that wasn't really fair, sorry.  And keep in mind that I'm
> > not the PCI maintainer, so these are just my opinions, nothing like an
> > official "nack."
> > 
> > I did look at this dmesg log from the thread you referenced:
> >     http://marc.info/?l=linux-kernel&m=127178918128740&w=2
> > but it looks to me like we just completely botched it.  I don't see an
> > SRIOV device or anything else that didn't have resources, so as far as I
> > can tell, we started with working resource assignments from the BIOS,
> > threw them away, and started over from scratch.  We failed because we
> > tried to assign I/O port space to bridges with nothing behind them, and
> > there was nothing left by the time we got to the 0000:09:04.0 device
> > that actually *did* need the space.
> 
> hmm.. is that possible? Yinghai's patch sized the resource requirement of each
> of the bridges, before actually allocating them. Which means a bridge with
> no device behind it would not get any i/o space.

Here's what I see in the dmesg log referenced above:

    ACPI: PCI Root Bridge [PCI0] (0000:00)
    pci_root PNP0A08:00: host bridge window [io 0x0000-0x0cf7]
    pci_root PNP0A08:00: host bridge window [io 0x0d00-0xffff]
    pci 0000:00:1c.0: PCI bridge to [bus 04-09]
    pci 0000:00:1c.0:   bridge window [io 0xd000-0xdfff]
    pci 0000:04:00.0: PCI bridge to [bus 05-09]
    pci 0000:04:00.0:   bridge window [io 0xd000-0xdfff]
    pci 0000:05:01.0: PCI bridge to [bus 08-09]
    pci 0000:05:01.0:   bridge window [io 0xd000-0xdfff]
    pci 0000:08:00.0: PCI bridge to [bus 09-09]
    pci 0000:08:00.0:   bridge window [io 0xd000-0xdfff]
    pci 0000:09:04.0: found [13f6:8788] class 000401 header type 00
    pci 0000:09:04.0: reg 10: [io  0xd800-0xd8ff]
    pci 0000:05:02.0: PCI bridge to [bus 07-07]
    pci 0000:05:02.0:   bridge window [io 0xf000-0x0000] (disabled)
    pci 0000:05:03.0: PCI bridge to [bus 06-06]
    pci 0000:05:03.0:   bridge window [io 0xf000-0x0000] (disabled)
    
The above is the state as we got it from BIOS.  Despite all the bridges,
09:04.0 is the only device below the 00:1c.0 bridge, and it requires only
0x100 I/O ports.

There are no devices on buses 06 (below 05:03.0) or 07 (below 05:02.0).

I didn't look at Yinghai's patch to figure out *why*, but it sure looks like
we released the 09:04.0 space, then tried to assign 0x2000 ports to 05:01.0 
(which needs 0x100 and had 0x1000 originally), 0x1000 to 05:02.0 (which needs
none), and 0x1000 to 05:03.0 (which also needs none):

    PCI: No. 3 try to assign unassigned res
    release child resource [io  0xd800-0xd8ff]
    pci 0000:08:00.0: resource 7 [io 0xd000-0xdfff] released
    pci 0000:04:00.0: BAR 7: can't assign io (size 0x4000)
    pci 0000:05:01.0: BAR 7: can't assign io (size 0x2000)
    pci 0000:05:02.0: BAR 7: can't assign io (size 0x1000)
    pci 0000:05:03.0: BAR 7: can't assign io (size 0x1000)
    pci 0000:08:00.0: BAR 7: can't assign io (size 0x1000)
    pci 0000:09:04.0: BAR 0: can't assign io (size 0x100)
    
I think there are at least two things wrong here:

  1) We rearranged things when we didn't need to, and
  2) We assigned I/O ports to bridge that didn't need them

Neither one is a terrible problem, so I think it's better to just fix
them than to add a special alternate path and a kernel parameter to
enable it.

> > > The choice is between
> > > 	(a) an automated patch with the risk of regressing some platforms.
> > > 	(b) an semi-automated patch that does not regress *any* platform,
> > > 		with the ability to fix platforms that are currently broken.
> > > 	(c) status quo, which means broken platforms continue to be so.
> > > 
> > > I thought the initial proposal was to use (b), with the long
> > > term goal of fixing it automatically, assuming that it is even possible.
> > > 
> > > Let me know if that is *not* the goal and I will change directions.
> > 
> > *My* goal is that a user would never need a kernel option except to help
> > debug kernel problems.  I think of an option like "pci=override" as a
> > band-aid that covers up a kernel problem without really fixing it, so I
> > guess my choice would be (a).  Yes, there's a risk of regression, and we
> > have to do everything we can to avoid it.  But the result is a more
> > usable system.
> 
> Ok. I think we agree with your goal, but the disagreement is on the approach.
> You want to take one big leap, whereas the consensus  I heard in earlier 
> threads was that we need to take baby steps.

We need baby steps in terms of a series of tiny patches.  I'm not yet
convinced that we need to add a "mostly-working" alternate path and
make the user figure out whether to use it.  That makes things more
complicated and reduces testing coverage.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-07 21:41         ` Bjorn Helgaas
@ 2010-10-08 17:32           ` Ram Pai
  2010-10-08 20:16             ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2010-10-08 17:32 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Ram Pai, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Jesse Barnes, Linus Torvalds

On Thu, Oct 07, 2010 at 03:41:04PM -0600, Bjorn Helgaas wrote:
> On Thursday, October 07, 2010 02:42:13 pm Ram Pai wrote:
> > On Wed, Oct 06, 2010 at 10:13:02PM -0600, Bjorn Helgaas wrote:
> > > On Wed, Oct 06, 2010 at 05:30:41PM -0700, Ram Pai wrote:
> > > > On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> > > > > On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> > > > > >         PCI: override BIOS/firmware memory resource allocation
> > > > > > 		through command line parameters
> > > > > > 
> > > > > > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > > > > > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > > > > > 	OS fails to  enable  SRIOV.
> > > > > > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > > > > > 	the conflicting devices are disabled.
> > > > > > 
> > > > > > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > > > > > 	such problems and conflicts.  However previous  attempts to do so
> > > > > > 	have led to regression on legacy platforms.
> > > > > 
> > > > > I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> > > > > enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> > > > > and I suspect if we straightened that out, we could avoid some of the
> > > > > regressions we saw with previous attempts.
> > > > 
> > > > Can you be more specific as to what can be done to fix it automatically?
> > > > 
> > > > Neither accepting this approach nor telling what needs to be straightened out
> > > > to automatically fix all the systems out there, is just a deadend.
> > > 
> > > Yeah, I guess that wasn't really fair, sorry.  And keep in mind that I'm
> > > not the PCI maintainer, so these are just my opinions, nothing like an
> > > official "nack."
> > > 
> > > I did look at this dmesg log from the thread you referenced:
> > >     http://marc.info/?l=linux-kernel&m=127178918128740&w=2
> > > but it looks to me like we just completely botched it.  I don't see an
> > > SRIOV device or anything else that didn't have resources, so as far as I
> > > can tell, we started with working resource assignments from the BIOS,
> > > threw them away, and started over from scratch.  We failed because we
> > > tried to assign I/O port space to bridges with nothing behind them, and
> > > there was nothing left by the time we got to the 0000:09:04.0 device
> > > that actually *did* need the space.
> > 
> > hmm.. is that possible? Yinghai's patch sized the resource requirement of each
> > of the bridges, before actually allocating them. Which means a bridge with
> > no device behind it would not get any i/o space.
> 
> Here's what I see in the dmesg log referenced above:
> 
>     ACPI: PCI Root Bridge [PCI0] (0000:00)
>     pci_root PNP0A08:00: host bridge window [io 0x0000-0x0cf7]
>     pci_root PNP0A08:00: host bridge window [io 0x0d00-0xffff]
>     pci 0000:00:1c.0: PCI bridge to [bus 04-09]
>     pci 0000:00:1c.0:   bridge window [io 0xd000-0xdfff]
>     pci 0000:04:00.0: PCI bridge to [bus 05-09]
>     pci 0000:04:00.0:   bridge window [io 0xd000-0xdfff]
>     pci 0000:05:01.0: PCI bridge to [bus 08-09]
>     pci 0000:05:01.0:   bridge window [io 0xd000-0xdfff]
>     pci 0000:08:00.0: PCI bridge to [bus 09-09]
>     pci 0000:08:00.0:   bridge window [io 0xd000-0xdfff]
>     pci 0000:09:04.0: found [13f6:8788] class 000401 header type 00
>     pci 0000:09:04.0: reg 10: [io  0xd800-0xd8ff]
>     pci 0000:05:02.0: PCI bridge to [bus 07-07]
>     pci 0000:05:02.0:   bridge window [io 0xf000-0x0000] (disabled)
>     pci 0000:05:03.0: PCI bridge to [bus 06-06]
>     pci 0000:05:03.0:   bridge window [io 0xf000-0x0000] (disabled)
>     
> The above is the state as we got it from BIOS.  Despite all the bridges,
> 09:04.0 is the only device below the 00:1c.0 bridge, and it requires only
> 0x100 I/O ports.
> 
> There are no devices on buses 06 (below 05:03.0) or 07 (below 05:02.0).
> 
> I didn't look at Yinghai's patch to figure out *why*, but it sure looks like
> we released the 09:04.0 space, then tried to assign 0x2000 ports to 05:01.0 
> (which needs 0x100 and had 0x1000 originally), 0x1000 to 05:02.0 (which needs
> none), and 0x1000 to 05:03.0 (which also needs none):
> 
>     PCI: No. 3 try to assign unassigned res
>     release child resource [io  0xd800-0xd8ff]
>     pci 0000:08:00.0: resource 7 [io 0xd000-0xdfff] released
>     pci 0000:04:00.0: BAR 7: can't assign io (size 0x4000)
>     pci 0000:05:01.0: BAR 7: can't assign io (size 0x2000)
>     pci 0000:05:02.0: BAR 7: can't assign io (size 0x1000)
>     pci 0000:05:03.0: BAR 7: can't assign io (size 0x1000)
>     pci 0000:08:00.0: BAR 7: can't assign io (size 0x1000)

Actually the message preceeding to them  are even more surprising:

Apr 20 20:31:42 [kernel] pci 0000:04:00.0: BAR 8: can't assign mem (size
0xc00000)
Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 8: can't assign mem (size
0x200000)
Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 9: can't assign mem pref
(size 0x200000)
Apr 20 20:31:42 [kernel] pci 0000:05:02.0: BAR 8: can't assign mem (size
0x400000)
Apr 20 20:31:42 [kernel] pci 0000:05:03.0: BAR 7: can't assign io (size
0x1000)

Do these bridges have IOV BARs and those BARs are demanding i/o resources?
Something is really funny with this machine. Or I am reading this wrong?

RP

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-08 17:32           ` Ram Pai
@ 2010-10-08 20:16             ` Bjorn Helgaas
  2010-10-12  7:05               ` Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2010-10-08 20:16 UTC (permalink / raw)
  To: Ram Pai
  Cc: linux-pci, linux-kernel, clemens, Yinghai Lu, Jesse Barnes,
	Linus Torvalds

On Friday, October 08, 2010 11:32:24 am Ram Pai wrote:
> On Thu, Oct 07, 2010 at 03:41:04PM -0600, Bjorn Helgaas wrote:
> > On Thursday, October 07, 2010 02:42:13 pm Ram Pai wrote:
> > > On Wed, Oct 06, 2010 at 10:13:02PM -0600, Bjorn Helgaas wrote:
> > > > On Wed, Oct 06, 2010 at 05:30:41PM -0700, Ram Pai wrote:
> > > > > On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> > > > > > On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> > > > > > >         PCI: override BIOS/firmware memory resource allocation
> > > > > > > 		through command line parameters
> > > > > > > 
> > > > > > > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > > > > > > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > > > > > > 	OS fails to  enable  SRIOV.
> > > > > > > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > > > > > > 	the conflicting devices are disabled.
> > > > > > > 
> > > > > > > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > > > > > > 	such problems and conflicts.  However previous  attempts to do so
> > > > > > > 	have led to regression on legacy platforms.
> > > > > > 
> > > > > > I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> > > > > > enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> > > > > > and I suspect if we straightened that out, we could avoid some of the
> > > > > > regressions we saw with previous attempts.
> > > > > 
> > > > > Can you be more specific as to what can be done to fix it automatically?
> > > > > 
> > > > > Neither accepting this approach nor telling what needs to be straightened out
> > > > > to automatically fix all the systems out there, is just a deadend.
> > > > 
> > > > Yeah, I guess that wasn't really fair, sorry.  And keep in mind that I'm
> > > > not the PCI maintainer, so these are just my opinions, nothing like an
> > > > official "nack."
> > > > 
> > > > I did look at this dmesg log from the thread you referenced:
> > > >     http://marc.info/?l=linux-kernel&m=127178918128740&w=2
> > > > but it looks to me like we just completely botched it.  I don't see an
> > > > SRIOV device or anything else that didn't have resources, so as far as I
> > > > can tell, we started with working resource assignments from the BIOS,
> > > > threw them away, and started over from scratch.  We failed because we
> > > > tried to assign I/O port space to bridges with nothing behind them, and
> > > > there was nothing left by the time we got to the 0000:09:04.0 device
> > > > that actually *did* need the space.
> > > 
> > > hmm.. is that possible? Yinghai's patch sized the resource requirement of each
> > > of the bridges, before actually allocating them. Which means a bridge with
> > > no device behind it would not get any i/o space.
> > 
> > Here's what I see in the dmesg log referenced above:
> > 
> >     ACPI: PCI Root Bridge [PCI0] (0000:00)
> >     pci_root PNP0A08:00: host bridge window [io 0x0000-0x0cf7]
> >     pci_root PNP0A08:00: host bridge window [io 0x0d00-0xffff]
> >     pci 0000:00:1c.0: PCI bridge to [bus 04-09]
> >     pci 0000:00:1c.0:   bridge window [io 0xd000-0xdfff]
> >     pci 0000:04:00.0: PCI bridge to [bus 05-09]
> >     pci 0000:04:00.0:   bridge window [io 0xd000-0xdfff]
> >     pci 0000:05:01.0: PCI bridge to [bus 08-09]
> >     pci 0000:05:01.0:   bridge window [io 0xd000-0xdfff]
> >     pci 0000:08:00.0: PCI bridge to [bus 09-09]
> >     pci 0000:08:00.0:   bridge window [io 0xd000-0xdfff]
> >     pci 0000:09:04.0: found [13f6:8788] class 000401 header type 00
> >     pci 0000:09:04.0: reg 10: [io  0xd800-0xd8ff]
> >     pci 0000:05:02.0: PCI bridge to [bus 07-07]
> >     pci 0000:05:02.0:   bridge window [io 0xf000-0x0000] (disabled)
> >     pci 0000:05:03.0: PCI bridge to [bus 06-06]
> >     pci 0000:05:03.0:   bridge window [io 0xf000-0x0000] (disabled)
> >     
> > The above is the state as we got it from BIOS.  Despite all the bridges,
> > 09:04.0 is the only device below the 00:1c.0 bridge, and it requires only
> > 0x100 I/O ports.
> > 
> > There are no devices on buses 06 (below 05:03.0) or 07 (below 05:02.0).
> > 
> > I didn't look at Yinghai's patch to figure out *why*, but it sure looks like
> > we released the 09:04.0 space, then tried to assign 0x2000 ports to 05:01.0 
> > (which needs 0x100 and had 0x1000 originally), 0x1000 to 05:02.0 (which needs
> > none), and 0x1000 to 05:03.0 (which also needs none):
> > 
> >     PCI: No. 3 try to assign unassigned res
> >     release child resource [io  0xd800-0xd8ff]
> >     pci 0000:08:00.0: resource 7 [io 0xd000-0xdfff] released
> >     pci 0000:04:00.0: BAR 7: can't assign io (size 0x4000)
> >     pci 0000:05:01.0: BAR 7: can't assign io (size 0x2000)
> >     pci 0000:05:02.0: BAR 7: can't assign io (size 0x1000)
> >     pci 0000:05:03.0: BAR 7: can't assign io (size 0x1000)
> >     pci 0000:08:00.0: BAR 7: can't assign io (size 0x1000)
> 
> Actually the message preceeding to them  are even more surprising:
> 
> Apr 20 20:31:42 [kernel] pci 0000:04:00.0: BAR 8: can't assign mem (size
> 0xc00000)
> Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 8: can't assign mem (size
> 0x200000)
> Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 9: can't assign mem pref
> (size 0x200000)
> Apr 20 20:31:42 [kernel] pci 0000:05:02.0: BAR 8: can't assign mem (size
> 0x400000)
> Apr 20 20:31:42 [kernel] pci 0000:05:03.0: BAR 7: can't assign io (size
> 0x1000)
> 
> Do these bridges have IOV BARs and those BARs are demanding i/o resources?
> Something is really funny with this machine. Or I am reading this wrong?

I don't see anything strange about the machine, but I don't understand
what Yinghai's patch was doing.  The lspci output is here:
    http://marc.info/?l=linux-pci&m=127184080929189&w=2

I don't see any SRIOV devices, and this kernel wasn't built with IOV
support anyway.  The messages above are attempts to assign resources
for bridge windows.

But the only real device in the whole hierarchy behind 00:1c.0
is 09:04.0, which doesn't require memory space, so I have no idea
why we're trying to open memory windows.

This is what I meant about "we haven't tried hard enough yet."  I'd
rather spend time fixing the problems like this, than just putting the
buggy code in and adding a kernel parameter to enable it.  That
guarantees that we'll never get enough testing to really shake it
out.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-08 20:16             ` Bjorn Helgaas
@ 2010-10-12  7:05               ` Ram Pai
  2010-10-12 19:01                 ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2010-10-12  7:05 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Ram Pai, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Jesse Barnes, Linus Torvalds

On Fri, Oct 08, 2010 at 02:16:57PM -0600, Bjorn Helgaas wrote:
> On Friday, October 08, 2010 11:32:24 am Ram Pai wrote:
> > On Thu, Oct 07, 2010 at 03:41:04PM -0600, Bjorn Helgaas wrote:
> > > On Thursday, October 07, 2010 02:42:13 pm Ram Pai wrote:
> > > > On Wed, Oct 06, 2010 at 10:13:02PM -0600, Bjorn Helgaas wrote:
> > > > > On Wed, Oct 06, 2010 at 05:30:41PM -0700, Ram Pai wrote:
> > > > > > On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> > > > > > > On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> > > > > > > >         PCI: override BIOS/firmware memory resource allocation
> > > > > > > > 		through command line parameters
> > > > > > > > 
> > > > > > > > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > > > > > > > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > > > > > > > 	OS fails to  enable  SRIOV.
> > > > > > > > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > > > > > > > 	the conflicting devices are disabled.
> > > > > > > > 
> > > > > > > > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > > > > > > > 	such problems and conflicts.  However previous  attempts to do so
> > > > > > > > 	have led to regression on legacy platforms.
> > > > > > > 
> > > > > > > I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> > > > > > > enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> > > > > > > and I suspect if we straightened that out, we could avoid some of the
> > > > > > > regressions we saw with previous attempts.
> > > > > > 
> > > > > > Can you be more specific as to what can be done to fix it automatically?
> > > > > > 
> > > > > > Neither accepting this approach nor telling what needs to be straightened out
> > > > > > to automatically fix all the systems out there, is just a deadend.
> > > > > 
> > > > > Yeah, I guess that wasn't really fair, sorry.  And keep in mind that I'm
> > > > > not the PCI maintainer, so these are just my opinions, nothing like an
> > > > > official "nack."
> > > > > 
> > > > > I did look at this dmesg log from the thread you referenced:
> > > > >     http://marc.info/?l=linux-kernel&m=127178918128740&w=2
> > > > > but it looks to me like we just completely botched it.  I don't see an
> > > > > SRIOV device or anything else that didn't have resources, so as far as I
> > > > > can tell, we started with working resource assignments from the BIOS,
> > > > > threw them away, and started over from scratch.  We failed because we
> > > > > tried to assign I/O port space to bridges with nothing behind them, and
> > > > > there was nothing left by the time we got to the 0000:09:04.0 device
> > > > > that actually *did* need the space.
> > > > 
> > > > hmm.. is that possible? Yinghai's patch sized the resource requirement of each
> > > > of the bridges, before actually allocating them. Which means a bridge with
> > > > no device behind it would not get any i/o space.
> > > 
> > > Here's what I see in the dmesg log referenced above:
> > > 
> > >     ACPI: PCI Root Bridge [PCI0] (0000:00)
> > >     pci_root PNP0A08:00: host bridge window [io 0x0000-0x0cf7]
> > >     pci_root PNP0A08:00: host bridge window [io 0x0d00-0xffff]
> > >     pci 0000:00:1c.0: PCI bridge to [bus 04-09]
> > >     pci 0000:00:1c.0:   bridge window [io 0xd000-0xdfff]
> > >     pci 0000:04:00.0: PCI bridge to [bus 05-09]
> > >     pci 0000:04:00.0:   bridge window [io 0xd000-0xdfff]
> > >     pci 0000:05:01.0: PCI bridge to [bus 08-09]
> > >     pci 0000:05:01.0:   bridge window [io 0xd000-0xdfff]
> > >     pci 0000:08:00.0: PCI bridge to [bus 09-09]
> > >     pci 0000:08:00.0:   bridge window [io 0xd000-0xdfff]
> > >     pci 0000:09:04.0: found [13f6:8788] class 000401 header type 00
> > >     pci 0000:09:04.0: reg 10: [io  0xd800-0xd8ff]
> > >     pci 0000:05:02.0: PCI bridge to [bus 07-07]
> > >     pci 0000:05:02.0:   bridge window [io 0xf000-0x0000] (disabled)
> > >     pci 0000:05:03.0: PCI bridge to [bus 06-06]
> > >     pci 0000:05:03.0:   bridge window [io 0xf000-0x0000] (disabled)
> > >     
> > > The above is the state as we got it from BIOS.  Despite all the bridges,
> > > 09:04.0 is the only device below the 00:1c.0 bridge, and it requires only
> > > 0x100 I/O ports.
> > > 
> > > There are no devices on buses 06 (below 05:03.0) or 07 (below 05:02.0).
> > > 
> > > I didn't look at Yinghai's patch to figure out *why*, but it sure looks like
> > > we released the 09:04.0 space, then tried to assign 0x2000 ports to 05:01.0 
> > > (which needs 0x100 and had 0x1000 originally), 0x1000 to 05:02.0 (which needs
> > > none), and 0x1000 to 05:03.0 (which also needs none):
> > > 
> > >     PCI: No. 3 try to assign unassigned res
> > >     release child resource [io  0xd800-0xd8ff]
> > >     pci 0000:08:00.0: resource 7 [io 0xd000-0xdfff] released
> > >     pci 0000:04:00.0: BAR 7: can't assign io (size 0x4000)
> > >     pci 0000:05:01.0: BAR 7: can't assign io (size 0x2000)
> > >     pci 0000:05:02.0: BAR 7: can't assign io (size 0x1000)
> > >     pci 0000:05:03.0: BAR 7: can't assign io (size 0x1000)
> > >     pci 0000:08:00.0: BAR 7: can't assign io (size 0x1000)
> > 
> > Actually the message preceeding to them  are even more surprising:
> > 
> > Apr 20 20:31:42 [kernel] pci 0000:04:00.0: BAR 8: can't assign mem (size
> > 0xc00000)
> > Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 8: can't assign mem (size
> > 0x200000)
> > Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 9: can't assign mem pref
> > (size 0x200000)
> > Apr 20 20:31:42 [kernel] pci 0000:05:02.0: BAR 8: can't assign mem (size
> > 0x400000)
> > Apr 20 20:31:42 [kernel] pci 0000:05:03.0: BAR 7: can't assign io (size
> > 0x1000)
> > 
> > Do these bridges have IOV BARs and those BARs are demanding i/o resources?
> > Something is really funny with this machine. Or I am reading this wrong?
> 
> I don't see anything strange about the machine, but I don't understand
> what Yinghai's patch was doing.  The lspci output is here:
>     http://marc.info/?l=linux-pci&m=127184080929189&w=2
> 
> I don't see any SRIOV devices, and this kernel wasn't built with IOV
> support anyway.  The messages above are attempts to assign resources
> for bridge windows.
> 
> But the only real device in the whole hierarchy behind 00:1c.0
> is 09:04.0, which doesn't require memory space, so I have no idea
> why we're trying to open memory windows.

I dont understand the code much. But there is some alignment 
constraints on the memory and io space dictated by the PCI code in 
pbus_size_io() and pbus_size_mem(), a constraint that ends up 
associating a minimum of 4096 bytes io window and 
nearly about 2MB mem window to a bridge irrespective of the 
requirements of the devices behind the bridge. That sounds like a
bug to me. But I don't know if those are valid constraints either.
Any idea?

> 
> This is what I meant about "we haven't tried hard enough yet."  I'd
> rather spend time fixing the problems like this, than just putting the
> buggy code in and adding a kernel parameter to enable it.  That
> guarantees that we'll never get enough testing to really shake it
> out.

Ok. I am no fan of kernel parameters either. However I am afraid 
we will attract furious looks as and when someone regresses. 

RP

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-12  7:05               ` Ram Pai
@ 2010-10-12 19:01                 ` Bjorn Helgaas
  2010-10-18 20:10                   ` Jesse Barnes
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2010-10-12 19:01 UTC (permalink / raw)
  To: Ram Pai
  Cc: linux-pci, linux-kernel, clemens, Yinghai Lu, Jesse Barnes,
	Linus Torvalds

On Tuesday, October 12, 2010 01:05:15 am Ram Pai wrote:
> On Fri, Oct 08, 2010 at 02:16:57PM -0600, Bjorn Helgaas wrote:
> > On Friday, October 08, 2010 11:32:24 am Ram Pai wrote:
> > > On Thu, Oct 07, 2010 at 03:41:04PM -0600, Bjorn Helgaas wrote:
> > > > On Thursday, October 07, 2010 02:42:13 pm Ram Pai wrote:
> > > > > On Wed, Oct 06, 2010 at 10:13:02PM -0600, Bjorn Helgaas wrote:
> > > > > > On Wed, Oct 06, 2010 at 05:30:41PM -0700, Ram Pai wrote:
> > > > > > > On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> > > > > > > > On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> > > > > > > > >         PCI: override BIOS/firmware memory resource allocation
> > > > > > > > > 		through command line parameters
> > > > > > > > > 
> > > > > > > > > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > > > > > > > > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > > > > > > > > 	OS fails to  enable  SRIOV.
> > > > > > > > > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > > > > > > > > 	the conflicting devices are disabled.
> > > > > > > > > 
> > > > > > > > > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > > > > > > > > 	such problems and conflicts.  However previous  attempts to do so
> > > > > > > > > 	have led to regression on legacy platforms.
> > > > > > > > 
> > > > > > > > I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> > > > > > > > enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> > > > > > > > and I suspect if we straightened that out, we could avoid some of the
> > > > > > > > regressions we saw with previous attempts.
> > > > > > > 
> > > > > > > Can you be more specific as to what can be done to fix it automatically?
> > > > > > > 
> > > > > > > Neither accepting this approach nor telling what needs to be straightened out
> > > > > > > to automatically fix all the systems out there, is just a deadend.
> > > > > > 
> > > > > > Yeah, I guess that wasn't really fair, sorry.  And keep in mind that I'm
> > > > > > not the PCI maintainer, so these are just my opinions, nothing like an
> > > > > > official "nack."
> > > > > > 
> > > > > > I did look at this dmesg log from the thread you referenced:
> > > > > >     http://marc.info/?l=linux-kernel&m=127178918128740&w=2
> > > > > > but it looks to me like we just completely botched it.  I don't see an
> > > > > > SRIOV device or anything else that didn't have resources, so as far as I
> > > > > > can tell, we started with working resource assignments from the BIOS,
> > > > > > threw them away, and started over from scratch.  We failed because we
> > > > > > tried to assign I/O port space to bridges with nothing behind them, and
> > > > > > there was nothing left by the time we got to the 0000:09:04.0 device
> > > > > > that actually *did* need the space.
> > > > > 
> > > > > hmm.. is that possible? Yinghai's patch sized the resource requirement of each
> > > > > of the bridges, before actually allocating them. Which means a bridge with
> > > > > no device behind it would not get any i/o space.
> > > > 
> > > > Here's what I see in the dmesg log referenced above:
> > > > 
> > > >     ACPI: PCI Root Bridge [PCI0] (0000:00)
> > > >     pci_root PNP0A08:00: host bridge window [io 0x0000-0x0cf7]
> > > >     pci_root PNP0A08:00: host bridge window [io 0x0d00-0xffff]
> > > >     pci 0000:00:1c.0: PCI bridge to [bus 04-09]
> > > >     pci 0000:00:1c.0:   bridge window [io 0xd000-0xdfff]
> > > >     pci 0000:04:00.0: PCI bridge to [bus 05-09]
> > > >     pci 0000:04:00.0:   bridge window [io 0xd000-0xdfff]
> > > >     pci 0000:05:01.0: PCI bridge to [bus 08-09]
> > > >     pci 0000:05:01.0:   bridge window [io 0xd000-0xdfff]
> > > >     pci 0000:08:00.0: PCI bridge to [bus 09-09]
> > > >     pci 0000:08:00.0:   bridge window [io 0xd000-0xdfff]
> > > >     pci 0000:09:04.0: found [13f6:8788] class 000401 header type 00
> > > >     pci 0000:09:04.0: reg 10: [io  0xd800-0xd8ff]
> > > >     pci 0000:05:02.0: PCI bridge to [bus 07-07]
> > > >     pci 0000:05:02.0:   bridge window [io 0xf000-0x0000] (disabled)
> > > >     pci 0000:05:03.0: PCI bridge to [bus 06-06]
> > > >     pci 0000:05:03.0:   bridge window [io 0xf000-0x0000] (disabled)
> > > >     
> > > > The above is the state as we got it from BIOS.  Despite all the bridges,
> > > > 09:04.0 is the only device below the 00:1c.0 bridge, and it requires only
> > > > 0x100 I/O ports.
> > > > 
> > > > There are no devices on buses 06 (below 05:03.0) or 07 (below 05:02.0).
> > > > 
> > > > I didn't look at Yinghai's patch to figure out *why*, but it sure looks like
> > > > we released the 09:04.0 space, then tried to assign 0x2000 ports to 05:01.0 
> > > > (which needs 0x100 and had 0x1000 originally), 0x1000 to 05:02.0 (which needs
> > > > none), and 0x1000 to 05:03.0 (which also needs none):
> > > > 
> > > >     PCI: No. 3 try to assign unassigned res
> > > >     release child resource [io  0xd800-0xd8ff]
> > > >     pci 0000:08:00.0: resource 7 [io 0xd000-0xdfff] released
> > > >     pci 0000:04:00.0: BAR 7: can't assign io (size 0x4000)
> > > >     pci 0000:05:01.0: BAR 7: can't assign io (size 0x2000)
> > > >     pci 0000:05:02.0: BAR 7: can't assign io (size 0x1000)
> > > >     pci 0000:05:03.0: BAR 7: can't assign io (size 0x1000)
> > > >     pci 0000:08:00.0: BAR 7: can't assign io (size 0x1000)
> > > 
> > > Actually the message preceeding to them  are even more surprising:
> > > 
> > > Apr 20 20:31:42 [kernel] pci 0000:04:00.0: BAR 8: can't assign mem (size
> > > 0xc00000)
> > > Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 8: can't assign mem (size
> > > 0x200000)
> > > Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 9: can't assign mem pref
> > > (size 0x200000)
> > > Apr 20 20:31:42 [kernel] pci 0000:05:02.0: BAR 8: can't assign mem (size
> > > 0x400000)
> > > Apr 20 20:31:42 [kernel] pci 0000:05:03.0: BAR 7: can't assign io (size
> > > 0x1000)
> > > 
> > > Do these bridges have IOV BARs and those BARs are demanding i/o resources?
> > > Something is really funny with this machine. Or I am reading this wrong?
> > 
> > I don't see anything strange about the machine, but I don't understand
> > what Yinghai's patch was doing.  The lspci output is here:
> >     http://marc.info/?l=linux-pci&m=127184080929189&w=2
> > 
> > I don't see any SRIOV devices, and this kernel wasn't built with IOV
> > support anyway.  The messages above are attempts to assign resources
> > for bridge windows.
> > 
> > But the only real device in the whole hierarchy behind 00:1c.0
> > is 09:04.0, which doesn't require memory space, so I have no idea
> > why we're trying to open memory windows.
> 
> I dont understand the code much. But there is some alignment 
> constraints on the memory and io space dictated by the PCI code in 
> pbus_size_io() and pbus_size_mem(), a constraint that ends up 
> associating a minimum of 4096 bytes io window and 
> nearly about 2MB mem window to a bridge irrespective of the 
> requirements of the devices behind the bridge. That sounds like a
> bug to me. But I don't know if those are valid constraints either.
> Any idea?

I think memory windows comes in 1MB chunks and I/O in 4096 port chunks.
There are some bridges that support smaller I/O windows (search for
"en1k" in drivers/pci/quirks.c), but it looks like we don't take that
into account when allocating space.

And of course, there's no requirement to enable the window at all if
we don't have any downstream devices.

> > This is what I meant about "we haven't tried hard enough yet."  I'd
> > rather spend time fixing the problems like this, than just putting the
> > buggy code in and adding a kernel parameter to enable it.  That
> > guarantees that we'll never get enough testing to really shake it
> > out.
> 
> Ok. I am no fan of kernel parameters either. However I am afraid 
> we will attract furious looks as and when someone regresses. 

Yep, I'm used to that :-)

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-12 19:01                 ` Bjorn Helgaas
@ 2010-10-18 20:10                   ` Jesse Barnes
  2010-10-19 17:17                     ` Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Jesse Barnes @ 2010-10-18 20:10 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Ram Pai, linux-pci, linux-kernel, clemens, Yinghai Lu, Linus Torvalds

On Tue, 12 Oct 2010 13:01:54 -0600
Bjorn Helgaas <bjorn.helgaas@hp.com> wrote:

> On Tuesday, October 12, 2010 01:05:15 am Ram Pai wrote:
> > On Fri, Oct 08, 2010 at 02:16:57PM -0600, Bjorn Helgaas wrote:
> > > On Friday, October 08, 2010 11:32:24 am Ram Pai wrote:
> > > > On Thu, Oct 07, 2010 at 03:41:04PM -0600, Bjorn Helgaas wrote:
> > > > > On Thursday, October 07, 2010 02:42:13 pm Ram Pai wrote:
> > > > > > On Wed, Oct 06, 2010 at 10:13:02PM -0600, Bjorn Helgaas wrote:
> > > > > > > On Wed, Oct 06, 2010 at 05:30:41PM -0700, Ram Pai wrote:
> > > > > > > > On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> > > > > > > > > On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> > > > > > > > > >         PCI: override BIOS/firmware memory resource allocation
> > > > > > > > > > 		through command line parameters
> > > > > > > > > > 
> > > > > > > > > > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > > > > > > > > > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > > > > > > > > > 	OS fails to  enable  SRIOV.
> > > > > > > > > > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > > > > > > > > > 	the conflicting devices are disabled.
> > > > > > > > > > 
> > > > > > > > > > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > > > > > > > > > 	such problems and conflicts.  However previous  attempts to do so
> > > > > > > > > > 	have led to regression on legacy platforms.
> > > > > > > > > 
> > > > > > > > > I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> > > > > > > > > enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> > > > > > > > > and I suspect if we straightened that out, we could avoid some of the
> > > > > > > > > regressions we saw with previous attempts.
> > > > > > > > 
> > > > > > > > Can you be more specific as to what can be done to fix it automatically?
> > > > > > > > 
> > > > > > > > Neither accepting this approach nor telling what needs to be straightened out
> > > > > > > > to automatically fix all the systems out there, is just a deadend.
> > > > > > > 
> > > > > > > Yeah, I guess that wasn't really fair, sorry.  And keep in mind that I'm
> > > > > > > not the PCI maintainer, so these are just my opinions, nothing like an
> > > > > > > official "nack."
> > > > > > > 
> > > > > > > I did look at this dmesg log from the thread you referenced:
> > > > > > >     http://marc.info/?l=linux-kernel&m=127178918128740&w=2
> > > > > > > but it looks to me like we just completely botched it.  I don't see an
> > > > > > > SRIOV device or anything else that didn't have resources, so as far as I
> > > > > > > can tell, we started with working resource assignments from the BIOS,
> > > > > > > threw them away, and started over from scratch.  We failed because we
> > > > > > > tried to assign I/O port space to bridges with nothing behind them, and
> > > > > > > there was nothing left by the time we got to the 0000:09:04.0 device
> > > > > > > that actually *did* need the space.
> > > > > > 
> > > > > > hmm.. is that possible? Yinghai's patch sized the resource requirement of each
> > > > > > of the bridges, before actually allocating them. Which means a bridge with
> > > > > > no device behind it would not get any i/o space.
> > > > > 
> > > > > Here's what I see in the dmesg log referenced above:
> > > > > 
> > > > >     ACPI: PCI Root Bridge [PCI0] (0000:00)
> > > > >     pci_root PNP0A08:00: host bridge window [io 0x0000-0x0cf7]
> > > > >     pci_root PNP0A08:00: host bridge window [io 0x0d00-0xffff]
> > > > >     pci 0000:00:1c.0: PCI bridge to [bus 04-09]
> > > > >     pci 0000:00:1c.0:   bridge window [io 0xd000-0xdfff]
> > > > >     pci 0000:04:00.0: PCI bridge to [bus 05-09]
> > > > >     pci 0000:04:00.0:   bridge window [io 0xd000-0xdfff]
> > > > >     pci 0000:05:01.0: PCI bridge to [bus 08-09]
> > > > >     pci 0000:05:01.0:   bridge window [io 0xd000-0xdfff]
> > > > >     pci 0000:08:00.0: PCI bridge to [bus 09-09]
> > > > >     pci 0000:08:00.0:   bridge window [io 0xd000-0xdfff]
> > > > >     pci 0000:09:04.0: found [13f6:8788] class 000401 header type 00
> > > > >     pci 0000:09:04.0: reg 10: [io  0xd800-0xd8ff]
> > > > >     pci 0000:05:02.0: PCI bridge to [bus 07-07]
> > > > >     pci 0000:05:02.0:   bridge window [io 0xf000-0x0000] (disabled)
> > > > >     pci 0000:05:03.0: PCI bridge to [bus 06-06]
> > > > >     pci 0000:05:03.0:   bridge window [io 0xf000-0x0000] (disabled)
> > > > >     
> > > > > The above is the state as we got it from BIOS.  Despite all the bridges,
> > > > > 09:04.0 is the only device below the 00:1c.0 bridge, and it requires only
> > > > > 0x100 I/O ports.
> > > > > 
> > > > > There are no devices on buses 06 (below 05:03.0) or 07 (below 05:02.0).
> > > > > 
> > > > > I didn't look at Yinghai's patch to figure out *why*, but it sure looks like
> > > > > we released the 09:04.0 space, then tried to assign 0x2000 ports to 05:01.0 
> > > > > (which needs 0x100 and had 0x1000 originally), 0x1000 to 05:02.0 (which needs
> > > > > none), and 0x1000 to 05:03.0 (which also needs none):
> > > > > 
> > > > >     PCI: No. 3 try to assign unassigned res
> > > > >     release child resource [io  0xd800-0xd8ff]
> > > > >     pci 0000:08:00.0: resource 7 [io 0xd000-0xdfff] released
> > > > >     pci 0000:04:00.0: BAR 7: can't assign io (size 0x4000)
> > > > >     pci 0000:05:01.0: BAR 7: can't assign io (size 0x2000)
> > > > >     pci 0000:05:02.0: BAR 7: can't assign io (size 0x1000)
> > > > >     pci 0000:05:03.0: BAR 7: can't assign io (size 0x1000)
> > > > >     pci 0000:08:00.0: BAR 7: can't assign io (size 0x1000)
> > > > 
> > > > Actually the message preceeding to them  are even more surprising:
> > > > 
> > > > Apr 20 20:31:42 [kernel] pci 0000:04:00.0: BAR 8: can't assign mem (size
> > > > 0xc00000)
> > > > Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 8: can't assign mem (size
> > > > 0x200000)
> > > > Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 9: can't assign mem pref
> > > > (size 0x200000)
> > > > Apr 20 20:31:42 [kernel] pci 0000:05:02.0: BAR 8: can't assign mem (size
> > > > 0x400000)
> > > > Apr 20 20:31:42 [kernel] pci 0000:05:03.0: BAR 7: can't assign io (size
> > > > 0x1000)
> > > > 
> > > > Do these bridges have IOV BARs and those BARs are demanding i/o resources?
> > > > Something is really funny with this machine. Or I am reading this wrong?
> > > 
> > > I don't see anything strange about the machine, but I don't understand
> > > what Yinghai's patch was doing.  The lspci output is here:
> > >     http://marc.info/?l=linux-pci&m=127184080929189&w=2
> > > 
> > > I don't see any SRIOV devices, and this kernel wasn't built with IOV
> > > support anyway.  The messages above are attempts to assign resources
> > > for bridge windows.
> > > 
> > > But the only real device in the whole hierarchy behind 00:1c.0
> > > is 09:04.0, which doesn't require memory space, so I have no idea
> > > why we're trying to open memory windows.
> > 
> > I dont understand the code much. But there is some alignment 
> > constraints on the memory and io space dictated by the PCI code in 
> > pbus_size_io() and pbus_size_mem(), a constraint that ends up 
> > associating a minimum of 4096 bytes io window and 
> > nearly about 2MB mem window to a bridge irrespective of the 
> > requirements of the devices behind the bridge. That sounds like a
> > bug to me. But I don't know if those are valid constraints either.
> > Any idea?
> 
> I think memory windows comes in 1MB chunks and I/O in 4096 port chunks.
> There are some bridges that support smaller I/O windows (search for
> "en1k" in drivers/pci/quirks.c), but it looks like we don't take that
> into account when allocating space.
> 
> And of course, there's no requirement to enable the window at all if
> we don't have any downstream devices.
> 
> > > This is what I meant about "we haven't tried hard enough yet."  I'd
> > > rather spend time fixing the problems like this, than just putting the
> > > buggy code in and adding a kernel parameter to enable it.  That
> > > guarantees that we'll never get enough testing to really shake it
> > > out.
> > 
> > Ok. I am no fan of kernel parameters either. However I am afraid 
> > we will attract furious looks as and when someone regresses. 
> 
> Yep, I'm used to that :-)

So where do we stand with this machine's problem?

Ram, do you have other machines that require your override patch?

Until we understand what's failing and why, I'm hesitant to apply a
patch that will work around the problem but require an extra kernel
parameter.

Our general approach is to avoid overwriting the BIOS provided
configuration, and when we do, to use available e820 and PnP data to
figure out where to put things, in addition to allocating space in a
similar way to Windows to avoid hitting undiscovered platform bugs
(i.e. Bjorn's recent patchset).

So while I'm not totally opposed to this patch, I'd feel much better
about it if we really understood how the failing machines were failing
before we try to workaround potential BIOS limitations.

-- 
Jesse Barnes, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-18 20:10                   ` Jesse Barnes
@ 2010-10-19 17:17                     ` Ram Pai
  2010-10-19 18:24                       ` Jesse Barnes
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2010-10-19 17:17 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Bjorn Helgaas, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds

On Mon, Oct 18, 2010 at 01:10:10PM -0700, Jesse Barnes wrote:
> On Tue, 12 Oct 2010 13:01:54 -0600
> Bjorn Helgaas <bjorn.helgaas@hp.com> wrote:
> 
> > On Tuesday, October 12, 2010 01:05:15 am Ram Pai wrote:
> > > On Fri, Oct 08, 2010 at 02:16:57PM -0600, Bjorn Helgaas wrote:
> > > > On Friday, October 08, 2010 11:32:24 am Ram Pai wrote:
> > > > > On Thu, Oct 07, 2010 at 03:41:04PM -0600, Bjorn Helgaas wrote:
> > > > > > On Thursday, October 07, 2010 02:42:13 pm Ram Pai wrote:
> > > > > > > On Wed, Oct 06, 2010 at 10:13:02PM -0600, Bjorn Helgaas wrote:
> > > > > > > > On Wed, Oct 06, 2010 at 05:30:41PM -0700, Ram Pai wrote:
> > > > > > > > > On Wed, Oct 06, 2010 at 05:39:53PM -0600, Bjorn Helgaas wrote:
> > > > > > > > > > On Wed, Oct 06, 2010 at 03:58:34PM -0700, Ram Pai wrote:
> > > > > > > > > > >         PCI: override BIOS/firmware memory resource allocation
> > > > > > > > > > > 		through command line parameters
> > > > > > > > > > > 
> > > > > > > > > > > 	Platforms that are unaware of  SRIOV  BARs  fail to allocate MMIO
> > > > > > > > > > > 	resources  to SRIOV PCIe  devices. Hence  on  such  platforms the
> > > > > > > > > > > 	OS fails to  enable  SRIOV.
> > > > > > > > > > > 	Some  platforms  where  BIOS/uEFI resource   allocations  conflict
> > > > > > > > > > > 	the conflicting devices are disabled.
> > > > > > > > > > > 
> > > > > > > > > > > 	Ideally we  would  want  the  OS  to detect and fix automatically
> > > > > > > > > > > 	such problems and conflicts.  However previous  attempts to do so
> > > > > > > > > > > 	have led to regression on legacy platforms.
> > > > > > > > > > 
> > > > > > > > > > I'm sorry to be a nay-sayer, but I think we just haven't tried hard
> > > > > > > > > > enough.  Our ACPI/PCI/e820 resource management is not well integrated,
> > > > > > > > > > and I suspect if we straightened that out, we could avoid some of the
> > > > > > > > > > regressions we saw with previous attempts.
> > > > > > > > > 
> > > > > > > > > Can you be more specific as to what can be done to fix it automatically?
> > > > > > > > > 
> > > > > > > > > Neither accepting this approach nor telling what needs to be straightened out
> > > > > > > > > to automatically fix all the systems out there, is just a deadend.
> > > > > > > > 
> > > > > > > > Yeah, I guess that wasn't really fair, sorry.  And keep in mind that I'm
> > > > > > > > not the PCI maintainer, so these are just my opinions, nothing like an
> > > > > > > > official "nack."
> > > > > > > > 
> > > > > > > > I did look at this dmesg log from the thread you referenced:
> > > > > > > >     http://marc.info/?l=linux-kernel&m=127178918128740&w=2
> > > > > > > > but it looks to me like we just completely botched it.  I don't see an
> > > > > > > > SRIOV device or anything else that didn't have resources, so as far as I
> > > > > > > > can tell, we started with working resource assignments from the BIOS,
> > > > > > > > threw them away, and started over from scratch.  We failed because we
> > > > > > > > tried to assign I/O port space to bridges with nothing behind them, and
> > > > > > > > there was nothing left by the time we got to the 0000:09:04.0 device
> > > > > > > > that actually *did* need the space.
> > > > > > > 
> > > > > > > hmm.. is that possible? Yinghai's patch sized the resource requirement of each
> > > > > > > of the bridges, before actually allocating them. Which means a bridge with
> > > > > > > no device behind it would not get any i/o space.
> > > > > > 
> > > > > > Here's what I see in the dmesg log referenced above:
> > > > > > 
> > > > > >     ACPI: PCI Root Bridge [PCI0] (0000:00)
> > > > > >     pci_root PNP0A08:00: host bridge window [io 0x0000-0x0cf7]
> > > > > >     pci_root PNP0A08:00: host bridge window [io 0x0d00-0xffff]
> > > > > >     pci 0000:00:1c.0: PCI bridge to [bus 04-09]
> > > > > >     pci 0000:00:1c.0:   bridge window [io 0xd000-0xdfff]
> > > > > >     pci 0000:04:00.0: PCI bridge to [bus 05-09]
> > > > > >     pci 0000:04:00.0:   bridge window [io 0xd000-0xdfff]
> > > > > >     pci 0000:05:01.0: PCI bridge to [bus 08-09]
> > > > > >     pci 0000:05:01.0:   bridge window [io 0xd000-0xdfff]
> > > > > >     pci 0000:08:00.0: PCI bridge to [bus 09-09]
> > > > > >     pci 0000:08:00.0:   bridge window [io 0xd000-0xdfff]
> > > > > >     pci 0000:09:04.0: found [13f6:8788] class 000401 header type 00
> > > > > >     pci 0000:09:04.0: reg 10: [io  0xd800-0xd8ff]
> > > > > >     pci 0000:05:02.0: PCI bridge to [bus 07-07]
> > > > > >     pci 0000:05:02.0:   bridge window [io 0xf000-0x0000] (disabled)
> > > > > >     pci 0000:05:03.0: PCI bridge to [bus 06-06]
> > > > > >     pci 0000:05:03.0:   bridge window [io 0xf000-0x0000] (disabled)
> > > > > >     
> > > > > > The above is the state as we got it from BIOS.  Despite all the bridges,
> > > > > > 09:04.0 is the only device below the 00:1c.0 bridge, and it requires only
> > > > > > 0x100 I/O ports.
> > > > > > 
> > > > > > There are no devices on buses 06 (below 05:03.0) or 07 (below 05:02.0).
> > > > > > 
> > > > > > I didn't look at Yinghai's patch to figure out *why*, but it sure looks like
> > > > > > we released the 09:04.0 space, then tried to assign 0x2000 ports to 05:01.0 
> > > > > > (which needs 0x100 and had 0x1000 originally), 0x1000 to 05:02.0 (which needs
> > > > > > none), and 0x1000 to 05:03.0 (which also needs none):
> > > > > > 
> > > > > >     PCI: No. 3 try to assign unassigned res
> > > > > >     release child resource [io  0xd800-0xd8ff]
> > > > > >     pci 0000:08:00.0: resource 7 [io 0xd000-0xdfff] released
> > > > > >     pci 0000:04:00.0: BAR 7: can't assign io (size 0x4000)
> > > > > >     pci 0000:05:01.0: BAR 7: can't assign io (size 0x2000)
> > > > > >     pci 0000:05:02.0: BAR 7: can't assign io (size 0x1000)
> > > > > >     pci 0000:05:03.0: BAR 7: can't assign io (size 0x1000)
> > > > > >     pci 0000:08:00.0: BAR 7: can't assign io (size 0x1000)
> > > > > 
> > > > > Actually the message preceeding to them  are even more surprising:
> > > > > 
> > > > > Apr 20 20:31:42 [kernel] pci 0000:04:00.0: BAR 8: can't assign mem (size
> > > > > 0xc00000)
> > > > > Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 8: can't assign mem (size
> > > > > 0x200000)
> > > > > Apr 20 20:31:42 [kernel] pci 0000:05:01.0: BAR 9: can't assign mem pref
> > > > > (size 0x200000)
> > > > > Apr 20 20:31:42 [kernel] pci 0000:05:02.0: BAR 8: can't assign mem (size
> > > > > 0x400000)
> > > > > Apr 20 20:31:42 [kernel] pci 0000:05:03.0: BAR 7: can't assign io (size
> > > > > 0x1000)
> > > > > 
> > > > > Do these bridges have IOV BARs and those BARs are demanding i/o resources?
> > > > > Something is really funny with this machine. Or I am reading this wrong?
> > > > 
> > > > I don't see anything strange about the machine, but I don't understand
> > > > what Yinghai's patch was doing.  The lspci output is here:
> > > >     http://marc.info/?l=linux-pci&m=127184080929189&w=2
> > > > 
> > > > I don't see any SRIOV devices, and this kernel wasn't built with IOV
> > > > support anyway.  The messages above are attempts to assign resources
> > > > for bridge windows.
> > > > 
> > > > But the only real device in the whole hierarchy behind 00:1c.0
> > > > is 09:04.0, which doesn't require memory space, so I have no idea
> > > > why we're trying to open memory windows.
> > > 
> > > I dont understand the code much. But there is some alignment 
> > > constraints on the memory and io space dictated by the PCI code in 
> > > pbus_size_io() and pbus_size_mem(), a constraint that ends up 
> > > associating a minimum of 4096 bytes io window and 
> > > nearly about 2MB mem window to a bridge irrespective of the 
> > > requirements of the devices behind the bridge. That sounds like a
> > > bug to me. But I don't know if those are valid constraints either.
> > > Any idea?
> > 
> > I think memory windows comes in 1MB chunks and I/O in 4096 port chunks.
> > There are some bridges that support smaller I/O windows (search for
> > "en1k" in drivers/pci/quirks.c), but it looks like we don't take that
> > into account when allocating space.
> > 
> > And of course, there's no requirement to enable the window at all if
> > we don't have any downstream devices.
> > 
> > > > This is what I meant about "we haven't tried hard enough yet."  I'd
> > > > rather spend time fixing the problems like this, than just putting the
> > > > buggy code in and adding a kernel parameter to enable it.  That
> > > > guarantees that we'll never get enough testing to really shake it
> > > > out.
> > > 
> > > Ok. I am no fan of kernel parameters either. However I am afraid 
> > > we will attract furious looks as and when someone regresses. 
> > 
> > Yep, I'm used to that :-)
> 
> So where do we stand with this machine's problem?

I think, this machine with the latest mainline kernel, will see memory resource
allocation failure messages. Since the latest kernel does not release and retry
to allocate resources, the io resources allocated by the BIOS continue to stay
put and hence the problem is masked.

However, any attempt to release and reallocate the resource on that machine
will fail; because as pointed out by Bjorn, there is some weird allocation
behavior in the current code. Unfortunately I cannot trigger that behavior on
any of my machines. 

I have requested data from Peter, who originally reported the problem. 
Hope he still has his setup with the Xonar card available for debugging.

Anyway, I do see a smoking gun in pbus_size_io() and pbus_size_mem().  They
call resource_size() to find the size requirement of resources of all devices
behind the bridge.  However for resources whose start and size are set to zero,
resource_size() returns one. Later ALIGN() rounds it up to the next higher
alignment boundary.

> 
> Ram, do you have other machines that require your override patch?

Yes, I have a couple of machines whose BIOS is unaware of SRIOV resources,
These machines need the override patch. :(


> 
> Until we understand what's failing and why, I'm hesitant to apply a
> patch that will work around the problem but require an extra kernel
> parameter.

We have already come a full circle here.  The original approach was
reverted because it regressed a platform. Now we are rejecting this
approach because we want the original approach. 

> 
> Our general approach is to avoid overwriting the BIOS provided
> configuration, and when we do, to use available e820 and PnP data to
> figure out where to put things, in addition to allocating space in a
> similar way to Windows to avoid hitting undiscovered platform bugs
> (i.e. Bjorn's recent patchset).
> 
> So while I'm not totally opposed to this patch, I'd feel much better
> about it if we really understood how the failing machines were failing
> before we try to workaround potential BIOS limitations.

Ok.
RP

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-19 17:17                     ` Ram Pai
@ 2010-10-19 18:24                       ` Jesse Barnes
  2010-10-22  0:28                         ` Ram Pai
  2010-10-22 17:16                         ` [PATCH 1/1] PCI: ignore failure to preallocate minimal resources to hotplug bridges Ram Pai
  0 siblings, 2 replies; 32+ messages in thread
From: Jesse Barnes @ 2010-10-19 18:24 UTC (permalink / raw)
  To: Ram Pai
  Cc: Bjorn Helgaas, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds

On Tue, 19 Oct 2010 10:17:40 -0700
Ram Pai <linuxram@us.ibm.com> wrote:
> > So where do we stand with this machine's problem?
> 
> I think, this machine with the latest mainline kernel, will see memory resource
> allocation failure messages. Since the latest kernel does not release and retry
> to allocate resources, the io resources allocated by the BIOS continue to stay
> put and hence the problem is masked.
> 
> However, any attempt to release and reallocate the resource on that machine
> will fail; because as pointed out by Bjorn, there is some weird allocation
> behavior in the current code. Unfortunately I cannot trigger that behavior on
> any of my machines. 
> 
> I have requested data from Peter, who originally reported the problem. 
> Hope he still has his setup with the Xonar card available for debugging.
> 
> Anyway, I do see a smoking gun in pbus_size_io() and pbus_size_mem().  They
> call resource_size() to find the size requirement of resources of all devices
> behind the bridge.  However for resources whose start and size are set to zero,
> resource_size() returns one. Later ALIGN() rounds it up to the next higher
> alignment boundary.

Right, if there are no devices with actual sizes behind a given bridge
window we shouldn't bother to allocate space (that may mean
re-allocation later if a device is added, but that needs extra work
anyway).

And like Bjorn said, I/O sizing has special requirements for PCI-PCI
bridges, but for others we may be able to make the windows smaller.

> > Ram, do you have other machines that require your override patch?
> 
> Yes, I have a couple of machines whose BIOS is unaware of SRIOV resources,
> These machines need the override patch. :(

Ok, but hopefully we can make those machines work without extra kernel
options; at worst maybe we can special case SRIOV resources and cause
them to trigger more aggressive reallocation.

> > Until we understand what's failing and why, I'm hesitant to apply a
> > patch that will work around the problem but require an extra kernel
> > parameter.
> 
> We have already come a full circle here.  The original approach was
> reverted because it regressed a platform. Now we are rejecting this
> approach because we want the original approach. 

Well the original approach had several problems:
  - unclear try= parameter
  - undocumented and ad-hoc reallocation behavior
  - poor (well, lack of) overall design

The root of the issue is still that we have poor data on where we're
allowed to put device resources.  Bjorn has been improving this, along
with changing the way we do allocations so as to avoid problem areas,
but ultimately this is the area where we need the most work.

We've tried and failed to add chipset specific drivers to give us safe
ranges, but those just can't keep up with the number of platforms and
variations out there.  On x86, I think the only reasonable approach is
to use the platforms as designed, i.e. use the resources Windows uses
and in the way Windows uses them.  Anything else just means we'll be
playing catch up.

As special cases arise (i.e. ways we use the platform that depart from
its original design and Windows version), as I suspect this SRIOV issue
is, we may need to apply additional conditions.  But I'd like to avoid
that if at all possible.

So on that note, does Windows on these machines support allocation of
SRIOV resources?  If so, how is it handled?  Which resource ranges are
used for the extra BARs?

-- 
Jesse Barnes, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-19 18:24                       ` Jesse Barnes
@ 2010-10-22  0:28                         ` Ram Pai
  2010-10-22 17:55                           ` Bjorn Helgaas
  2010-10-22 17:16                         ` [PATCH 1/1] PCI: ignore failure to preallocate minimal resources to hotplug bridges Ram Pai
  1 sibling, 1 reply; 32+ messages in thread
From: Ram Pai @ 2010-10-22  0:28 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Bjorn Helgaas, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds

On Tue, Oct 19, 2010 at 11:24:39AM -0700, Jesse Barnes wrote:
> On Tue, 19 Oct 2010 10:17:40 -0700
> Ram Pai <linuxram@us.ibm.com> wrote:
> > > So where do we stand with this machine's problem?
> > 
> > I think, this machine with the latest mainline kernel, will see memory resource
> > allocation failure messages. Since the latest kernel does not release and retry
> > to allocate resources, the io resources allocated by the BIOS continue to stay
> > put and hence the problem is masked.
> > 
> > However, any attempt to release and reallocate the resource on that machine
> > will fail; because as pointed out by Bjorn, there is some weird allocation
> > behavior in the current code. Unfortunately I cannot trigger that behavior on
> > any of my machines. 
> > 
> > I have requested data from Peter, who originally reported the problem. 
> > Hope he still has his setup with the Xonar card available for debugging.
> > 
> > Anyway, I do see a smoking gun in pbus_size_io() and pbus_size_mem().  They
> > call resource_size() to find the size requirement of resources of all devices
> > behind the bridge.  However for resources whose start and size are set to zero,
> > resource_size() returns one. Later ALIGN() rounds it up to the next higher
> > alignment boundary.
> 
> Right, if there are no devices with actual sizes behind a given bridge
> window we shouldn't bother to allocate space (that may mean
> re-allocation later if a device is added, but that needs extra work
> anyway).
> 
> And like Bjorn said, I/O sizing has special requirements for PCI-PCI
> bridges, but for others we may be able to make the windows smaller.

Ok. After further investigation, I find that the BIOS has not allocated any
resource to hotplug bridges that have no devices behind them. However
the OS tries to allocate some minimal resources, 4096 I/O ports and 2M mem
window, to the these hotplug bridges but fails because there are not enough
resources to satisfy all the hotplug bridges.

This issue exists even today with the latest mainline kernel but is masked
because no devices are really effected. However Yinghai's reallocation patch
exposed the issue, since it released the BIOS allocated window and tried to
reallocate. Unfortunately it ended up allocating in the wrong order. The
bridges with real devices got no resources, where as the hotplug bridges with
no devices got some mimimal resources.

The details are captured at: https://bugzilla.kernel.org/show_bug.cgi?id=15960

I suppose the solution is to not pre-allocate resources to hotplug bridges that
have no devices behind them, and then bring back Yinghai's reallocation patch.  

Do you agree with the approach?

My concern is some, including Linus, might consider this as plugging yet
another hole, with no gaurantees that it will not-regress any other platform.

I hope Linus is listening.

> 
> > > Ram, do you have other machines that require your override patch?
> > 
> > Yes, I have a couple of machines whose BIOS is unaware of SRIOV resources,
> > These machines need the override patch. :(
> 
> Ok, but hopefully we can make those machines work without extra kernel
> options; at worst maybe we can special case SRIOV resources and cause
> them to trigger more aggressive reallocation.
> 
> > > Until we understand what's failing and why, I'm hesitant to apply a
> > > patch that will work around the problem but require an extra kernel
> > > parameter.
> > 
> > We have already come a full circle here.  The original approach was
> > reverted because it regressed a platform. Now we are rejecting this
> > approach because we want the original approach. 
> 
> Well the original approach had several problems:
>   - unclear try= parameter
>   - undocumented and ad-hoc reallocation behavior
>   - poor (well, lack of) overall design
> 
> The root of the issue is still that we have poor data on where we're
> allowed to put device resources.  Bjorn has been improving this, along
> with changing the way we do allocations so as to avoid problem areas,
> but ultimately this is the area where we need the most work.
> 
> We've tried and failed to add chipset specific drivers to give us safe
> ranges, but those just can't keep up with the number of platforms and
> variations out there.  On x86, I think the only reasonable approach is
> to use the platforms as designed, i.e. use the resources Windows uses
> and in the way Windows uses them.  Anything else just means we'll be
> playing catch up.
> 
> As special cases arise (i.e. ways we use the platform that depart from
> its original design and Windows version), as I suspect this SRIOV issue
> is, we may need to apply additional conditions.  But I'd like to avoid
> that if at all possible.
> 
> So on that note, does Windows on these machines support allocation of
> SRIOV resources?  If so, how is it handled?  Which resource ranges are
> used for the extra BARs?

No. I have not tried windows on these boxes. But last I heard windows
did not support SRIOV. Does it?

RP

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/1] PCI: ignore failure to preallocate minimal resources to hotplug bridges
  2010-10-19 18:24                       ` Jesse Barnes
  2010-10-22  0:28                         ` Ram Pai
@ 2010-10-22 17:16                         ` Ram Pai
  2010-10-22 22:16                           ` Bjorn Helgaas
  2011-01-07 22:32                           ` Jesse Barnes
  1 sibling, 2 replies; 32+ messages in thread
From: Ram Pai @ 2010-10-22 17:16 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Bjorn Helgaas, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

        PCI: ignore failure to preallocate minimal resources to 
		hotplug bridges

	Linux tries to pre-allocate minimal resources to hotplug 
	bridges.  This 	works 	fine as long as there are enough 
	resources  to  satisfy   all   other   genuine  resource
	requirements.  However  if  enough  resources   are  not 
	available    to    satisfy    the    pre-allocation, the 
	resource-allocator reports errors and returns failure.
	
	This patch distinguishes between must-need resources and
	nice-to-have   resources.   Any   failure   to  allocate 
	nice-to-have resources are ignored. 

	This  behavior  can  be  particularly  useful to trigger 
	automatic  reallocation, if  the  OS  discovers  genuine 
	resource  allocation  conflicts  or  genuine unallocated 
	BARs  caused  by not-so-smart allocation behavior of the 
	native BIOS.

	The motivation for this patch is due a issue reported in 
	https://bugzilla.kernel.org/show_bug.cgi?id=15960

Signed-off-by: Ram Pai <linuxram@us.ibm.com>

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 66cb8f4..3bbc427 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -412,14 +412,14 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 {
 	struct pci_dev *dev;
 	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
-	unsigned long size = 0, size1 = 0, old_size;
+	unsigned long size = 0, size1 = 0, old_size, dev_present=0;
 
 	if (!b_res)
  		return;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		int i;
-
+		dev_present=1;
 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
 			struct resource *r = &dev->resource[i];
 			unsigned long r_size;
@@ -460,6 +460,12 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 	b_res->start = 4096;
 	b_res->end = b_res->start + size - 1;
 	b_res->flags |= IORESOURCE_STARTALIGN;
+
+	/* if no devices are behind this bus, inform
+	 * resource-allocater to try hard but not to worry
+	 * if allocations fail 
+	 */
+	b_res->flags |= (!dev_present) ? IORESOURCE_IGNORE_FAIL : 0;
 }
 
 /* Calculate the size of the bus and minimal alignment which
@@ -470,7 +476,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	struct pci_dev *dev;
 	resource_size_t min_align, align, size, old_size;
 	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
-	int order, max_order;
+	int order, max_order, dev_present=0;
 	struct resource *b_res = find_free_bus_resource(bus, type);
 	unsigned int mem64_mask = 0;
 
@@ -487,6 +493,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		int i;
 
+		dev_present=1;
 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
 			struct resource *r = &dev->resource[i];
 			resource_size_t r_size;
@@ -550,6 +557,13 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	b_res->end = size + min_align - 1;
 	b_res->flags |= IORESOURCE_STARTALIGN;
 	b_res->flags |= mem64_mask;
+
+	/* if no devices are behind this bus, inform
+	 * resource-allocater to try hard but not to worry
+	 * if allocations fail 
+	 */
+	b_res->flags |= (!dev_present) ? IORESOURCE_IGNORE_FAIL : 0;
+
 	return 1;
 }
 
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 2aaa131..a8ce953 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -210,7 +210,8 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 	if (!align) {
 		dev_info(&dev->dev, "BAR %d: can't assign %pR "
 			 "(bogus alignment)\n", resno, res);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
 
 	bus = dev->bus;
@@ -225,6 +226,10 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 	}
 
 	if (ret) {
+		if (res->flags & IORESOURCE_IGNORE_FAIL) {
+			ret = 0;
+			goto out;
+		}
 		if (res->flags & IORESOURCE_MEM)
 			if (res->flags & IORESOURCE_PREFETCH)
 				type = "mem pref";
@@ -239,6 +244,8 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 			 resno, type, (unsigned long long) resource_size(res));
 	}
 
+out:
+	res->flags &= ~IORESOURCE_IGNORE_FAIL;
 	return ret;
 }
 
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index b227902..941624b 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -49,6 +49,7 @@ struct resource_list {
 
 #define IORESOURCE_SIZEALIGN	0x00040000	/* size indicates alignment */
 #define IORESOURCE_STARTALIGN	0x00080000	/* start field is alignment */
+#define IORESOURCE_IGNORE_FAIL	0x00800000 	/* IGNORE if allocation fail*/
 
 #define IORESOURCE_MEM_64	0x00100000
 #define IORESOURCE_WINDOW	0x00200000	/* forwarded by bridge */

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-22  0:28                         ` Ram Pai
@ 2010-10-22 17:55                           ` Bjorn Helgaas
  2010-10-22 18:59                             ` Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2010-10-22 17:55 UTC (permalink / raw)
  To: Ram Pai
  Cc: Jesse Barnes, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds

On Thursday, October 21, 2010 06:28:51 pm Ram Pai wrote:
> On Tue, Oct 19, 2010 at 11:24:39AM -0700, Jesse Barnes wrote:

> Ok. After further investigation, I find that the BIOS has not allocated any
> resource to hotplug bridges that have no devices behind them. However
> the OS tries to allocate some minimal resources, 4096 I/O ports and 2M mem
> window, to the these hotplug bridges but fails because there are not enough
> resources to satisfy all the hotplug bridges.
> 
> This issue exists even today with the latest mainline kernel but is masked
> because no devices are really effected. However Yinghai's reallocation patch
> exposed the issue, since it released the BIOS allocated window and tried to
> reallocate. Unfortunately it ended up allocating in the wrong order. The
> bridges with real devices got no resources, where as the hotplug bridges with
> no devices got some mimimal resources.
> 
> The details are captured at: https://bugzilla.kernel.org/show_bug.cgi?id=15960
> 
> I suppose the solution is to not pre-allocate resources to hotplug bridges that
> have no devices behind them, and then bring back Yinghai's reallocation patch.  

If we assign resources to bridges with no devices behind them while
starving bridges that DO have devices behind them, I think that's a bug.
It'd be great if you could isolate and fix that before we complicate
things by throwing Yinghai's patch into the mix.

I think it'd interesting to have a debug parameter like "pci=assign-all"
that meant "ignore all BIOS PCI config and start from scratch."  I'm
sure we'd trip over lots of issues and it would be easier to resolve them
before trying to make things fancier.

> > So on that note, does Windows on these machines support allocation of
> > SRIOV resources?  If so, how is it handled?  Which resource ranges are
> > used for the extra BARs?
> 
> No. I have not tried windows on these boxes. But last I heard windows
> did not support SRIOV. Does it?

I've heard anecdotally that Windows supports SRIOV much better than
Linux does, but I have no evidence.  I would guess that it depends on
driver support in addition to Windows itself.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-22 17:55                           ` Bjorn Helgaas
@ 2010-10-22 18:59                             ` Ram Pai
  2010-10-22 21:49                               ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2010-10-22 18:59 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Jesse Barnes, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds

On Fri, Oct 22, 2010 at 11:55:18AM -0600, Bjorn Helgaas wrote:
> On Thursday, October 21, 2010 06:28:51 pm Ram Pai wrote:
> > On Tue, Oct 19, 2010 at 11:24:39AM -0700, Jesse Barnes wrote:
> 
> > Ok. After further investigation, I find that the BIOS has not allocated any
> > resource to hotplug bridges that have no devices behind them. However
> > the OS tries to allocate some minimal resources, 4096 I/O ports and 2M mem
> > window, to the these hotplug bridges but fails because there are not enough
> > resources to satisfy all the hotplug bridges.
> > 
> > This issue exists even today with the latest mainline kernel but is masked
> > because no devices are really effected. However Yinghai's reallocation patch
> > exposed the issue, since it released the BIOS allocated window and tried to
> > reallocate. Unfortunately it ended up allocating in the wrong order. The
> > bridges with real devices got no resources, where as the hotplug bridges with
> > no devices got some mimimal resources.
> > 
> > The details are captured at: https://bugzilla.kernel.org/show_bug.cgi?id=15960
> > 
> > I suppose the solution is to not pre-allocate resources to hotplug bridges that
> > have no devices behind them, and then bring back Yinghai's reallocation patch.  
> 
> If we assign resources to bridges with no devices behind them while
> starving bridges that DO have devices behind them, I think that's a bug.
> It'd be great if you could isolate and fix that before we complicate
> things by throwing Yinghai's patch into the mix.

Yes. I sent out a patch an hour back with a fix for that.

> 
> I think it'd interesting to have a debug parameter like "pci=assign-all"
> that meant "ignore all BIOS PCI config and start from scratch."  I'm
> sure we'd trip over lots of issues and it would be easier to resolve them
> before trying to make things fancier.

The patch that I had sent 3-weeks back had that feature. It can be triggered
by pci=override=always.

However, your arguement earlier was that it would make no difference because
no one will enable that parameter by default. This means hardly anyone will
report any bugs. 

Moreover I am sure we will encounter lots of bugs if we enable that
parameter, and that would mean we will be fixing bugs for a long long time.

Gating "Yinghai's patch or any other approach" on fixing all the bugs in the
pci subsystem, implicitly means 'go away' ;). I hope you dont mean that.

I propose to bring Yinghai's patch, for that will enable us to expose bugs and
at the same time it will enable SRIOV on *all* the platforms that don't have it.

RP

> 
> > > So on that note, does Windows on these machines support allocation of
> > > SRIOV resources?  If so, how is it handled?  Which resource ranges are
> > > used for the extra BARs?
> > 
> > No. I have not tried windows on these boxes. But last I heard windows
> > did not support SRIOV. Does it?
> 
> I've heard anecdotally that Windows supports SRIOV much better than
> Linux does, but I have no evidence.  I would guess that it depends on
> driver support in addition to Windows itself.
> 
> Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation
  2010-10-22 18:59                             ` Ram Pai
@ 2010-10-22 21:49                               ` Bjorn Helgaas
  0 siblings, 0 replies; 32+ messages in thread
From: Bjorn Helgaas @ 2010-10-22 21:49 UTC (permalink / raw)
  To: Ram Pai
  Cc: Jesse Barnes, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds

On Friday, October 22, 2010 12:59:37 pm Ram Pai wrote:
> On Fri, Oct 22, 2010 at 11:55:18AM -0600, Bjorn Helgaas wrote:
> > If we assign resources to bridges with no devices behind them while
> > starving bridges that DO have devices behind them, I think that's a bug.
> > It'd be great if you could isolate and fix that before we complicate
> > things by throwing Yinghai's patch into the mix.
> 
> Yes. I sent out a patch an hour back with a fix for that.

Yeah, I saw that right after I hit "send" :-)

> > I think it'd interesting to have a debug parameter like "pci=assign-all"
> > that meant "ignore all BIOS PCI config and start from scratch."  I'm
> > sure we'd trip over lots of issues and it would be easier to resolve them
> > before trying to make things fancier.
> 
> The patch that I had sent 3-weeks back had that feature. It can be triggered
> by pci=override=always.
> 
> However, your arguement earlier was that it would make no difference because
> no one will enable that parameter by default. This means hardly anyone will
> report any bugs. 

I just meant as a development tool, maybe not even upstream, as a way
to exercise the Linux allocation code.  I don't have much confidence
in it yet.  We certainly don't want Linux to ignore the BIOS config in
general; we should just use it unless we can tell it's broken.

> Gating "Yinghai's patch or any other approach" on fixing all the bugs in the
> pci subsystem, implicitly means 'go away' ;). I hope you dont mean that.

Oh, no.

> I propose to bring Yinghai's patch, for that will enable us to expose bugs and
> at the same time it will enable SRIOV on *all* the platforms that don't have it.

Yes, we should consider Yinghai's patch again after all the known issues
with it are fixed.  I just don't know whether we're there yet.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] PCI: ignore failure to preallocate minimal resources to hotplug bridges
  2010-10-22 17:16                         ` [PATCH 1/1] PCI: ignore failure to preallocate minimal resources to hotplug bridges Ram Pai
@ 2010-10-22 22:16                           ` Bjorn Helgaas
  2011-01-07 22:32                           ` Jesse Barnes
  1 sibling, 0 replies; 32+ messages in thread
From: Bjorn Helgaas @ 2010-10-22 22:16 UTC (permalink / raw)
  To: Ram Pai
  Cc: Jesse Barnes, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

On Friday, October 22, 2010 11:16:40 am Ram Pai wrote:
>         PCI: ignore failure to preallocate minimal resources to 
> 		hotplug bridges
> 
> 	Linux tries to pre-allocate minimal resources to hotplug 
> 	bridges.  This 	works 	fine as long as there are enough 
> 	resources  to  satisfy   all   other   genuine  resource
> 	requirements.  However  if  enough  resources   are  not 
> 	available    to    satisfy    the    pre-allocation, the 
> 	resource-allocator reports errors and returns failure.
> 	
> 	This patch distinguishes between must-need resources and
> 	nice-to-have   resources.   Any   failure   to  allocate 
> 	nice-to-have resources are ignored. 
> 
> 	This  behavior  can  be  particularly  useful to trigger 
> 	automatic  reallocation, if  the  OS  discovers  genuine 
> 	resource  allocation  conflicts  or  genuine unallocated 
> 	BARs  caused  by not-so-smart allocation behavior of the 
> 	native BIOS.
> 
> 	The motivation for this patch is due a issue reported in 
> 	https://bugzilla.kernel.org/show_bug.cgi?id=15960

No need to justify both margins, the left is enough :-)
Doesn't need to be indented either.

> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> 
> diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
> index 66cb8f4..3bbc427 100644
> --- a/drivers/pci/setup-bus.c
> +++ b/drivers/pci/setup-bus.c
> @@ -412,14 +412,14 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
>  {
>  	struct pci_dev *dev;
>  	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
> -	unsigned long size = 0, size1 = 0, old_size;
> +	unsigned long size = 0, size1 = 0, old_size, dev_present=0;

General rule of thumb: when modifying code always follow the
existing style for spacing, indentation, etc.  In this case
you would need spaces around the "=".  

>  	if (!b_res)
>   		return;
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		int i;
> -
> +		dev_present=1;
>  		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
>  			struct resource *r = &dev->resource[i];
>  			unsigned long r_size;
> @@ -460,6 +460,12 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
>  	b_res->start = 4096;
>  	b_res->end = b_res->start + size - 1;
>  	b_res->flags |= IORESOURCE_STARTALIGN;
> +
> +	/* if no devices are behind this bus, inform
> +	 * resource-allocater to try hard but not to worry
> +	 * if allocations fail 
> +	 */
> +	b_res->flags |= (!dev_present) ? IORESOURCE_IGNORE_FAIL : 0;
>  }
>  
>  /* Calculate the size of the bus and minimal alignment which
> @@ -470,7 +476,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  	struct pci_dev *dev;
>  	resource_size_t min_align, align, size, old_size;
>  	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
> -	int order, max_order;
> +	int order, max_order, dev_present=0;
>  	struct resource *b_res = find_free_bus_resource(bus, type);
>  	unsigned int mem64_mask = 0;
>  
> @@ -487,6 +493,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		int i;
>  
> +		dev_present=1;
>  		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
>  			struct resource *r = &dev->resource[i];
>  			resource_size_t r_size;
> @@ -550,6 +557,13 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  	b_res->end = size + min_align - 1;
>  	b_res->flags |= IORESOURCE_STARTALIGN;
>  	b_res->flags |= mem64_mask;
> +
> +	/* if no devices are behind this bus, inform
> +	 * resource-allocater to try hard but not to worry
> +	 * if allocations fail 
> +	 */
> +	b_res->flags |= (!dev_present) ? IORESOURCE_IGNORE_FAIL : 0;
> +
>  	return 1;
>  }
>  
> diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
> index 2aaa131..a8ce953 100644
> --- a/drivers/pci/setup-res.c
> +++ b/drivers/pci/setup-res.c
> @@ -210,7 +210,8 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
>  	if (!align) {
>  		dev_info(&dev->dev, "BAR %d: can't assign %pR "
>  			 "(bogus alignment)\n", resno, res);
> -		return -EINVAL;
> +		ret = -EINVAL;
> +		goto out;
>  	}
>  
>  	bus = dev->bus;
> @@ -225,6 +226,10 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
>  	}
>  
>  	if (ret) {
> +		if (res->flags & IORESOURCE_IGNORE_FAIL) {

You might be able to do something like this:

		if (dev->subordinate && list_empty(&dev->subordinate->devices))

and avoid the need for a new flag.

I'm not sure this approach is enough, though.  What if we have 4K of
I/O space available, and we have this situation:

	bridge A (disabled)
	bridge B (disabled)
		dev D (needs I/O space)

I think your patch will let us ignore a failure to allocate space for
bridge A.  But the real problem is the order: we have to allocate space
for bridge B *first*.

Bjorn

> +			ret = 0;
> +			goto out;
> +		}
>  		if (res->flags & IORESOURCE_MEM)
>  			if (res->flags & IORESOURCE_PREFETCH)
>  				type = "mem pref";
> @@ -239,6 +244,8 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
>  			 resno, type, (unsigned long long) resource_size(res));
>  	}
>  
> +out:
> +	res->flags &= ~IORESOURCE_IGNORE_FAIL;
>  	return ret;
>  }
>  
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index b227902..941624b 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -49,6 +49,7 @@ struct resource_list {
>  
>  #define IORESOURCE_SIZEALIGN	0x00040000	/* size indicates alignment */
>  #define IORESOURCE_STARTALIGN	0x00080000	/* start field is alignment */
> +#define IORESOURCE_IGNORE_FAIL	0x00800000 	/* IGNORE if allocation fail*/
>  
>  #define IORESOURCE_MEM_64	0x00100000
>  #define IORESOURCE_WINDOW	0x00200000	/* forwarded by bridge */
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] PCI: ignore failure to preallocate minimal resources to hotplug bridges
  2010-10-22 17:16                         ` [PATCH 1/1] PCI: ignore failure to preallocate minimal resources to hotplug bridges Ram Pai
  2010-10-22 22:16                           ` Bjorn Helgaas
@ 2011-01-07 22:32                           ` Jesse Barnes
  2011-01-11 21:10                             ` Ram Pai
  1 sibling, 1 reply; 32+ messages in thread
From: Jesse Barnes @ 2011-01-07 22:32 UTC (permalink / raw)
  To: Ram Pai
  Cc: Bjorn Helgaas, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

On Fri, 22 Oct 2010 10:16:40 -0700
Ram Pai <linuxram@us.ibm.com> wrote:

>         PCI: ignore failure to preallocate minimal resources to 
> 		hotplug bridges
> 
> 	Linux tries to pre-allocate minimal resources to hotplug 
> 	bridges.  This 	works 	fine as long as there are enough 
> 	resources  to  satisfy   all   other   genuine  resource
> 	requirements.  However  if  enough  resources   are  not 
> 	available    to    satisfy    the    pre-allocation, the 
> 	resource-allocator reports errors and returns failure.
> 	
> 	This patch distinguishes between must-need resources and
> 	nice-to-have   resources.   Any   failure   to  allocate 
> 	nice-to-have resources are ignored. 
> 
> 	This  behavior  can  be  particularly  useful to trigger 
> 	automatic  reallocation, if  the  OS  discovers  genuine 
> 	resource  allocation  conflicts  or  genuine unallocated 
> 	BARs  caused  by not-so-smart allocation behavior of the 
> 	native BIOS.
> 
> 	The motivation for this patch is due a issue reported in 
> 	https://bugzilla.kernel.org/show_bug.cgi?id=15960
> 
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> 

Seems like we want a fix like this; did you see Bjorn's comments?
Maybe we can adjust the order of allocation instead?

What issues do you still have outstanding wrt resource allocation?  Do
you still think you need the big "reallocate everything" option?

Thanks,
-- 
Jesse Barnes, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] PCI: ignore failure to preallocate minimal resources to hotplug bridges
  2011-01-07 22:32                           ` Jesse Barnes
@ 2011-01-11 21:10                             ` Ram Pai
  2011-01-14 18:19                               ` [PATCH 1/1] PCI: allocate essential resources before reserving hotplug resources Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2011-01-11 21:10 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Ram Pai, Bjorn Helgaas, linux-pci, linux-kernel, clemens,
	Yinghai Lu, Linus Torvalds, peter.henriksson, ebiederm

On Fri, Jan 07, 2011 at 02:32:08PM -0800, Jesse Barnes wrote:
> On Fri, 22 Oct 2010 10:16:40 -0700
> Ram Pai <linuxram@us.ibm.com> wrote:
> 
> >         PCI: ignore failure to preallocate minimal resources to 
> > 		hotplug bridges
> > 
> > 	Linux tries to pre-allocate minimal resources to hotplug 
> > 	bridges.  This 	works 	fine as long as there are enough 
> > 	resources  to  satisfy   all   other   genuine  resource
> > 	requirements.  However  if  enough  resources   are  not 
> > 	available    to    satisfy    the    pre-allocation, the 
> > 	resource-allocator reports errors and returns failure.
> > 	
> > 	This patch distinguishes between must-need resources and
> > 	nice-to-have   resources.   Any   failure   to  allocate 
> > 	nice-to-have resources are ignored. 
> > 
> > 	This  behavior  can  be  particularly  useful to trigger 
> > 	automatic  reallocation, if  the  OS  discovers  genuine 
> > 	resource  allocation  conflicts  or  genuine unallocated 
> > 	BARs  caused  by not-so-smart allocation behavior of the 
> > 	native BIOS.
> > 
> > 	The motivation for this patch is due a issue reported in 
> > 	https://bugzilla.kernel.org/show_bug.cgi?id=15960
> > 
> > Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> > 
> 
> Seems like we want a fix like this; did you see Bjorn's comments?
> Maybe we can adjust the order of allocation instead?
> 
> What issues do you still have outstanding wrt resource allocation?  Do
> you still think you need the big "reallocate everything" option?

Jesse,

 Yes I saw Bjorn's comment but then i got pulled into something else and
 made no progress. Just yesterday I started looking into this and co-incidently
 see your mail too.

 I am not sure how to order the allocation given the current organization
 of the code.
	
 The real problem  is that the kernel wants to pre-allocate some minimum
 memory and ioport to hotplug bridges. On a system with just enough resources to
 satisfy the currently installed devices, the current code organization makes it
 hard to organize the allocation order such that atleast the must-have requests
 are satisfied.
	
 An ideal solution should first satisfy the must-have requests and if
 any additional resources are still available then pre-allocate nice-to-have
 requests.

 I am thinking of structuring the code the following way.

 a) note down the must-have requirement as well as the nice-to-way requirement
	for each pci-resource.
 b) run through each resource and satisfy the must-have requirement. 
 c) if (b) succeeds, run through each resource and *try* to satisfy the
	nice-to-way requirement.

 The other solution is to simply ignore preallocation to hotplug bridges.

Please suggest,
RP
	

> 
> Thanks,
> -- 
> Jesse Barnes, Intel Open Source Technology Center

-- 
Ram Pai
System X Device-Driver Enablement Lead
Linux Technology Center
Beaverton OR-97006
503-5783752 t/l 7753752
linuxram@us.ibm.com

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/1] PCI: allocate essential resources before reserving hotplug resources
  2011-01-11 21:10                             ` Ram Pai
@ 2011-01-14 18:19                               ` Ram Pai
  2011-01-18 20:52                                 ` Bjorn Helgaas
  2011-01-18 21:30                                 ` [PATCH 1/1 Version 2.0] " Ram Pai
  0 siblings, 2 replies; 32+ messages in thread
From: Ram Pai @ 2011-01-14 18:19 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Bjorn Helgaas, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

    PCI: reserve  additional  resources  for hotplug bridges
    only after all essential resources are allocated.
    
    Linux  tries  to reserve minimal  resources for  hotplug
    bridges.  This  works   fine as long as there are enough
    resources  to  satisfy  all  other  essential   resource
    requirements.  However  if  enough  resources   are  not
    available    to    satisfy  both the essential resources 
    and   the   reservation  for   hotplug  resources,   the
    resource-allocator reports error and returns failure.
    
    This patch distinguishes between essential resources and
    nice-to-have   resources.   Any   failure   to  allocate
    nice-to-have resources are ignored.
    
    This  behavior  can  be  particularly  useful to trigger
    automatic  reallocation, if  the  OS  discovers  genuine
    resource  allocation  conflicts  or  genuine unallocated
    requests caused  by  not-so-smart allocation behavior of 
    the native BIOS.
    
    https://bugzilla.kernel.org/show_bug.cgi?id=15960
    captures  the  details  of  the issue and the motivation
    behind this patch.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>

drivers/pci/setup-bus.c |  142 +++++++++++++++++++++++++++++++++++-------------
include/linux/ioport.h  |    1 
2 files changed, 105 insertions(+), 38 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 66cb8f4..76a7fb7 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -91,7 +91,26 @@ static void __dev_sort_resources(struct pci_dev *dev,
 	pdev_sort_resources(dev, head);
 }
 
-static void __assign_resources_sorted(struct resource_list *head,
+
+static void adjust_resources_sorted(struct resource_list *head)
+{
+	struct resource *res;
+	struct resource_list *list, *tmp;
+	int idx;
+
+	for (list = head->next; list;) {
+		res = list->res;
+		idx = res - &list->dev->resource[0];
+		if (res->add_size)
+			adjust_resource(res, res->start, 
+				resource_size(res) + res->add_size);
+		tmp = list;
+		list = list->next;
+		kfree(tmp);
+	}
+}
+
+static void assign_requested_resources_sorted(struct resource_list *head,
 				 struct resource_list_x *fail_head)
 {
 	struct resource *res;
@@ -114,14 +133,25 @@ static void __assign_resources_sorted(struct resource_list *head,
 			}
 			res->start = 0;
 			res->end = 0;
+			res->add_size = 0;
 			res->flags = 0;
 		}
 		tmp = list;
 		list = list->next;
-		kfree(tmp);
 	}
 }
 
+static void __assign_resources_sorted(struct resource_list *head,
+				 struct resource_list_x *fail_head)
+{
+	/* Satisfy the must-have resource requests */
+	assign_requested_resources_sorted(head, fail_head);
+
+	/* Try to satisfy any additional nice-to-have resource 
+		requests */
+	adjust_resources_sorted(head);
+}
+
 static void pdev_assign_resources_sorted(struct pci_dev *dev,
 				 struct resource_list_x *fail_head)
 {
@@ -404,15 +434,37 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, unsigned lon
 	return NULL;
 }
 
+static resource_size_t calculate_iosize(resource_size_t size, 
+		resource_size_t min_size, 
+		resource_size_t size1, 
+		resource_size_t old_size, 
+		resource_size_t align)
+{
+	if (size < min_size)
+		size = min_size;
+	if (old_size == 1 )
+		old_size = 0;
+	/* To be fixed in 2.5: we should have sort of HAVE_ISA
+	   flag in the struct pci_bus. */
+#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
+	size = (size & 0xff) + ((size & ~0xffUL) << 2);
+#endif
+	size = ALIGN(size + size1, align);
+	if (size < old_size)
+		size = old_size;
+	return size;
+}
+
 /* Sizing the IO windows of the PCI-PCI bridge is trivial,
    since these windows have 4K granularity and the IO ranges
    of non-bridge PCI devices are limited to 256 bytes.
    We must be careful with the ISA aliasing though. */
-static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
+static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, 
+			resource_size_t add_size)
 {
 	struct pci_dev *dev;
 	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
-	unsigned long size = 0, size1 = 0, old_size;
+	unsigned long size = 0, size1 = 0;
 
 	if (!b_res)
  		return;
@@ -435,20 +487,16 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 				size1 += r_size;
 		}
 	}
-	if (size < min_size)
-		size = min_size;
-	old_size = resource_size(b_res);
-	if (old_size == 1)
-		old_size = 0;
-/* To be fixed in 2.5: we should have sort of HAVE_ISA
-   flag in the struct pci_bus. */
-#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
-	size = (size & 0xff) + ((size & ~0xffUL) << 2);
-#endif
-	size = ALIGN(size + size1, 4096);
-	if (size < old_size)
-		size = old_size;
-	if (!size) {
+
+	size = calculate_iosize(size, min_size, size1, 
+			resource_size(b_res), 4096);
+	if (add_size)
+		size1 = calculate_iosize(size, min_size+add_size, size1, 
+				resource_size(b_res), 4096);
+	else
+		size1  = size;
+
+	if (!size && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window "
 				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
@@ -459,16 +507,35 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 	/* Alignment of the IO window is always 4K */
 	b_res->start = 4096;
 	b_res->end = b_res->start + size - 1;
+	b_res->add_size = size1-size;
 	b_res->flags |= IORESOURCE_STARTALIGN;
 }
 
+
+static resource_size_t calculate_memsize(resource_size_t size, 
+		resource_size_t min_size, 
+		resource_size_t size1, 
+		resource_size_t old_size, 
+		resource_size_t align)
+{
+	if (size < min_size)
+		size = min_size;
+	if (old_size == 1 )
+		old_size = 0;
+	if (size < old_size)
+		size = old_size;
+	size = ALIGN(size + size1, align);
+	return size;
+}
+
 /* Calculate the size of the bus and minimal alignment which
    guarantees that all child resources fit in this size. */
 static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
-			 unsigned long type, resource_size_t min_size)
+			 unsigned long type, resource_size_t min_size, 
+				resource_size_t add_size)
 {
 	struct pci_dev *dev;
-	resource_size_t min_align, align, size, old_size;
+	resource_size_t min_align, align, size, size1;
 	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
 	int order, max_order;
 	struct resource *b_res = find_free_bus_resource(bus, type);
@@ -516,13 +583,6 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			mem64_mask &= r->flags & IORESOURCE_MEM_64;
 		}
 	}
-	if (size < min_size)
-		size = min_size;
-	old_size = resource_size(b_res);
-	if (old_size == 1)
-		old_size = 0;
-	if (size < old_size)
-		size = old_size;
 
 	align = 0;
 	min_align = 0;
@@ -537,8 +597,14 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			min_align = align1 >> 1;
 		align += aligns[order];
 	}
-	size = ALIGN(size, min_align);
-	if (!size) {
+
+	size = calculate_memsize(size, min_size, 0, resource_size(b_res), align);
+	if (add_size)
+		size1 = calculate_memsize(size, min_size+add_size, 0, resource_size(b_res), align);
+	else
+		size1  = size;
+
+	if (!size && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window "
 				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
@@ -548,8 +614,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	}
 	b_res->start = min_align;
 	b_res->end = size + min_align - 1;
-	b_res->flags |= IORESOURCE_STARTALIGN;
-	b_res->flags |= mem64_mask;
+	b_res->add_size = size1 - size;
+	b_res->flags |= IORESOURCE_STARTALIGN | mem64_mask;
 	return 1;
 }
 
@@ -606,7 +672,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	unsigned long mask, prefmask;
-	resource_size_t min_mem_size = 0, min_io_size = 0;
+	resource_size_t additional_mem_size = 0, additional_io_size = 0;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		struct pci_bus *b = dev->subordinate;
@@ -637,11 +703,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 	case PCI_CLASS_BRIDGE_PCI:
 		pci_bridge_check_ranges(bus);
 		if (bus->self->is_hotplug_bridge) {
-			min_io_size  = pci_hotplug_io_size;
-			min_mem_size = pci_hotplug_mem_size;
+			additional_io_size  = pci_hotplug_io_size;
+			additional_mem_size = pci_hotplug_mem_size;
 		}
 	default:
-		pbus_size_io(bus, min_io_size);
+		pbus_size_io(bus, 0, additional_io_size);
 		/* If the bridge supports prefetchable range, size it
 		   separately. If it doesn't, or its prefetchable window
 		   has already been allocated by arch code, try
@@ -649,11 +715,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 		   resources. */
 		mask = IORESOURCE_MEM;
 		prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
-		if (pbus_size_mem(bus, prefmask, prefmask, min_mem_size))
+		if (pbus_size_mem(bus, prefmask, prefmask, 0, additional_mem_size))
 			mask = prefmask; /* Success, size non-prefetch only. */
 		else
-			min_mem_size += min_mem_size;
-		pbus_size_mem(bus, mask, IORESOURCE_MEM, min_mem_size);
+			additional_mem_size += additional_mem_size;
+		pbus_size_mem(bus, mask, IORESOURCE_MEM, 0, additional_mem_size);
 		break;
 	}
 }
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index e9bb22c..d00c61c 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -18,6 +18,7 @@
 struct resource {
 	resource_size_t start;
 	resource_size_t end;
+	resource_size_t add_size;
 	const char *name;
 	unsigned long flags;
 	struct resource *parent, *sibling, *child;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] PCI: allocate essential resources before reserving hotplug resources
  2011-01-14 18:19                               ` [PATCH 1/1] PCI: allocate essential resources before reserving hotplug resources Ram Pai
@ 2011-01-18 20:52                                 ` Bjorn Helgaas
  2011-01-18 21:42                                   ` Ram Pai
  2011-01-18 21:30                                 ` [PATCH 1/1 Version 2.0] " Ram Pai
  1 sibling, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2011-01-18 20:52 UTC (permalink / raw)
  To: Ram Pai
  Cc: Jesse Barnes, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

On Friday, January 14, 2011 11:19:15 am Ram Pai wrote:
>     PCI: reserve  additional  resources  for hotplug bridges
>     only after all essential resources are allocated.
>     
>     Linux  tries  to reserve minimal  resources for  hotplug
>     bridges.  This  works   fine as long as there are enough
>     resources  to  satisfy  all  other  essential   resource
>     requirements.  However  if  enough  resources   are  not
>     available    to    satisfy  both the essential resources 
>     and   the   reservation  for   hotplug  resources,   the
>     resource-allocator reports error and returns failure.
>     
>     This patch distinguishes between essential resources and
>     nice-to-have   resources.   Any   failure   to  allocate
>     nice-to-have resources are ignored.
>     
>     This  behavior  can  be  particularly  useful to trigger
>     automatic  reallocation, if  the  OS  discovers  genuine
>     resource  allocation  conflicts  or  genuine unallocated
>     requests caused  by  not-so-smart allocation behavior of 
>     the native BIOS.
>     
>     https://bugzilla.kernel.org/show_bug.cgi?id=15960
>     captures  the  details  of  the issue and the motivation
>     behind this patch.

Please don't indent and right-justify the changelog.

This is a PCI-specific issue, so I'm not comfortable adding add_size
to struct resource.  That penalizes all resource users while only
helping PCI.  Also, the struct resource lives forever, and the
add_size information is only useful while we're configuring the
bridge.

Bjorn

> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> 
> drivers/pci/setup-bus.c |  142 +++++++++++++++++++++++++++++++++++-------------
> include/linux/ioport.h  |    1 
> 2 files changed, 105 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
> index 66cb8f4..76a7fb7 100644
> --- a/drivers/pci/setup-bus.c
> +++ b/drivers/pci/setup-bus.c
> @@ -91,7 +91,26 @@ static void __dev_sort_resources(struct pci_dev *dev,
>  	pdev_sort_resources(dev, head);
>  }
>  
> -static void __assign_resources_sorted(struct resource_list *head,
> +
> +static void adjust_resources_sorted(struct resource_list *head)
> +{
> +	struct resource *res;
> +	struct resource_list *list, *tmp;
> +	int idx;
> +
> +	for (list = head->next; list;) {
> +		res = list->res;
> +		idx = res - &list->dev->resource[0];
> +		if (res->add_size)
> +			adjust_resource(res, res->start, 
> +				resource_size(res) + res->add_size);
> +		tmp = list;
> +		list = list->next;
> +		kfree(tmp);
> +	}
> +}
> +
> +static void assign_requested_resources_sorted(struct resource_list *head,
>  				 struct resource_list_x *fail_head)
>  {
>  	struct resource *res;
> @@ -114,14 +133,25 @@ static void __assign_resources_sorted(struct resource_list *head,
>  			}
>  			res->start = 0;
>  			res->end = 0;
> +			res->add_size = 0;
>  			res->flags = 0;
>  		}
>  		tmp = list;
>  		list = list->next;
> -		kfree(tmp);
>  	}
>  }
>  
> +static void __assign_resources_sorted(struct resource_list *head,
> +				 struct resource_list_x *fail_head)
> +{
> +	/* Satisfy the must-have resource requests */
> +	assign_requested_resources_sorted(head, fail_head);
> +
> +	/* Try to satisfy any additional nice-to-have resource 
> +		requests */
> +	adjust_resources_sorted(head);
> +}
> +
>  static void pdev_assign_resources_sorted(struct pci_dev *dev,
>  				 struct resource_list_x *fail_head)
>  {
> @@ -404,15 +434,37 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, unsigned lon
>  	return NULL;
>  }
>  
> +static resource_size_t calculate_iosize(resource_size_t size, 
> +		resource_size_t min_size, 
> +		resource_size_t size1, 
> +		resource_size_t old_size, 
> +		resource_size_t align)
> +{
> +	if (size < min_size)
> +		size = min_size;
> +	if (old_size == 1 )
> +		old_size = 0;
> +	/* To be fixed in 2.5: we should have sort of HAVE_ISA
> +	   flag in the struct pci_bus. */
> +#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
> +	size = (size & 0xff) + ((size & ~0xffUL) << 2);
> +#endif
> +	size = ALIGN(size + size1, align);
> +	if (size < old_size)
> +		size = old_size;
> +	return size;
> +}
> +
>  /* Sizing the IO windows of the PCI-PCI bridge is trivial,
>     since these windows have 4K granularity and the IO ranges
>     of non-bridge PCI devices are limited to 256 bytes.
>     We must be careful with the ISA aliasing though. */
> -static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
> +static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, 
> +			resource_size_t add_size)
>  {
>  	struct pci_dev *dev;
>  	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
> -	unsigned long size = 0, size1 = 0, old_size;
> +	unsigned long size = 0, size1 = 0;
>  
>  	if (!b_res)
>   		return;
> @@ -435,20 +487,16 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
>  				size1 += r_size;
>  		}
>  	}
> -	if (size < min_size)
> -		size = min_size;
> -	old_size = resource_size(b_res);
> -	if (old_size == 1)
> -		old_size = 0;
> -/* To be fixed in 2.5: we should have sort of HAVE_ISA
> -   flag in the struct pci_bus. */
> -#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
> -	size = (size & 0xff) + ((size & ~0xffUL) << 2);
> -#endif
> -	size = ALIGN(size + size1, 4096);
> -	if (size < old_size)
> -		size = old_size;
> -	if (!size) {
> +
> +	size = calculate_iosize(size, min_size, size1, 
> +			resource_size(b_res), 4096);
> +	if (add_size)
> +		size1 = calculate_iosize(size, min_size+add_size, size1, 
> +				resource_size(b_res), 4096);
> +	else
> +		size1  = size;
> +
> +	if (!size && !size1) {
>  		if (b_res->start || b_res->end)
>  			dev_info(&bus->self->dev, "disabling bridge window "
>  				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
> @@ -459,16 +507,35 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
>  	/* Alignment of the IO window is always 4K */
>  	b_res->start = 4096;
>  	b_res->end = b_res->start + size - 1;
> +	b_res->add_size = size1-size;
>  	b_res->flags |= IORESOURCE_STARTALIGN;
>  }
>  
> +
> +static resource_size_t calculate_memsize(resource_size_t size, 
> +		resource_size_t min_size, 
> +		resource_size_t size1, 
> +		resource_size_t old_size, 
> +		resource_size_t align)
> +{
> +	if (size < min_size)
> +		size = min_size;
> +	if (old_size == 1 )
> +		old_size = 0;
> +	if (size < old_size)
> +		size = old_size;
> +	size = ALIGN(size + size1, align);
> +	return size;
> +}
> +
>  /* Calculate the size of the bus and minimal alignment which
>     guarantees that all child resources fit in this size. */
>  static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
> -			 unsigned long type, resource_size_t min_size)
> +			 unsigned long type, resource_size_t min_size, 
> +				resource_size_t add_size)
>  {
>  	struct pci_dev *dev;
> -	resource_size_t min_align, align, size, old_size;
> +	resource_size_t min_align, align, size, size1;
>  	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
>  	int order, max_order;
>  	struct resource *b_res = find_free_bus_resource(bus, type);
> @@ -516,13 +583,6 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  			mem64_mask &= r->flags & IORESOURCE_MEM_64;
>  		}
>  	}
> -	if (size < min_size)
> -		size = min_size;
> -	old_size = resource_size(b_res);
> -	if (old_size == 1)
> -		old_size = 0;
> -	if (size < old_size)
> -		size = old_size;
>  
>  	align = 0;
>  	min_align = 0;
> @@ -537,8 +597,14 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  			min_align = align1 >> 1;
>  		align += aligns[order];
>  	}
> -	size = ALIGN(size, min_align);
> -	if (!size) {
> +
> +	size = calculate_memsize(size, min_size, 0, resource_size(b_res), align);
> +	if (add_size)
> +		size1 = calculate_memsize(size, min_size+add_size, 0, resource_size(b_res), align);
> +	else
> +		size1  = size;
> +
> +	if (!size && !size1) {
>  		if (b_res->start || b_res->end)
>  			dev_info(&bus->self->dev, "disabling bridge window "
>  				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
> @@ -548,8 +614,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  	}
>  	b_res->start = min_align;
>  	b_res->end = size + min_align - 1;
> -	b_res->flags |= IORESOURCE_STARTALIGN;
> -	b_res->flags |= mem64_mask;
> +	b_res->add_size = size1 - size;
> +	b_res->flags |= IORESOURCE_STARTALIGN | mem64_mask;
>  	return 1;
>  }
>  
> @@ -606,7 +672,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
>  {
>  	struct pci_dev *dev;
>  	unsigned long mask, prefmask;
> -	resource_size_t min_mem_size = 0, min_io_size = 0;
> +	resource_size_t additional_mem_size = 0, additional_io_size = 0;
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		struct pci_bus *b = dev->subordinate;
> @@ -637,11 +703,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
>  	case PCI_CLASS_BRIDGE_PCI:
>  		pci_bridge_check_ranges(bus);
>  		if (bus->self->is_hotplug_bridge) {
> -			min_io_size  = pci_hotplug_io_size;
> -			min_mem_size = pci_hotplug_mem_size;
> +			additional_io_size  = pci_hotplug_io_size;
> +			additional_mem_size = pci_hotplug_mem_size;
>  		}
>  	default:
> -		pbus_size_io(bus, min_io_size);
> +		pbus_size_io(bus, 0, additional_io_size);
>  		/* If the bridge supports prefetchable range, size it
>  		   separately. If it doesn't, or its prefetchable window
>  		   has already been allocated by arch code, try
> @@ -649,11 +715,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
>  		   resources. */
>  		mask = IORESOURCE_MEM;
>  		prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
> -		if (pbus_size_mem(bus, prefmask, prefmask, min_mem_size))
> +		if (pbus_size_mem(bus, prefmask, prefmask, 0, additional_mem_size))
>  			mask = prefmask; /* Success, size non-prefetch only. */
>  		else
> -			min_mem_size += min_mem_size;
> -		pbus_size_mem(bus, mask, IORESOURCE_MEM, min_mem_size);
> +			additional_mem_size += additional_mem_size;
> +		pbus_size_mem(bus, mask, IORESOURCE_MEM, 0, additional_mem_size);
>  		break;
>  	}
>  }
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index e9bb22c..d00c61c 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -18,6 +18,7 @@
>  struct resource {
>  	resource_size_t start;
>  	resource_size_t end;
> +	resource_size_t add_size;
>  	const char *name;
>  	unsigned long flags;
>  	struct resource *parent, *sibling, *child;
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/1 Version 2.0] PCI: allocate essential resources before reserving hotplug resources
  2011-01-14 18:19                               ` [PATCH 1/1] PCI: allocate essential resources before reserving hotplug resources Ram Pai
  2011-01-18 20:52                                 ` Bjorn Helgaas
@ 2011-01-18 21:30                                 ` Ram Pai
  2011-01-18 21:46                                   ` Bjorn Helgaas
  1 sibling, 1 reply; 32+ messages in thread
From: Ram Pai @ 2011-01-18 21:30 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Bjorn Helgaas, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

        PCI: pre-allocate additional resources to devices
        only after successful allocation of essential resources.
    
        Linux tries to pre-allocate minimal resources to hotplug
        bridges.  This  works   fine as long as there are enough
        resources  to  satisfy   all   other   genuine  resource
        requirements.  However  if  enough  resources   are  not
        available to   satisfy   any   of   these   nice-to-have
        pre-allocations, the  resource-allocator reports  errors
        and returns failure.
    
        This patch distinguishes between must-have resource from
        nice-to-have   resource.   Any    failure   to  allocate
        nice-to-have resources are ignored.
    
        This  behavior  can  be  particularly  useful to trigger
        automatic  reallocation, if  the  OS  discovers  genuine
        resource  allocation  conflicts  or  genuine unallocated
        resource requests caused   by  buggy allocation behavior
        of the native BIOS/uEFI.
    
        https://bugzilla.kernel.org/show_bug.cgi?id=15960
        captures the movitation behind the patch.
        This patch is verified to resolve the above bug.
    
Signed-off-by: Ram Pai <linuxram@us.ibm.com>

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 66cb8f4..5b0fb2f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -91,7 +91,39 @@ static void __dev_sort_resources(struct pci_dev *dev,
 	pdev_sort_resources(dev, head);
 }
 
-static void __assign_resources_sorted(struct resource_list *head,
+static inline void reset_resource(struct resource *res)
+{
+	res->start = 0;
+	res->end = 0;
+	res->add_size = 0;
+	res->flags = 0;
+}
+
+static void adjust_resources_sorted(struct resource_list *head)
+{
+	struct resource *res;
+	struct resource_list *list, *tmp;
+	int idx;
+
+	for (list = head->next; list;) {
+		res = list->res;
+		idx = res - &list->dev->resource[0];
+
+		if (!resource_size(res) && res->add_size) {
+			 res->end = res->start + res->add_size - 1;
+			 if(pci_assign_resource(list->dev, idx))
+				reset_resource(res);
+		} else if (res->add_size) {
+			adjust_resource(res, res->start, 
+				resource_size(res) + res->add_size);
+		}
+		tmp = list;
+		list = list->next;
+		kfree(tmp);
+	}
+}
+
+static void assign_requested_resources_sorted(struct resource_list *head,
 				 struct resource_list_x *fail_head)
 {
 	struct resource *res;
@@ -102,7 +134,7 @@ static void __assign_resources_sorted(struct resource_list *head,
 		res = list->res;
 		idx = res - &list->dev->resource[0];
 
-		if (pci_assign_resource(list->dev, idx)) {
+		if (resource_size(res) && pci_assign_resource(list->dev, idx)) {
 			if (fail_head && !pci_is_root_bus(list->dev->bus)) {
 				/*
 				 * if the failed res is for ROM BAR, and it will
@@ -112,16 +144,24 @@ static void __assign_resources_sorted(struct resource_list *head,
 				      (!(res->flags & IORESOURCE_ROM_ENABLE))))
 					add_to_failed_list(fail_head, list->dev, res);
 			}
-			res->start = 0;
-			res->end = 0;
-			res->flags = 0;
+			reset_resource(res);
 		}
 		tmp = list;
 		list = list->next;
-		kfree(tmp);
 	}
 }
 
+static void __assign_resources_sorted(struct resource_list *head,
+				 struct resource_list_x *fail_head)
+{
+	/* Satisfy the must-have resource requests */
+	assign_requested_resources_sorted(head, fail_head);
+
+	/* Try to satisfy any additional nice-to-have resource 
+		requests */
+	adjust_resources_sorted(head);
+}
+
 static void pdev_assign_resources_sorted(struct pci_dev *dev,
 				 struct resource_list_x *fail_head)
 {
@@ -404,15 +444,37 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, unsigned lon
 	return NULL;
 }
 
+static resource_size_t calculate_iosize(resource_size_t size, 
+		resource_size_t min_size, 
+		resource_size_t size1, 
+		resource_size_t old_size, 
+		resource_size_t align)
+{
+	if (size < min_size)
+		size = min_size;
+	if (old_size == 1 )
+		old_size = 0;
+	/* To be fixed in 2.5: we should have sort of HAVE_ISA
+	   flag in the struct pci_bus. */
+#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
+	size = (size & 0xff) + ((size & ~0xffUL) << 2);
+#endif
+	size = ALIGN(size + size1, align);
+	if (size < old_size)
+		size = old_size;
+	return size;
+}
+
 /* Sizing the IO windows of the PCI-PCI bridge is trivial,
    since these windows have 4K granularity and the IO ranges
    of non-bridge PCI devices are limited to 256 bytes.
    We must be careful with the ISA aliasing though. */
-static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
+static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, 
+			resource_size_t add_size)
 {
 	struct pci_dev *dev;
 	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
-	unsigned long size = 0, size1 = 0, old_size;
+	unsigned long size = 0, size1 = 0;
 
 	if (!b_res)
  		return;
@@ -435,20 +497,14 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 				size1 += r_size;
 		}
 	}
-	if (size < min_size)
-		size = min_size;
-	old_size = resource_size(b_res);
-	if (old_size == 1)
-		old_size = 0;
-/* To be fixed in 2.5: we should have sort of HAVE_ISA
-   flag in the struct pci_bus. */
-#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
-	size = (size & 0xff) + ((size & ~0xffUL) << 2);
-#endif
-	size = ALIGN(size + size1, 4096);
-	if (size < old_size)
-		size = old_size;
-	if (!size) {
+
+	size = calculate_iosize(size, min_size, size1, 
+			resource_size(b_res), 4096);
+	size1 = !add_size? size: 
+		calculate_iosize(size, min_size+add_size, size1, 
+			resource_size(b_res), 4096);
+
+	if (!size && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window "
 				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
@@ -459,16 +515,35 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 	/* Alignment of the IO window is always 4K */
 	b_res->start = 4096;
 	b_res->end = b_res->start + size - 1;
+	b_res->add_size = size1-size;
 	b_res->flags |= IORESOURCE_STARTALIGN;
 }
 
+
+static resource_size_t calculate_memsize(resource_size_t size, 
+		resource_size_t min_size, 
+		resource_size_t size1, 
+		resource_size_t old_size, 
+		resource_size_t align)
+{
+	if (size < min_size)
+		size = min_size;
+	if (old_size == 1 )
+		old_size = 0;
+	if (size < old_size)
+		size = old_size;
+	size = ALIGN(size + size1, align);
+	return size;
+}
+
 /* Calculate the size of the bus and minimal alignment which
    guarantees that all child resources fit in this size. */
 static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
-			 unsigned long type, resource_size_t min_size)
+			 unsigned long type, resource_size_t min_size, 
+				resource_size_t add_size)
 {
 	struct pci_dev *dev;
-	resource_size_t min_align, align, size, old_size;
+	resource_size_t min_align, align, size, size1;
 	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
 	int order, max_order;
 	struct resource *b_res = find_free_bus_resource(bus, type);
@@ -516,13 +591,6 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			mem64_mask &= r->flags & IORESOURCE_MEM_64;
 		}
 	}
-	if (size < min_size)
-		size = min_size;
-	old_size = resource_size(b_res);
-	if (old_size == 1)
-		old_size = 0;
-	if (size < old_size)
-		size = old_size;
 
 	align = 0;
 	min_align = 0;
@@ -537,8 +605,13 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			min_align = align1 >> 1;
 		align += aligns[order];
 	}
-	size = ALIGN(size, min_align);
-	if (!size) {
+
+	size = calculate_memsize(size, min_size, 0, resource_size(b_res), align);
+	size1 = !add_size ? size : 
+		calculate_memsize(size, min_size+add_size, 0, 
+				resource_size(b_res), align);
+
+	if (!size && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window "
 				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
@@ -548,8 +621,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	}
 	b_res->start = min_align;
 	b_res->end = size + min_align - 1;
-	b_res->flags |= IORESOURCE_STARTALIGN;
-	b_res->flags |= mem64_mask;
+	b_res->add_size = size1 - size;
+	b_res->flags |= IORESOURCE_STARTALIGN | mem64_mask;
 	return 1;
 }
 
@@ -606,7 +679,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	unsigned long mask, prefmask;
-	resource_size_t min_mem_size = 0, min_io_size = 0;
+	resource_size_t additional_mem_size = 0, additional_io_size = 0;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		struct pci_bus *b = dev->subordinate;
@@ -637,11 +710,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 	case PCI_CLASS_BRIDGE_PCI:
 		pci_bridge_check_ranges(bus);
 		if (bus->self->is_hotplug_bridge) {
-			min_io_size  = pci_hotplug_io_size;
-			min_mem_size = pci_hotplug_mem_size;
+			additional_io_size  = pci_hotplug_io_size;
+			additional_mem_size = pci_hotplug_mem_size;
 		}
 	default:
-		pbus_size_io(bus, min_io_size);
+		pbus_size_io(bus, 0, additional_io_size);
 		/* If the bridge supports prefetchable range, size it
 		   separately. If it doesn't, or its prefetchable window
 		   has already been allocated by arch code, try
@@ -649,11 +722,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 		   resources. */
 		mask = IORESOURCE_MEM;
 		prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
-		if (pbus_size_mem(bus, prefmask, prefmask, min_mem_size))
+		if (pbus_size_mem(bus, prefmask, prefmask, 0, additional_mem_size))
 			mask = prefmask; /* Success, size non-prefetch only. */
 		else
-			min_mem_size += min_mem_size;
-		pbus_size_mem(bus, mask, IORESOURCE_MEM, min_mem_size);
+			additional_mem_size += additional_mem_size;
+		pbus_size_mem(bus, mask, IORESOURCE_MEM, 0, additional_mem_size);
 		break;
 	}
 }
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index e9bb22c..70b4ae6 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -18,6 +18,7 @@
 struct resource {
 	resource_size_t start;
 	resource_size_t end;
+	resource_size_t add_size; /* additional nice-to-have resource */
 	const char *name;
 	unsigned long flags;
 	struct resource *parent, *sibling, *child;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] PCI: allocate essential resources before reserving hotplug resources
  2011-01-18 20:52                                 ` Bjorn Helgaas
@ 2011-01-18 21:42                                   ` Ram Pai
  2011-01-18 22:11                                     ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2011-01-18 21:42 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Jesse Barnes, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

On Tue, Jan 18, 2011 at 01:52:06PM -0700, Bjorn Helgaas wrote:
> On Friday, January 14, 2011 11:19:15 am Ram Pai wrote:
> >     PCI: reserve  additional  resources  for hotplug bridges
> >     only after all essential resources are allocated.
> >     
> >     Linux  tries  to reserve minimal  resources for  hotplug
> >     bridges.  This  works   fine as long as there are enough
> >     resources  to  satisfy  all  other  essential   resource
> >     requirements.  However  if  enough  resources   are  not
> >     available    to    satisfy  both the essential resources 
> >     and   the   reservation  for   hotplug  resources,   the
> >     resource-allocator reports error and returns failure.
> >     
> >     This patch distinguishes between essential resources and
> >     nice-to-have   resources.   Any   failure   to  allocate
> >     nice-to-have resources are ignored.
> >     
> >     This  behavior  can  be  particularly  useful to trigger
> >     automatic  reallocation, if  the  OS  discovers  genuine
> >     resource  allocation  conflicts  or  genuine unallocated
> >     requests caused  by  not-so-smart allocation behavior of 
> >     the native BIOS.
> >     
> >     https://bugzilla.kernel.org/show_bug.cgi?id=15960
> >     captures  the  details  of  the issue and the motivation
> >     behind this patch.
> 
> Please don't indent and right-justify the changelog.

Ooops. I did not see your comments earlier. I just out a patch without
looking at your comments.

> 
> This is a PCI-specific issue, so I'm not comfortable adding add_size
> to struct resource.  That penalizes all resource users while only
> helping PCI.  Also, the struct resource lives forever, and the
> add_size information is only useful while we're configuring the
> bridge.

Any suggestion on how to do this without adding a field to struct resource?

RP
> 
> Bjorn
> 
> > Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> > 
> > drivers/pci/setup-bus.c |  142 +++++++++++++++++++++++++++++++++++-------------
> > include/linux/ioport.h  |    1 
> > 2 files changed, 105 insertions(+), 38 deletions(-)
> > 
> > diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
> > index 66cb8f4..76a7fb7 100644
> > --- a/drivers/pci/setup-bus.c
> > +++ b/drivers/pci/setup-bus.c
> > @@ -91,7 +91,26 @@ static void __dev_sort_resources(struct pci_dev *dev,
> >  	pdev_sort_resources(dev, head);
> >  }
> >  
> > -static void __assign_resources_sorted(struct resource_list *head,
> > +
> > +static void adjust_resources_sorted(struct resource_list *head)
> > +{
> > +	struct resource *res;
> > +	struct resource_list *list, *tmp;
> > +	int idx;
> > +
> > +	for (list = head->next; list;) {
> > +		res = list->res;
> > +		idx = res - &list->dev->resource[0];
> > +		if (res->add_size)
> > +			adjust_resource(res, res->start, 
> > +				resource_size(res) + res->add_size);
> > +		tmp = list;
> > +		list = list->next;
> > +		kfree(tmp);
> > +	}
> > +}
> > +
> > +static void assign_requested_resources_sorted(struct resource_list *head,
> >  				 struct resource_list_x *fail_head)
> >  {
> >  	struct resource *res;
> > @@ -114,14 +133,25 @@ static void __assign_resources_sorted(struct resource_list *head,
> >  			}
> >  			res->start = 0;
> >  			res->end = 0;
> > +			res->add_size = 0;
> >  			res->flags = 0;
> >  		}
> >  		tmp = list;
> >  		list = list->next;
> > -		kfree(tmp);
> >  	}
> >  }
> >  
> > +static void __assign_resources_sorted(struct resource_list *head,
> > +				 struct resource_list_x *fail_head)
> > +{
> > +	/* Satisfy the must-have resource requests */
> > +	assign_requested_resources_sorted(head, fail_head);
> > +
> > +	/* Try to satisfy any additional nice-to-have resource 
> > +		requests */
> > +	adjust_resources_sorted(head);
> > +}
> > +
> >  static void pdev_assign_resources_sorted(struct pci_dev *dev,
> >  				 struct resource_list_x *fail_head)
> >  {
> > @@ -404,15 +434,37 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, unsigned lon
> >  	return NULL;
> >  }
> >  
> > +static resource_size_t calculate_iosize(resource_size_t size, 
> > +		resource_size_t min_size, 
> > +		resource_size_t size1, 
> > +		resource_size_t old_size, 
> > +		resource_size_t align)
> > +{
> > +	if (size < min_size)
> > +		size = min_size;
> > +	if (old_size == 1 )
> > +		old_size = 0;
> > +	/* To be fixed in 2.5: we should have sort of HAVE_ISA
> > +	   flag in the struct pci_bus. */
> > +#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
> > +	size = (size & 0xff) + ((size & ~0xffUL) << 2);
> > +#endif
> > +	size = ALIGN(size + size1, align);
> > +	if (size < old_size)
> > +		size = old_size;
> > +	return size;
> > +}
> > +
> >  /* Sizing the IO windows of the PCI-PCI bridge is trivial,
> >     since these windows have 4K granularity and the IO ranges
> >     of non-bridge PCI devices are limited to 256 bytes.
> >     We must be careful with the ISA aliasing though. */
> > -static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
> > +static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, 
> > +			resource_size_t add_size)
> >  {
> >  	struct pci_dev *dev;
> >  	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
> > -	unsigned long size = 0, size1 = 0, old_size;
> > +	unsigned long size = 0, size1 = 0;
> >  
> >  	if (!b_res)
> >   		return;
> > @@ -435,20 +487,16 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
> >  				size1 += r_size;
> >  		}
> >  	}
> > -	if (size < min_size)
> > -		size = min_size;
> > -	old_size = resource_size(b_res);
> > -	if (old_size == 1)
> > -		old_size = 0;
> > -/* To be fixed in 2.5: we should have sort of HAVE_ISA
> > -   flag in the struct pci_bus. */
> > -#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
> > -	size = (size & 0xff) + ((size & ~0xffUL) << 2);
> > -#endif
> > -	size = ALIGN(size + size1, 4096);
> > -	if (size < old_size)
> > -		size = old_size;
> > -	if (!size) {
> > +
> > +	size = calculate_iosize(size, min_size, size1, 
> > +			resource_size(b_res), 4096);
> > +	if (add_size)
> > +		size1 = calculate_iosize(size, min_size+add_size, size1, 
> > +				resource_size(b_res), 4096);
> > +	else
> > +		size1  = size;
> > +
> > +	if (!size && !size1) {
> >  		if (b_res->start || b_res->end)
> >  			dev_info(&bus->self->dev, "disabling bridge window "
> >  				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
> > @@ -459,16 +507,35 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
> >  	/* Alignment of the IO window is always 4K */
> >  	b_res->start = 4096;
> >  	b_res->end = b_res->start + size - 1;
> > +	b_res->add_size = size1-size;
> >  	b_res->flags |= IORESOURCE_STARTALIGN;
> >  }
> >  
> > +
> > +static resource_size_t calculate_memsize(resource_size_t size, 
> > +		resource_size_t min_size, 
> > +		resource_size_t size1, 
> > +		resource_size_t old_size, 
> > +		resource_size_t align)
> > +{
> > +	if (size < min_size)
> > +		size = min_size;
> > +	if (old_size == 1 )
> > +		old_size = 0;
> > +	if (size < old_size)
> > +		size = old_size;
> > +	size = ALIGN(size + size1, align);
> > +	return size;
> > +}
> > +
> >  /* Calculate the size of the bus and minimal alignment which
> >     guarantees that all child resources fit in this size. */
> >  static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
> > -			 unsigned long type, resource_size_t min_size)
> > +			 unsigned long type, resource_size_t min_size, 
> > +				resource_size_t add_size)
> >  {
> >  	struct pci_dev *dev;
> > -	resource_size_t min_align, align, size, old_size;
> > +	resource_size_t min_align, align, size, size1;
> >  	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
> >  	int order, max_order;
> >  	struct resource *b_res = find_free_bus_resource(bus, type);
> > @@ -516,13 +583,6 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
> >  			mem64_mask &= r->flags & IORESOURCE_MEM_64;
> >  		}
> >  	}
> > -	if (size < min_size)
> > -		size = min_size;
> > -	old_size = resource_size(b_res);
> > -	if (old_size == 1)
> > -		old_size = 0;
> > -	if (size < old_size)
> > -		size = old_size;
> >  
> >  	align = 0;
> >  	min_align = 0;
> > @@ -537,8 +597,14 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
> >  			min_align = align1 >> 1;
> >  		align += aligns[order];
> >  	}
> > -	size = ALIGN(size, min_align);
> > -	if (!size) {
> > +
> > +	size = calculate_memsize(size, min_size, 0, resource_size(b_res), align);
> > +	if (add_size)
> > +		size1 = calculate_memsize(size, min_size+add_size, 0, resource_size(b_res), align);
> > +	else
> > +		size1  = size;
> > +
> > +	if (!size && !size1) {
> >  		if (b_res->start || b_res->end)
> >  			dev_info(&bus->self->dev, "disabling bridge window "
> >  				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
> > @@ -548,8 +614,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
> >  	}
> >  	b_res->start = min_align;
> >  	b_res->end = size + min_align - 1;
> > -	b_res->flags |= IORESOURCE_STARTALIGN;
> > -	b_res->flags |= mem64_mask;
> > +	b_res->add_size = size1 - size;
> > +	b_res->flags |= IORESOURCE_STARTALIGN | mem64_mask;
> >  	return 1;
> >  }
> >  
> > @@ -606,7 +672,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
> >  {
> >  	struct pci_dev *dev;
> >  	unsigned long mask, prefmask;
> > -	resource_size_t min_mem_size = 0, min_io_size = 0;
> > +	resource_size_t additional_mem_size = 0, additional_io_size = 0;
> >  
> >  	list_for_each_entry(dev, &bus->devices, bus_list) {
> >  		struct pci_bus *b = dev->subordinate;
> > @@ -637,11 +703,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
> >  	case PCI_CLASS_BRIDGE_PCI:
> >  		pci_bridge_check_ranges(bus);
> >  		if (bus->self->is_hotplug_bridge) {
> > -			min_io_size  = pci_hotplug_io_size;
> > -			min_mem_size = pci_hotplug_mem_size;
> > +			additional_io_size  = pci_hotplug_io_size;
> > +			additional_mem_size = pci_hotplug_mem_size;
> >  		}
> >  	default:
> > -		pbus_size_io(bus, min_io_size);
> > +		pbus_size_io(bus, 0, additional_io_size);
> >  		/* If the bridge supports prefetchable range, size it
> >  		   separately. If it doesn't, or its prefetchable window
> >  		   has already been allocated by arch code, try
> > @@ -649,11 +715,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
> >  		   resources. */
> >  		mask = IORESOURCE_MEM;
> >  		prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
> > -		if (pbus_size_mem(bus, prefmask, prefmask, min_mem_size))
> > +		if (pbus_size_mem(bus, prefmask, prefmask, 0, additional_mem_size))
> >  			mask = prefmask; /* Success, size non-prefetch only. */
> >  		else
> > -			min_mem_size += min_mem_size;
> > -		pbus_size_mem(bus, mask, IORESOURCE_MEM, min_mem_size);
> > +			additional_mem_size += additional_mem_size;
> > +		pbus_size_mem(bus, mask, IORESOURCE_MEM, 0, additional_mem_size);
> >  		break;
> >  	}
> >  }
> > diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> > index e9bb22c..d00c61c 100644
> > --- a/include/linux/ioport.h
> > +++ b/include/linux/ioport.h
> > @@ -18,6 +18,7 @@
> >  struct resource {
> >  	resource_size_t start;
> >  	resource_size_t end;
> > +	resource_size_t add_size;
> >  	const char *name;
> >  	unsigned long flags;
> >  	struct resource *parent, *sibling, *child;
> > 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Ram Pai
System X Device-Driver Enablement Lead
Linux Technology Center
Beaverton OR-97006
503-5783752 t/l 7753752
linuxram@us.ibm.com

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1 Version 2.0] PCI: allocate essential resources before reserving hotplug resources
  2011-01-18 21:30                                 ` [PATCH 1/1 Version 2.0] " Ram Pai
@ 2011-01-18 21:46                                   ` Bjorn Helgaas
  2011-01-18 22:03                                     ` Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2011-01-18 21:46 UTC (permalink / raw)
  To: Ram Pai
  Cc: Jesse Barnes, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

On Tuesday, January 18, 2011 02:30:09 pm Ram Pai wrote:
>         PCI: pre-allocate additional resources to devices
>         only after successful allocation of essential resources.
> ...

When you post a new version, can you please include some text about
"what changed between v1 and v2"?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1 Version 2.0] PCI: allocate essential resources before reserving hotplug resources
  2011-01-18 21:46                                   ` Bjorn Helgaas
@ 2011-01-18 22:03                                     ` Ram Pai
  0 siblings, 0 replies; 32+ messages in thread
From: Ram Pai @ 2011-01-18 22:03 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Ram Pai, Jesse Barnes, linux-pci, linux-kernel, clemens,
	Yinghai Lu, Linus Torvalds, peter.henriksson, ebiederm

On Tue, Jan 18, 2011 at 02:46:02PM -0700, Bjorn Helgaas wrote:
> On Tuesday, January 18, 2011 02:30:09 pm Ram Pai wrote:
> >         PCI: pre-allocate additional resources to devices
> >         only after successful allocation of essential resources.
> > ...
> 
> When you post a new version, can you please include some text about
> "what changed between v1 and v2"?

Ok. I will next version onwards.

BTW: the earlier version called pci_assign_resource() on resources
that needed no resources, causing the pci_assign_resource()
to return failure. This version fixed that issue.

RP

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] PCI: allocate essential resources before reserving hotplug resources
  2011-01-18 21:42                                   ` Ram Pai
@ 2011-01-18 22:11                                     ` Bjorn Helgaas
  2011-01-19 19:58                                       ` [PATCH 1/1 v3] " Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2011-01-18 22:11 UTC (permalink / raw)
  To: Ram Pai
  Cc: Jesse Barnes, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

On Tuesday, January 18, 2011 02:42:53 pm Ram Pai wrote:
> On Tue, Jan 18, 2011 at 01:52:06PM -0700, Bjorn Helgaas wrote:

> > This is a PCI-specific issue, so I'm not comfortable adding add_size
> > to struct resource.  That penalizes all resource users while only
> > helping PCI.  Also, the struct resource lives forever, and the
> > add_size information is only useful while we're configuring the
> > bridge.
> 
> Any suggestion on how to do this without adding a field to struct resource?

It *is* a little ugly.  You compute some of this info early, in
pbus_size_io/mem(), but it's not used until later.  Maybe it would
make sense in struct pci_bus?  That's a pain, too, because then you
have to test the resource type in adjust_resources_sorted() but at
least it's a per-PCI bridge structure.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/1 v3] PCI: allocate essential resources before reserving hotplug resources
  2011-01-18 22:11                                     ` Bjorn Helgaas
@ 2011-01-19 19:58                                       ` Ram Pai
  2011-01-20  1:00                                         ` [PATCH 1/1 v4] " Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2011-01-19 19:58 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Bjorn Helgaas, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

    PCI: pre-allocate additional resources to devices only after successful
            allocation of essential resources.
    
    Linux tries to pre-allocate minimal resources to hotplug bridges. This
    works fine as long as there are enough resources  to satisfy all other
    genuine resource requirements. However if enough resources are not
    available to satisfy any of these nice-to-have pre-allocations, the
    resource-allocator reports errors and returns failure.
    
    This patch distinguishes between must-have resource from nice-to-have
    resource.  Any failure to allocate nice-to-have resources are ignored.
    
    This behavior can be particularly useful to trigger automatic
    reallocation when the OS discovers genuine allocation-conflicts
    or genuine unallocated-requests caused by buggy allocation behavior
    of the native BIOS/uEFI.
    
    https://bugzilla.kernel.org/show_bug.cgi?id=15960 captures the movitation
    behind the patch.
    
    changelog v2:  o  fixed a bug where pci_assign_resource() was called on a
    		  resource of zero resource size.
    
    changelog v3:  addressed Bjorn's comment
    	       o  "Please don't indent and right-justify the changelog".
    	       o  removed add_size from struct resource.  The additional
    		  size is now tracked using a linked list.
    
    Signed-off-by: Ram Pai <linuxram@us.ibm.com>

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 66cb8f4..665c23b 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -33,11 +33,14 @@ struct resource_list_x {
 	struct pci_dev *dev;
 	resource_size_t start;
 	resource_size_t end;
+	resource_size_t add_size;
 	unsigned long flags;
 };
 
-static void add_to_failed_list(struct resource_list_x *head,
-				 struct pci_dev *dev, struct resource *res)
+
+static void add_to_list(struct resource_list_x *head,
+		 struct pci_dev *dev, struct resource *res, 
+		 resource_size_t size)
 {
 	struct resource_list_x *list = head;
 	struct resource_list_x *ln = list->next;
@@ -45,7 +48,7 @@ static void add_to_failed_list(struct resource_list_x *head,
 
 	tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
 	if (!tmp) {
-		pr_warning("add_to_failed_list: kmalloc() failed!\n");
+		pr_warning("add_to_list: kmalloc() failed!\n");
 		return;
 	}
 
@@ -55,9 +58,18 @@ static void add_to_failed_list(struct resource_list_x *head,
 	tmp->start = res->start;
 	tmp->end = res->end;
 	tmp->flags = res->flags;
+
+	tmp->add_size = size;
+
 	list->next = tmp;
 }
 
+static void add_to_failed_list(struct resource_list_x *head,
+				 struct pci_dev *dev, struct resource *res)
+{
+	add_to_list(head, dev, res, 0);
+}
+
 static void free_failed_list(struct resource_list_x *head)
 {
 	struct resource_list_x *list, *tmp;
@@ -91,7 +103,66 @@ static void __dev_sort_resources(struct pci_dev *dev,
 	pdev_sort_resources(dev, head);
 }
 
-static void __assign_resources_sorted(struct resource_list *head,
+static inline void reset_resource(struct resource *res)
+{
+	res->start = 0;
+	res->end = 0;
+	res->flags = 0;
+}
+
+/*
+ * Walk through each element of the add_head and try to procure
+ * additional resources for the element, provided that element
+ * is found in the head list.
+ */
+static void adjust_resources_sorted(struct resource_list_x *add_head, 
+		struct resource_list *head)
+{
+	struct resource *res;
+	struct resource_list_x *list, *tmp, *prev;
+	struct resource_list *hlist;
+	resource_size_t add_size;
+	int idx;
+
+	prev = add_head;
+	for (list = add_head->next; list;) {
+		res = list->res;
+
+		/* skip this resource if not found in head list */
+		for (hlist = head->next; hlist;) {
+			if (res == hlist->res) 
+				break;
+			hlist = hlist->next;
+		}
+
+		if (!hlist) {
+			prev = list;
+			list = list->next;
+			continue;
+		}
+	
+		idx = res - &list->dev->resource[0];
+
+		add_size=list->add_size;
+		if (!res->flags)
+			goto out;
+
+		if (!resource_size(res) && add_size) {
+			 res->end = res->start + add_size - 1;
+			 if(pci_assign_resource(list->dev, idx))
+				reset_resource(res);
+		} else if (add_size) {
+			adjust_resource(res, res->start, 
+				resource_size(res) + add_size);
+		}
+out:
+		tmp = list;
+		prev->next = list = list->next;
+		kfree(tmp);
+	}
+}
+
+static void assign_requested_resources_sorted(struct resource_list *head,
 				 struct resource_list_x *fail_head)
 {
 	struct resource *res;
@@ -102,7 +173,7 @@ static void __assign_resources_sorted(struct resource_list *head,
 		res = list->res;
 		idx = res - &list->dev->resource[0];
 
-		if (pci_assign_resource(list->dev, idx)) {
+		if (resource_size(res) && pci_assign_resource(list->dev, idx)) {
 			if (fail_head && !pci_is_root_bus(list->dev->bus)) {
 				/*
 				 * if the failed res is for ROM BAR, and it will
@@ -112,9 +183,7 @@ static void __assign_resources_sorted(struct resource_list *head,
 				      (!(res->flags & IORESOURCE_ROM_ENABLE))))
 					add_to_failed_list(fail_head, list->dev, res);
 			}
-			res->start = 0;
-			res->end = 0;
-			res->flags = 0;
+			reset_resource(res);
 		}
 		tmp = list;
 		list = list->next;
@@ -122,6 +191,19 @@ static void __assign_resources_sorted(struct resource_list *head,
 	}
 }
 
+static void __assign_resources_sorted(struct resource_list *head,
+				 struct resource_list_x *add_head,
+				 struct resource_list_x *fail_head)
+{
+	/* Satisfy the must-have resource requests */
+	assign_requested_resources_sorted(head, fail_head);
+
+	/* Try to satisfy any additional nice-to-have resource 
+		requests */
+	if (add_head)
+		adjust_resources_sorted(add_head, head);
+}
+
 static void pdev_assign_resources_sorted(struct pci_dev *dev,
 				 struct resource_list_x *fail_head)
 {
@@ -129,11 +211,12 @@ static void pdev_assign_resources_sorted(struct pci_dev *dev,
 
 	head.next = NULL;
 	__dev_sort_resources(dev, &head);
-	__assign_resources_sorted(&head, fail_head);
+	__assign_resources_sorted(&head, NULL, fail_head);
 
 }
 
 static void pbus_assign_resources_sorted(const struct pci_bus *bus,
+					 struct resource_list_x *add_head,
 					 struct resource_list_x *fail_head)
 {
 	struct pci_dev *dev;
@@ -143,7 +226,7 @@ static void pbus_assign_resources_sorted(const struct pci_bus *bus,
 	list_for_each_entry(dev, &bus->devices, bus_list)
 		__dev_sort_resources(dev, &head);
 
-	__assign_resources_sorted(&head, fail_head);
+	__assign_resources_sorted(&head, add_head, fail_head);
 }
 
 void pci_setup_cardbus(struct pci_bus *bus)
@@ -404,15 +487,37 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, unsigned lon
 	return NULL;
 }
 
+static resource_size_t calculate_iosize(resource_size_t size, 
+		resource_size_t min_size, 
+		resource_size_t size1, 
+		resource_size_t old_size, 
+		resource_size_t align)
+{
+	if (size < min_size)
+		size = min_size;
+	if (old_size == 1 )
+		old_size = 0;
+	/* To be fixed in 2.5: we should have sort of HAVE_ISA
+	   flag in the struct pci_bus. */
+#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
+	size = (size & 0xff) + ((size & ~0xffUL) << 2);
+#endif
+	size = ALIGN(size + size1, align);
+	if (size < old_size)
+		size = old_size;
+	return size;
+}
+
 /* Sizing the IO windows of the PCI-PCI bridge is trivial,
    since these windows have 4K granularity and the IO ranges
    of non-bridge PCI devices are limited to 256 bytes.
    We must be careful with the ISA aliasing though. */
-static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
+static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, 
+		resource_size_t add_size, struct resource_list_x *add_head)
 {
 	struct pci_dev *dev;
 	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
-	unsigned long size = 0, size1 = 0, old_size;
+	unsigned long size = 0, size1 = 0;
 
 	if (!b_res)
  		return;
@@ -435,20 +540,14 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 				size1 += r_size;
 		}
 	}
-	if (size < min_size)
-		size = min_size;
-	old_size = resource_size(b_res);
-	if (old_size == 1)
-		old_size = 0;
-/* To be fixed in 2.5: we should have sort of HAVE_ISA
-   flag in the struct pci_bus. */
-#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
-	size = (size & 0xff) + ((size & ~0xffUL) << 2);
-#endif
-	size = ALIGN(size + size1, 4096);
-	if (size < old_size)
-		size = old_size;
-	if (!size) {
+
+	size = calculate_iosize(size, min_size, size1, 
+			resource_size(b_res), 4096);
+	size1 = !add_size? size: 
+		calculate_iosize(size, min_size+add_size, size1, 
+			resource_size(b_res), 4096);
+
+	if (!size && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window "
 				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
@@ -459,16 +558,40 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 	/* Alignment of the IO window is always 4K */
 	b_res->start = 4096;
 	b_res->end = b_res->start + size - 1;
+	b_res->add_size = size1-size;
 	b_res->flags |= IORESOURCE_STARTALIGN;
+
+	if (size1 > size && add_head)
+		add_to_list(add_head, bus->self, b_res, size1-size);
+
+}
+
+
+static resource_size_t calculate_memsize(resource_size_t size, 
+		resource_size_t min_size, 
+		resource_size_t size1, 
+		resource_size_t old_size, 
+		resource_size_t align)
+{
+	if (size < min_size)
+		size = min_size;
+	if (old_size == 1 )
+		old_size = 0;
+	if (size < old_size)
+		size = old_size;
+	size = ALIGN(size + size1, align);
+	return size;
 }
 
 /* Calculate the size of the bus and minimal alignment which
    guarantees that all child resources fit in this size. */
 static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
-			 unsigned long type, resource_size_t min_size)
+			 unsigned long type, resource_size_t min_size, 
+			resource_size_t add_size, 
+			struct resource_list_x *add_head)
 {
 	struct pci_dev *dev;
-	resource_size_t min_align, align, size, old_size;
+	resource_size_t min_align, align, size, size1;
 	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
 	int order, max_order;
 	struct resource *b_res = find_free_bus_resource(bus, type);
@@ -516,13 +639,6 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			mem64_mask &= r->flags & IORESOURCE_MEM_64;
 		}
 	}
-	if (size < min_size)
-		size = min_size;
-	old_size = resource_size(b_res);
-	if (old_size == 1)
-		old_size = 0;
-	if (size < old_size)
-		size = old_size;
 
 	align = 0;
 	min_align = 0;
@@ -537,8 +653,13 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			min_align = align1 >> 1;
 		align += aligns[order];
 	}
-	size = ALIGN(size, min_align);
-	if (!size) {
+
+	size = calculate_memsize(size, min_size, 0, resource_size(b_res), align);
+	size1 = !add_size ? size : 
+		calculate_memsize(size, min_size+add_size, 0, 
+				resource_size(b_res), align);
+
+	if (!size && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window "
 				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
@@ -548,8 +669,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	}
 	b_res->start = min_align;
 	b_res->end = size + min_align - 1;
-	b_res->flags |= IORESOURCE_STARTALIGN;
-	b_res->flags |= mem64_mask;
+	b_res->flags |= IORESOURCE_STARTALIGN | mem64_mask;
+
+	if (size1 > size && add_head)
+		add_to_list(add_head, bus->self, b_res, size1-size);
+
 	return 1;
 }
 
@@ -602,11 +726,12 @@ static void pci_bus_size_cardbus(struct pci_bus *bus)
 	}
 }
 
-void __ref pci_bus_size_bridges(struct pci_bus *bus)
+void __ref __pci_bus_size_bridges(struct pci_bus *bus, 
+			struct resource_list_x *add_head)
 {
 	struct pci_dev *dev;
 	unsigned long mask, prefmask;
-	resource_size_t min_mem_size = 0, min_io_size = 0;
+	resource_size_t additional_mem_size = 0, additional_io_size = 0;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		struct pci_bus *b = dev->subordinate;
@@ -620,7 +745,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 
 		case PCI_CLASS_BRIDGE_PCI:
 		default:
-			pci_bus_size_bridges(b);
+			__pci_bus_size_bridges(b, add_head);
 			break;
 		}
 	}
@@ -637,11 +762,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 	case PCI_CLASS_BRIDGE_PCI:
 		pci_bridge_check_ranges(bus);
 		if (bus->self->is_hotplug_bridge) {
-			min_io_size  = pci_hotplug_io_size;
-			min_mem_size = pci_hotplug_mem_size;
+			additional_io_size  = pci_hotplug_io_size;
+			additional_mem_size = pci_hotplug_mem_size;
 		}
 	default:
-		pbus_size_io(bus, min_io_size);
+		pbus_size_io(bus, 0, additional_io_size, add_head);
 		/* If the bridge supports prefetchable range, size it
 		   separately. If it doesn't, or its prefetchable window
 		   has already been allocated by arch code, try
@@ -649,30 +774,36 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 		   resources. */
 		mask = IORESOURCE_MEM;
 		prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
-		if (pbus_size_mem(bus, prefmask, prefmask, min_mem_size))
+		if (pbus_size_mem(bus, prefmask, prefmask, 0, additional_mem_size, add_head))
 			mask = prefmask; /* Success, size non-prefetch only. */
 		else
-			min_mem_size += min_mem_size;
-		pbus_size_mem(bus, mask, IORESOURCE_MEM, min_mem_size);
+			additional_mem_size += additional_mem_size;
+		pbus_size_mem(bus, mask, IORESOURCE_MEM, 0, additional_mem_size, add_head);
 		break;
 	}
 }
+
+void __ref pci_bus_size_bridges(struct pci_bus *bus)
+{
+	__pci_bus_size_bridges(bus, NULL);
+}
 EXPORT_SYMBOL(pci_bus_size_bridges);
 
 static void __ref __pci_bus_assign_resources(const struct pci_bus *bus,
+					 struct resource_list_x *add_head,
 					 struct resource_list_x *fail_head)
 {
 	struct pci_bus *b;
 	struct pci_dev *dev;
 
-	pbus_assign_resources_sorted(bus, fail_head);
+	pbus_assign_resources_sorted(bus, add_head, fail_head);
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		b = dev->subordinate;
 		if (!b)
 			continue;
 
-		__pci_bus_assign_resources(b, fail_head);
+		__pci_bus_assign_resources(b, add_head, fail_head);
 
 		switch (dev->class >> 8) {
 		case PCI_CLASS_BRIDGE_PCI:
@@ -694,7 +825,7 @@ static void __ref __pci_bus_assign_resources(const struct pci_bus *bus,
 
 void __ref pci_bus_assign_resources(const struct pci_bus *bus)
 {
-	__pci_bus_assign_resources(bus, NULL);
+	__pci_bus_assign_resources(bus, NULL, NULL);
 }
 EXPORT_SYMBOL(pci_bus_assign_resources);
 
@@ -709,7 +840,7 @@ static void __ref __pci_bridge_assign_resources(const struct pci_dev *bridge,
 	if (!b)
 		return;
 
-	__pci_bus_assign_resources(b, fail_head);
+	__pci_bus_assign_resources(b, NULL, fail_head);
 
 	switch (bridge->class >> 8) {
 	case PCI_CLASS_BRIDGE_PCI:
@@ -842,15 +973,18 @@ void __init
 pci_assign_unassigned_resources(void)
 {
 	struct pci_bus *bus;
-
+	struct resource_list_x add_list; /* list of resources that 
+					want additional resources */
+	add_list.next = NULL;
 	/* Depth first, calculate sizes and alignments of all
 	   subordinate buses. */
 	list_for_each_entry(bus, &pci_root_buses, node) {
-		pci_bus_size_bridges(bus);
+		__pci_bus_size_bridges(bus, &add_list);
 	}
+
 	/* Depth last, allocate resources and update the hardware. */
 	list_for_each_entry(bus, &pci_root_buses, node) {
-		pci_bus_assign_resources(bus);
+		__pci_bus_assign_resources(bus, &add_list, NULL);
 		pci_enable_bridges(bus);
 	}
 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/1 v4] PCI: allocate essential resources before reserving hotplug resources
  2011-01-19 19:58                                       ` [PATCH 1/1 v3] " Ram Pai
@ 2011-01-20  1:00                                         ` Ram Pai
  2011-01-21  1:22                                           ` Yinghai Lu
  0 siblings, 1 reply; 32+ messages in thread
From: Ram Pai @ 2011-01-20  1:00 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Bjorn Helgaas, linux-pci, linux-kernel, clemens, Yinghai Lu,
	Linus Torvalds, peter.henriksson, ebiederm

    PCI: pre-allocate additional resources to devices only after successful
            allocation of essential resources.
    
    Linux tries to pre-allocate minimal resources to hotplug bridges. This
    works fine as long as there are enough resources  to satisfy all other
    genuine resource requirements. However if enough resources are not
    available to satisfy any of these nice-to-have pre-allocations, the
    resource-allocator reports errors and returns failure.
    
    This patch distinguishes between must-have resource from nice-to-have
    resource.  Any failure to allocate nice-to-have resources are ignored.
    
    This behavior can be particularly useful to trigger automatic
    reallocation when the OS discovers genuine allocation-conflicts
    or genuine unallocated-requests caused by buggy allocation behavior
    of the native BIOS/uEFI.
    
    https://bugzilla.kernel.org/show_bug.cgi?id=15960 captures the movitation
    behind the patch.
    
    changelog v2:  o  fixed a bug where pci_assign_resource() was called on a
    		      resource of zero resource size.
    
    changelog v3:  addressed Bjorn's comment
    	       o  "Please don't indent and right-justify the changelog".
    	       o  removed add_size from struct resource.  The additional
    		  size is now tracked using a linked list.
    
    changelog v4:  o moved freeing up of elements of head list from
    		  assign_requested_resources_sorted() to
    		__assign_resources_sorted(). This fixes a corruption bug.
    	       o removed a wrong reference to 'add_size' in
  		   pbus_size_mem(). Erroneously got introduced while
		   generating the patch.
    	       o some code optimizations in adjust_resources_sorted()
    		   and assign_requested_resources_sorted()
    
Signed-off-by: Ram Pai <linuxram@us.ibm.com>

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 66cb8f4..efbdff2 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -33,11 +33,23 @@ struct resource_list_x {
 	struct pci_dev *dev;
 	resource_size_t start;
 	resource_size_t end;
+	resource_size_t add_size;
 	unsigned long flags;
 };
 
-static void add_to_failed_list(struct resource_list_x *head,
-				 struct pci_dev *dev, struct resource *res)
+#define free_list(type, head) do {                      \
+	struct type *list, *tmp;			\
+	for (list = (head)->next; list;) {		\
+		tmp = list;				\
+		list = list->next;			\
+		kfree(tmp);				\
+	}						\
+	(head)->next = NULL;				\
+} while (0)
+
+static void add_to_list(struct resource_list_x *head,
+		 struct pci_dev *dev, struct resource *res, 
+		 resource_size_t size)
 {
 	struct resource_list_x *list = head;
 	struct resource_list_x *ln = list->next;
@@ -45,7 +57,7 @@ static void add_to_failed_list(struct resource_list_x *head,
 
 	tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
 	if (!tmp) {
-		pr_warning("add_to_failed_list: kmalloc() failed!\n");
+		pr_warning("add_to_list: kmalloc() failed!\n");
 		return;
 	}
 
@@ -55,20 +67,16 @@ static void add_to_failed_list(struct resource_list_x *head,
 	tmp->start = res->start;
 	tmp->end = res->end;
 	tmp->flags = res->flags;
+
+	tmp->add_size = size;
+
 	list->next = tmp;
 }
 
-static void free_failed_list(struct resource_list_x *head)
+static void add_to_failed_list(struct resource_list_x *head,
+				 struct pci_dev *dev, struct resource *res)
 {
-	struct resource_list_x *list, *tmp;
-
-	for (list = head->next; list;) {
-		tmp = list;
-		list = list->next;
-		kfree(tmp);
-	}
-
-	head->next = NULL;
+	add_to_list(head, dev, res, 0);
 }
 
 static void __dev_sort_resources(struct pci_dev *dev,
@@ -91,18 +99,70 @@ static void __dev_sort_resources(struct pci_dev *dev,
 	pdev_sort_resources(dev, head);
 }
 
-static void __assign_resources_sorted(struct resource_list *head,
-				 struct resource_list_x *fail_head)
+static inline void reset_resource(struct resource *res)
+{
+	res->start = 0;
+	res->end = 0;
+	res->flags = 0;
+}
+
+/*
+ * Walk through each element of the add_head and try to procure
+ * additional resources for the element, provided that element
+ * is found in the head list.
+ */
+static void adjust_resources_sorted(struct resource_list_x *add_head, 
+		struct resource_list *head)
 {
 	struct resource *res;
-	struct resource_list *list, *tmp;
+	struct resource_list_x *list, *tmp, *prev;
+	struct resource_list *hlist;
+	resource_size_t add_size;
 	int idx;
 
-	for (list = head->next; list;) {
+	prev = add_head;
+	for (list = add_head->next; list;) {
 		res = list->res;
+		if (!res->flags)
+			goto out;
+
+		/* skip this resource if not found in head list */
+		for (hlist = head->next; hlist && hlist->res != res; 
+				hlist = hlist->next);
+		if (!hlist) {
+			prev = list;
+			list = list->next;
+			continue;
+		}
+
 		idx = res - &list->dev->resource[0];
+		add_size=list->add_size;
+		if (!resource_size(res) && add_size) {
+			 res->end = res->start + add_size - 1;
+			 if(pci_assign_resource(list->dev, idx))
+				reset_resource(res);
+		} else if (add_size) {
+			adjust_resource(res, res->start, 
+				resource_size(res) + add_size);
+		}
+out:
+		tmp = list;
+		prev->next = list = list->next;
+		kfree(tmp);
+	}
+}
+
+static void assign_requested_resources_sorted(struct resource_list *head,
+				 struct resource_list_x *fail_head)
+{
+	struct resource *res;
+	struct resource_list *list;
+	int idx;
 
-		if (pci_assign_resource(list->dev, idx)) {
+	for (list = head->next; list; list = list->next) {
+		res = list->res;
+		idx = res - &list->dev->resource[0];
+		if (resource_size(res) && pci_assign_resource(list->dev, idx)) {
 			if (fail_head && !pci_is_root_bus(list->dev->bus)) {
 				/*
 				 * if the failed res is for ROM BAR, and it will
@@ -112,16 +172,25 @@ static void __assign_resources_sorted(struct resource_list *head,
 				      (!(res->flags & IORESOURCE_ROM_ENABLE))))
 					add_to_failed_list(fail_head, list->dev, res);
 			}
-			res->start = 0;
-			res->end = 0;
-			res->flags = 0;
+			reset_resource(res);
 		}
-		tmp = list;
-		list = list->next;
-		kfree(tmp);
 	}
 }
 
+static void __assign_resources_sorted(struct resource_list *head,
+				 struct resource_list_x *add_head,
+				 struct resource_list_x *fail_head)
+{
+	/* Satisfy the must-have resource requests */
+	assign_requested_resources_sorted(head, fail_head);
+
+	/* Try to satisfy any additional nice-to-have resource 
+		requests */
+	if (add_head)
+		adjust_resources_sorted(add_head, head);
+	free_list(resource_list, head);
+}
+
 static void pdev_assign_resources_sorted(struct pci_dev *dev,
 				 struct resource_list_x *fail_head)
 {
@@ -129,11 +198,12 @@ static void pdev_assign_resources_sorted(struct pci_dev *dev,
 
 	head.next = NULL;
 	__dev_sort_resources(dev, &head);
-	__assign_resources_sorted(&head, fail_head);
+	__assign_resources_sorted(&head, NULL, fail_head);
 
 }
 
 static void pbus_assign_resources_sorted(const struct pci_bus *bus,
+					 struct resource_list_x *add_head,
 					 struct resource_list_x *fail_head)
 {
 	struct pci_dev *dev;
@@ -143,7 +213,7 @@ static void pbus_assign_resources_sorted(const struct pci_bus *bus,
 	list_for_each_entry(dev, &bus->devices, bus_list)
 		__dev_sort_resources(dev, &head);
 
-	__assign_resources_sorted(&head, fail_head);
+	__assign_resources_sorted(&head, add_head, fail_head);
 }
 
 void pci_setup_cardbus(struct pci_bus *bus)
@@ -404,15 +474,37 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, unsigned lon
 	return NULL;
 }
 
+static resource_size_t calculate_iosize(resource_size_t size, 
+		resource_size_t min_size, 
+		resource_size_t size1, 
+		resource_size_t old_size, 
+		resource_size_t align)
+{
+	if (size < min_size)
+		size = min_size;
+	if (old_size == 1 )
+		old_size = 0;
+	/* To be fixed in 2.5: we should have sort of HAVE_ISA
+	   flag in the struct pci_bus. */
+#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
+	size = (size & 0xff) + ((size & ~0xffUL) << 2);
+#endif
+	size = ALIGN(size + size1, align);
+	if (size < old_size)
+		size = old_size;
+	return size;
+}
+
 /* Sizing the IO windows of the PCI-PCI bridge is trivial,
    since these windows have 4K granularity and the IO ranges
    of non-bridge PCI devices are limited to 256 bytes.
    We must be careful with the ISA aliasing though. */
-static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
+static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, 
+		resource_size_t add_size, struct resource_list_x *add_head)
 {
 	struct pci_dev *dev;
 	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
-	unsigned long size = 0, size1 = 0, old_size;
+	unsigned long size = 0, size1 = 0;
 
 	if (!b_res)
  		return;
@@ -435,20 +527,14 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 				size1 += r_size;
 		}
 	}
-	if (size < min_size)
-		size = min_size;
-	old_size = resource_size(b_res);
-	if (old_size == 1)
-		old_size = 0;
-/* To be fixed in 2.5: we should have sort of HAVE_ISA
-   flag in the struct pci_bus. */
-#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
-	size = (size & 0xff) + ((size & ~0xffUL) << 2);
-#endif
-	size = ALIGN(size + size1, 4096);
-	if (size < old_size)
-		size = old_size;
-	if (!size) {
+
+	size = calculate_iosize(size, min_size, size1, 
+			resource_size(b_res), 4096);
+	size1 = !add_size? size: 
+		calculate_iosize(size, min_size+add_size, size1, 
+			resource_size(b_res), 4096);
+
+	if (!size && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window "
 				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
@@ -460,15 +546,38 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 	b_res->start = 4096;
 	b_res->end = b_res->start + size - 1;
 	b_res->flags |= IORESOURCE_STARTALIGN;
+
+	if (size1 > size && add_head)
+		add_to_list(add_head, bus->self, b_res, size1-size);
+
+}
+
+
+static resource_size_t calculate_memsize(resource_size_t size, 
+		resource_size_t min_size, 
+		resource_size_t size1, 
+		resource_size_t old_size, 
+		resource_size_t align)
+{
+	if (size < min_size)
+		size = min_size;
+	if (old_size == 1 )
+		old_size = 0;
+	if (size < old_size)
+		size = old_size;
+	size = ALIGN(size + size1, align);
+	return size;
 }
 
 /* Calculate the size of the bus and minimal alignment which
    guarantees that all child resources fit in this size. */
 static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
-			 unsigned long type, resource_size_t min_size)
+			 unsigned long type, resource_size_t min_size, 
+			resource_size_t add_size, 
+			struct resource_list_x *add_head)
 {
 	struct pci_dev *dev;
-	resource_size_t min_align, align, size, old_size;
+	resource_size_t min_align, align, size, size1;
 	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
 	int order, max_order;
 	struct resource *b_res = find_free_bus_resource(bus, type);
@@ -516,13 +625,6 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			mem64_mask &= r->flags & IORESOURCE_MEM_64;
 		}
 	}
-	if (size < min_size)
-		size = min_size;
-	old_size = resource_size(b_res);
-	if (old_size == 1)
-		old_size = 0;
-	if (size < old_size)
-		size = old_size;
 
 	align = 0;
 	min_align = 0;
@@ -537,8 +639,13 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			min_align = align1 >> 1;
 		align += aligns[order];
 	}
-	size = ALIGN(size, min_align);
-	if (!size) {
+
+	size = calculate_memsize(size, min_size, 0, resource_size(b_res), align);
+	size1 = !add_size ? size : 
+		calculate_memsize(size, min_size+add_size, 0, 
+				resource_size(b_res), align);
+
+	if (!size && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window "
 				 "%pR to [bus %02x-%02x] (unused)\n", b_res,
@@ -548,8 +655,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	}
 	b_res->start = min_align;
 	b_res->end = size + min_align - 1;
-	b_res->flags |= IORESOURCE_STARTALIGN;
-	b_res->flags |= mem64_mask;
+	b_res->flags |= IORESOURCE_STARTALIGN | mem64_mask;
+
+	if (size1 > size && add_head)
+		add_to_list(add_head, bus->self, b_res, size1-size);
+
 	return 1;
 }
 
@@ -602,11 +712,12 @@ static void pci_bus_size_cardbus(struct pci_bus *bus)
 	}
 }
 
-void __ref pci_bus_size_bridges(struct pci_bus *bus)
+void __ref __pci_bus_size_bridges(struct pci_bus *bus, 
+			struct resource_list_x *add_head)
 {
 	struct pci_dev *dev;
 	unsigned long mask, prefmask;
-	resource_size_t min_mem_size = 0, min_io_size = 0;
+	resource_size_t additional_mem_size = 0, additional_io_size = 0;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		struct pci_bus *b = dev->subordinate;
@@ -620,7 +731,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 
 		case PCI_CLASS_BRIDGE_PCI:
 		default:
-			pci_bus_size_bridges(b);
+			__pci_bus_size_bridges(b, add_head);
 			break;
 		}
 	}
@@ -637,11 +748,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 	case PCI_CLASS_BRIDGE_PCI:
 		pci_bridge_check_ranges(bus);
 		if (bus->self->is_hotplug_bridge) {
-			min_io_size  = pci_hotplug_io_size;
-			min_mem_size = pci_hotplug_mem_size;
+			additional_io_size  = pci_hotplug_io_size;
+			additional_mem_size = pci_hotplug_mem_size;
 		}
 	default:
-		pbus_size_io(bus, min_io_size);
+		pbus_size_io(bus, 0, additional_io_size, add_head);
 		/* If the bridge supports prefetchable range, size it
 		   separately. If it doesn't, or its prefetchable window
 		   has already been allocated by arch code, try
@@ -649,30 +760,36 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 		   resources. */
 		mask = IORESOURCE_MEM;
 		prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
-		if (pbus_size_mem(bus, prefmask, prefmask, min_mem_size))
+		if (pbus_size_mem(bus, prefmask, prefmask, 0, additional_mem_size, add_head))
 			mask = prefmask; /* Success, size non-prefetch only. */
 		else
-			min_mem_size += min_mem_size;
-		pbus_size_mem(bus, mask, IORESOURCE_MEM, min_mem_size);
+			additional_mem_size += additional_mem_size;
+		pbus_size_mem(bus, mask, IORESOURCE_MEM, 0, additional_mem_size, add_head);
 		break;
 	}
 }
+
+void __ref pci_bus_size_bridges(struct pci_bus *bus)
+{
+	__pci_bus_size_bridges(bus, NULL);
+}
 EXPORT_SYMBOL(pci_bus_size_bridges);
 
 static void __ref __pci_bus_assign_resources(const struct pci_bus *bus,
+					 struct resource_list_x *add_head,
 					 struct resource_list_x *fail_head)
 {
 	struct pci_bus *b;
 	struct pci_dev *dev;
 
-	pbus_assign_resources_sorted(bus, fail_head);
+	pbus_assign_resources_sorted(bus, add_head, fail_head);
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		b = dev->subordinate;
 		if (!b)
 			continue;
 
-		__pci_bus_assign_resources(b, fail_head);
+		__pci_bus_assign_resources(b, add_head, fail_head);
 
 		switch (dev->class >> 8) {
 		case PCI_CLASS_BRIDGE_PCI:
@@ -694,7 +811,7 @@ static void __ref __pci_bus_assign_resources(const struct pci_bus *bus,
 
 void __ref pci_bus_assign_resources(const struct pci_bus *bus)
 {
-	__pci_bus_assign_resources(bus, NULL);
+	__pci_bus_assign_resources(bus, NULL, NULL);
 }
 EXPORT_SYMBOL(pci_bus_assign_resources);
 
@@ -709,7 +826,7 @@ static void __ref __pci_bridge_assign_resources(const struct pci_dev *bridge,
 	if (!b)
 		return;
 
-	__pci_bus_assign_resources(b, fail_head);
+	__pci_bus_assign_resources(b, NULL, fail_head);
 
 	switch (bridge->class >> 8) {
 	case PCI_CLASS_BRIDGE_PCI:
@@ -842,15 +959,18 @@ void __init
 pci_assign_unassigned_resources(void)
 {
 	struct pci_bus *bus;
-
+	struct resource_list_x add_list; /* list of resources that 
+					want additional resources */
+	add_list.next = NULL;
 	/* Depth first, calculate sizes and alignments of all
 	   subordinate buses. */
 	list_for_each_entry(bus, &pci_root_buses, node) {
-		pci_bus_size_bridges(bus);
+		__pci_bus_size_bridges(bus, &add_list);
 	}
+
 	/* Depth last, allocate resources and update the hardware. */
 	list_for_each_entry(bus, &pci_root_buses, node) {
-		pci_bus_assign_resources(bus);
+		__pci_bus_assign_resources(bus, &add_list, NULL);
 		pci_enable_bridges(bus);
 	}
 
@@ -882,7 +1002,7 @@ again:
 
 	if (tried_times >= 2) {
 		/* still fail, don't need to try more */
-		free_failed_list(&head);
+		free_list(resource_list_x, &head);
 		goto enable_all;
 	}
 
@@ -913,7 +1033,7 @@ again:
 
 		list = list->next;
 	}
-	free_failed_list(&head);
+	free_list(resource_list_x, &head);
 
 	goto again;
 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1 v4] PCI: allocate essential resources before reserving hotplug resources
  2011-01-20  1:00                                         ` [PATCH 1/1 v4] " Ram Pai
@ 2011-01-21  1:22                                           ` Yinghai Lu
  2011-01-21  7:17                                             ` Ram Pai
  0 siblings, 1 reply; 32+ messages in thread
From: Yinghai Lu @ 2011-01-21  1:22 UTC (permalink / raw)
  To: Ram Pai
  Cc: Jesse Barnes, Bjorn Helgaas, linux-pci, linux-kernel, clemens,
	Linus Torvalds, peter.henriksson, ebiederm

On Wed, Jan 19, 2011 at 5:00 PM, Ram Pai <linuxram@us.ibm.com> wrote:
>    PCI: pre-allocate additional resources to devices only after successful
>            allocation of essential resources.
>
>    Linux tries to pre-allocate minimal resources to hotplug bridges. This
>    works fine as long as there are enough resources  to satisfy all other
>    genuine resource requirements. However if enough resources are not
>    available to satisfy any of these nice-to-have pre-allocations, the
>    resource-allocator reports errors and returns failure.
>
>    This patch distinguishes between must-have resource from nice-to-have
>    resource.  Any failure to allocate nice-to-have resources are ignored.
>
>    This behavior can be particularly useful to trigger automatic
>    reallocation when the OS discovers genuine allocation-conflicts
>    or genuine unallocated-requests caused by buggy allocation behavior
>    of the native BIOS/uEFI.
>
>    https://bugzilla.kernel.org/show_bug.cgi?id=15960 captures the movitation
>    behind the patch.
>
>    changelog v2:  o  fixed a bug where pci_assign_resource() was called on a
>                      resource of zero resource size.
>
>    changelog v3:  addressed Bjorn's comment
>               o  "Please don't indent and right-justify the changelog".
>               o  removed add_size from struct resource.  The additional
>                  size is now tracked using a linked list.
>
>    changelog v4:  o moved freeing up of elements of head list from
>                  assign_requested_resources_sorted() to
>                __assign_resources_sorted(). This fixes a corruption bug.
>               o removed a wrong reference to 'add_size' in
>                   pbus_size_mem(). Erroneously got introduced while
>                   generating the patch.
>               o some code optimizations in adjust_resources_sorted()
>                   and assign_requested_resources_sorted()
>
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
>
> diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
> index 66cb8f4..efbdff2 100644
> --- a/drivers/pci/setup-bus.c
> +++ b/drivers/pci/setup-bus.c
> @@ -33,11 +33,23 @@ struct resource_list_x {
>        struct pci_dev *dev;
>        resource_size_t start;
>        resource_size_t end;
> +       resource_size_t add_size;
>        unsigned long flags;
>  };
>
> -static void add_to_failed_list(struct resource_list_x *head,
> -                                struct pci_dev *dev, struct resource *res)
> +#define free_list(type, head) do {                      \
> +       struct type *list, *tmp;                        \
> +       for (list = (head)->next; list;) {              \
> +               tmp = list;                             \
> +               list = list->next;                      \
> +               kfree(tmp);                             \
> +       }                                               \
> +       (head)->next = NULL;                            \
> +} while (0)

inline function should be better?

Yinghai

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1 v4] PCI: allocate essential resources before reserving hotplug resources
  2011-01-21  1:22                                           ` Yinghai Lu
@ 2011-01-21  7:17                                             ` Ram Pai
  0 siblings, 0 replies; 32+ messages in thread
From: Ram Pai @ 2011-01-21  7:17 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ram Pai, Jesse Barnes, Bjorn Helgaas, linux-pci, linux-kernel,
	clemens, Linus Torvalds, peter.henriksson, ebiederm

On Thu, Jan 20, 2011 at 05:22:02PM -0800, Yinghai Lu wrote:
> On Wed, Jan 19, 2011 at 5:00 PM, Ram Pai <linuxram@us.ibm.com> wrote:
> >    PCI: pre-allocate additional resources to devices only after successful
> >            allocation of essential resources.
> >
> >    Linux tries to pre-allocate minimal resources to hotplug bridges. This
> >    works fine as long as there are enough resources  to satisfy all other
> >    genuine resource requirements. However if enough resources are not
> >    available to satisfy any of these nice-to-have pre-allocations, the
> >    resource-allocator reports errors and returns failure.
> >
> >    This patch distinguishes between must-have resource from nice-to-have
> >    resource.  Any failure to allocate nice-to-have resources are ignored.
> >
> >    This behavior can be particularly useful to trigger automatic
> >    reallocation when the OS discovers genuine allocation-conflicts
> >    or genuine unallocated-requests caused by buggy allocation behavior
> >    of the native BIOS/uEFI.
> >
> >    https://bugzilla.kernel.org/show_bug.cgi?id=15960 captures the movitation
> >    behind the patch.
> >
> >    changelog v2:  o  fixed a bug where pci_assign_resource() was called on a
> >                      resource of zero resource size.
> >
> >    changelog v3:  addressed Bjorn's comment
> >               o  "Please don't indent and right-justify the changelog".
> >               o  removed add_size from struct resource.  The additional
> >                  size is now tracked using a linked list.
> >
> >    changelog v4:  o moved freeing up of elements of head list from
> >                  assign_requested_resources_sorted() to
> >                __assign_resources_sorted(). This fixes a corruption bug.
> >               o removed a wrong reference to 'add_size' in
> >                   pbus_size_mem(). Erroneously got introduced while
> >                   generating the patch.
> >               o some code optimizations in adjust_resources_sorted()
> >                   and assign_requested_resources_sorted()
> >
> > Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> >
> > diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
> > index 66cb8f4..efbdff2 100644
> > --- a/drivers/pci/setup-bus.c
> > +++ b/drivers/pci/setup-bus.c
> > @@ -33,11 +33,23 @@ struct resource_list_x {
> >        struct pci_dev *dev;
> >        resource_size_t start;
> >        resource_size_t end;
> > +       resource_size_t add_size;
> >        unsigned long flags;
> >  };
> >
> > -static void add_to_failed_list(struct resource_list_x *head,
> > -                                struct pci_dev *dev, struct resource *res)
> > +#define free_list(type, head) do {                      \
> > +       struct type *list, *tmp;                        \
> > +       for (list = (head)->next; list;) {              \
> > +               tmp = list;                             \
> > +               list = list->next;                      \
> > +               kfree(tmp);                             \
> > +       }                                               \
> > +       (head)->next = NULL;                            \
> > +} while (0)
> 
> inline function should be better?

I thought about it and decided to use the macro since the 'head' can
be either a resouce_list_x pointer or a resouce_list pointer.

A datastructure agonistic inline function would not be clean 
enough.

RP

> 
> Yinghai

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2011-01-21  7:17 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-06 22:58 [RFC v2 PATCH 1/1] PCI: override BIOS/firmware resource allocation Ram Pai
2010-10-06 23:39 ` Bjorn Helgaas
2010-10-07  0:30   ` Ram Pai
2010-10-07  4:13     ` Bjorn Helgaas
2010-10-07 20:42       ` Ram Pai
2010-10-07 21:41         ` Bjorn Helgaas
2010-10-08 17:32           ` Ram Pai
2010-10-08 20:16             ` Bjorn Helgaas
2010-10-12  7:05               ` Ram Pai
2010-10-12 19:01                 ` Bjorn Helgaas
2010-10-18 20:10                   ` Jesse Barnes
2010-10-19 17:17                     ` Ram Pai
2010-10-19 18:24                       ` Jesse Barnes
2010-10-22  0:28                         ` Ram Pai
2010-10-22 17:55                           ` Bjorn Helgaas
2010-10-22 18:59                             ` Ram Pai
2010-10-22 21:49                               ` Bjorn Helgaas
2010-10-22 17:16                         ` [PATCH 1/1] PCI: ignore failure to preallocate minimal resources to hotplug bridges Ram Pai
2010-10-22 22:16                           ` Bjorn Helgaas
2011-01-07 22:32                           ` Jesse Barnes
2011-01-11 21:10                             ` Ram Pai
2011-01-14 18:19                               ` [PATCH 1/1] PCI: allocate essential resources before reserving hotplug resources Ram Pai
2011-01-18 20:52                                 ` Bjorn Helgaas
2011-01-18 21:42                                   ` Ram Pai
2011-01-18 22:11                                     ` Bjorn Helgaas
2011-01-19 19:58                                       ` [PATCH 1/1 v3] " Ram Pai
2011-01-20  1:00                                         ` [PATCH 1/1 v4] " Ram Pai
2011-01-21  1:22                                           ` Yinghai Lu
2011-01-21  7:17                                             ` Ram Pai
2011-01-18 21:30                                 ` [PATCH 1/1 Version 2.0] " Ram Pai
2011-01-18 21:46                                   ` Bjorn Helgaas
2011-01-18 22:03                                     ` Ram Pai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).