LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [Fwd: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific system vectors]
@ 2008-08-08 15:37 Alan Mayer
  2008-08-11 16:59 ` [PATCH] x86_64: (NEW) Dynamically allocate arch specific system vectors Ingo Molnar
  0 siblings, 1 reply; 35+ messages in thread
From: Alan Mayer @ 2008-08-08 15:37 UTC (permalink / raw)
  To: Eric W. Biederman, jeremy, rusty, suresh.b.siddha, mingo,
	torvalds, linux-kernel, Dean Nelson, Cliff Wickman


Subject: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific 
system vectors

From: Alan Mayer <ajm@sgi.com>

On some systems (e. g., UV) it is necessary to use an interrupt vector
as a "system" vector, that is, it is generated by system hardware, not an
IO device.  This patch dynamically allocates them from the pool of interrupt
vectors below the fixed system vectors.  This may include stealing some from
the device interrupt vector pool, so they are allocated dynamically so that
other archs don't have to pay the price.  In UV, examples of these hardware
and software systems that need dynamically allocated vectors are the GRU,
the BAU, and XPM/XPC.

Signed-off-by: Alan Mayer <ajm@sgi.com>

Reviewed by:  Robin Holt <holt@sgi.com> Dean Nelson <dcn@sgi.com> Cliff 
Wickman <cpw@sgi.com>

---
Index: linuxnext.latest/arch/x86/kernel/io_apic_64.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/io_apic_64.c	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/io_apic_64.c	2008-08-07 
13:26:18.000000000 -0500
@@ -85,10 +85,6 @@

  static int assign_irq_vector(int irq, cpumask_t mask);

-int first_system_vector = 0xfe;
-
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = 
SYS_VECTOR_FREE};
-
  #define __apicdebuginit  __init

  int sis_apic_bug; /* not actually supported, dummy for compile */
@@ -770,7 +766,7 @@
  	return irq;
  }

-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, int priority, cpumask_t *mask)
  {
  	/*
  	 * NOTE! The local APIC isn't very good at handling
@@ -783,63 +779,99 @@
  	 * Also, we've got to be careful not to trash gate
  	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
  	 */
-	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+	static int current_device_vector = FIRST_DYNAMIC_VECTOR;
+	static int current_device_offset;	/* initially 0 */
+	int current_vector;
+	int current_offset;
  	unsigned int old_vector;
-	int cpu;
+	cpumask_t target_cpu_mask;
+	int target_cpu;
+	cpumask_t domain_cpu_mask;
  	struct irq_cfg *cfg;

  	BUG_ON((unsigned)irq >= NR_IRQS);
  	cfg = &irq_cfg[irq];

-	/* Only try and allocate irqs on cpus that are present */
-	cpus_and(mask, mask, cpu_online_map);
-
  	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
  		return -EBUSY;

+	if (priority == IRQ_PRIORITY_NONE) {
+		/* Only try and allocate irqs on cpus that are present */
+		cpus_and(target_cpu_mask, *mask, cpu_online_map);
+
+		current_vector = current_device_vector;
+		current_offset = current_device_offset;
+	} else {
+		cpus_and(target_cpu_mask, *mask, cpu_possible_map);
+		domain_cpu_mask = target_cpu_mask;
+
+		if (priority == IRQ_PRIORITY_HIGH)
+			current_vector = first_fixed_system_vector;
+		else if (priority == IRQ_PRIORITY_LOW)
+			current_vector = FIRST_DYNAMIC_VECTOR - 1;
+		else
+			BUG();
+		current_offset = 0;
+	}
+
  	old_vector = cfg->vector;
  	if (old_vector) {
  		cpumask_t tmp;
-		cpus_and(tmp, cfg->domain, mask);
+		cpus_and(tmp, cfg->domain, target_cpu_mask);
  		if (!cpus_empty(tmp))
  			return 0;
  	}

-	for_each_cpu_mask_nr(cpu, mask) {
-		cpumask_t domain, new_mask;
-		int new_cpu;
+	for_each_cpu_mask_nr(target_cpu, target_cpu_mask) {
+		int domain_cpu;
  		int vector, offset;

-		domain = vector_allocation_domain(cpu);
-		cpus_and(new_mask, domain, cpu_online_map);
+		if (priority == IRQ_PRIORITY_NONE) {
+			domain_cpu_mask = vector_allocation_domain(target_cpu);
+			cpus_and(domain_cpu_mask, domain_cpu_mask,
+				 cpu_online_map);
+		}

  		vector = current_vector;
  		offset = current_offset;
  next:
-		vector += 8;
-		if (vector >= first_system_vector) {
-			/* If we run out of vectors on large boxen, must share them. */
-			offset = (offset + 1) % 8;
-			vector = FIRST_DEVICE_VECTOR + offset;
+		if (priority == IRQ_PRIORITY_HIGH) {
+			if (--vector < FIRST_DYNAMIC_VECTOR)
+				break;
+		} else if (priority == IRQ_PRIORITY_LOW) {
+			if (++vector == first_fixed_system_vector)
+				break;
+		} else {
+			vector += 8;
+			if (vector > last_dynamic_device_vector) {
+				/*
+				 * If we run out of vectors on large boxes,
+				 * must share them.
+				 */
+				offset = (offset + 1) % 8;
+				vector = FIRST_DYNAMIC_VECTOR + offset;
+			}
+			if (unlikely(current_vector == vector))
+				continue;
  		}
-		if (unlikely(current_vector == vector))
-			continue;
  		if (vector == IA32_SYSCALL_VECTOR)
  			goto next;
-		for_each_cpu_mask_nr(new_cpu, new_mask)
-			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+		for_each_cpu_mask_nr(domain_cpu, domain_cpu_mask)
+			if (per_cpu(vector_irq, domain_cpu)[vector] != -1)
  				goto next;
  		/* Found one! */
-		current_vector = vector;
-		current_offset = offset;
+		if (priority == IRQ_PRIORITY_NONE) {
+			current_device_vector = vector;
+			current_device_offset = offset;
+		}
  		if (old_vector) {
  			cfg->move_in_progress = 1;
  			cfg->old_domain = cfg->domain;
  		}
-		for_each_cpu_mask_nr(new_cpu, new_mask)
-			per_cpu(vector_irq, new_cpu)[vector] = irq;
+		for_each_cpu_mask_nr(domain_cpu, domain_cpu_mask)
+			per_cpu(vector_irq, domain_cpu)[vector] = irq;
  		cfg->vector = vector;
-		cfg->domain = domain;
+		cfg->domain = domain_cpu_mask;
  		return 0;
  	}
  	return -ENOSPC;
@@ -851,7 +883,7 @@
  	unsigned long flags;

  	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, IRQ_PRIORITY_NONE, &mask);
  	spin_unlock_irqrestore(&vector_lock, flags);
  	return err;
  }
@@ -2256,23 +2288,30 @@
  device_initcall(ioapic_init_sysfs);

  /*
- * Dynamic irq allocate and deallocation
+ * Dynamically allocate an irq vector mapping.
   */
-int create_irq(void)
+static int do_create_irq(int priority, cpumask_t *mask)
  {
  	/* Allocate an unused irq */
  	int irq;
  	int new;
  	unsigned long flags;
+	cpumask_t target_cpu_mask;

  	irq = -ENOSPC;
  	spin_lock_irqsave(&vector_lock, flags);
+
+	if (mask)
+		target_cpu_mask = *mask;
+	else
+		target_cpu_mask = TARGET_CPUS;
+
  	for (new = (NR_IRQS - 1); new >= 0; new--) {
  		if (platform_legacy_irq(new))
  			continue;
  		if (irq_cfg[new].vector != 0)
  			continue;
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, priority, &target_cpu_mask) == 0)
  			irq = new;
  		break;
  	}
@@ -2284,6 +2323,17 @@
  	return irq;
  }

+/*
+ * Dynamically allocate an irq device vector mapping.
+ */
+int create_irq(void)
+{
+	return do_create_irq(IRQ_PRIORITY_NONE, NULL);
+}
+
+/*
+ * Free a dynamically allocated irq device vector mapping.
+ */
  void destroy_irq(unsigned int irq)
  {
  	unsigned long flags;
@@ -2299,6 +2349,106 @@
  }

  /*
+ * NOP functions
+ */
+static void noop(unsigned int irq)
+{
+}
+
+static unsigned int noop_ret(unsigned int irq)
+{
+	return 0;
+}
+
+static void ack_apic(unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+/*
+ * For dynamic allocation of system vectors where
+ * an ack_APIC_irq() is needed after handling the IRQ
+ */
+static struct irq_chip ack_apic_chip = {
+	.name		= "ack_apic",
+	.startup	= noop_ret,
+	.shutdown	= noop,
+	.enable		= noop,
+	.disable	= noop,
+	.ack		= noop,
+	.mask		= noop,
+	.unmask		= noop,
+	.eoi		= ack_apic,
+	.end		= noop,
+};
+
+/*
+ * Dynamically allocate an irq system vector mapping.
+ * (The irq is not to be shared.)
+ *
+ * After calling this function, the caller is responsible for any needed
+ * calls to:
+ *   set_irq_data(&any_driver_data);
+ *   set_irq_type(irq, IRQ_TYPE...);
+ * Then make the call to request_irq() to create the irqaction:
+ *   request_irq(irq, interrupt_handler, irqflags, "devname", NULL);
+ *   You might consider the flag IRQF_NOBALANCING.
+ */
+int create_irq_system_vector(int priority, cpumask_t *mask, char *irq_name,
+			     int *assigned_vector)
+{
+	unsigned long flags;
+	int irq;
+
+	/* locate an available irq */
+	irq = do_create_irq(priority, mask);
+	if (irq < 0)
+		return irq;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	set_irq_chip_and_handler_name(irq, &ack_apic_chip, handle_percpu_irq,
+				      irq_name);
+
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	*assigned_vector = irq_cfg[irq].vector;
+	return irq;
+}
+EXPORT_SYMBOL(create_irq_system_vector);
+
+/*
+ * Free a dynamically allocated irq system vector mapping.
+ *
+ * Before calling this function, the caller is responsible for calling
+ * free_irq(irq, dev_id);  to free the irqaction.
+ */
+void destroy_irq_system_vector(int irq)
+{
+	unsigned long flags;
+	int cpu;
+
+	if ((unsigned)irq >= NR_IRQS || irq_cfg[irq].vector == 0)
+		return;
+
+#ifdef CONFIG_SMP
+	synchronize_irq(irq);
+#endif
+	dynamic_irq_cleanup(irq);
+	disable_irq(irq);
+
+	spin_lock_irqsave(&vector_lock, flags);
+
+	for_each_cpu_mask_nr(cpu, irq_cfg[irq].domain)
+		per_cpu(vector_irq, cpu)[irq_cfg[irq].vector] = -1;
+
+	irq_cfg[irq].vector = 0;
+	cpus_clear(irq_cfg[irq].domain);
+
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+EXPORT_SYMBOL(destroy_irq_system_vector);
+
+/*
   * MSI message composition
   */
  #ifdef CONFIG_PCI_MSI
@@ -2533,7 +2683,7 @@
  {
  	int irq, ret;

-	irq = create_irq();
+	irq = do_create_irq(IRQ_PRIORITY_NONE, NULL);
  	if (irq < 0)
  		return irq;

@@ -2571,7 +2721,7 @@

  	sub_handle = 0;
  	list_for_each_entry(desc, &dev->msi_list, list) {
-		irq = create_irq();
+		irq = do_create_irq(IRQ_PRIORITY_NONE, NULL);
  		if (irq < 0)
  			return irq;
  #ifdef CONFIG_INTR_REMAP
Index: linuxnext.latest/include/linux/irq.h
===================================================================
--- linuxnext.latest.orig/include/linux/irq.h	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/include/linux/irq.h	2008-08-07 09:46:42.000000000 -0500
@@ -352,10 +352,14 @@
  extern void set_irq_noprobe(unsigned int irq);
  extern void set_irq_probe(unsigned int irq);

-/* Handle dynamic irq creation and destruction */
+/* Handle dynamic irq device vector mapping and unmapping */
  extern int create_irq(void);
  extern void destroy_irq(unsigned int irq);

+/* Handle dynamic irq system vector mapping and unmapping */
+extern int create_irq_system_vector(int, cpumask_t *, char *, int *);
+extern void destroy_irq_system_vector(int);
+
  /* Test to see if a driver has successfully requested an irq */
  static inline int irq_has_action(unsigned int irq)
  {
Index: linuxnext.latest/include/asm-x86/irq_vectors.h
===================================================================
--- linuxnext.latest.orig/include/asm-x86/irq_vectors.h	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/include/asm-x86/irq_vectors.h	2008-08-07 
09:46:42.000000000 -0500
@@ -91,14 +91,40 @@
  #define LOCAL_TIMER_VECTOR	0xef

  /*
+ * The first device or system vector (lowest numbered) available for 
dynamic
+ * allocation is defined by FIRST_DYNAMIC_VECTOR.
+ *
+ * The last device vector available for dynamic allocation is defined by
+ * last_dynamic_device_vector, which is initially set to
+ * LAST_DYNAMIC_DEVICE_VECTOR.
+ *
+ * The last system vector available for dynamic allocation is defined by
+ * first_fixed_system_vector - 1. The variable first_fixed_system_vector
+ * is initially set to FIRST_FIXED_SYSTEM_VECTOR.
+ *
+ * SGI-UV uses LAST_UV_DYNAMIC_DEVICE_VECTOR to reserve a range of
+ * vectors that falls between the first_fixed_system_vector and
+ * last_dynamic_device_vector for dynamic system vector allocations.
+ */
+#define FIRST_FIXED_SYSTEM_VECTOR		0xfe
+#define LAST_DYNAMIC_DEVICE_VECTOR		FIRST_FIXED_SYSTEM_VECTOR
+#define LAST_UV_DYNAMIC_DEVICE_VECTOR		0xe0
+#define IRQ_PRIORITY_NONE			1
+#define IRQ_PRIORITY_LOW			2
+#define IRQ_PRIORITY_HIGH			3
+
+/*
   * First APIC vector available to drivers: (vectors 0x30-0xee) we
   * start at 0x31(0x41) to spread out vectors evenly between priority
   * levels. (0x80 is the syscall vector)
+ *
+ * Device vectors are dynamically allocated as numbers in the range of
+ * FIRST_DYNAMIC_VECTOR to last_dynamic_device_vector (inclusive).
   */
  #ifdef CONFIG_X86_32
-# define FIRST_DEVICE_VECTOR	0x31
+# define FIRST_DYNAMIC_VECTOR	0x31
  #else
-# define FIRST_DEVICE_VECTOR	(IRQ15_VECTOR + 2)
+# define FIRST_DYNAMIC_VECTOR	(IRQ15_VECTOR + 2)
  #endif

  #define NR_VECTORS		256
Index: linuxnext.latest/arch/x86/kernel/io_apic_32.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/io_apic_32.c	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/io_apic_32.c	2008-08-07 
13:32:37.000000000 -0500
@@ -1165,11 +1165,15 @@
  }

  /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { 
FIRST_DEVICE_VECTOR , 0 };
+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = {
+	FIRST_DYNAMIC_VECTOR,
+	0
+};

  static int __assign_irq_vector(int irq)
  {
-	static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
+	static int current_vector = FIRST_DYNAMIC_VECTOR;
+	static int current_offset;
  	int vector, offset;

  	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
@@ -1181,9 +1185,9 @@
  	offset = current_offset;
  next:
  	vector += 8;
-	if (vector >= first_system_vector) {
+	if (vector > last_dynamic_device_vector) {
  		offset = (offset + 1) % 8;
-		vector = FIRST_DEVICE_VECTOR + offset;
+		vector = FIRST_DYNAMIC_VECTOR + offset;
  	}
  	if (vector == current_vector)
  		return -ENOSPC;
@@ -2314,7 +2318,7 @@
  	int i;

  	/* Reserve all the system vectors. */
-	for (i = first_system_vector; i < NR_VECTORS; i++)
+	for (i = last_dynamic_device_vector + 1; i < NR_VECTORS; i++)
  		set_bit(i, used_vectors);

  	enable_IO_APIC();
@@ -2435,9 +2439,9 @@
  device_initcall(ioapic_init_sysfs);

  /*
- * Dynamic irq allocate and deallocation
+ * Dynamically allocate an irq vector mapping.
   */
-int create_irq(void)
+static int do_create_irq(int priority, cpumask_t *mask)
  {
  	/* Allocate an unused irq */
  	int irq, new, vector = 0;
@@ -2464,6 +2468,17 @@
  	return irq;
  }

+/*
+ * Dynamically allocate an irq device vector mapping.
+ */
+int create_irq(void)
+{
+	return do_create_irq(IRQ_PRIORITY_NONE, NULL);
+}
+
+/*
+ * Free a dynamically allocated irq device vector mapping.
+ */
  void destroy_irq(unsigned int irq)
  {
  	unsigned long flags;
@@ -2560,7 +2575,7 @@
  {
  	struct msi_msg msg;
  	int irq, ret;
-	irq = create_irq();
+	irq = do_create_irq(IRQ_PRIORITY_NONE, NULL);
  	if (irq < 0)
  		return irq;

Index: linuxnext.latest/include/asm-x86/desc.h
===================================================================
--- linuxnext.latest.orig/include/asm-x86/desc.h	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/include/asm-x86/desc.h	2008-08-07 
09:46:42.000000000 -0500
@@ -310,22 +310,25 @@
  #define SYS_VECTOR_FREE		0
  #define SYS_VECTOR_ALLOCED	1

-extern int first_system_vector;
-extern char system_vectors[];
-
-static inline void alloc_system_vector(int vector)
-{
-	if (system_vectors[vector] == SYS_VECTOR_FREE) {
-		system_vectors[vector] = SYS_VECTOR_ALLOCED;
-		if (first_system_vector > vector)
-			first_system_vector = vector;
+extern int last_dynamic_device_vector;
+extern int first_fixed_system_vector;
+extern char fixed_system_vectors[];
+
+static inline void alloc_fixed_system_vector(int vector)
+{
+	if (fixed_system_vectors[vector] == SYS_VECTOR_FREE) {
+		fixed_system_vectors[vector] = SYS_VECTOR_ALLOCED;
+		if (first_fixed_system_vector > vector)
+			first_fixed_system_vector = vector;
+		if (last_dynamic_device_vector >= vector)
+			last_dynamic_device_vector = vector - 1;
  	} else
  		BUG();
  }

  static inline void alloc_intr_gate(unsigned int n, void *addr)
  {
-	alloc_system_vector(n);
+	alloc_fixed_system_vector(n);
  	set_intr_gate(n, addr);
  }

Index: linuxnext.latest/arch/x86/kernel/apic_32.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/apic_32.c	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/apic_32.c	2008-08-07 
09:46:42.000000000 -0500
@@ -68,9 +68,11 @@
  int local_apic_timer_c2_ok;
  EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);

-int first_system_vector = 0xfe;
-
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = 
SYS_VECTOR_FREE};
+int last_dynamic_device_vector = LAST_DYNAMIC_DEVICE_VECTOR;
+int first_fixed_system_vector = FIRST_FIXED_SYSTEM_VECTOR;
+char fixed_system_vectors[NR_VECTORS] = {
+	[0 ... NR_VECTORS-1] = SYS_VECTOR_FREE
+};

  /*
   * Debug level, exported for io_apic.c
@@ -1361,7 +1363,7 @@
  	 * IRQ0 must be given a fixed assignment and initialized,
  	 * because it's used before the IO-APIC is set up.
  	 */
-	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+	set_intr_gate(FIRST_DYNAMIC_VECTOR, interrupt[0]);

  	/*
  	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
Index: linuxnext.latest/arch/x86/kernel/irqinit_64.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/irqinit_64.c	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/irqinit_64.c	2008-08-07 
09:46:42.000000000 -0500
@@ -22,6 +22,7 @@
  #include <asm/desc.h>
  #include <asm/apic.h>
  #include <asm/i8259.h>
+#include <asm/genapic.h>

  /*
   * Common place to define all x86 IRQ vectors
@@ -217,6 +218,11 @@
  	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
  	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);

+	if (is_uv_system() &&
+	    LAST_UV_DYNAMIC_DEVICE_VECTOR < last_dynamic_device_vector) {
+		last_dynamic_device_vector = LAST_UV_DYNAMIC_DEVICE_VECTOR;
+	}
+
  	if (!acpi_ioapic)
  		setup_irq(2, &irq2);
  }
Index: linuxnext.latest/arch/x86/kernel/vmiclock_32.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/vmiclock_32.c	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/vmiclock_32.c	2008-08-07 
09:46:42.000000000 -0500
@@ -81,7 +81,7 @@
  static inline unsigned int vmi_get_timer_vector(void)
  {
  #ifdef CONFIG_X86_IO_APIC
-	return FIRST_DEVICE_VECTOR;
+	return FIRST_DYNAMIC_VECTOR;
  #else
  	return FIRST_EXTERNAL_VECTOR;
  #endif
Index: linuxnext.latest/arch/x86/kernel/apic_64.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/apic_64.c	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/apic_64.c	2008-08-07 
09:46:42.000000000 -0500
@@ -33,6 +33,7 @@
  #include <asm/smp.h>
  #include <asm/mtrr.h>
  #include <asm/mpspec.h>
+#include <asm/desc.h>
  #include <asm/hpet.h>
  #include <asm/pgalloc.h>
  #include <asm/nmi.h>
@@ -58,6 +59,13 @@
  int local_apic_timer_c2_ok;
  EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);

+int last_dynamic_device_vector = LAST_DYNAMIC_DEVICE_VECTOR;
+int first_fixed_system_vector = FIRST_FIXED_SYSTEM_VECTOR;
+char fixed_system_vectors[NR_VECTORS] = {
+	[0 ... NR_VECTORS-1] = SYS_VECTOR_FREE
+};
+
+
  /*
   * Debug level, exported for io_apic.c
   */
Index: linuxnext.latest/kernel/irq/chip.c
===================================================================
--- linuxnext.latest.orig/kernel/irq/chip.c	2008-08-07 
09:46:37.000000000 -0500
+++ linuxnext.latest/kernel/irq/chip.c	2008-08-07 09:46:42.000000000 -0500
@@ -78,6 +78,7 @@
  	desc->chip_data = NULL;
  	desc->handle_irq = handle_bad_irq;
  	desc->chip = &no_irq_chip;
+	desc->name = "none";
  	spin_unlock_irqrestore(&desc->lock, flags);
  }


-- 
Somebody just stopped callin' you "Angel."
--
Alan J. Mayer
SGI
ajm@sgi.com
WORK: 651-683-3131
HOME: 651-407-0134
--

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific system vectors
  2008-08-08 15:37 [Fwd: [PATCH] x86_64: (NEW) Dynamically allocate arch specific system vectors] Alan Mayer
@ 2008-08-11 16:59 ` Ingo Molnar
  2008-08-11 17:14   ` Alan Mayer
  0 siblings, 1 reply; 35+ messages in thread
From: Ingo Molnar @ 2008-08-11 16:59 UTC (permalink / raw)
  To: Alan Mayer
  Cc: Eric W. Biederman, jeremy, rusty, suresh.b.siddha, torvalds,
	linux-kernel, Dean Nelson, Cliff Wickman, H. Peter Anvin,
	Thomas Gleixner


* Alan Mayer <ajm@sgi.com> wrote:

> Subject: [PATCH] x86_64: (NEW) Dynamically allocate arch specific 
> system vectors
>
> From: Alan Mayer <ajm@sgi.com>
>
> On some systems (e. g., UV) it is necessary to use an interrupt vector 
> as a "system" vector, that is, it is generated by system hardware, not 
> an IO device.  This patch dynamically allocates them from the pool of 
> interrupt vectors below the fixed system vectors.  This may include 
> stealing some from the device interrupt vector pool, so they are 
> allocated dynamically so that other archs don't have to pay the price.  
> In UV, examples of these hardware and software systems that need 
> dynamically allocated vectors are the GRU, the BAU, and XPM/XPC.

patch has severe inlined-as-text corruption, please check 
Documentation/email-clients.txt about how to send patches. (or send it 
as an attachment, i can process that)

Also, given the extensive feedback from Eric, it would be nice to have 
his Acked-by line as well to any patch that is resubmitted for 
inclusion, to make sure you meet all the requirements he has outlined.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific system vectors
  2008-08-11 16:59 ` [PATCH] x86_64: (NEW) Dynamically allocate arch specific system vectors Ingo Molnar
@ 2008-08-11 17:14   ` Alan Mayer
  2008-08-11 19:39     ` Eric W. Biederman
  0 siblings, 1 reply; 35+ messages in thread
From: Alan Mayer @ 2008-08-11 17:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric W. Biederman, jeremy, rusty, suresh.b.siddha, torvalds,
	linux-kernel, Dean Nelson, Cliff Wickman, H. Peter Anvin,
	Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 1482 bytes --]

Okay, here it is as an attachment.  I think my email client is munging it.
I haven't been able to fix it, apparently.

I, too, would like to know what Eric thinks.

		--ajm

Ingo Molnar wrote:
> * Alan Mayer <ajm@sgi.com> wrote:
> 
>> Subject: [PATCH] x86_64: (NEW) Dynamically allocate arch specific 
>> system vectors
>>
>> From: Alan Mayer <ajm@sgi.com>
>>
>> On some systems (e. g., UV) it is necessary to use an interrupt vector 
>> as a "system" vector, that is, it is generated by system hardware, not 
>> an IO device.  This patch dynamically allocates them from the pool of 
>> interrupt vectors below the fixed system vectors.  This may include 
>> stealing some from the device interrupt vector pool, so they are 
>> allocated dynamically so that other archs don't have to pay the price.  
>> In UV, examples of these hardware and software systems that need 
>> dynamically allocated vectors are the GRU, the BAU, and XPM/XPC.
> 
> patch has severe inlined-as-text corruption, please check 
> Documentation/email-clients.txt about how to send patches. (or send it 
> as an attachment, i can process that)
> 
> Also, given the extensive feedback from Eric, it would be nice to have 
> his Acked-by line as well to any patch that is resubmitted for 
> inclusion, to make sure you meet all the requirements he has outlined.
> 
> Thanks,
> 
> 	Ingo

-- 
I know
it's only rock and roll,
But I like it.
--
Alan J. Mayer
SGI
ajm@sgi.com
WORK: 651-683-3131
HOME: 651-407-0134
--

[-- Attachment #2: system_irq_vector_patch2 --]
[-- Type: text/plain, Size: 19313 bytes --]

Subject: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific system vectors

From: Alan Mayer <ajm@sgi.com>

On some systems (e. g., UV) it is necessary to use an interrupt vector
as a "system" vector, that is, it is generated by system hardware, not an
IO device.  This patch dynamically allocates them from the pool of interrupt
vectors below the fixed system vectors.  This may include stealing some from
the device interrupt vector pool, so they are allocated dynamically so that
other archs don't have to pay the price.  In UV, examples of these hardware
and software systems that need dynamically allocated vectors are the GRU,
the BAU, and XPM/XPC.

Signed-off-by: Alan Mayer <ajm@sgi.com>

Reviewed by:  Robin Holt <holt@sgi.com> Dean Nelson <dcn@sgi.com> Cliff Wickman <cpw@sgi.com>

---
Index: linuxnext.latest/arch/x86/kernel/io_apic_64.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/io_apic_64.c	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/io_apic_64.c	2008-08-07 13:26:18.000000000 -0500
@@ -85,10 +85,6 @@
 
 static int assign_irq_vector(int irq, cpumask_t mask);
 
-int first_system_vector = 0xfe;
-
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
-
 #define __apicdebuginit  __init
 
 int sis_apic_bug; /* not actually supported, dummy for compile */
@@ -770,7 +766,7 @@
 	return irq;
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, int priority, cpumask_t *mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -783,63 +779,99 @@
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+	static int current_device_vector = FIRST_DYNAMIC_VECTOR;
+	static int current_device_offset;	/* initially 0 */
+	int current_vector;
+	int current_offset;
 	unsigned int old_vector;
-	int cpu;
+	cpumask_t target_cpu_mask;
+	int target_cpu;
+	cpumask_t domain_cpu_mask;
 	struct irq_cfg *cfg;
 
 	BUG_ON((unsigned)irq >= NR_IRQS);
 	cfg = &irq_cfg[irq];
 
-	/* Only try and allocate irqs on cpus that are present */
-	cpus_and(mask, mask, cpu_online_map);
-
 	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
 		return -EBUSY;
 
+	if (priority == IRQ_PRIORITY_NONE) {
+		/* Only try and allocate irqs on cpus that are present */
+		cpus_and(target_cpu_mask, *mask, cpu_online_map);
+
+		current_vector = current_device_vector;
+		current_offset = current_device_offset;
+	} else {
+		cpus_and(target_cpu_mask, *mask, cpu_possible_map);
+		domain_cpu_mask = target_cpu_mask;
+
+		if (priority == IRQ_PRIORITY_HIGH)
+			current_vector = first_fixed_system_vector;
+		else if (priority == IRQ_PRIORITY_LOW)
+			current_vector = FIRST_DYNAMIC_VECTOR - 1;
+		else
+			BUG();
+		current_offset = 0;
+	}
+
 	old_vector = cfg->vector;
 	if (old_vector) {
 		cpumask_t tmp;
-		cpus_and(tmp, cfg->domain, mask);
+		cpus_and(tmp, cfg->domain, target_cpu_mask);
 		if (!cpus_empty(tmp))
 			return 0;
 	}
 
-	for_each_cpu_mask_nr(cpu, mask) {
-		cpumask_t domain, new_mask;
-		int new_cpu;
+	for_each_cpu_mask_nr(target_cpu, target_cpu_mask) {
+		int domain_cpu;
 		int vector, offset;
 
-		domain = vector_allocation_domain(cpu);
-		cpus_and(new_mask, domain, cpu_online_map);
+		if (priority == IRQ_PRIORITY_NONE) {
+			domain_cpu_mask = vector_allocation_domain(target_cpu);
+			cpus_and(domain_cpu_mask, domain_cpu_mask,
+				 cpu_online_map);
+		}
 
 		vector = current_vector;
 		offset = current_offset;
 next:
-		vector += 8;
-		if (vector >= first_system_vector) {
-			/* If we run out of vectors on large boxen, must share them. */
-			offset = (offset + 1) % 8;
-			vector = FIRST_DEVICE_VECTOR + offset;
+		if (priority == IRQ_PRIORITY_HIGH) {
+			if (--vector < FIRST_DYNAMIC_VECTOR)
+				break;
+		} else if (priority == IRQ_PRIORITY_LOW) {
+			if (++vector == first_fixed_system_vector)
+				break;
+		} else {
+			vector += 8;
+			if (vector > last_dynamic_device_vector) {
+				/*
+				 * If we run out of vectors on large boxes,
+				 * must share them.
+				 */
+				offset = (offset + 1) % 8;
+				vector = FIRST_DYNAMIC_VECTOR + offset;
+			}
+			if (unlikely(current_vector == vector))
+				continue;
 		}
-		if (unlikely(current_vector == vector))
-			continue;
 		if (vector == IA32_SYSCALL_VECTOR)
 			goto next;
-		for_each_cpu_mask_nr(new_cpu, new_mask)
-			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+		for_each_cpu_mask_nr(domain_cpu, domain_cpu_mask)
+			if (per_cpu(vector_irq, domain_cpu)[vector] != -1)
 				goto next;
 		/* Found one! */
-		current_vector = vector;
-		current_offset = offset;
+		if (priority == IRQ_PRIORITY_NONE) {
+			current_device_vector = vector;
+			current_device_offset = offset;
+		}
 		if (old_vector) {
 			cfg->move_in_progress = 1;
 			cfg->old_domain = cfg->domain;
 		}
-		for_each_cpu_mask_nr(new_cpu, new_mask)
-			per_cpu(vector_irq, new_cpu)[vector] = irq;
+		for_each_cpu_mask_nr(domain_cpu, domain_cpu_mask)
+			per_cpu(vector_irq, domain_cpu)[vector] = irq;
 		cfg->vector = vector;
-		cfg->domain = domain;
+		cfg->domain = domain_cpu_mask;
 		return 0;
 	}
 	return -ENOSPC;
@@ -851,7 +883,7 @@
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, IRQ_PRIORITY_NONE, &mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
@@ -2256,23 +2288,30 @@
 device_initcall(ioapic_init_sysfs);
 
 /*
- * Dynamic irq allocate and deallocation
+ * Dynamically allocate an irq vector mapping.
  */
-int create_irq(void)
+static int do_create_irq(int priority, cpumask_t *mask)
 {
 	/* Allocate an unused irq */
 	int irq;
 	int new;
 	unsigned long flags;
+	cpumask_t target_cpu_mask;
 
 	irq = -ENOSPC;
 	spin_lock_irqsave(&vector_lock, flags);
+
+	if (mask)
+		target_cpu_mask = *mask;
+	else
+		target_cpu_mask = TARGET_CPUS;
+
 	for (new = (NR_IRQS - 1); new >= 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
 		if (irq_cfg[new].vector != 0)
 			continue;
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, priority, &target_cpu_mask) == 0)
 			irq = new;
 		break;
 	}
@@ -2284,6 +2323,17 @@
 	return irq;
 }
 
+/*
+ * Dynamically allocate an irq device vector mapping.
+ */
+int create_irq(void)
+{
+	return do_create_irq(IRQ_PRIORITY_NONE, NULL);
+}
+
+/*
+ * Free a dynamically allocated irq device vector mapping.
+ */
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
@@ -2299,6 +2349,106 @@
 }
 
 /*
+ * NOP functions
+ */
+static void noop(unsigned int irq)
+{
+}
+
+static unsigned int noop_ret(unsigned int irq)
+{
+	return 0;
+}
+
+static void ack_apic(unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+/*
+ * For dynamic allocation of system vectors where
+ * an ack_APIC_irq() is needed after handling the IRQ
+ */
+static struct irq_chip ack_apic_chip = {
+	.name		= "ack_apic",
+	.startup	= noop_ret,
+	.shutdown	= noop,
+	.enable		= noop,
+	.disable	= noop,
+	.ack		= noop,
+	.mask		= noop,
+	.unmask		= noop,
+	.eoi		= ack_apic,
+	.end		= noop,
+};
+
+/*
+ * Dynamically allocate an irq system vector mapping.
+ * (The irq is not to be shared.)
+ *
+ * After calling this function, the caller is responsible for any needed
+ * calls to:
+ *   set_irq_data(&any_driver_data);
+ *   set_irq_type(irq, IRQ_TYPE...);
+ * Then make the call to request_irq() to create the irqaction:
+ *   request_irq(irq, interrupt_handler, irqflags, "devname", NULL);
+ *   You might consider the flag IRQF_NOBALANCING.
+ */
+int create_irq_system_vector(int priority, cpumask_t *mask, char *irq_name,
+			     int *assigned_vector)
+{
+	unsigned long flags;
+	int irq;
+
+	/* locate an available irq */
+	irq = do_create_irq(priority, mask);
+	if (irq < 0)
+		return irq;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	set_irq_chip_and_handler_name(irq, &ack_apic_chip, handle_percpu_irq,
+				      irq_name);
+
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	*assigned_vector = irq_cfg[irq].vector;
+	return irq;
+}
+EXPORT_SYMBOL(create_irq_system_vector);
+
+/*
+ * Free a dynamically allocated irq system vector mapping.
+ *
+ * Before calling this function, the caller is responsible for calling
+ * free_irq(irq, dev_id);  to free the irqaction.
+ */
+void destroy_irq_system_vector(int irq)
+{
+	unsigned long flags;
+	int cpu;
+
+	if ((unsigned)irq >= NR_IRQS || irq_cfg[irq].vector == 0)
+		return;
+
+#ifdef CONFIG_SMP
+	synchronize_irq(irq);
+#endif
+	dynamic_irq_cleanup(irq);
+	disable_irq(irq);
+
+	spin_lock_irqsave(&vector_lock, flags);
+
+	for_each_cpu_mask_nr(cpu, irq_cfg[irq].domain)
+		per_cpu(vector_irq, cpu)[irq_cfg[irq].vector] = -1;
+
+	irq_cfg[irq].vector = 0;
+	cpus_clear(irq_cfg[irq].domain);
+
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+EXPORT_SYMBOL(destroy_irq_system_vector);
+
+/*
  * MSI message composition
  */
 #ifdef CONFIG_PCI_MSI
@@ -2533,7 +2683,7 @@
 {
 	int irq, ret;
 
-	irq = create_irq();
+	irq = do_create_irq(IRQ_PRIORITY_NONE, NULL);
 	if (irq < 0)
 		return irq;
 
@@ -2571,7 +2721,7 @@
 
 	sub_handle = 0;
 	list_for_each_entry(desc, &dev->msi_list, list) {
-		irq = create_irq();
+		irq = do_create_irq(IRQ_PRIORITY_NONE, NULL);
 		if (irq < 0)
 			return irq;
 #ifdef CONFIG_INTR_REMAP
Index: linuxnext.latest/include/linux/irq.h
===================================================================
--- linuxnext.latest.orig/include/linux/irq.h	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/include/linux/irq.h	2008-08-07 09:46:42.000000000 -0500
@@ -352,10 +352,14 @@
 extern void set_irq_noprobe(unsigned int irq);
 extern void set_irq_probe(unsigned int irq);
 
-/* Handle dynamic irq creation and destruction */
+/* Handle dynamic irq device vector mapping and unmapping */
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
+/* Handle dynamic irq system vector mapping and unmapping */
+extern int create_irq_system_vector(int, cpumask_t *, char *, int *);
+extern void destroy_irq_system_vector(int);
+
 /* Test to see if a driver has successfully requested an irq */
 static inline int irq_has_action(unsigned int irq)
 {
Index: linuxnext.latest/include/asm-x86/irq_vectors.h
===================================================================
--- linuxnext.latest.orig/include/asm-x86/irq_vectors.h	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/include/asm-x86/irq_vectors.h	2008-08-07 09:46:42.000000000 -0500
@@ -91,14 +91,40 @@
 #define LOCAL_TIMER_VECTOR	0xef
 
 /*
+ * The first device or system vector (lowest numbered) available for dynamic
+ * allocation is defined by FIRST_DYNAMIC_VECTOR.
+ *
+ * The last device vector available for dynamic allocation is defined by
+ * last_dynamic_device_vector, which is initially set to
+ * LAST_DYNAMIC_DEVICE_VECTOR.
+ *
+ * The last system vector available for dynamic allocation is defined by
+ * first_fixed_system_vector - 1. The variable first_fixed_system_vector
+ * is initially set to FIRST_FIXED_SYSTEM_VECTOR.
+ *
+ * SGI-UV uses LAST_UV_DYNAMIC_DEVICE_VECTOR to reserve a range of
+ * vectors that falls between the first_fixed_system_vector and
+ * last_dynamic_device_vector for dynamic system vector allocations.
+ */
+#define FIRST_FIXED_SYSTEM_VECTOR		0xfe
+#define LAST_DYNAMIC_DEVICE_VECTOR		FIRST_FIXED_SYSTEM_VECTOR
+#define LAST_UV_DYNAMIC_DEVICE_VECTOR		0xe0
+#define IRQ_PRIORITY_NONE			1
+#define IRQ_PRIORITY_LOW			2
+#define IRQ_PRIORITY_HIGH			3
+
+/*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
  * levels. (0x80 is the syscall vector)
+ *
+ * Device vectors are dynamically allocated as numbers in the range of
+ * FIRST_DYNAMIC_VECTOR to last_dynamic_device_vector (inclusive).
  */
 #ifdef CONFIG_X86_32
-# define FIRST_DEVICE_VECTOR	0x31
+# define FIRST_DYNAMIC_VECTOR	0x31
 #else
-# define FIRST_DEVICE_VECTOR	(IRQ15_VECTOR + 2)
+# define FIRST_DYNAMIC_VECTOR	(IRQ15_VECTOR + 2)
 #endif
 
 #define NR_VECTORS		256
Index: linuxnext.latest/arch/x86/kernel/io_apic_32.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/io_apic_32.c	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/io_apic_32.c	2008-08-07 13:32:37.000000000 -0500
@@ -1165,11 +1165,15 @@
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = {
+	FIRST_DYNAMIC_VECTOR,
+	0
+};
 
 static int __assign_irq_vector(int irq)
 {
-	static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
+	static int current_vector = FIRST_DYNAMIC_VECTOR;
+	static int current_offset;
 	int vector, offset;
 
 	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
@@ -1181,9 +1185,9 @@
 	offset = current_offset;
 next:
 	vector += 8;
-	if (vector >= first_system_vector) {
+	if (vector > last_dynamic_device_vector) {
 		offset = (offset + 1) % 8;
-		vector = FIRST_DEVICE_VECTOR + offset;
+		vector = FIRST_DYNAMIC_VECTOR + offset;
 	}
 	if (vector == current_vector)
 		return -ENOSPC;
@@ -2314,7 +2318,7 @@
 	int i;
 
 	/* Reserve all the system vectors. */
-	for (i = first_system_vector; i < NR_VECTORS; i++)
+	for (i = last_dynamic_device_vector + 1; i < NR_VECTORS; i++)
 		set_bit(i, used_vectors);
 
 	enable_IO_APIC();
@@ -2435,9 +2439,9 @@
 device_initcall(ioapic_init_sysfs);
 
 /*
- * Dynamic irq allocate and deallocation
+ * Dynamically allocate an irq vector mapping.
  */
-int create_irq(void)
+static int do_create_irq(int priority, cpumask_t *mask)
 {
 	/* Allocate an unused irq */
 	int irq, new, vector = 0;
@@ -2464,6 +2468,17 @@
 	return irq;
 }
 
+/*
+ * Dynamically allocate an irq device vector mapping.
+ */
+int create_irq(void)
+{
+	return do_create_irq(IRQ_PRIORITY_NONE, NULL);
+}
+
+/*
+ * Free a dynamically allocated irq device vector mapping.
+ */
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
@@ -2560,7 +2575,7 @@
 {
 	struct msi_msg msg;
 	int irq, ret;
-	irq = create_irq();
+	irq = do_create_irq(IRQ_PRIORITY_NONE, NULL);
 	if (irq < 0)
 		return irq;
 
Index: linuxnext.latest/include/asm-x86/desc.h
===================================================================
--- linuxnext.latest.orig/include/asm-x86/desc.h	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/include/asm-x86/desc.h	2008-08-07 09:46:42.000000000 -0500
@@ -310,22 +310,25 @@
 #define SYS_VECTOR_FREE		0
 #define SYS_VECTOR_ALLOCED	1
 
-extern int first_system_vector;
-extern char system_vectors[];
-
-static inline void alloc_system_vector(int vector)
-{
-	if (system_vectors[vector] == SYS_VECTOR_FREE) {
-		system_vectors[vector] = SYS_VECTOR_ALLOCED;
-		if (first_system_vector > vector)
-			first_system_vector = vector;
+extern int last_dynamic_device_vector;
+extern int first_fixed_system_vector;
+extern char fixed_system_vectors[];
+
+static inline void alloc_fixed_system_vector(int vector)
+{
+	if (fixed_system_vectors[vector] == SYS_VECTOR_FREE) {
+		fixed_system_vectors[vector] = SYS_VECTOR_ALLOCED;
+		if (first_fixed_system_vector > vector)
+			first_fixed_system_vector = vector;
+		if (last_dynamic_device_vector >= vector)
+			last_dynamic_device_vector = vector - 1;
 	} else
 		BUG();
 }
 
 static inline void alloc_intr_gate(unsigned int n, void *addr)
 {
-	alloc_system_vector(n);
+	alloc_fixed_system_vector(n);
 	set_intr_gate(n, addr);
 }
 
Index: linuxnext.latest/arch/x86/kernel/apic_32.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/apic_32.c	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/apic_32.c	2008-08-07 09:46:42.000000000 -0500
@@ -68,9 +68,11 @@
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
-int first_system_vector = 0xfe;
-
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+int last_dynamic_device_vector = LAST_DYNAMIC_DEVICE_VECTOR;
+int first_fixed_system_vector = FIRST_FIXED_SYSTEM_VECTOR;
+char fixed_system_vectors[NR_VECTORS] = {
+	[0 ... NR_VECTORS-1] = SYS_VECTOR_FREE
+};
 
 /*
  * Debug level, exported for io_apic.c
@@ -1361,7 +1363,7 @@
 	 * IRQ0 must be given a fixed assignment and initialized,
 	 * because it's used before the IO-APIC is set up.
 	 */
-	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+	set_intr_gate(FIRST_DYNAMIC_VECTOR, interrupt[0]);
 
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
Index: linuxnext.latest/arch/x86/kernel/irqinit_64.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/irqinit_64.c	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/irqinit_64.c	2008-08-07 09:46:42.000000000 -0500
@@ -22,6 +22,7 @@
 #include <asm/desc.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
+#include <asm/genapic.h>
 
 /*
  * Common place to define all x86 IRQ vectors
@@ -217,6 +218,11 @@
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
+	if (is_uv_system() &&
+	    LAST_UV_DYNAMIC_DEVICE_VECTOR < last_dynamic_device_vector) {
+		last_dynamic_device_vector = LAST_UV_DYNAMIC_DEVICE_VECTOR;
+	}
+
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 }
Index: linuxnext.latest/arch/x86/kernel/vmiclock_32.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/vmiclock_32.c	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/vmiclock_32.c	2008-08-07 09:46:42.000000000 -0500
@@ -81,7 +81,7 @@
 static inline unsigned int vmi_get_timer_vector(void)
 {
 #ifdef CONFIG_X86_IO_APIC
-	return FIRST_DEVICE_VECTOR;
+	return FIRST_DYNAMIC_VECTOR;
 #else
 	return FIRST_EXTERNAL_VECTOR;
 #endif
Index: linuxnext.latest/arch/x86/kernel/apic_64.c
===================================================================
--- linuxnext.latest.orig/arch/x86/kernel/apic_64.c	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/arch/x86/kernel/apic_64.c	2008-08-07 09:46:42.000000000 -0500
@@ -33,6 +33,7 @@
 #include <asm/smp.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
+#include <asm/desc.h>
 #include <asm/hpet.h>
 #include <asm/pgalloc.h>
 #include <asm/nmi.h>
@@ -58,6 +59,13 @@
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
+int last_dynamic_device_vector = LAST_DYNAMIC_DEVICE_VECTOR;
+int first_fixed_system_vector = FIRST_FIXED_SYSTEM_VECTOR;
+char fixed_system_vectors[NR_VECTORS] = {
+	[0 ... NR_VECTORS-1] = SYS_VECTOR_FREE
+};
+
+
 /*
  * Debug level, exported for io_apic.c
  */
Index: linuxnext.latest/kernel/irq/chip.c
===================================================================
--- linuxnext.latest.orig/kernel/irq/chip.c	2008-08-07 09:46:37.000000000 -0500
+++ linuxnext.latest/kernel/irq/chip.c	2008-08-07 09:46:42.000000000 -0500
@@ -78,6 +78,7 @@
 	desc->chip_data = NULL;
 	desc->handle_irq = handle_bad_irq;
 	desc->chip = &no_irq_chip;
+	desc->name = "none";
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific system vectors
  2008-08-11 17:14   ` Alan Mayer
@ 2008-08-11 19:39     ` Eric W. Biederman
  2008-08-11 19:51       ` Ingo Molnar
                         ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Eric W. Biederman @ 2008-08-11 19:39 UTC (permalink / raw)
  To: Alan Mayer
  Cc: Ingo Molnar, jeremy, rusty, suresh.b.siddha, torvalds,
	linux-kernel, Dean Nelson, Cliff Wickman, H. Peter Anvin,
	Thomas Gleixner, Yinghai Lu

Alan Mayer <ajm@sgi.com> writes:

> Okay, here it is as an attachment.  I think my email client is munging it.
> I haven't been able to fix it, apparently.
>
> I, too, would like to know what Eric thinks.

I think arch/x86 is about to fall over from accidental complexity of
the irq handling.  Looking at your problem and the problem of killing
NR_IRQS I spent way to much time playing with it this weekend then
I should have, but I think I have found a path that works and is
fairly easily verifiable.

The short version is we make vector_irq the one repository of knowledge
about what we are doing with vectors.

We create a common factor of assign_irq_vector that looks something like:

bool __grab_irq_vector(struct irq_desc *desc, unsigned vector, cpumask_t new_domain)
{
        /* Must be called with vector lock */
        struct irq_cfg *cfg;
        bool grabbed = false;
        unsigned int old_vector;
        cpumask_t mask;
        int cpu;

        cfg = get_irqp_cfg(irq);
        old_vector = cfg->vector;
        cpus_and(mask, new_domain, cpu_online_map);

        for_each_cpu_mask_nr(cpu, mask) {
		if (per_cpu(vector_irq, cpu)[vector])
                	goto out;
        }
        /* Available reserve it */
        for_each_cpu_mask_nr(cpu, mask)
  	      per_cpu(vector_irq, cpu)[vector] = desc;
        if (cfg->vector) {
        	cfg->move_in_progress;
                cfg->old_domain = cfg->domain;
        }
        cfg->vector = vector;
        cfg->domain = mask;
        grabbed = true;
        
out:
        return grabbed;
}

Then in your allocator for per cpu irqs you can do:
spin_lock(&vector_lock);
for (vector = FIRST_VECTOR; vector != LAST_VECTOR, vector--) {
	if (__grab_irq_vector(desc, CPU_MASK_ALL))
        	goto found;
}
spin_unlock(&vector_lock);

Although I am not at all convinced that dynamic allocation of
the vector number (instead of statically reserving it makes sense).
The only way I can see to guarantee all of the special is to
statically allocate them with a lot of good comments.  I think
the introduction of system_vectors quite likely defeated the
errata work around we have the lapic timer in a separate priority.

Still if we go in for dynamic allocation of the system vectors 
the above looks much simpler and easier to work with than
a lot of other possibilities.

I think used_vectors and system_vectors are data structures that
we need to remove, as their interactions with assign_irq_vector
are not at all well defined or nice.

I think vector_irq should return an irq_desc and have an entry for
all of the static vectors as well (if we are going to do weird
things with dynamic high priority vector allocation, and dynamic
detection of which vectors assign_irq_vector may use).

I have a patch series that gets me 90% of the way there, and the
rest appears easy but I don't have any time to mess with it right
now.  I will try and post it something in the next couple of days.

Eric



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific system vectors
  2008-08-11 19:39     ` Eric W. Biederman
@ 2008-08-11 19:51       ` Ingo Molnar
  2008-08-11 19:55         ` Jeremy Fitzhardinge
  2008-08-11 20:10         ` Eric W. Biederman
  2008-08-11 20:02       ` Alan Mayer
  2008-09-11 15:23       ` [RFC 0/4] dynamically " Dean Nelson
  2 siblings, 2 replies; 35+ messages in thread
From: Ingo Molnar @ 2008-08-11 19:51 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alan Mayer, jeremy, rusty, suresh.b.siddha, torvalds,
	linux-kernel, Dean Nelson, Cliff Wickman, H. Peter Anvin,
	Thomas Gleixner, Yinghai Lu


* Eric W. Biederman <ebiederm@xmission.com> wrote:

> Alan Mayer <ajm@sgi.com> writes:
> 
> > Okay, here it is as an attachment.  I think my email client is munging it.
> > I haven't been able to fix it, apparently.
> >
> > I, too, would like to know what Eric thinks.
> 
> I think arch/x86 is about to fall over from accidental complexity of 
> the irq handling. [...]

it was in that state for many years already ;-) Unification, cleanups of 
other historic messes and the constant push for new hw support just made 
it stand out more visibly. IRQ and APIC code unification is definitely 
the final (and by far hardest) major step of x86 unification.

> [...]  Looking at your problem and the problem of killing NR_IRQS I 
> spent way to much time playing with it this weekend then I should 
> have, but I think I have found a path that works and is fairly easily 
> verifiable.

cool :-)

> I have a patch series that gets me 90% of the way there, and the rest 
> appears easy but I don't have any time to mess with it right now.  I 
> will try and post it something in the next couple of days.

i'm very interested in it, even if it's incomplete and wont build/boot 
at all. So please consider posting your existing incomplete series as an 
RFC right now, maybe we can help finish it sooner than you will find the 
time? We can put it into a new tip/x86/irq-unification branch, without 
merging it into tip/master just yet.

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific system vectors
  2008-08-11 19:51       ` Ingo Molnar
@ 2008-08-11 19:55         ` Jeremy Fitzhardinge
  2008-08-11 20:10         ` Eric W. Biederman
  1 sibling, 0 replies; 35+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-11 19:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric W. Biederman, Alan Mayer, rusty, suresh.b.siddha, torvalds,
	linux-kernel, Dean Nelson, Cliff Wickman, H. Peter Anvin,
	Thomas Gleixner, Yinghai Lu

Ingo Molnar wrote:
> i'm very interested in it, even if it's incomplete and wont build/boot 
> at all. So please consider posting your existing incomplete series as an 
> RFC right now, maybe we can help finish it sooner than you will find the 
> time? We can put it into a new tip/x86/irq-unification branch, without 
> merging it into tip/master just yet.

Seconded.  This is the area which most concerns me wrt Xen dom0 
integration, and getting it right should make the Xen-specific parts 
turn out as minor adjuncts.

    J

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific system vectors
  2008-08-11 19:39     ` Eric W. Biederman
  2008-08-11 19:51       ` Ingo Molnar
@ 2008-08-11 20:02       ` Alan Mayer
  2008-09-11 15:23       ` [RFC 0/4] dynamically " Dean Nelson
  2 siblings, 0 replies; 35+ messages in thread
From: Alan Mayer @ 2008-08-11 20:02 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, jeremy, rusty, suresh.b.siddha, torvalds,
	linux-kernel, Dean Nelson, Cliff Wickman, H. Peter Anvin,
	Thomas Gleixner, Yinghai Lu

Okay, so we'll wait for Eric to send out his patch and work off that.

Eric W. Biederman wrote:
> Alan Mayer <ajm@sgi.com> writes:
> 
>> Okay, here it is as an attachment.  I think my email client is munging it.
>> I haven't been able to fix it, apparently.
>>
>> I, too, would like to know what Eric thinks.
> 
> I think arch/x86 is about to fall over from accidental complexity of
> the irq handling.  Looking at your problem and the problem of killing
> NR_IRQS I spent way to much time playing with it this weekend then
> I should have, but I think I have found a path that works and is
> fairly easily verifiable.

I quite agree.

> 
> The short version is we make vector_irq the one repository of knowledge
> about what we are doing with vectors.
> 
> We create a common factor of assign_irq_vector that looks something like:
> 
> bool __grab_irq_vector(struct irq_desc *desc, unsigned vector, cpumask_t new_domain)
> {
>         /* Must be called with vector lock */
>         struct irq_cfg *cfg;
>         bool grabbed = false;
>         unsigned int old_vector;
>         cpumask_t mask;
>         int cpu;
> 
>         cfg = get_irqp_cfg(irq);
>         old_vector = cfg->vector;
>         cpus_and(mask, new_domain, cpu_online_map);
> 
>         for_each_cpu_mask_nr(cpu, mask) {
> 		if (per_cpu(vector_irq, cpu)[vector])
>                 	goto out;
>         }
>         /* Available reserve it */
>         for_each_cpu_mask_nr(cpu, mask)
>   	      per_cpu(vector_irq, cpu)[vector] = desc;
>         if (cfg->vector) {
>         	cfg->move_in_progress;
>                 cfg->old_domain = cfg->domain;
>         }
>         cfg->vector = vector;
>         cfg->domain = mask;
>         grabbed = true;
>         
> out:
>         return grabbed;
> }
> 
> Then in your allocator for per cpu irqs you can do:
> spin_lock(&vector_lock);
> for (vector = FIRST_VECTOR; vector != LAST_VECTOR, vector--) {
> 	if (__grab_irq_vector(desc, CPU_MASK_ALL))
>         	goto found;
> }
> spin_unlock(&vector_lock);
> 
> Although I am not at all convinced that dynamic allocation of
> the vector number (instead of statically reserving it makes sense).
> The only way I can see to guarantee all of the special is to
> statically allocate them with a lot of good comments.  I think
> the introduction of system_vectors quite likely defeated the
> errata work around we have the lapic timer in a separate priority.

Our system requires some extra system vectors.  They are meaningless on
other systems.  So, rather than statically allocate them for everyone
or clutter the code with ifdef's, we dynamically allocate them.

> 
> Still if we go in for dynamic allocation of the system vectors 
> the above looks much simpler and easier to work with than
> a lot of other possibilities.
> 
> I think used_vectors and system_vectors are data structures that
> we need to remove, as their interactions with assign_irq_vector
> are not at all well defined or nice.
> 
> I think vector_irq should return an irq_desc and have an entry for
> all of the static vectors as well (if we are going to do weird
> things with dynamic high priority vector allocation, and dynamic
> detection of which vectors assign_irq_vector may use).
> 
> I have a patch series that gets me 90% of the way there, and the
> rest appears easy but I don't have any time to mess with it right
> now.  I will try and post it something in the next couple of days.
> 
> Eric
> 

If I can get a sense of where you're headed with your patch and you don't mind,
maybe I can do the last 10%.

		--ajm

-- 
Somebody just stopped callin' you "Angel."
--
Alan J. Mayer
SGI
ajm@sgi.com
WORK: 651-683-3131
HOME: 651-407-0134
--

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] x86_64:  (NEW) Dynamically allocate arch specific system vectors
  2008-08-11 19:51       ` Ingo Molnar
  2008-08-11 19:55         ` Jeremy Fitzhardinge
@ 2008-08-11 20:10         ` Eric W. Biederman
  1 sibling, 0 replies; 35+ messages in thread
From: Eric W. Biederman @ 2008-08-11 20:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Mayer, jeremy, rusty, suresh.b.siddha, torvalds,
	linux-kernel, Dean Nelson, Cliff Wickman, H. Peter Anvin,
	Thomas Gleixner, Yinghai Lu

Ingo Molnar <mingo@elte.hu> writes:

> * Eric W. Biederman <ebiederm@xmission.com> wrote:
>
>> Alan Mayer <ajm@sgi.com> writes:
>> 
>> > Okay, here it is as an attachment.  I think my email client is munging it.
>> > I haven't been able to fix it, apparently.
>> >
>> > I, too, would like to know what Eric thinks.
>> 
>> I think arch/x86 is about to fall over from accidental complexity of 
>> the irq handling. [...]
>
> it was in that state for many years already ;-) Unification, cleanups of 
> other historic messes and the constant push for new hw support just made 
> it stand out more visibly. IRQ and APIC code unification is definitely 
> the final (and by far hardest) major step of x86 unification.

Well I think we are looking at the final couple of straws...

Yes unification (aka porting the infrastructure improvements
to x86_32) and then sharing the code is a piece of it.

>> [...]  Looking at your problem and the problem of killing NR_IRQS I 
>> spent way to much time playing with it this weekend then I should 
>> have, but I think I have found a path that works and is fairly easily 
>> verifiable.
>
> cool :-)
>
>> I have a patch series that gets me 90% of the way there, and the rest 
>> appears easy but I don't have any time to mess with it right now.  I 
>> will try and post it something in the next couple of days.
>
> i'm very interested in it, even if it's incomplete and wont build/boot 
> at all. So please consider posting your existing incomplete series as an 
> RFC right now, maybe we can help finish it sooner than you will find the 
> time? We can put it into a new tip/x86/irq-unification branch, without 
> merging it into tip/master just yet.

Sure. Mostly I just need to allocate a few minutes to post it, later today
or tomorrow hopefully.

Eric



^ permalink raw reply	[flat|nested] 35+ messages in thread

* [RFC 0/4] dynamically allocate arch specific system vectors
  2008-08-11 19:39     ` Eric W. Biederman
  2008-08-11 19:51       ` Ingo Molnar
  2008-08-11 20:02       ` Alan Mayer
@ 2008-09-11 15:23       ` Dean Nelson
  2008-09-11 15:25         ` [RFC 1/4] switch vector_irq[] from irq number to irq_desc pointer Dean Nelson
                           ` (5 more replies)
  2 siblings, 6 replies; 35+ messages in thread
From: Dean Nelson @ 2008-09-11 15:23 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alan Mayer, Ingo Molnar, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu

On Mon, Aug 11, 2008 at 12:39:22PM -0700, Eric W. Biederman wrote:
>
> Although I am not at all convinced that dynamic allocation of
> the vector number (instead of statically reserving it makes sense).

We (SGI) need somewhere around eight vectors.

There are two kernel modules, sgi-gru and sgi-xp (in drivers/misc), that
each need two vectors. And there's the broadcast assist unit (BAU) that is
involved in tlb shootdown on uv, which currently uses statically reserved
vector 0xf8 (UV_BAU_MESSAGE -- see uv_bau_init()). I know of a debugger that
also uses 0xf8 because it was previously available until UV_BAU_MESSAGE came
along. The BAU would be happy with a dynamically allocated system vector.
We have a couple of other things in the works that also need vectors.

All of these eight or so vectors are only meaningful on SGI uv systems.

We'll go with which ever way you decide on this (dynamically allocated or
statically reserved).

But we really need to get something into 2.6.28. So in order to move forward
I'm submitting the following patchset for comment. This set of four patches
represents my take on Eric's suggested approach to supporting dynamically
allocated system vectors (i.e., __grab_irq_vector()).

Eric, is this what you were thinking of? Or did I miss the mark?

Thanks,
Dean

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [RFC 1/4] switch vector_irq[] from irq number to irq_desc pointer
  2008-09-11 15:23       ` [RFC 0/4] dynamically " Dean Nelson
@ 2008-09-11 15:25         ` Dean Nelson
  2008-09-11 15:27         ` [RFC 2/4] introduce dynamically allocated system vectors Dean Nelson
                           ` (4 subsequent siblings)
  5 siblings, 0 replies; 35+ messages in thread
From: Dean Nelson @ 2008-09-11 15:25 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alan Mayer, Ingo Molnar, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu

Change per_cpu variable vector_irq[] from holding an 'int' irq number to
holding a 'struct irq_desc' pointer.

Signed-off-by: Dean Nelson <dcn@sgi.com>

---

Note that this pre-allocates the irq_desc structure before we know whether
a vector will be successfully found. And should it not, the irq_desc structure
is left created with the irq still unallocated. Should someone in the future
attempt to allocate a vector with that same irq and succeed, they will get
the previously allocated irq_desc structure.

Also, I won't claim the changes to arch/x86/xen/irq.c were correctly done, and
I know they weren't tested.

 arch/x86/kernel/io_apic.c     |   32 ++++++++++++++++++--------------
 arch/x86/kernel/irq_32.c      |   14 ++++++--------
 arch/x86/kernel/irq_64.c      |   10 ++++------
 arch/x86/kernel/irqinit_32.c  |   29 +++++++++--------------------
 arch/x86/kernel/irqinit_64.c  |   29 +++++++++--------------------
 arch/x86/kernel/vmiclock_32.c |    2 +-
 arch/x86/xen/irq.c            |    3 ++-
 include/asm-x86/hw_irq.h      |    2 +-
 8 files changed, 50 insertions(+), 71 deletions(-)

Index: linux/arch/x86/kernel/io_apic.c
===================================================================
--- linux.orig/arch/x86/kernel/io_apic.c	2008-09-09 12:57:08.000000000 -0500
+++ linux/arch/x86/kernel/io_apic.c	2008-09-10 09:21:29.000000000 -0500
@@ -1222,6 +1222,7 @@ static int __assign_irq_vector(int irq, 
 	unsigned int old_vector;
 	int cpu;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	cfg = irq_cfg(irq);
 
@@ -1239,6 +1240,8 @@ static int __assign_irq_vector(int irq, 
 			return 0;
 	}
 
+	desc = irq_to_desc_alloc(irq);
+
 	for_each_cpu_mask_nr(cpu, mask) {
 		cpumask_t domain, new_mask;
 		int new_cpu;
@@ -1266,7 +1269,7 @@ next:
 			goto next;
 #endif
 		for_each_cpu_mask_nr(new_cpu, new_mask)
-			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+			if (per_cpu(vector_irq, new_cpu)[vector] != NULL)
 				goto next;
 		/* Found one! */
 		current_vector = vector;
@@ -1276,7 +1279,7 @@ next:
 			cfg->old_domain = cfg->domain;
 		}
 		for_each_cpu_mask_nr(new_cpu, new_mask)
-			per_cpu(vector_irq, new_cpu)[vector] = irq;
+			per_cpu(vector_irq, new_cpu)[vector] = desc;
 		cfg->vector = vector;
 		cfg->domain = domain;
 		return 0;
@@ -1307,7 +1310,7 @@ static void __clear_irq_vector(int irq)
 	vector = cfg->vector;
 	cpus_and(mask, cfg->domain, cpu_online_map);
 	for_each_cpu_mask_nr(cpu, mask)
-		per_cpu(vector_irq, cpu)[vector] = -1;
+		per_cpu(vector_irq, cpu)[vector] = NULL;
 
 	cfg->vector = 0;
 	cpus_clear(cfg->domain);
@@ -1319,23 +1322,26 @@ void __setup_vector_irq(int cpu)
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
 	for_each_irq_cfg(irq, cfg) {
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
-		per_cpu(vector_irq, cpu)[vector] = irq;
+		desc = irq_to_desc(irq);
+		BUG_ON(desc == NULL);
+		per_cpu(vector_irq, cpu)[vector] = desc;
 	}
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
-		irq = per_cpu(vector_irq, cpu)[vector];
-		if (irq < 0)
+		desc = per_cpu(vector_irq, cpu)[vector];
+		if (desc == NULL)
 			continue;
 
-		cfg = irq_cfg(irq);
+		cfg = irq_cfg(desc->irq);
 		if (!cpu_isset(cpu, cfg->domain))
-			per_cpu(vector_irq, cpu)[vector] = -1;
+			per_cpu(vector_irq, cpu)[vector] = NULL;
 	}
 }
 
@@ -2370,16 +2376,14 @@ asmlinkage void smp_irq_move_cleanup_int
 
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		unsigned int irq;
 		struct irq_desc *desc;
 		struct irq_cfg *cfg;
-		irq = __get_cpu_var(vector_irq)[vector];
 
-		desc = irq_to_desc(irq);
-		if (!desc)
+		desc = __get_cpu_var(vector_irq)[vector];
+		if (desc == NULL)
 			continue;
 
-		cfg = irq_cfg(irq);
+		cfg = irq_cfg(desc->irq);
 		spin_lock(&desc->lock);
 		if (!cfg->move_cleanup_count)
 			goto unlock;
@@ -2387,7 +2391,7 @@ asmlinkage void smp_irq_move_cleanup_int
 		if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
 			goto unlock;
 
-		__get_cpu_var(vector_irq)[vector] = -1;
+		__get_cpu_var(vector_irq)[vector] = NULL;
 		cfg->move_cleanup_count--;
 unlock:
 		spin_unlock(&desc->lock);
Index: linux/include/asm-x86/hw_irq.h
===================================================================
--- linux.orig/include/asm-x86/hw_irq.h	2008-09-09 12:57:08.000000000 -0500
+++ linux/include/asm-x86/hw_irq.h	2008-09-09 12:57:13.000000000 -0500
@@ -113,7 +113,7 @@ extern asmlinkage void smp_invalidate_in
 extern void (*const interrupt[NR_VECTORS])(void);
 #endif
 
-typedef int vector_irq_t[NR_VECTORS];
+typedef struct irq_desc *vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
 
 #ifdef CONFIG_X86_IO_APIC
Index: linux/arch/x86/kernel/irq_32.c
===================================================================
--- linux.orig/arch/x86/kernel/irq_32.c	2008-09-09 12:57:08.000000000 -0500
+++ linux/arch/x86/kernel/irq_32.c	2008-09-10 09:20:10.000000000 -0500
@@ -226,26 +226,24 @@ unsigned int do_IRQ(struct pt_regs *regs
 	int overflow;
 	unsigned vector = ~regs->orig_ax;
 	struct irq_desc *desc;
-	unsigned irq;
 
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
-	irq = __get_cpu_var(vector_irq)[vector];
 
 	overflow = check_stack_overflow();
 
-	desc = irq_to_desc(irq);
-	if (unlikely(!desc)) {
-		printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
-					__func__, irq, vector, smp_processor_id());
+	desc = __get_cpu_var(vector_irq)[vector];
+	if (unlikely(desc == NULL)) {
+		printk(KERN_EMERG "%s: cannot handle IRQ vector %#x cpu %d\n",
+					__func__, vector, smp_processor_id());
 		BUG();
 	}
 
-	if (!execute_on_irq_stack(overflow, desc, irq)) {
+	if (!execute_on_irq_stack(overflow, desc, desc->irq)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
-		desc->handle_irq(irq, desc);
+		desc->handle_irq(desc->irq, desc);
 	}
 
 	irq_exit();
Index: linux/arch/x86/kernel/irq_64.c
===================================================================
--- linux.orig/arch/x86/kernel/irq_64.c	2008-09-09 12:57:08.000000000 -0500
+++ linux/arch/x86/kernel/irq_64.c	2008-09-10 09:20:10.000000000 -0500
@@ -213,20 +213,18 @@ asmlinkage unsigned int do_IRQ(struct pt
 
 	/* high bit used in ret_from_ code  */
 	unsigned vector = ~regs->orig_ax;
-	unsigned irq;
 
 	exit_idle();
 	irq_enter();
-	irq = __get_cpu_var(vector_irq)[vector];
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	stack_overflow_check(regs);
 #endif
 
-	desc = irq_to_desc(irq);
-	if (likely(desc))
-		generic_handle_irq_desc(irq, desc);
-	else {
+	desc = __get_cpu_var(vector_irq)[vector];
+	if (likely(desc != NULL)) {
+		generic_handle_irq_desc(desc->irq, desc);
+	} else {
 		if (!disable_apic)
 			ack_APIC_irq();
 
Index: linux/arch/x86/kernel/vmiclock_32.c
===================================================================
--- linux.orig/arch/x86/kernel/vmiclock_32.c	2008-09-09 12:57:08.000000000 -0500
+++ linux/arch/x86/kernel/vmiclock_32.c	2008-09-09 12:57:13.000000000 -0500
@@ -242,7 +242,7 @@ void __init vmi_time_init(void)
 	vmi_time_init_clockevent();
 	setup_irq(0, &vmi_clock_action);
 	for_each_possible_cpu(cpu)
-		per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
+		per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = NULL;
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
Index: linux/arch/x86/xen/irq.c
===================================================================
--- linux.orig/arch/x86/xen/irq.c	2008-09-09 12:57:08.000000000 -0500
+++ linux/arch/x86/xen/irq.c	2008-09-09 12:57:13.000000000 -0500
@@ -27,8 +27,9 @@ static void __init __xen_init_IRQ(void)
 	for(i = 0; i < NR_VECTORS; i++) {
 		int cpu;
 
+		desc = irq_to_desc_alloc(i);
 		for_each_possible_cpu(cpu)
-			per_cpu(vector_irq, cpu)[i] = i;
+			per_cpu(vector_irq, cpu)[i] = desc;
 	}
 
 	xen_init_IRQ();
Index: linux/arch/x86/kernel/irqinit_32.c
===================================================================
--- linux.orig/arch/x86/kernel/irqinit_32.c	2008-09-09 12:57:08.000000000 -0500
+++ linux/arch/x86/kernel/irqinit_32.c	2008-09-09 12:57:13.000000000 -0500
@@ -59,6 +59,8 @@ static struct irqaction fpu_irq = {
 void __init init_ISA_irqs (void)
 {
 	int i;
+	int cpu;
+	unsigned int vector;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	init_bsp_APIC();
@@ -76,6 +78,12 @@ void __init init_ISA_irqs (void)
 		desc->action = NULL;
 		desc->depth = 1;
 
+		vector = IRQ0_VECTOR + i;
+		for_each_cpu_mask_nr(cpu, cpu_possible_map) {
+			BUG_ON(per_cpu(vector_irq, cpu)[vector] != NULL);
+			per_cpu(vector_irq, cpu)[vector] = desc;
+		}
+
 		set_irq_chip_and_handler_name(i, &i8259A_chip,
 					      handle_level_irq, "XT");
 	}
@@ -90,26 +98,7 @@ static struct irqaction irq2 = {
 	.name = "cascade",
 };
 
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... IRQ0_VECTOR - 1] = -1,
-	[IRQ0_VECTOR] = 0,
-	[IRQ1_VECTOR] = 1,
-	[IRQ2_VECTOR] = 2,
-	[IRQ3_VECTOR] = 3,
-	[IRQ4_VECTOR] = 4,
-	[IRQ5_VECTOR] = 5,
-	[IRQ6_VECTOR] = 6,
-	[IRQ7_VECTOR] = 7,
-	[IRQ8_VECTOR] = 8,
-	[IRQ9_VECTOR] = 9,
-	[IRQ10_VECTOR] = 10,
-	[IRQ11_VECTOR] = 11,
-	[IRQ12_VECTOR] = 12,
-	[IRQ13_VECTOR] = 13,
-	[IRQ14_VECTOR] = 14,
-	[IRQ15_VECTOR] = 15,
-	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
+DEFINE_PER_CPU(vector_irq_t, vector_irq);
 
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
Index: linux/arch/x86/kernel/irqinit_64.c
===================================================================
--- linux.orig/arch/x86/kernel/irqinit_64.c	2008-09-09 12:57:08.000000000 -0500
+++ linux/arch/x86/kernel/irqinit_64.c	2008-09-09 12:57:13.000000000 -0500
@@ -114,30 +114,13 @@ static struct irqaction irq2 = {
 	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... IRQ0_VECTOR - 1] = -1,
-	[IRQ0_VECTOR] = 0,
-	[IRQ1_VECTOR] = 1,
-	[IRQ2_VECTOR] = 2,
-	[IRQ3_VECTOR] = 3,
-	[IRQ4_VECTOR] = 4,
-	[IRQ5_VECTOR] = 5,
-	[IRQ6_VECTOR] = 6,
-	[IRQ7_VECTOR] = 7,
-	[IRQ8_VECTOR] = 8,
-	[IRQ9_VECTOR] = 9,
-	[IRQ10_VECTOR] = 10,
-	[IRQ11_VECTOR] = 11,
-	[IRQ12_VECTOR] = 12,
-	[IRQ13_VECTOR] = 13,
-	[IRQ14_VECTOR] = 14,
-	[IRQ15_VECTOR] = 15,
-	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
+DEFINE_PER_CPU(vector_irq_t, vector_irq);
 
 static void __init init_ISA_irqs (void)
 {
 	int i;
+	int cpu;
+	unsigned int vector;
 
 	init_bsp_APIC();
 	init_8259A(0);
@@ -150,6 +133,12 @@ static void __init init_ISA_irqs (void)
 		desc->action = NULL;
 		desc->depth = 1;
 
+		vector = IRQ0_VECTOR + i;
+		for_each_cpu_mask_nr(cpu, cpu_possible_map) {
+			BUG_ON(per_cpu(vector_irq, cpu)[vector] != NULL);
+			per_cpu(vector_irq, cpu)[vector] = desc;
+		}
+
 		/*
 		 * 16 old-style INTA-cycle interrupts:
 		 */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [RFC 2/4] introduce dynamically allocated system vectors
  2008-09-11 15:23       ` [RFC 0/4] dynamically " Dean Nelson
  2008-09-11 15:25         ` [RFC 1/4] switch vector_irq[] from irq number to irq_desc pointer Dean Nelson
@ 2008-09-11 15:27         ` Dean Nelson
  2008-09-14 15:39           ` Ingo Molnar
  2008-09-14 15:46           ` Ingo Molnar
  2008-09-11 15:28         ` [RFC 3/4] switch static system vector allocation to use vector_irq[] Dean Nelson
                           ` (3 subsequent siblings)
  5 siblings, 2 replies; 35+ messages in thread
From: Dean Nelson @ 2008-09-11 15:27 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alan Mayer, Ingo Molnar, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu

Introduce the dynamic allocation and deallocation of system vectors which
are mapped to irq numbers allowing the use of request_irq()/free_irq().

Signed-off-by: Dean Nelson <dcn@sgi.com>

---

 arch/x86/kernel/apic.c        |    3 
 arch/x86/kernel/io_apic.c     |  264 +++++++++++++++++++++++++++++++++-----
 arch/x86/kernel/irqinit_64.c  |    4 
 include/asm-x86/desc.h        |   13 +
 include/asm-x86/irq_vectors.h |    1 
 include/linux/irq.h           |   13 +
 6 files changed, 258 insertions(+), 40 deletions(-)

Index: linux/arch/x86/kernel/io_apic.c
===================================================================
--- linux.orig/arch/x86/kernel/io_apic.c	2008-09-10 12:08:46.000000000 -0500
+++ linux/arch/x86/kernel/io_apic.c	2008-09-11 07:17:33.000000000 -0500
@@ -1205,7 +1205,34 @@ void unlock_vector_lock(void)
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+bool __grab_irq_vector(struct irq_desc *desc, unsigned int vector,
+		       cpumask_t *new_domain_mask)
+{
+	/* Must be called with vector lock */
+	struct irq_cfg *cfg;
+	int cpu;
+
+	for_each_cpu_mask_nr(cpu, *new_domain_mask) {
+		if (per_cpu(vector_irq, cpu)[vector] != NULL)
+			return false;
+	}
+
+	/* Available reserve it */
+	for_each_cpu_mask_nr(cpu, *new_domain_mask)
+		per_cpu(vector_irq, cpu)[vector] = desc;
+
+	cfg = irq_cfg(desc->irq);
+	if (cfg->vector) {
+		cfg->move_in_progress = 1;
+		cfg->old_domain = cfg->domain;
+	}
+	cfg->vector = vector;
+	cfg->domain = *new_domain_mask;
+
+	return true;
+}
+
+static int __assign_irq_vector(int irq, cpumask_t *mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1219,42 +1246,40 @@ static int __assign_irq_vector(int irq, 
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
-	unsigned int old_vector;
+	cpumask_t target_cpus_mask;
 	int cpu;
 	struct irq_cfg *cfg;
 	struct irq_desc *desc;
 
 	cfg = irq_cfg(irq);
 
-	/* Only try and allocate irqs on cpus that are present */
-	cpus_and(mask, mask, cpu_online_map);
-
 	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
 		return -EBUSY;
 
-	old_vector = cfg->vector;
-	if (old_vector) {
+	/* Only try and allocate irqs on cpus that are present */
+	cpus_and(target_cpus_mask, *mask, cpu_online_map);
+
+	if (cfg->vector) {
 		cpumask_t tmp;
-		cpus_and(tmp, cfg->domain, mask);
+		cpus_and(tmp, cfg->domain, target_cpus_mask);
 		if (!cpus_empty(tmp))
 			return 0;
 	}
 
 	desc = irq_to_desc_alloc(irq);
 
-	for_each_cpu_mask_nr(cpu, mask) {
-		cpumask_t domain, new_mask;
-		int new_cpu;
+	for_each_cpu_mask_nr(cpu, target_cpus_mask) {
+		cpumask_t domain, new_domain_mask;
 		int vector, offset;
 
 		domain = vector_allocation_domain(cpu);
-		cpus_and(new_mask, domain, cpu_online_map);
+		cpus_and(new_domain_mask, domain, cpu_online_map);
 
 		vector = current_vector;
 		offset = current_offset;
 next:
 		vector += 8;
-		if (vector >= first_system_vector) {
+		if (vector > last_device_vector) {
 			/* If we run out of vectors on large boxen, must share them. */
 			offset = (offset + 1) % 8;
 			vector = FIRST_DEVICE_VECTOR + offset;
@@ -1268,20 +1293,12 @@ next:
 		if (vector == SYSCALL_VECTOR)
 			goto next;
 #endif
-		for_each_cpu_mask_nr(new_cpu, new_mask)
-			if (per_cpu(vector_irq, new_cpu)[vector] != NULL)
-				goto next;
+		if (!__grab_irq_vector(desc, vector, &new_domain_mask))
+			goto next;
+
 		/* Found one! */
 		current_vector = vector;
 		current_offset = offset;
-		if (old_vector) {
-			cfg->move_in_progress = 1;
-			cfg->old_domain = cfg->domain;
-		}
-		for_each_cpu_mask_nr(new_cpu, new_mask)
-			per_cpu(vector_irq, new_cpu)[vector] = desc;
-		cfg->vector = vector;
-		cfg->domain = domain;
 		return 0;
 	}
 	return -ENOSPC;
@@ -1293,11 +1310,51 @@ static int assign_irq_vector(int irq, cp
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, &mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
+static int __assign_irq_system_vector(int irq, cpumask_t *mask, int priority)
+{
+	int vector;
+	cpumask_t target_cpus_mask;
+	int cpu;
+	cpumask_t domain;
+	cpumask_t new_domain_mask = CPU_MASK_NONE;
+	struct irq_desc *desc;
+
+	if (priority == IRQ_PRIORITY_HIGH)
+		vector = first_static_system_vector;
+	else if (priority == IRQ_PRIORITY_LOW)
+		vector = FIRST_DEVICE_VECTOR - 1;
+	else
+		BUG();
+
+	cpus_and(target_cpus_mask, *mask, cpu_possible_map);
+	for_each_cpu_mask_nr(cpu, target_cpus_mask) {
+		domain = vector_allocation_domain(cpu);
+		cpus_and(domain, domain, cpu_possible_map);
+		cpus_or(new_domain_mask, new_domain_mask, domain);
+	}
+
+	desc = irq_to_desc_alloc(irq);
+
+	do {
+		if (priority == IRQ_PRIORITY_HIGH) {
+			if (--vector < FIRST_DEVICE_VECTOR)
+				return -ENOSPC;
+		} else {	/* IRQ_PRIORITY_LOW */
+			if (++vector == first_static_system_vector)
+				return -ENOSPC;
+		}
+
+	} while (!__grab_irq_vector(desc, vector, &new_domain_mask));
+
+	/* found one */
+	return 0;
+}
+
 static void __clear_irq_vector(int irq)
 {
 	struct irq_cfg *cfg;
@@ -3045,21 +3102,22 @@ static int __init ioapic_init_sysfs(void
 
 device_initcall(ioapic_init_sysfs);
 
-/*
- * Dynamic irq allocate and deallocation
- */
-unsigned int create_irq_nr(unsigned int irq_want)
+#define DEVICE_VECTOR	1
+#define SYSTEM_VECTOR	2
+
+static unsigned int __create_irq_nr(int vector_type, unsigned int irq_want,
+				    cpumask_t *mask, int priority)
 {
 	/* Allocate an unused irq */
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
 	struct irq_cfg *cfg_new;
+	int ret;
 
 #ifndef CONFIG_HAVE_SPARSE_IRQ
 	irq_want = nr_irqs - 1;
 #endif
-
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
 	for (new = irq_want; new > 0; new--) {
@@ -3071,18 +3129,34 @@ unsigned int create_irq_nr(unsigned int 
 		/* check if need to create one */
 		if (!cfg_new)
 			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (vector_type == DEVICE_VECTOR)
+			ret = __assign_irq_vector(new, mask);
+		else
+			ret = __assign_irq_system_vector(new, mask, priority);
+
+		if (ret == 0)
 			irq = new;
 		break;
 	}
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	if (irq > 0) {
+	if (irq > 0)
 		dynamic_irq_init(irq);
-	}
+
 	return irq;
 }
 
+unsigned int create_irq_nr(unsigned int irq_want)
+{
+	cpumask_t mask = TARGET_CPUS;
+
+	return __create_irq_nr(DEVICE_VECTOR, irq_want, &mask,
+			       IRQ_PRIORITY_LOW);
+}
+
+/*
+ * Dynamic irq device vector allocation.
+ */
 int create_irq(void)
 {
 	int irq;
@@ -3095,6 +3169,9 @@ int create_irq(void)
 	return irq;
 }
 
+/*
+ * Dynamic irq device vector deallocation.
+ */
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
@@ -3109,6 +3186,127 @@ void destroy_irq(unsigned int irq)
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
+static void noop(unsigned int irq)
+{
+}
+
+static unsigned int noop_ret(unsigned int irq)
+{
+	return 0;
+}
+
+static void ack_apic(unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+static struct irq_chip ack_apic_chip = {
+	.name		= "ack_apic",
+	.startup	= noop_ret,
+	.shutdown	= noop,
+	.enable		= noop,
+	.disable	= noop,
+	.ack		= noop,
+	.mask		= noop,
+	.unmask		= noop,
+	.eoi		= ack_apic,
+	.end		= noop,
+};
+
+unsigned int create_irq_system_vector_nr(unsigned int irq_want, cpumask_t *mask,
+					 int priority)
+{
+	return __create_irq_nr(SYSTEM_VECTOR, irq_want, mask, priority);
+}
+
+/*
+ * Dynamic irq system vector allocation.
+ */
+unsigned int create_irq_system_vector(cpumask_t *mask, int priority,
+				      char *irq_name, int *assigned_vector)
+{
+	unsigned long flags;
+	struct irq_cfg *cfg;
+	int irq;
+
+	/* allocate an available irq and vector mapping */
+	irq = create_irq_system_vector_nr(nr_irqs - 1, mask, priority);
+	if (irq == 0)
+		return -1;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	set_irq_chip_and_handler_name(irq, &ack_apic_chip, handle_percpu_irq,
+				      irq_name);
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	cfg = irq_cfg(irq);
+	*assigned_vector = cfg->vector;
+	return irq;
+}
+EXPORT_SYMBOL(create_irq_system_vector);
+
+/*
+ * Dynamic irq system vector deallocation.
+ */
+void destroy_irq_system_vector(unsigned int irq)
+{
+	unsigned long flags;
+	struct irq_cfg *cfg;
+	int cpu;
+
+	if (irq >= nr_irqs)
+		return;
+	cfg = irq_cfg(irq);
+	if (cfg->vector == 0)
+		return;
+
+#ifdef CONFIG_SMP
+	synchronize_irq(irq);
+#endif
+	dynamic_irq_cleanup(irq);
+	disable_irq(irq);
+
+	spin_lock_irqsave(&vector_lock, flags);
+
+	for_each_cpu_mask_nr(cpu, cfg->domain)
+		per_cpu(vector_irq, cpu)[cfg->vector] = NULL;
+
+	cfg->vector = 0;
+	cpus_clear(cfg->domain);
+
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+EXPORT_SYMBOL(destroy_irq_system_vector);
+
+int reserve_system_vectors(int number)
+{
+	unsigned long flags;
+	int new_last_device_vector;
+	int vector;
+	int cpu;
+	int ret = -EBUSY;
+
+	spin_lock_irqsave(&vector_lock, flags);
+
+	new_last_device_vector = last_device_vector - number;
+	if (new_last_device_vector < MIN_LAST_DEVICE_VECTOR)
+		goto out;
+
+	for (vector = last_device_vector; vector > new_last_device_vector;
+	     vector--) {
+		for_each_cpu_mask_nr(cpu, cpu_possible_map) {
+			if (per_cpu(vector_irq, cpu)[vector] != NULL)
+				goto out;
+		}
+	}
+
+	last_device_vector = new_last_device_vector;
+	ret = 0;
+out:
+	spin_unlock_irqrestore(&vector_lock, flags);
+	return ret;
+}
+
 /*
  * MSI message composition
  */
Index: linux/include/linux/irq.h
===================================================================
--- linux.orig/include/linux/irq.h	2008-09-10 12:08:46.000000000 -0500
+++ linux/include/linux/irq.h	2008-09-11 06:53:16.000000000 -0500
@@ -390,11 +390,22 @@ set_irq_chained_handler(unsigned int irq
 extern void set_irq_noprobe(unsigned int irq);
 extern void set_irq_probe(unsigned int irq);
 
-/* Handle dynamic irq creation and destruction */
+/* Handle dynamic irq device vector allocation and deallocation */
 extern unsigned int create_irq_nr(unsigned int irq_want);
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
+/* Handle dynamic irq system vector allocation and deallocation */
+extern unsigned int create_irq_system_vector(cpumask_t *mask, int priority,
+					     char *irq_name,
+					     int *assigned_vector);
+#define IRQ_PRIORITY_LOW	1
+#define IRQ_PRIORITY_HIGH	2
+
+extern void destroy_irq_system_vector(unsigned int irq);
+
+extern int reserve_system_vectors(int number);
+
 /* Test to see if a driver has successfully requested an irq */
 static inline int irq_has_action(unsigned int irq)
 {
Index: linux/arch/x86/kernel/apic.c
===================================================================
--- linux.orig/arch/x86/kernel/apic.c	2008-09-10 12:08:46.000000000 -0500
+++ linux/arch/x86/kernel/apic.c	2008-09-11 06:42:34.000000000 -0500
@@ -116,7 +116,8 @@ static int disable_apic_timer __cpuinitd
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
-int first_system_vector = 0xfe;
+int first_static_system_vector = 0xfe;
+int last_device_vector = 0xfd;
 
 char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
 
Index: linux/include/asm-x86/desc.h
===================================================================
--- linux.orig/include/asm-x86/desc.h	2008-09-10 12:08:46.000000000 -0500
+++ linux/include/asm-x86/desc.h	2008-09-11 06:42:34.000000000 -0500
@@ -323,22 +323,25 @@ static inline void set_intr_gate(unsigne
 #define SYS_VECTOR_FREE		0
 #define SYS_VECTOR_ALLOCED	1
 
-extern int first_system_vector;
+extern int first_static_system_vector;
+extern int last_device_vector;
 extern char system_vectors[];
 
-static inline void alloc_system_vector(int vector)
+static inline void alloc_static_system_vector(int vector)
 {
 	if (system_vectors[vector] == SYS_VECTOR_FREE) {
 		system_vectors[vector] = SYS_VECTOR_ALLOCED;
-		if (first_system_vector > vector)
-			first_system_vector = vector;
+		if (first_static_system_vector > vector)
+			first_static_system_vector = vector;
+		if (last_device_vector > vector - 1)
+			last_device_vector = vector - 1;
 	} else
 		BUG();
 }
 
 static inline void alloc_intr_gate(unsigned int n, void *addr)
 {
-	alloc_system_vector(n);
+	alloc_static_system_vector(n);
 	set_intr_gate(n, addr);
 }
 
Index: linux/include/asm-x86/irq_vectors.h
===================================================================
--- linux.orig/include/asm-x86/irq_vectors.h	2008-09-05 08:38:48.000000000 -0500
+++ linux/include/asm-x86/irq_vectors.h	2008-09-11 07:14:54.000000000 -0500
@@ -92,6 +92,7 @@
  * levels. (0x80 is the syscall vector)
  */
 #define FIRST_DEVICE_VECTOR	(IRQ15_VECTOR + 2)
+#define MIN_LAST_DEVICE_VECTOR	(LOCAL_TIMER_VECTOR - 16)
 
 #define NR_VECTORS		256
 
Index: linux/arch/x86/kernel/irqinit_64.c
===================================================================
--- linux.orig/arch/x86/kernel/irqinit_64.c	2008-09-09 12:57:13.000000000 -0500
+++ linux/arch/x86/kernel/irqinit_64.c	2008-09-11 07:21:41.000000000 -0500
@@ -22,6 +22,7 @@
 #include <asm/desc.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
+#include <asm/genapic.h>
 
 /*
  * Common place to define all x86 IRQ vectors
@@ -202,6 +203,9 @@ void __init native_init_IRQ(void)
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
+	if (is_uv_system())
+		reserve_system_vectors(8);
+
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 }

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [RFC 3/4] switch static system vector allocation to use vector_irq[]
  2008-09-11 15:23       ` [RFC 0/4] dynamically " Dean Nelson
  2008-09-11 15:25         ` [RFC 1/4] switch vector_irq[] from irq number to irq_desc pointer Dean Nelson
  2008-09-11 15:27         ` [RFC 2/4] introduce dynamically allocated system vectors Dean Nelson
@ 2008-09-11 15:28         ` Dean Nelson
  2008-09-11 15:29         ` [RFC 4/4] switch non-standard SYSCALL_VECTOR " Dean Nelson
                           ` (2 subsequent siblings)
  5 siblings, 0 replies; 35+ messages in thread
From: Dean Nelson @ 2008-09-11 15:28 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alan Mayer, Ingo Molnar, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu

Replace the current use of system_vectors[] for the allocation of static
system vectors by also using the per_cpu variable vector_irq[].

Signed-off-by: Dean Nelson <dcn@sgi.com>

---

 arch/x86/kernel/apic.c    |    2 --
 arch/x86/kernel/io_apic.c |   22 ++++++++++++----------
 arch/x86/kernel/irq_32.c  |    2 +-
 arch/x86/kernel/irq_64.c  |    2 +-
 include/asm-x86/desc.h    |   21 +++++++++++++--------
 include/asm-x86/irq.h     |    2 ++
 include/linux/irq.h       |    5 +++++
 7 files changed, 34 insertions(+), 22 deletions(-)

Index: linux/arch/x86/kernel/apic.c
===================================================================
--- linux.orig/arch/x86/kernel/apic.c	2008-09-10 12:09:43.000000000 -0500
+++ linux/arch/x86/kernel/apic.c	2008-09-10 12:10:35.000000000 -0500
@@ -119,8 +119,6 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok
 int first_static_system_vector = 0xfe;
 int last_device_vector = 0xfd;
 
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
-
 /*
  * Debug level, exported for io_apic.c
  */
Index: linux/include/asm-x86/desc.h
===================================================================
--- linux.orig/include/asm-x86/desc.h	2008-09-10 12:09:43.000000000 -0500
+++ linux/include/asm-x86/desc.h	2008-09-10 12:10:35.000000000 -0500
@@ -6,6 +6,7 @@
 #include <asm/ldt.h>
 #include <asm/mmu.h>
 #include <linux/smp.h>
+#include <linux/irq.h>
 
 static inline void fill_ldt(struct desc_struct *desc,
 			    const struct user_desc *info)
@@ -329,14 +330,18 @@ extern char system_vectors[];
 
 static inline void alloc_static_system_vector(int vector)
 {
-	if (system_vectors[vector] == SYS_VECTOR_FREE) {
-		system_vectors[vector] = SYS_VECTOR_ALLOCED;
-		if (first_static_system_vector > vector)
-			first_static_system_vector = vector;
-		if (last_device_vector > vector - 1)
-			last_device_vector = vector - 1;
-	} else
-		BUG();
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	ret = __grab_irq_vector(NON_IRQ_DESC, vector, &cpu_possible_map);
+	BUG_ON(ret == false);
+
+	if (first_static_system_vector > vector)
+		first_static_system_vector = vector;
+	if (last_device_vector > vector - 1)
+		last_device_vector = vector - 1;
+	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
 static inline void alloc_intr_gate(unsigned int n, void *addr)
Index: linux/arch/x86/kernel/io_apic.c
===================================================================
--- linux.orig/arch/x86/kernel/io_apic.c	2008-09-10 12:09:43.000000000 -0500
+++ linux/arch/x86/kernel/io_apic.c	2008-09-10 14:17:12.000000000 -0500
@@ -70,7 +70,7 @@
 int sis_apic_bug = -1;
 
 static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+DEFINE_SPINLOCK(vector_lock);
 
 /*
  * # of IRQ routing registers
@@ -1221,13 +1221,15 @@ bool __grab_irq_vector(struct irq_desc *
 	for_each_cpu_mask_nr(cpu, *new_domain_mask)
 		per_cpu(vector_irq, cpu)[vector] = desc;
 
-	cfg = irq_cfg(desc->irq);
-	if (cfg->vector) {
-		cfg->move_in_progress = 1;
-		cfg->old_domain = cfg->domain;
+	if (desc != NON_IRQ_DESC) {
+		cfg = irq_cfg(desc->irq);
+		if (cfg->vector) {
+			cfg->move_in_progress = 1;
+			cfg->old_domain = cfg->domain;
+		}
+		cfg->vector = vector;
+		cfg->domain = *new_domain_mask;
 	}
-	cfg->vector = vector;
-	cfg->domain = *new_domain_mask;
 
 	return true;
 }
@@ -1387,13 +1389,13 @@ void __setup_vector_irq(int cpu)
 			continue;
 		vector = cfg->vector;
 		desc = irq_to_desc(irq);
-		BUG_ON(desc == NULL);
+		BUG_ON(desc == NULL || desc == NON_IRQ_DESC);
 		per_cpu(vector_irq, cpu)[vector] = desc;
 	}
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		desc = per_cpu(vector_irq, cpu)[vector];
-		if (desc == NULL)
+		if (desc == NULL || desc == NON_IRQ_DESC)
 			continue;
 
 		cfg = irq_cfg(desc->irq);
@@ -2437,7 +2439,7 @@ asmlinkage void smp_irq_move_cleanup_int
 		struct irq_cfg *cfg;
 
 		desc = __get_cpu_var(vector_irq)[vector];
-		if (desc == NULL)
+		if (desc == NULL || desc == NON_IRQ_DESC)
 			continue;
 
 		cfg = irq_cfg(desc->irq);
Index: linux/include/linux/irq.h
===================================================================
--- linux.orig/include/linux/irq.h	2008-09-10 12:09:43.000000000 -0500
+++ linux/include/linux/irq.h	2008-09-10 12:10:35.000000000 -0500
@@ -390,6 +390,11 @@ set_irq_chained_handler(unsigned int irq
 extern void set_irq_noprobe(unsigned int irq);
 extern void set_irq_probe(unsigned int irq);
 
+extern bool __grab_irq_vector(struct irq_desc *desc, unsigned int vector,
+			      cpumask_t *new_domain_mask);
+
+#define NON_IRQ_DESC	((struct irq_desc *)-1UL)
+
 /* Handle dynamic irq device vector allocation and deallocation */
 extern unsigned int create_irq_nr(unsigned int irq_want);
 extern int create_irq(void);
Index: linux/arch/x86/kernel/irq_32.c
===================================================================
--- linux.orig/arch/x86/kernel/irq_32.c	2008-09-10 12:08:37.000000000 -0500
+++ linux/arch/x86/kernel/irq_32.c	2008-09-10 12:10:35.000000000 -0500
@@ -234,7 +234,7 @@ unsigned int do_IRQ(struct pt_regs *regs
 	overflow = check_stack_overflow();
 
 	desc = __get_cpu_var(vector_irq)[vector];
-	if (unlikely(desc == NULL)) {
+	if (unlikely(desc == NULL || desc == NON_IRQ_DESC)) {
 		printk(KERN_EMERG "%s: cannot handle IRQ vector %#x cpu %d\n",
 					__func__, vector, smp_processor_id());
 		BUG();
Index: linux/arch/x86/kernel/irq_64.c
===================================================================
--- linux.orig/arch/x86/kernel/irq_64.c	2008-09-10 12:08:37.000000000 -0500
+++ linux/arch/x86/kernel/irq_64.c	2008-09-10 12:10:35.000000000 -0500
@@ -222,7 +222,7 @@ asmlinkage unsigned int do_IRQ(struct pt
 #endif
 
 	desc = __get_cpu_var(vector_irq)[vector];
-	if (likely(desc != NULL)) {
+	if (likely(desc != NULL && desc != NON_IRQ_DESC)) {
 		generic_handle_irq_desc(desc->irq, desc);
 	} else {
 		if (!disable_apic)
Index: linux/include/asm-x86/irq.h
===================================================================
--- linux.orig/include/asm-x86/irq.h	2008-09-10 12:07:29.000000000 -0500
+++ linux/include/asm-x86/irq.h	2008-09-10 14:22:12.000000000 -0500
@@ -47,4 +47,6 @@ extern void native_init_IRQ(void);
 /* Interrupt vector management */
 extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
 
+extern spinlock_t vector_lock;
+
 #endif /* ASM_X86__IRQ_H */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [RFC 4/4] switch non-standard SYSCALL_VECTOR allocation to use vector_irq[]
  2008-09-11 15:23       ` [RFC 0/4] dynamically " Dean Nelson
                           ` (2 preceding siblings ...)
  2008-09-11 15:28         ` [RFC 3/4] switch static system vector allocation to use vector_irq[] Dean Nelson
@ 2008-09-11 15:29         ` Dean Nelson
  2008-09-14 15:40           ` Ingo Molnar
  2008-09-14 15:42           ` Ingo Molnar
  2008-09-11 20:04         ` [RFC 0/4] dynamically allocate arch specific system vectors H. Peter Anvin
  2008-09-14 15:35         ` Ingo Molnar
  5 siblings, 2 replies; 35+ messages in thread
From: Dean Nelson @ 2008-09-11 15:29 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alan Mayer, Ingo Molnar, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu

Replace the current use of used_vectors[] for the allocation of a non-standard
SYSCALL_VECTOR by also using the per_cpu variable vector_irq[].

Signed-off-by: Dean Nelson <dcn@sgi.com>

---

 arch/x86/kernel/traps_32.c            |   17 +++++++++++------
 drivers/lguest/interrupts_and_traps.c |   28 ++++++++++++++++++++++------
 include/asm-x86/irq.h                 |    3 ---
 3 files changed, 33 insertions(+), 15 deletions(-)

Index: linux/arch/x86/kernel/traps_32.c
===================================================================
--- linux.orig/arch/x86/kernel/traps_32.c	2008-09-10 14:25:23.000000000 -0500
+++ linux/arch/x86/kernel/traps_32.c	2008-09-10 14:25:28.000000000 -0500
@@ -63,9 +63,6 @@
 
 #include "mach_traps.h"
 
-DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
-
 asmlinkage int system_call(void);
 
 /* Do we ignore FPU interrupts ? */
@@ -1189,6 +1186,8 @@ asmlinkage void math_emulate(long arg)
 void __init trap_init(void)
 {
 	int i;
+	bool ret;
+	unsigned long flags;
 
 #ifdef CONFIG_EISA
 	void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1236,10 +1235,16 @@ void __init trap_init(void)
 	set_system_gate(SYSCALL_VECTOR, &system_call);
 
 	/* Reserve all the builtin and the syscall vector: */
-	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
-		set_bit(i, used_vectors);
+	spin_lock_irqsave(&vector_lock, flags);
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+		ret =__grab_irq_vector(NON_IRQ_DESC, i, &cpu_possible_map);
+		BUG_ON(ret == false);
+	}
 
-	set_bit(SYSCALL_VECTOR, used_vectors);
+	ret = __grab_irq_vector(NON_IRQ_DESC, SYSCALL_VECTOR,
+				&cpu_possible_map);
+	BUG_ON(ret == false);
+	spin_unlock_irqrestore(&vector_lock, flags);
 
 	/*
 	 * Should be a barrier for any external CPU state:
Index: linux/include/asm-x86/irq.h
===================================================================
--- linux.orig/include/asm-x86/irq.h	2008-09-10 14:25:23.000000000 -0500
+++ linux/include/asm-x86/irq.h	2008-09-10 14:25:28.000000000 -0500
@@ -44,9 +44,6 @@ extern unsigned int do_IRQ(struct pt_reg
 extern void init_IRQ(void);
 extern void native_init_IRQ(void);
 
-/* Interrupt vector management */
-extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
-
 extern spinlock_t vector_lock;
 
 #endif /* ASM_X86__IRQ_H */
Index: linux/drivers/lguest/interrupts_and_traps.c
===================================================================
--- linux.orig/drivers/lguest/interrupts_and_traps.c	2008-09-10 14:25:23.000000000 -0500
+++ linux/drivers/lguest/interrupts_and_traps.c	2008-09-10 14:25:28.000000000 -0500
@@ -221,19 +221,35 @@ bool check_syscall_vector(struct lguest 
 
 int init_interrupts(void)
 {
+	unsigned long flags;
+	bool ret;
+
 	/* If they want some strange system call vector, reserve it now */
-	if (syscall_vector != SYSCALL_VECTOR
-	    && test_and_set_bit(syscall_vector, used_vectors)) {
-		printk("lg: couldn't reserve syscall %u\n", syscall_vector);
-		return -EBUSY;
+	if (syscall_vector != SYSCALL_VECTOR) {
+		spin_lock_irqsave(&vector_lock, flags);
+		ret = __grab_irq_vector(NON_IRQ_DESC, syscall_vector,
+					&cpu_possible_map);
+		spin_unlock_irqrestore(&vector_lock, flags);
+		if (ret == false) {
+			printk("lg: couldn't reserve syscall %u\n",
+			       syscall_vector);
+			return -EBUSY;
+		}
 	}
 	return 0;
 }
 
 void free_interrupts(void)
 {
-	if (syscall_vector != SYSCALL_VECTOR)
-		clear_bit(syscall_vector, used_vectors);
+	int cpu;
+
+	if (syscall_vector != SYSCALL_VECTOR) {
+		for_each_cpu_mask_nr(cpu, cpu_possible_map) {
+			BUG_ON(per_cpu(vector_irq, cpu)[syscall_vector] !=
+			       NON_IRQ_DESC);
+			per_cpu(vector_irq, cpu)[syscall_vector] = NULL;
+		}
+	}
 }
 
 /*H:220 Now we've got the routines to deliver interrupts, delivering traps like

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-11 15:23       ` [RFC 0/4] dynamically " Dean Nelson
                           ` (3 preceding siblings ...)
  2008-09-11 15:29         ` [RFC 4/4] switch non-standard SYSCALL_VECTOR " Dean Nelson
@ 2008-09-11 20:04         ` H. Peter Anvin
  2008-09-12 11:46           ` Dean Nelson
  2008-09-14 15:35         ` Ingo Molnar
  5 siblings, 1 reply; 35+ messages in thread
From: H. Peter Anvin @ 2008-09-11 20:04 UTC (permalink / raw)
  To: Dean Nelson
  Cc: Eric W. Biederman, Alan Mayer, Ingo Molnar, jeremy, rusty,
	suresh.b.siddha, torvalds, linux-kernel, Thomas Gleixner,
	Yinghai Lu

Dean Nelson wrote:
> 
> We (SGI) need somewhere around eight vectors.
> 
> There are two kernel modules, sgi-gru and sgi-xp (in drivers/misc), that
> each need two vectors. And there's the broadcast assist unit (BAU) that is
> involved in tlb shootdown on uv, which currently uses statically reserved
> vector 0xf8 (UV_BAU_MESSAGE -- see uv_bau_init()). I know of a debugger that
> also uses 0xf8 because it was previously available until UV_BAU_MESSAGE came
> along. The BAU would be happy with a dynamically allocated system vector.
> We have a couple of other things in the works that also need vectors.
> 
> All of these eight or so vectors are only meaningful on SGI uv systems.
> 

Are these kernel-internal vectors, or exposed to userspace (i.e. the INT 
instruction works in userspace)?  From what I'm gathering, I think this 
is the former.

There are the occational user who wants fixed user-space addressible 
vectors, effectively as secondary system call entry vectors (lguest is 
the only in-kernel user that I know of); those need to be static and 
non-conflicting, which is kind of difficult.  It seems to me that 
anything else probably should be possible to be dynamic.

	-hpa

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-11 20:04         ` [RFC 0/4] dynamically allocate arch specific system vectors H. Peter Anvin
@ 2008-09-12 11:46           ` Dean Nelson
  0 siblings, 0 replies; 35+ messages in thread
From: Dean Nelson @ 2008-09-12 11:46 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric W. Biederman, Alan Mayer, Ingo Molnar, jeremy, rusty,
	suresh.b.siddha, torvalds, linux-kernel, Thomas Gleixner,
	Yinghai Lu

On Thu, Sep 11, 2008 at 01:04:35PM -0700, H. Peter Anvin wrote:
> Dean Nelson wrote:
> >
> >We (SGI) need somewhere around eight vectors.
> >
> >There are two kernel modules, sgi-gru and sgi-xp (in drivers/misc), that
> >each need two vectors. And there's the broadcast assist unit (BAU) that is
> >involved in tlb shootdown on uv, which currently uses statically reserved
> >vector 0xf8 (UV_BAU_MESSAGE -- see uv_bau_init()). I know of a debugger 
> >that
> >also uses 0xf8 because it was previously available until UV_BAU_MESSAGE 
> >came
> >along. The BAU would be happy with a dynamically allocated system vector.
> >We have a couple of other things in the works that also need vectors.
> >
> >All of these eight or so vectors are only meaningful on SGI uv systems.
> >
> 
> Are these kernel-internal vectors, or exposed to userspace (i.e. the INT 
> instruction works in userspace)?  From what I'm gathering, I think this 
> is the former.

Yeah, these are kernel-internal vectors and are not exposed to userspace.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-11 15:23       ` [RFC 0/4] dynamically " Dean Nelson
                           ` (4 preceding siblings ...)
  2008-09-11 20:04         ` [RFC 0/4] dynamically allocate arch specific system vectors H. Peter Anvin
@ 2008-09-14 15:35         ` Ingo Molnar
  2008-09-14 15:48           ` Ingo Molnar
  2008-09-15 21:50           ` Dean Nelson
  5 siblings, 2 replies; 35+ messages in thread
From: Ingo Molnar @ 2008-09-14 15:35 UTC (permalink / raw)
  To: Dean Nelson
  Cc: Eric W. Biederman, Alan Mayer, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu


* Dean Nelson <dcn@sgi.com> wrote:

> On Mon, Aug 11, 2008 at 12:39:22PM -0700, Eric W. Biederman wrote:
> >
> > Although I am not at all convinced that dynamic allocation of
> > the vector number (instead of statically reserving it makes sense).
> 
> We (SGI) need somewhere around eight vectors.
> 
> There are two kernel modules, sgi-gru and sgi-xp (in drivers/misc), 
> that each need two vectors. And there's the broadcast assist unit 
> (BAU) that is involved in tlb shootdown on uv, which currently uses 
> statically reserved vector 0xf8 (UV_BAU_MESSAGE -- see uv_bau_init()). 

while i understand the UV_BAU_MESSAGE case (TLB flushes are special), 
why does sgi-gru and sgi-xp need to go that deep? They are drivers, they 
should be able to make use of an ordinary irq just like the other 2000 
drivers we have do.

> I know of a debugger that also uses 0xf8 because it was previously 
> available until UV_BAU_MESSAGE came along. The BAU would be happy with 
> a dynamically allocated system vector. We have a couple of other 
> things in the works that also need vectors.

which debugger is this?

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 2/4] introduce dynamically allocated system vectors
  2008-09-11 15:27         ` [RFC 2/4] introduce dynamically allocated system vectors Dean Nelson
@ 2008-09-14 15:39           ` Ingo Molnar
  2008-09-14 15:46           ` Ingo Molnar
  1 sibling, 0 replies; 35+ messages in thread
From: Ingo Molnar @ 2008-09-14 15:39 UTC (permalink / raw)
  To: Dean Nelson
  Cc: Eric W. Biederman, Alan Mayer, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu


* Dean Nelson <dcn@sgi.com> wrote:

> Introduce the dynamic allocation and deallocation of system vectors which
> are mapped to irq numbers allowing the use of request_irq()/free_irq().
> 
> Signed-off-by: Dean Nelson <dcn@sgi.com>
> 
> ---
> 
>  arch/x86/kernel/apic.c        |    3 
>  arch/x86/kernel/io_apic.c     |  264 +++++++++++++++++++++++++++++++++-----

it's not clean to put it into io_apic.c:

> -static int __assign_irq_vector(int irq, cpumask_t mask)
> +bool __grab_irq_vector(struct irq_desc *desc, unsigned int vector,
> +		       cpumask_t *new_domain_mask)

please put it into arch/x86/kernel/irq.c or so.

this bit:

> Index: linux/include/linux/irq.h
> ===================================================================
> --- linux.orig/include/linux/irq.h	2008-09-10 12:08:46.000000000 -0500
> +++ linux/include/linux/irq.h	2008-09-11 06:53:16.000000000 -0500
> @@ -390,11 +390,22 @@ set_irq_chained_handler(unsigned int irq
>  extern void set_irq_noprobe(unsigned int irq);
>  extern void set_irq_probe(unsigned int irq);
>  
> -/* Handle dynamic irq creation and destruction */
> +/* Handle dynamic irq device vector allocation and deallocation */
>  extern unsigned int create_irq_nr(unsigned int irq_want);
>  extern int create_irq(void);
>  extern void destroy_irq(unsigned int irq);
>  
> +/* Handle dynamic irq system vector allocation and deallocation */
> +extern unsigned int create_irq_system_vector(cpumask_t *mask, int priority,
> +					     char *irq_name,
> +					     int *assigned_vector);
> +#define IRQ_PRIORITY_LOW	1
> +#define IRQ_PRIORITY_HIGH	2
> +
> +extern void destroy_irq_system_vector(unsigned int irq);
> +
> +extern int reserve_system_vectors(int number);

does not belong into the generic kernel code - it's an x86 property.

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 4/4] switch non-standard SYSCALL_VECTOR allocation to use vector_irq[]
  2008-09-11 15:29         ` [RFC 4/4] switch non-standard SYSCALL_VECTOR " Dean Nelson
@ 2008-09-14 15:40           ` Ingo Molnar
  2008-09-14 15:42           ` Ingo Molnar
  1 sibling, 0 replies; 35+ messages in thread
From: Ingo Molnar @ 2008-09-14 15:40 UTC (permalink / raw)
  To: Dean Nelson
  Cc: Eric W. Biederman, Alan Mayer, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu


* Dean Nelson <dcn@sgi.com> wrote:

> Replace the current use of used_vectors[] for the allocation of a 
> non-standard SYSCALL_VECTOR by also using the per_cpu variable 
> vector_irq[].

this:

> Index: linux/arch/x86/kernel/traps_32.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/traps_32.c	2008-09-10 14:25:23.000000000 -0500
> +++ linux/arch/x86/kernel/traps_32.c	2008-09-10 14:25:28.000000000 -0500

> +	ret = __grab_irq_vector(NON_IRQ_DESC, SYSCALL_VECTOR,
> +				&cpu_possible_map);

break the build if IO_APIC is disabled. (see previous mail)

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 4/4] switch non-standard SYSCALL_VECTOR allocation to use vector_irq[]
  2008-09-11 15:29         ` [RFC 4/4] switch non-standard SYSCALL_VECTOR " Dean Nelson
  2008-09-14 15:40           ` Ingo Molnar
@ 2008-09-14 15:42           ` Ingo Molnar
  1 sibling, 0 replies; 35+ messages in thread
From: Ingo Molnar @ 2008-09-14 15:42 UTC (permalink / raw)
  To: Dean Nelson
  Cc: Eric W. Biederman, Alan Mayer, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu


* Dean Nelson <dcn@sgi.com> wrote:

> +		ret = __grab_irq_vector(NON_IRQ_DESC, syscall_vector,
> +					&cpu_possible_map);
> +		spin_unlock_irqrestore(&vector_lock, flags);
> +		if (ret == false) {
> +			printk("lg: couldn't reserve syscall %u\n",
> +			       syscall_vector);
> +			return -EBUSY;
> +		}

please use __grab_irq_vector() to standard return type: -EINVAL (or 
-EBUSY) on failure, vector on success. That will get rid of the 'ret == 
false' ugliness too.

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 2/4] introduce dynamically allocated system vectors
  2008-09-11 15:27         ` [RFC 2/4] introduce dynamically allocated system vectors Dean Nelson
  2008-09-14 15:39           ` Ingo Molnar
@ 2008-09-14 15:46           ` Ingo Molnar
  1 sibling, 0 replies; 35+ messages in thread
From: Ingo Molnar @ 2008-09-14 15:46 UTC (permalink / raw)
  To: Dean Nelson, Mike Travis
  Cc: Eric W. Biederman, Alan Mayer, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu


* Dean Nelson <dcn@sgi.com> wrote:

> @@ -1219,42 +1246,40 @@ static int __assign_irq_vector(int irq, 
>  	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
>  	 */
>  	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
> -	unsigned int old_vector;
> +	cpumask_t target_cpus_mask;

hm. One SGI developer (Mike Travis) is busy removing on-kernel-stack 
cpumask_t variables. Other one (you) is busy reintroducing them:

> +	cpumask_t target_cpus_mask;
> +	cpumask_t domain;
> +	cpumask_t new_domain_mask = CPU_MASK_NONE;

in multiple ...

> +	cpumask_t mask = TARGET_CPUS;

... functions.

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-14 15:35         ` Ingo Molnar
@ 2008-09-14 15:48           ` Ingo Molnar
  2008-09-15 21:50           ` Dean Nelson
  1 sibling, 0 replies; 35+ messages in thread
From: Ingo Molnar @ 2008-09-14 15:48 UTC (permalink / raw)
  To: Dean Nelson
  Cc: Eric W. Biederman, Alan Mayer, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu


* Ingo Molnar <mingo@elte.hu> wrote:

> > There are two kernel modules, sgi-gru and sgi-xp (in drivers/misc), 
> > that each need two vectors. And there's the broadcast assist unit 
> > (BAU) that is involved in tlb shootdown on uv, which currently uses 
> > statically reserved vector 0xf8 (UV_BAU_MESSAGE -- see 
> > uv_bau_init()).
> 
> while i understand the UV_BAU_MESSAGE case (TLB flushes are special), 
> why does sgi-gru and sgi-xp need to go that deep? They are drivers, 
> they should be able to make use of an ordinary irq just like the other 
> 2000 drivers we have do.

... but all in one, i still like this concept as it's a nice clean-up. I 
just dont think it should be exposed to generic drivers, and i think it 
needs further fixes and cleanups. (see my previous mails)

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-14 15:35         ` Ingo Molnar
  2008-09-14 15:48           ` Ingo Molnar
@ 2008-09-15 21:50           ` Dean Nelson
  2008-09-16  8:24             ` Ingo Molnar
  1 sibling, 1 reply; 35+ messages in thread
From: Dean Nelson @ 2008-09-15 21:50 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric W. Biederman, Alan Mayer, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu

On Sun, Sep 14, 2008 at 05:35:22PM +0200, Ingo Molnar wrote:
> 
> * Dean Nelson <dcn@sgi.com> wrote:
> 
> > On Mon, Aug 11, 2008 at 12:39:22PM -0700, Eric W. Biederman wrote:
> > >
> > > Although I am not at all convinced that dynamic allocation of
> > > the vector number (instead of statically reserving it makes sense).
> > 
> > We (SGI) need somewhere around eight vectors.
> > 
> > There are two kernel modules, sgi-gru and sgi-xp (in drivers/misc), 
> > that each need two vectors. And there's the broadcast assist unit 
> > (BAU) that is involved in tlb shootdown on uv, which currently uses 
> > statically reserved vector 0xf8 (UV_BAU_MESSAGE -- see uv_bau_init()). 
> 
> while i understand the UV_BAU_MESSAGE case (TLB flushes are special), 
> why does sgi-gru and sgi-xp need to go that deep? They are drivers, they 
> should be able to make use of an ordinary irq just like the other 2000 
> drivers we have do.

The sgi-gru driver needs to be able to allocate a single irq/vector pair for
all CPUs even those that are not currently online. The sgi-xp driver has
similar but not as stringent needs.

The current __assign_irq_vector() restricts the allocation of the irq/vector
pair to a single CPU and its vector_allocation_domain().


> > I know of a debugger that also uses 0xf8 because it was previously 
> > available until UV_BAU_MESSAGE came along. The BAU would be happy with 
> > a dynamically allocated system vector. We have a couple of other 
> > things in the works that also need vectors.
> 
> which debugger is this?

KDB.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-15 21:50           ` Dean Nelson
@ 2008-09-16  8:24             ` Ingo Molnar
  2008-09-16 20:46               ` Dean Nelson
  0 siblings, 1 reply; 35+ messages in thread
From: Ingo Molnar @ 2008-09-16  8:24 UTC (permalink / raw)
  To: Dean Nelson
  Cc: Eric W. Biederman, Alan Mayer, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu


* Dean Nelson <dcn@sgi.com> wrote:

> > while i understand the UV_BAU_MESSAGE case (TLB flushes are 
> > special), why does sgi-gru and sgi-xp need to go that deep? They are 
> > drivers, they should be able to make use of an ordinary irq just 
> > like the other 2000 drivers we have do.
> 
> The sgi-gru driver needs to be able to allocate a single irq/vector 
> pair for all CPUs even those that are not currently online. The sgi-xp 
> driver has similar but not as stringent needs.

why does it need to allocate a single irq/vector pair? Why is a regular 
interrupt not good?

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-16  8:24             ` Ingo Molnar
@ 2008-09-16 20:46               ` Dean Nelson
  2008-09-17 17:30                 ` Dimitri Sivanich
  2008-09-17 19:15                 ` H. Peter Anvin
  0 siblings, 2 replies; 35+ messages in thread
From: Dean Nelson @ 2008-09-16 20:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric W. Biederman, Alan Mayer, jeremy, rusty, suresh.b.siddha,
	torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner,
	Yinghai Lu

On Tue, Sep 16, 2008 at 10:24:48AM +0200, Ingo Molnar wrote:
> 
> * Dean Nelson <dcn@sgi.com> wrote:
> 
> > > while i understand the UV_BAU_MESSAGE case (TLB flushes are 
> > > special), why does sgi-gru and sgi-xp need to go that deep? They are 
> > > drivers, they should be able to make use of an ordinary irq just 
> > > like the other 2000 drivers we have do.
> > 
> > The sgi-gru driver needs to be able to allocate a single irq/vector 
> > pair for all CPUs even those that are not currently online. The sgi-xp 
> > driver has similar but not as stringent needs.
> 
> why does it need to allocate a single irq/vector pair? Why is a regular 
> interrupt not good?

When you speak of a 'regular interrupt' I assume you are referring to simply
the irq number, with the knowledge of what vector and CPU(s) it is mapped to
being hidden?


    sgi-gru driver

The GRU is not an actual external device that is connected to an IOAPIC.
The gru is a hardware mechanism that is embedded in the node controller
(UV hub) that directly connects to the cpu socket. Any cpu (with permission)
can do direct loads and stores to the gru. Some of these stores will result
in an interrupt being sent back to the cpu that did the store.

The interrupt vector used for this interrupt is not in an IOAPIC. Instead
it must be loaded into the GRU at boot or driver initialization time.

The OS needs to route these interrupts back to the GRU driver interrupt
handler on the cpu that received the interrupt. Also, this is a performance
critical path. There should be no globally shared cachelines involved in the
routing.

The actual vector associated with the IRQ does not matter as long as it is
a relatively high priority interrupt. The vector does need to be mapped to
all of the possible CPUs in the partition. The GRU driver needs to know
vector's value, so that it can load it into the GRU.

    sgi-xp driver

The sgi-xp driver utilizes the node controller's message queue capability to
send messages from one system partition (a single SSI) to another partition.

A message queue can be configured to have the node controller raise an
interrupt whenever a message is written into it. This configuration is
accomplished by setting up two processor writable MMRs located in the
node controller. The vector number and apicid of the targeted CPU need
to be written into one of these MMRs. There is no IOAPIC associated with
this.

So one thought was that, once insmod'd, sgi-xp would allocate a message queue,
allocate an irq/vector pair for a CPU located on the node where the message
queue resides, and then set the MMRs with the memory address and length of the
message queue and the vector and CPU's apicid. And then repeat, as there are
actually two message queues required by this driver.


I hope this helps answer your question, or at least shows you what problem
we are trying to solve.

Thanks,
Dean


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-16 20:46               ` Dean Nelson
@ 2008-09-17 17:30                 ` Dimitri Sivanich
  2008-09-17 18:59                   ` Eric W. Biederman
  2008-09-17 19:15                 ` H. Peter Anvin
  1 sibling, 1 reply; 35+ messages in thread
From: Dimitri Sivanich @ 2008-09-17 17:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Dean Nelson, Eric W. Biederman, Alan Mayer, jeremy, rusty,
	suresh.b.siddha, torvalds, linux-kernel, H. Peter Anvin,
	Thomas Gleixner, Yinghai Lu

On Tue, Sep 16, 2008 at 03:46:54PM -0500, Dean Nelson wrote:
> On Tue, Sep 16, 2008 at 10:24:48AM +0200, Ingo Molnar wrote:
> > 
> > * Dean Nelson <dcn@sgi.com> wrote:
> > 
> > > > while i understand the UV_BAU_MESSAGE case (TLB flushes are 
> > > > special), why does sgi-gru and sgi-xp need to go that deep? They are 
> > > > drivers, they should be able to make use of an ordinary irq just 
> > > > like the other 2000 drivers we have do.
> > > 
> > > The sgi-gru driver needs to be able to allocate a single irq/vector 
> > > pair for all CPUs even those that are not currently online. The sgi-xp 
> > > driver has similar but not as stringent needs.
> > 
> > why does it need to allocate a single irq/vector pair? Why is a regular 
> > interrupt not good?
> 
> When you speak of a 'regular interrupt' I assume you are referring to simply
> the irq number, with the knowledge of what vector and CPU(s) it is mapped to
> being hidden?
> 
> 
>     sgi-gru driver
> 
> The GRU is not an actual external device that is connected to an IOAPIC.
> The gru is a hardware mechanism that is embedded in the node controller
> (UV hub) that directly connects to the cpu socket. Any cpu (with permission)
> can do direct loads and stores to the gru. Some of these stores will result
> in an interrupt being sent back to the cpu that did the store.
> 
> The interrupt vector used for this interrupt is not in an IOAPIC. Instead
> it must be loaded into the GRU at boot or driver initialization time.
> 
> The OS needs to route these interrupts back to the GRU driver interrupt
> handler on the cpu that received the interrupt. Also, this is a performance
> critical path. There should be no globally shared cachelines involved in the
> routing.
> 
> The actual vector associated with the IRQ does not matter as long as it is
> a relatively high priority interrupt. The vector does need to be mapped to
> all of the possible CPUs in the partition. The GRU driver needs to know
> vector's value, so that it can load it into the GRU.
> 
>     sgi-xp driver
> 
> The sgi-xp driver utilizes the node controller's message queue capability to
> send messages from one system partition (a single SSI) to another partition.
> 
> A message queue can be configured to have the node controller raise an
> interrupt whenever a message is written into it. This configuration is
> accomplished by setting up two processor writable MMRs located in the
> node controller. The vector number and apicid of the targeted CPU need
> to be written into one of these MMRs. There is no IOAPIC associated with
> this.
> 
> So one thought was that, once insmod'd, sgi-xp would allocate a message queue,
> allocate an irq/vector pair for a CPU located on the node where the message
> queue resides, and then set the MMRs with the memory address and length of the
> message queue and the vector and CPU's apicid. And then repeat, as there are
> actually two message queues required by this driver.

In addition to the above, the high resolution RTC timers in the UV hardware require that a vector be specified in order to send an interrupt to a specific destination when a timer expires.  The MMR's for these timers require a vector to be or'ed in with other values, including the interrupt's destination.  This is therefore done at run-time.

Like the GRU's vector, this vector is not in an IOAPIC.  This vector would be made available to all cpu's within a partition (SSI) and should be coupled with a per-cpu irq.

This is very similiar to what was available in earlier SGI hardware and used in drivers/char/mmtimer.c.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-17 17:30                 ` Dimitri Sivanich
@ 2008-09-17 18:59                   ` Eric W. Biederman
  2008-09-18 13:37                     ` Dean Nelson
  0 siblings, 1 reply; 35+ messages in thread
From: Eric W. Biederman @ 2008-09-17 18:59 UTC (permalink / raw)
  To: Dimitri Sivanich
  Cc: Ingo Molnar, Dean Nelson, Alan Mayer, jeremy, rusty,
	suresh.b.siddha, torvalds, linux-kernel, H. Peter Anvin,
	Thomas Gleixner, Yinghai Lu

Dimitri Sivanich <sivanich@sgi.com> writes:

> On Tue, Sep 16, 2008 at 03:46:54PM -0500, Dean Nelson wrote:
>> On Tue, Sep 16, 2008 at 10:24:48AM +0200, Ingo Molnar wrote:
>> > 
>> > * Dean Nelson <dcn@sgi.com> wrote:
>> > 
>> > > > while i understand the UV_BAU_MESSAGE case (TLB flushes are 
>> > > > special), why does sgi-gru and sgi-xp need to go that deep? They are 
>> > > > drivers, they should be able to make use of an ordinary irq just 
>> > > > like the other 2000 drivers we have do.
>> > > 
>> > > The sgi-gru driver needs to be able to allocate a single irq/vector 
>> > > pair for all CPUs even those that are not currently online. The sgi-xp 
>> > > driver has similar but not as stringent needs.

I need to look at these patches some more (apologies I got busy).

We can not assign a vector to all CPUS.

We don't have the fields for vector -> irq mapping for cpus that
are not-online.  So we can only assign to cpus that are online now.
And latter add the other cpus when they come online.

I have had that oops, it sucks, I don't want to go back.

Eric




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-16 20:46               ` Dean Nelson
  2008-09-17 17:30                 ` Dimitri Sivanich
@ 2008-09-17 19:15                 ` H. Peter Anvin
  2008-09-17 20:21                   ` Jack Steiner
  1 sibling, 1 reply; 35+ messages in thread
From: H. Peter Anvin @ 2008-09-17 19:15 UTC (permalink / raw)
  To: Dean Nelson
  Cc: Ingo Molnar, Eric W. Biederman, Alan Mayer, jeremy, rusty,
	suresh.b.siddha, torvalds, linux-kernel, Thomas Gleixner,
	Yinghai Lu

Dean Nelson wrote:
> 
>     sgi-gru driver
> 
> The GRU is not an actual external device that is connected to an IOAPIC.
> The gru is a hardware mechanism that is embedded in the node controller
> (UV hub) that directly connects to the cpu socket. Any cpu (with permission)
> can do direct loads and stores to the gru. Some of these stores will result
> in an interrupt being sent back to the cpu that did the store.
> 
> The interrupt vector used for this interrupt is not in an IOAPIC. Instead
> it must be loaded into the GRU at boot or driver initialization time.
> 

Could you clarify there: is this one vector number per CPU, or are you 
issuing a specific vector number and just varying the CPU number?

	-hpa

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-17 19:15                 ` H. Peter Anvin
@ 2008-09-17 20:21                   ` Jack Steiner
  2008-09-17 22:15                     ` Eric W. Biederman
  0 siblings, 1 reply; 35+ messages in thread
From: Jack Steiner @ 2008-09-17 20:21 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dean Nelson, Ingo Molnar, Eric W. Biederman, Alan Mayer, jeremy,
	rusty, suresh.b.siddha, torvalds, linux-kernel, Thomas Gleixner,
	Yinghai Lu

On Wed, Sep 17, 2008 at 12:15:42PM -0700, H. Peter Anvin wrote:
> Dean Nelson wrote:
> >
> >    sgi-gru driver
> >
> >The GRU is not an actual external device that is connected to an IOAPIC.
> >The gru is a hardware mechanism that is embedded in the node controller
> >(UV hub) that directly connects to the cpu socket. Any cpu (with 
> >permission)
> >can do direct loads and stores to the gru. Some of these stores will result
> >in an interrupt being sent back to the cpu that did the store.
> >
> >The interrupt vector used for this interrupt is not in an IOAPIC. Instead
> >it must be loaded into the GRU at boot or driver initialization time.
> >
> 
> Could you clarify there: is this one vector number per CPU, or are you 
> issuing a specific vector number and just varying the CPU number?

It is one vector for each cpu.

It is more efficient for software if the vector # is the same for all cpus
but the software/hardware can support a unique vector for each cpu. This
assumes, of course, that the driver can determine the irq->vector mapping for
each cpu.


<probably-more-detail-than-you-want>

Physically, the system contains a large number of blades. Each blade has
several processor sockets plus a UV hub (node controller).  There are 2 GRUs
located in each UV hub.

Each GRU supports multiple users simultaneously using the GRU.
Each user is assigned a context number (0 .. N-1). If an exception occurs,
the GRU uses the context number as an index into an array of [vector-apicid] pairs.
The [vector-apicid] identifies the cpu & vector for the interrupt.

Although supported by hardware, we do not intend to send interrupts
off-blade.

The array of [vector-apicid] pairs is located in each GRU and must be
initialized at boot time or when the driver is loaded. There is a
separate array for each GRU.

When the driver receives the interrupt, the vector number (or IRQ number) is
used by the driver to determine the GRU that sent the interrupt.


The simpliest scheme would be to assign 2 vectors - one for each GRU in the UV hub.
Vector #0 would be loaded into each "vector" of the  [vector-apicid] array for GRU
#0; vector #1 would be loaded into the [vector-apicid] array for GRU #1.

The [vector-apicid] arrays on all nodes would be identical as far as vectors are
concerned. (Apicids would be different and would target blade-local cpus).
Since interrupts are not sent offnode, the driver can use the vector (irq)
to uniquely identify the source of the interrupt.

However, we have a lot of flexibilty here. Any scheme that provides the right
information to the driver is ok. Note that servicing of these interrupts
is likely to be time critical. We need this path to be as efficient as possible.



--- jack





^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-17 20:21                   ` Jack Steiner
@ 2008-09-17 22:15                     ` Eric W. Biederman
  2008-09-18  1:09                       ` H. Peter Anvin
  2008-09-18 19:10                       ` Jack Steiner
  0 siblings, 2 replies; 35+ messages in thread
From: Eric W. Biederman @ 2008-09-17 22:15 UTC (permalink / raw)
  To: Jack Steiner
  Cc: H. Peter Anvin, Dean Nelson, Ingo Molnar, Alan Mayer, jeremy,
	rusty, suresh.b.siddha, torvalds, linux-kernel, Thomas Gleixner,
	Yinghai Lu

Jack Steiner <steiner@sgi.com> writes:

> On Wed, Sep 17, 2008 at 12:15:42PM -0700, H. Peter Anvin wrote:
>> Dean Nelson wrote:
>> >
>> >    sgi-gru driver
>> >
>> >The GRU is not an actual external device that is connected to an IOAPIC.
>> >The gru is a hardware mechanism that is embedded in the node controller
>> >(UV hub) that directly connects to the cpu socket. Any cpu (with 
>> >permission)
>> >can do direct loads and stores to the gru. Some of these stores will result
>> >in an interrupt being sent back to the cpu that did the store.
>> >
>> >The interrupt vector used for this interrupt is not in an IOAPIC. Instead
>> >it must be loaded into the GRU at boot or driver initialization time.
>> >
>> 
>> Could you clarify there: is this one vector number per CPU, or are you 
>> issuing a specific vector number and just varying the CPU number?
>
> It is one vector for each cpu.
>
> It is more efficient for software if the vector # is the same for all cpus
Why?  Especially in terms of irq counting that would seem to lead to cache
line conflicts.

> but the software/hardware can support a unique vector for each cpu. This
> assumes, of course, that the driver can determine the irq->vector mapping for
> each cpu.
>
>
> <probably-more-detail-than-you-want>
>
> Physically, the system contains a large number of blades. Each blade has
> several processor sockets plus a UV hub (node controller).  There are 2 GRUs
> located in each UV hub.
>
> Each GRU supports multiple users simultaneously using the GRU.
> Each user is assigned a context number (0 .. N-1). If an exception occurs,
> the GRU uses the context number as an index into an array of [vector-apicid]
> pairs.
> The [vector-apicid] identifies the cpu & vector for the interrupt.
>
> Although supported by hardware, we do not intend to send interrupts
> off-blade.
>
> The array of [vector-apicid] pairs is located in each GRU and must be
> initialized at boot time or when the driver is loaded. There is a
> separate array for each GRU.
>
> When the driver receives the interrupt, the vector number (or IRQ number) is
> used by the driver to determine the GRU that sent the interrupt.
>
>
> The simpliest scheme would be to assign 2 vectors - one for each GRU in the UV
> hub.
> Vector #0 would be loaded into each "vector" of the [vector-apicid] array for
> GRU
> #0; vector #1 would be loaded into the [vector-apicid] array for GRU #1.
>
> The [vector-apicid] arrays on all nodes would be identical as far as vectors are
> concerned. (Apicids would be different and would target blade-local cpus).
> Since interrupts are not sent offnode, the driver can use the vector (irq)
> to uniquely identify the source of the interrupt.
>
> However, we have a lot of flexibilty here. Any scheme that provides the right
> information to the driver is ok. Note that servicing of these interrupts
> is likely to be time critical. We need this path to be as efficient as possible.

That sounds like you have a non-standard MSI-X vector.  You certainly have all of
the same properties.  At which point create_irq() sounds like what you want.

One irq per cpu, per device.

It is the trend.  Don't worry all of the high performance drivers are doing it.
That is the path that will be optimized.

What you are proposing is some silly side path that will be ignored, and will be
increasingly less well supported over time as no other hardware does that.  Please
join the rest of the world.  Weird formats formats for programming irq information
into the hardware are easier to support than many other weird restrictions.

What function does the GRU perform that makes it more important and more special
than other hardware devices that requires it to have a high priority interrupt?

Eric


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-17 22:15                     ` Eric W. Biederman
@ 2008-09-18  1:09                       ` H. Peter Anvin
  2008-09-18 19:10                       ` Jack Steiner
  1 sibling, 0 replies; 35+ messages in thread
From: H. Peter Anvin @ 2008-09-18  1:09 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Jack Steiner, Dean Nelson, Ingo Molnar, Alan Mayer, jeremy,
	rusty, suresh.b.siddha, torvalds, linux-kernel, Thomas Gleixner,
	Yinghai Lu

Eric W. Biederman wrote:
>> It is one vector for each cpu.
>>
>> It is more efficient for software if the vector # is the same for all cpus
> Why?  Especially in terms of irq counting that would seem to lead to cache
> line conflicts.
> 
>> but the software/hardware can support a unique vector for each cpu. This
>> assumes, of course, that the driver can determine the irq->vector mapping for
>> each cpu.
> 
> That sounds like you have a non-standard MSI-X vector.  You certainly have all of
> the same properties.  At which point create_irq() sounds like what you want.
> 
> One irq per cpu, per device.
> 
> It is the trend.  Don't worry all of the high performance drivers are doing it.
> That is the path that will be optimized.
> 

In particular, it's just another interrupt type.  We already have quite 
a few of those, from XT-PIC to the various IOAPIC ones, to MSI and MSI-X.

Just treating these as variants of MSI seems to me to make most sense, too.

	-hpa


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-17 18:59                   ` Eric W. Biederman
@ 2008-09-18 13:37                     ` Dean Nelson
  2008-09-18 19:18                       ` H. Peter Anvin
  0 siblings, 1 reply; 35+ messages in thread
From: Dean Nelson @ 2008-09-18 13:37 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Alan Mayer, Dimitri Sivanich, jeremy, rusty,
	suresh.b.siddha, torvalds, linux-kernel, H. Peter Anvin,
	Thomas Gleixner, Yinghai Lu

On Wed, Sep 17, 2008 at 11:59:05AM -0700, Eric W. Biederman wrote:
> 
> I need to look at these patches some more (apologies I got busy).

Thanks for taking a look.

I've got a not-quite-complete new version of the patchset that addresses
most of the issues raised by Ingo. At it's heart is the following:


 arch/x86/kernel/irq.c___(new)_________________________________

#include <linux/irq.h>

static cpumask_t domain_online;

int grab_irq_vector(struct irq_desc *desc, unsigned int vector,
		   cpumask_t *domain)
{
	/* Must be called with vector lock held */
	int cpu;

	cpus_and(domain_online, *domain, cpu_online_map);

	for_each_cpu_mask_nr(cpu, domain_online) {
		if (per_cpu(vector_irq, cpu)[vector] != NULL)
			return -EBUSY;
	}

	/* Available reserve it */
	for_each_cpu_mask_nr(cpu, domain_online)
		per_cpu(vector_irq, cpu)[vector] = desc;

	return vector;
}


 arch/x86/kernel/io_apic.c_____________________________________

static int ioapic_grab_irq_vector(struct irq_desc *desc, unsigned int vector,
				  cpumask_t *domain)
{
	/* Must be called with vector lock held */
	struct irq_cfg *cfg;
	int ret;

	ret = grab_irq_vector(desc, vector, domain);
	if (ret == vector) {
		cfg = irq_cfg(desc->irq);
		if (cfg->vector) {
			cfg->move_in_progress = 1;
			cfg->old_domain = cfg->domain;
		}
		cfg->vector = vector;
		cfg->domain = *domain;
	}
	return ret;
}

I've also restructured the order of the patchset so that the first
three patches switch vector_irq from an irq # to an irq_desc pointer, 
replace system_vectors[] usage by a call to grab_irq_vector(), and
finally replace used_vectors[] usage by a call to grab_irq_vector().
You may find these three patches meaningful in and of themselves
since Ingo seemed to indicate that they cleaned up some of the code.

The last patch adds the dynamic allocate of system irq, which, if I'm
understanding correctly, needs to be reworked so that SGI's UV irq
needs get satisfied through a variant of MSI. The MSI code isn't
something I've looked at before.


> We can not assign a vector to all CPUS.
> 
> We don't have the fields for vector -> irq mapping for cpus that
> are not-online.  So we can only assign to cpus that are online now.
> And latter add the other cpus when they come online.
> 
> I have had that oops, it sucks, I don't want to go back.

I reworked the above functions so that grab_irq_vector() enforces
online CPUS only. (I'm assuming your statements applies to per_cpu
vector_irq, as well. If not, then grab_irq_vector() could accept a
cpu_possible_map domain if NON_IRQ_DESC is passed as the desc pointer.)

We'll need to ensure that when a CPU comes online that its vector_irqs
that need to be set to NON_IRQ_DESC, get set properly by
__setup_vector_irq().


Thanks,
Dean

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-17 22:15                     ` Eric W. Biederman
  2008-09-18  1:09                       ` H. Peter Anvin
@ 2008-09-18 19:10                       ` Jack Steiner
  2008-09-19  0:28                         ` Eric W. Biederman
  1 sibling, 1 reply; 35+ messages in thread
From: Jack Steiner @ 2008-09-18 19:10 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: H. Peter Anvin, Dean Nelson, Ingo Molnar, Alan Mayer, jeremy,
	rusty, suresh.b.siddha, torvalds, linux-kernel, Thomas Gleixner,
	Yinghai Lu

On Wed, Sep 17, 2008 at 03:15:07PM -0700, Eric W. Biederman wrote:
> Jack Steiner <steiner@sgi.com> writes:
> 
> > On Wed, Sep 17, 2008 at 12:15:42PM -0700, H. Peter Anvin wrote:
> >> Dean Nelson wrote:
> >> >
> >> >    sgi-gru driver
> >> >
> >> >The GRU is not an actual external device that is connected to an IOAPIC.
> >> >The gru is a hardware mechanism that is embedded in the node controller
> >> >(UV hub) that directly connects to the cpu socket. Any cpu (with 
> >> >permission)
> >> >can do direct loads and stores to the gru. Some of these stores will result
> >> >in an interrupt being sent back to the cpu that did the store.
> >> >
> >> >The interrupt vector used for this interrupt is not in an IOAPIC. Instead
> >> >it must be loaded into the GRU at boot or driver initialization time.
> >> >
> >> 
> >> Could you clarify there: is this one vector number per CPU, or are you 
> >> issuing a specific vector number and just varying the CPU number?
> >
> > It is one vector for each cpu.
> >
> > It is more efficient for software if the vector # is the same for all cpus
> Why?  Especially in terms of irq counting that would seem to lead to cache
> line conflicts.

Functionally, it does not matter. However, if the IRQ is not a per-cpu IRQ, a
very large number of IRQs (and vectors) may be needed. The GRU requires 32 interrupt
lines on each blade. A large system can currently support up to 512 blades.

After looking thru the MSI code, we are starting to believe that we should separate
the GRU requirements from the XPC requirements. It looks like XPC can easily use
the MSI infrastructure.  XPC needs a small number of IRQs, and interrupts are typically
targeted to a single cpu. They can also be retargeted using the standard methods.

The GRU, OTOH, is more like a timer interrupt or like a co-processor interrupt.
GRU interrupts can occur on any cpu using the GRU. When interrupts do occur, all that
needs to happen is to call an interrupt handler. I'm thinking of something like
the following:

	- permanently reserve 2 system vectors in include/asm-x86/irq_vectors.h
	- in uv_system_init(), call alloc_intr_gate() to route the
	  interrupts to a function in the file containing uv_system_init().
	- initialize the GRU chipset with the vector, etc, ...
	- if an interrupt occurs and the GRU driver is NOT loaded, print
	  an error message (rate limited or one time)

	- provide a special UV hook for the GRU driver to register/deregister a
	  special callback function for GRU interrupts


> 
> > but the software/hardware can support a unique vector for each cpu. This
> > assumes, of course, that the driver can determine the irq->vector mapping for
> > each cpu.
> >
> >
> > <probably-more-detail-than-you-want>
> >
> > Physically, the system contains a large number of blades. Each blade has
> > several processor sockets plus a UV hub (node controller).  There are 2 GRUs
> > located in each UV hub.
> >
> > Each GRU supports multiple users simultaneously using the GRU.
> > Each user is assigned a context number (0 .. N-1). If an exception occurs,
> > the GRU uses the context number as an index into an array of [vector-apicid]
> > pairs.
> > The [vector-apicid] identifies the cpu & vector for the interrupt.
> >
> > Although supported by hardware, we do not intend to send interrupts
> > off-blade.
> >
> > The array of [vector-apicid] pairs is located in each GRU and must be
> > initialized at boot time or when the driver is loaded. There is a
> > separate array for each GRU.
> >
> > When the driver receives the interrupt, the vector number (or IRQ number) is
> > used by the driver to determine the GRU that sent the interrupt.
> >
> >
> > The simpliest scheme would be to assign 2 vectors - one for each GRU in the UV
> > hub.
> > Vector #0 would be loaded into each "vector" of the [vector-apicid] array for
> > GRU
> > #0; vector #1 would be loaded into the [vector-apicid] array for GRU #1.
> >
> > The [vector-apicid] arrays on all nodes would be identical as far as vectors are
> > concerned. (Apicids would be different and would target blade-local cpus).
> > Since interrupts are not sent offnode, the driver can use the vector (irq)
> > to uniquely identify the source of the interrupt.
> >
> > However, we have a lot of flexibilty here. Any scheme that provides the right
> > information to the driver is ok. Note that servicing of these interrupts
> > is likely to be time critical. We need this path to be as efficient as possible.
> 
> That sounds like you have a non-standard MSI-X vector.  You certainly have all of
> the same properties.  At which point create_irq() sounds like what you want.
> 
> One irq per cpu, per device.
> 
> It is the trend.  Don't worry all of the high performance drivers are doing it.
> That is the path that will be optimized.
> 
> What you are proposing is some silly side path that will be ignored, and will be
> increasingly less well supported over time as no other hardware does that.  Please
> join the rest of the world.  Weird formats formats for programming irq information
> into the hardware are easier to support than many other weird restrictions.
> 
> What function does the GRU perform that makes it more important and more special
> than other hardware devices that requires it to have a high priority interrupt?
> 
> Eric

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-18 13:37                     ` Dean Nelson
@ 2008-09-18 19:18                       ` H. Peter Anvin
  0 siblings, 0 replies; 35+ messages in thread
From: H. Peter Anvin @ 2008-09-18 19:18 UTC (permalink / raw)
  To: Dean Nelson
  Cc: Eric W. Biederman, Ingo Molnar, Alan Mayer, Dimitri Sivanich,
	jeremy, rusty, suresh.b.siddha, torvalds, linux-kernel,
	Thomas Gleixner, Yinghai Lu

Dean Nelson wrote:
> On Wed, Sep 17, 2008 at 11:59:05AM -0700, Eric W. Biederman wrote:
>> I need to look at these patches some more (apologies I got busy).
> 
> Thanks for taking a look.
> 
> I've got a not-quite-complete new version of the patchset that addresses
> most of the issues raised by Ingo. At it's heart is the following:
> 

What would seem most intuitive to me would have been, in this case, to
add this as a new MSI IRQ chip type.  We already have several of those
(PCI vs HT), and this *should* give you all the hooks you need.

	-hpa


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-18 19:10                       ` Jack Steiner
@ 2008-09-19  0:28                         ` Eric W. Biederman
  2008-09-19  8:48                           ` Ingo Molnar
  0 siblings, 1 reply; 35+ messages in thread
From: Eric W. Biederman @ 2008-09-19  0:28 UTC (permalink / raw)
  To: Jack Steiner
  Cc: H. Peter Anvin, Dean Nelson, Ingo Molnar, Alan Mayer, jeremy,
	rusty, suresh.b.siddha, torvalds, linux-kernel, Thomas Gleixner,
	Yinghai Lu

Jack Steiner <steiner@sgi.com> writes:

> On Wed, Sep 17, 2008 at 03:15:07PM -0700, Eric W. Biederman wrote:
>> Jack Steiner <steiner@sgi.com> writes:
>> 
>> > On Wed, Sep 17, 2008 at 12:15:42PM -0700, H. Peter Anvin wrote:
>> >> Dean Nelson wrote:
>> >> >
>> >> >    sgi-gru driver
>> >> >
>> >> >The GRU is not an actual external device that is connected to an IOAPIC.
>> >> >The gru is a hardware mechanism that is embedded in the node controller
>> >> >(UV hub) that directly connects to the cpu socket. Any cpu (with 
>> >> >permission)
>> >> >can do direct loads and stores to the gru. Some of these stores will
> result
>> >> >in an interrupt being sent back to the cpu that did the store.
>> >> >
>> >> >The interrupt vector used for this interrupt is not in an IOAPIC. Instead
>> >> >it must be loaded into the GRU at boot or driver initialization time.
>> >> >
>> >> 
>> >> Could you clarify there: is this one vector number per CPU, or are you 
>> >> issuing a specific vector number and just varying the CPU number?
>> >
>> > It is one vector for each cpu.
>> >
>> > It is more efficient for software if the vector # is the same for all cpus
>> Why?  Especially in terms of irq counting that would seem to lead to cache
>> line conflicts.
>
> Functionally, it does not matter. However, if the IRQ is not a per-cpu IRQ, a
> very large number of IRQs (and vectors) may be needed. The GRU requires 32
> interrupt
> lines on each blade. A large system can currently support up to 512 blades.

Every vendor of high end hardware is saying they intend to provide
1 or 2 queues per cpu and 1 irq per queue.  So the GRU is not special in
that regard.  Also a very large number of IRQs is not a problem as
soon as we start dynamically allocating them, which is currently
in progress.

Once we start dynamically allocating irq_desc structures we can put
them in node-local memory and guarantee there is no data shared between
cpus.

> After looking thru the MSI code, we are starting to believe that we should
> separate
> the GRU requirements from the XPC requirements. It looks like XPC can easily use
> the MSI infrastructure.  XPC needs a small number of IRQs, and interrupts are
> typically
> targeted to a single cpu. They can also be retargeted using the standard
> methods.

Alright. 

I would be completely happy if there were interrupts who's affinity we can
not change, and are always targeted at a single cpu.

> The GRU, OTOH, is more like a timer interrupt or like a co-processor interrupt.
> GRU interrupts can occur on any cpu using the GRU. When interrupts do occur, all
> that
> needs to happen is to call an interrupt handler. I'm thinking of something like
> the following:
>
> 	- permanently reserve 2 system vectors in include/asm-x86/irq_vectors.h
> 	- in uv_system_init(), call alloc_intr_gate() to route the
> 	  interrupts to a function in the file containing uv_system_init().
> 	- initialize the GRU chipset with the vector, etc, ...
> 	- if an interrupt occurs and the GRU driver is NOT loaded, print
> 	  an error message (rate limited or one time)
>
> 	- provide a special UV hook for the GRU driver to register/deregister a
> 	  special callback function for GRU interrupts

That would work.  So far the GRU doesn't sound that special.

For a lot of this I would much rather solve the general case on this
giving us a solution that works for all high end interrupts rather than
one specific solution just for the GRU.  Especially since it looks like
we have most of the infrastructure already present to solve the general
case and we have to develop and review the specific case from scratch.

Eric

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC 0/4] dynamically allocate arch specific system vectors
  2008-09-19  0:28                         ` Eric W. Biederman
@ 2008-09-19  8:48                           ` Ingo Molnar
  0 siblings, 0 replies; 35+ messages in thread
From: Ingo Molnar @ 2008-09-19  8:48 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Jack Steiner, H. Peter Anvin, Dean Nelson, Alan Mayer, jeremy,
	rusty, suresh.b.siddha, torvalds, linux-kernel, Thomas Gleixner,
	Yinghai Lu


* Eric W. Biederman <ebiederm@xmission.com> wrote:

> Jack Steiner <steiner@sgi.com> writes:
> 
> > On Wed, Sep 17, 2008 at 03:15:07PM -0700, Eric W. Biederman wrote:
> >> Jack Steiner <steiner@sgi.com> writes:
> >> 
> >> > On Wed, Sep 17, 2008 at 12:15:42PM -0700, H. Peter Anvin wrote:
> >> >> Dean Nelson wrote:
> >> >> >
> >> >> >    sgi-gru driver
> >> >> >
> >> >> >The GRU is not an actual external device that is connected to an IOAPIC.
> >> >> >The gru is a hardware mechanism that is embedded in the node controller
> >> >> >(UV hub) that directly connects to the cpu socket. Any cpu (with 
> >> >> >permission)
> >> >> >can do direct loads and stores to the gru. Some of these stores will
> > result
> >> >> >in an interrupt being sent back to the cpu that did the store.
> >> >> >
> >> >> >The interrupt vector used for this interrupt is not in an IOAPIC. Instead
> >> >> >it must be loaded into the GRU at boot or driver initialization time.
> >> >> >
> >> >> 
> >> >> Could you clarify there: is this one vector number per CPU, or are you 
> >> >> issuing a specific vector number and just varying the CPU number?
> >> >
> >> > It is one vector for each cpu.
> >> >
> >> > It is more efficient for software if the vector # is the same for all cpus
> >> Why?  Especially in terms of irq counting that would seem to lead to cache
> >> line conflicts.
> >
> > Functionally, it does not matter. However, if the IRQ is not a per-cpu IRQ, a
> > very large number of IRQs (and vectors) may be needed. The GRU requires 32
> > interrupt
> > lines on each blade. A large system can currently support up to 512 blades.
> 
> Every vendor of high end hardware is saying they intend to provide
> 1 or 2 queues per cpu and 1 irq per queue.  So the GRU is not special in
> that regard.  Also a very large number of IRQs is not a problem as
> soon as we start dynamically allocating them, which is currently
> in progress.
> 
> Once we start dynamically allocating irq_desc structures we can put
> them in node-local memory and guarantee there is no data shared between
> cpus.
> 
> > After looking thru the MSI code, we are starting to believe that we should
> > separate
> > the GRU requirements from the XPC requirements. It looks like XPC can easily use
> > the MSI infrastructure.  XPC needs a small number of IRQs, and interrupts are
> > typically
> > targeted to a single cpu. They can also be retargeted using the standard
> > methods.
> 
> Alright. 
> 
> I would be completely happy if there were interrupts who's affinity we can
> not change, and are always targeted at a single cpu.
> 
> > The GRU, OTOH, is more like a timer interrupt or like a co-processor interrupt.
> > GRU interrupts can occur on any cpu using the GRU. When interrupts do occur, all
> > that
> > needs to happen is to call an interrupt handler. I'm thinking of something like
> > the following:
> >
> > 	- permanently reserve 2 system vectors in include/asm-x86/irq_vectors.h
> > 	- in uv_system_init(), call alloc_intr_gate() to route the
> > 	  interrupts to a function in the file containing uv_system_init().
> > 	- initialize the GRU chipset with the vector, etc, ...
> > 	- if an interrupt occurs and the GRU driver is NOT loaded, print
> > 	  an error message (rate limited or one time)
> >
> > 	- provide a special UV hook for the GRU driver to register/deregister a
> > 	  special callback function for GRU interrupts
> 
> That would work.  So far the GRU doesn't sound that special.
> 
> For a lot of this I would much rather solve the general case on this 
> giving us a solution that works for all high end interrupts rather 
> than one specific solution just for the GRU.  Especially since it 
> looks like we have most of the infrastructure already present to solve 
> the general case and we have to develop and review the specific case 
> from scratch.

ok, great.

Dean, just to make sure the useful bits are not lost now that the 
direction has been changed: could you please repost the patchset but 
without the driver API bits? It's still all a nice and useful 
generalization and cleanup of the x86 vector allocation code, and we can 
check it in -tip how well it works in practice.

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2008-09-19  8:49 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-08-08 15:37 [Fwd: [PATCH] x86_64: (NEW) Dynamically allocate arch specific system vectors] Alan Mayer
2008-08-11 16:59 ` [PATCH] x86_64: (NEW) Dynamically allocate arch specific system vectors Ingo Molnar
2008-08-11 17:14   ` Alan Mayer
2008-08-11 19:39     ` Eric W. Biederman
2008-08-11 19:51       ` Ingo Molnar
2008-08-11 19:55         ` Jeremy Fitzhardinge
2008-08-11 20:10         ` Eric W. Biederman
2008-08-11 20:02       ` Alan Mayer
2008-09-11 15:23       ` [RFC 0/4] dynamically " Dean Nelson
2008-09-11 15:25         ` [RFC 1/4] switch vector_irq[] from irq number to irq_desc pointer Dean Nelson
2008-09-11 15:27         ` [RFC 2/4] introduce dynamically allocated system vectors Dean Nelson
2008-09-14 15:39           ` Ingo Molnar
2008-09-14 15:46           ` Ingo Molnar
2008-09-11 15:28         ` [RFC 3/4] switch static system vector allocation to use vector_irq[] Dean Nelson
2008-09-11 15:29         ` [RFC 4/4] switch non-standard SYSCALL_VECTOR " Dean Nelson
2008-09-14 15:40           ` Ingo Molnar
2008-09-14 15:42           ` Ingo Molnar
2008-09-11 20:04         ` [RFC 0/4] dynamically allocate arch specific system vectors H. Peter Anvin
2008-09-12 11:46           ` Dean Nelson
2008-09-14 15:35         ` Ingo Molnar
2008-09-14 15:48           ` Ingo Molnar
2008-09-15 21:50           ` Dean Nelson
2008-09-16  8:24             ` Ingo Molnar
2008-09-16 20:46               ` Dean Nelson
2008-09-17 17:30                 ` Dimitri Sivanich
2008-09-17 18:59                   ` Eric W. Biederman
2008-09-18 13:37                     ` Dean Nelson
2008-09-18 19:18                       ` H. Peter Anvin
2008-09-17 19:15                 ` H. Peter Anvin
2008-09-17 20:21                   ` Jack Steiner
2008-09-17 22:15                     ` Eric W. Biederman
2008-09-18  1:09                       ` H. Peter Anvin
2008-09-18 19:10                       ` Jack Steiner
2008-09-19  0:28                         ` Eric W. Biederman
2008-09-19  8:48                           ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).