LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 0/2] cpusets: support for irqs
@ 2008-03-11 5:57 maxk
2008-03-11 5:57 ` [PATCH 1/2] cpuset: cpuset irq affinities maxk
0 siblings, 1 reply; 11+ messages in thread
From: maxk @ 2008-03-11 5:57 UTC (permalink / raw)
To: mingo; +Cc: pj, a.p.zijlstra, linux-kernel, menage
Here is an updated patchset that adds support for irqs to the cpusets.
First part if Peter's patch and second part is my fixes extensions for it.
Ingo mentioned that if this is test booted and stuff he'll included in
sched-devel. I tested it on a couple of x86-64 boxes and everything
seems to work just fine. ie Irqs are handled just like tasks. I tested a
bunch of scenarious (cpu hotplug, child set removal, etc).
btw Ingo, Peters original patches with cpu_system_map are no longer needed.
Can you please nuke them from sched-devel.
Peter, looks like I'll have some time tomorrow. So I'll move on to the
in kernel 'boot' set. If you have some code for it already send it my way.
I'll test/fix/extend it and send a patchset back.
b/include/linux/cpuset.h | 8 +
b/include/linux/irq.h | 9 +
b/kernel/cpuset.c | 160 ++++++++++++++++++++
b/kernel/irq/chip.c | 5
b/kernel/irq/manage.c | 20 ++
include/linux/irq.h | 2
kernel/cpuset.c | 367 +++++++++++++++++++++++++++++------------------
kernel/irq/manage.c | 28 ++-
8 files changed, 453 insertions(+), 146 deletions(-)
Thanx
Max
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/2] cpuset: cpuset irq affinities
2008-03-11 5:57 [PATCH 0/2] cpusets: support for irqs maxk
@ 2008-03-11 5:57 ` maxk
2008-03-11 5:57 ` [PATCH 2/2] cpusets: Improved irq affinity handling maxk
` (2 more replies)
0 siblings, 3 replies; 11+ messages in thread
From: maxk @ 2008-03-11 5:57 UTC (permalink / raw)
To: mingo; +Cc: pj, a.p.zijlstra, linux-kernel, menage
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Hi Paul,
How about something like this; along with the in-kernel version
of /cgroup/boot this could also provide the desired semantics.
Another benefit of this approach would be that it no longer requires
PF_THREAD_BIND, as we'd only stick unbounded kthreads into that cgroup.
(compile tested only)
---
include/linux/irq.h | 9 +++
kernel/cpuset.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/irq/manage.c | 19 ++++++
3 files changed, 188 insertions(+), 0 deletions(-)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 176e5e7..450c0de 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -174,11 +174,20 @@ struct irq_desc {
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *dir;
#endif
+#ifdef CONFIG_CPUSETS
+ struct cpuset *cs;
+#endif
const char *name;
} ____cacheline_internodealigned_in_smp;
extern struct irq_desc irq_desc[NR_IRQS];
+struct irq_iterator {
+ int (*function)(struct irq_iterator *, int, struct irq_desc *);
+};
+
+extern int irq_iterator(struct irq_iterator *);
+
/*
* Migration helpers for obsolete names, they will go away:
*/
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a1b61f4..e82a258 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -50,6 +50,9 @@
#include <linux/time.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#ifdef CONFIG_GENERIC_HARDIRQS
+#include <linux/irq.h>
+#endif
#include <asm/uaccess.h>
#include <asm/atomic.h>
@@ -732,6 +735,44 @@ void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed);
}
+#ifdef CONFIG_GENERIC_HARDIRQS
+struct cpuset_irq_cpumask {
+ struct irq_iterator v;
+ struct cpuset *cs;
+ cpumask_t mask;
+};
+
+static int
+update_irq_cpumask(struct irq_iterator *v, int irq, struct irq_desc *desc)
+{
+ struct cpuset_irq_cpumask *s =
+ container_of(v, struct cpuset_irq_cpumask, v);
+
+ if (desc->cs != s->cs)
+ return 0;
+
+ irq_set_affinity(irq, s->mask);
+
+ return 0;
+}
+
+static void update_irqs_cpumask(struct cpuset *cs)
+{
+ struct cpuset_irq_cpumask s = {
+ .v = { .function = update_irq_cpumask },
+ .cs = cs,
+ };
+
+ cpus_and(s.mask, cpu_online_map, cs->cpus_allowed);
+
+ irq_iterator(&s.v);
+}
+#else
+static void update_irqs_cpumask(struct cpuset *cs)
+{
+}
+#endif
+
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
@@ -795,6 +836,8 @@ static int update_cpumask(struct cpuset *cs, char *buf)
cgroup_scan_tasks(&scan);
heap_free(&heap);
+ update_irqs_cpumask(cs);
+
if (is_load_balanced)
rebuild_sched_domains();
return 0;
@@ -1056,6 +1099,52 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
return 0;
}
+#ifdef CONFIG_GENERIC_HARDIRQS
+struct cpuset_irq_update {
+ struct irq_iterator v;
+ struct cpuset *cs;
+ int irq;
+};
+
+static int
+cpuset_update_irq(struct irq_iterator *v, int irq, struct irq_desc *desc)
+{
+ struct cpuset_irq_update *s =
+ container_of(v, struct cpuset_irq_update, v);
+ cpumask_t online_set;
+ int ret;
+
+ if (irq != s->irq)
+ return 0;
+
+ cpus_and(online_set, cpu_online_map, s->cs->cpus_allowed);
+
+ ret = irq_set_affinity(irq, online_set);
+ if (!ret)
+ desc->cs = s->cs;
+
+ return ret;
+}
+
+static int update_irqs(struct cpuset *cs, char *buf)
+{
+ struct cpuset_irq_update s = {
+ .v = { .function = cpuset_update_irq },
+ .cs = cs,
+ };
+
+ if (sscanf(buf, "%d", &s.irq) != 1)
+ return -EIO;
+
+ return irq_iterator(&s.v);
+}
+#else
+static int update_irqs(struct cpuset *cs, char *buf)
+{
+ return 0;
+}
+#endif
+
/*
* Frequency meter - How fast is some event occurring?
*
@@ -1206,6 +1295,7 @@ typedef enum {
FILE_MEMORY_PRESSURE,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
+ FILE_IRQS,
} cpuset_filetype_t;
static ssize_t cpuset_common_file_write(struct cgroup *cont,
@@ -1273,6 +1363,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
cs->mems_generation = cpuset_mems_generation++;
break;
+ case FILE_IRQS:
+ retval = update_irqs(cs, buffer);
+ break;
default:
retval = -EINVAL;
goto out2;
@@ -1321,6 +1414,59 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
return nodelist_scnprintf(page, PAGE_SIZE, mask);
}
+#ifdef CONFIG_GENERIC_HARDIRQS
+struct cpuset_irq_print {
+ struct irq_iterator v;
+ struct cpuset *cs;
+ char *buf;
+ int len;
+ int buflen;
+};
+
+static int
+cpuset_sprintf_irq(struct irq_iterator *v, int irq, struct irq_desc *desc)
+{
+ struct cpuset_irq_print *s =
+ container_of(v, struct cpuset_irq_print, v);
+
+ if (desc->cs != s->cs)
+ return 0;
+
+ if (s->len > 0)
+ s->len += scnprintf(s->buf + s->len, s->buflen - s->len, " ");
+ s->len += scnprintf(s->buf + s->len, s->buflen - s->len, "%d", irq);
+
+ return 0;
+}
+
+static int cpuset_sprintf_irqlist(char *page, struct cpuset *cs)
+{
+ int ret;
+
+ struct cpuset_irq_print s = {
+ .v = { .function = cpuset_sprintf_irq },
+ .cs = cs,
+ .buf = page,
+ .len = 0,
+ .buflen = PAGE_SIZE,
+ };
+
+ mutex_lock(&callback_mutex);
+ ret = irq_iterator(&s.v);
+ mutex_unlock(&callback_mutex);
+
+ if (!ret)
+ ret = s.len;
+
+ return ret;
+}
+#else
+static int cpuset_sprintf_irqlist(char *page, struct cpuset *cs)
+{
+ return 0;
+}
+#endif
+
static ssize_t cpuset_common_file_read(struct cgroup *cont,
struct cftype *cft,
struct file *file,
@@ -1369,6 +1515,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
case FILE_SPREAD_SLAB:
*s++ = is_spread_slab(cs) ? '1' : '0';
break;
+ case FILE_IRQS:
+ s += cpuset_sprintf_irqlist(s, cs);
+ break;
default:
retval = -EINVAL;
goto out;
@@ -1459,6 +1608,13 @@ static struct cftype cft_spread_slab = {
.private = FILE_SPREAD_SLAB,
};
+static struct cftype cft_irqs = {
+ .name = "irqs",
+ .read = cpuset_common_file_read,
+ .write = cpuset_common_file_write,
+ .private = FILE_IRQS,
+};
+
static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
{
int err;
@@ -1481,6 +1637,10 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
return err;
if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
return err;
+#ifdef CONFIG_GENERIC_HARDIRQS
+ if ((err = cgroup_add_file(cont, ss, &cft_irqs)) < 0)
+ return err;
+#endif
/* memory_pressure_enabled is in root cpuset only */
if (err == 0 && !cont->parent)
err = cgroup_add_file(cont, ss,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 438a014..5154c25 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -96,6 +96,25 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
#endif
+int irq_iterator(struct irq_iterator *v)
+{
+ int ret = 0;
+ int irq;
+
+ for (irq = 0; irq < NR_IRQS; irq++) {
+ struct irq_desc *desc = &irq_desc[irq];
+
+ if (desc->chip == &no_irq_chip)
+ continue;
+
+ ret = v->function(v, irq, desc);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
/**
* disable_irq_nosync - disable an irq without waiting
* @irq: Interrupt to disable
--
1.5.4.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 2/2] cpusets: Improved irq affinity handling
2008-03-11 5:57 ` [PATCH 1/2] cpuset: cpuset irq affinities maxk
@ 2008-03-11 5:57 ` maxk
2008-03-11 6:35 ` [PATCH 1/2] cpuset: cpuset irq affinities Christoph Hellwig
2008-03-11 6:57 ` Paul Jackson
2 siblings, 0 replies; 11+ messages in thread
From: maxk @ 2008-03-11 5:57 UTC (permalink / raw)
To: mingo; +Cc: pj, a.p.zijlstra, linux-kernel, menage, Max Krasnyansky
From: Max Krasnyansky <maxk@qualcomm.com>
This builds on top of Peter's patch. Irqs are handled just like
tasks now. ie Moved when set goes empty, etc.
irq_set_affinity() now ensures that specified mask is a subset
of allowed cpus just like sched_setscheduler() does for tasks.
Other minor bugfixes.
Booted and tested on a couple of differnet x86-64 boxes:
Core2Duo lapttop and dual Opteron numa machine.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
---
include/linux/cpuset.h | 8 +
include/linux/irq.h | 2 +
kernel/cpuset.c | 367 ++++++++++++++++++++++++++++++------------------
kernel/irq/chip.c | 5 +
kernel/irq/manage.c | 27 +++-
5 files changed, 265 insertions(+), 144 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 0a26be3..e3d4daf 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -13,6 +13,8 @@
#include <linux/nodemask.h>
#include <linux/cgroup.h>
+struct cpuset;
+
#ifdef CONFIG_CPUSETS
extern int number_of_cpusets; /* How many cpusets are defined in system? */
@@ -20,6 +22,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
extern int cpuset_init_early(void);
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
+extern int cpuset_cpumask_allowed(cpumask_t *mask, struct cpuset *cs);
extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -93,6 +96,11 @@ static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p)
return cpu_possible_map;
}
+static inline int cpuset_cpumask_allowed(cpumask_t *mask, struct cpuset *cs)
+{
+ return 1;
+}
+
static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
return node_possible_map;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 450c0de..28799b5 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -17,6 +17,7 @@
#include <linux/cache.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
+#include <linux/cpuset.h>
#include <linux/irqreturn.h>
#include <linux/errno.h>
@@ -238,6 +239,7 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
#endif /* CONFIG_GENERIC_PENDING_IRQ */
extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
+extern int __irq_set_affinity(unsigned int irq, cpumask_t cpumask);
extern int irq_can_set_affinity(unsigned int irq);
#else /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e82a258..b59d635 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -260,6 +260,214 @@ static struct file_system_type cpuset_fs_type = {
.get_sb = cpuset_get_sb,
};
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+struct cpuset_irq_cpumask {
+ struct irq_iterator v;
+ struct cpuset *cs;
+ cpumask_t mask;
+};
+
+static int update_irq_cpumask(struct irq_iterator *v, int irq, struct irq_desc *desc)
+{
+ struct cpuset_irq_cpumask *s =
+ container_of(v, struct cpuset_irq_cpumask, v);
+
+ if (desc->cs != s->cs)
+ return 0;
+
+ __irq_set_affinity(irq, s->mask);
+ return 0;
+}
+
+static void update_irqs_cpumask(struct cpuset *cs)
+{
+ struct cpuset_irq_cpumask s = {
+ .v = { .function = update_irq_cpumask },
+ .cs = cs,
+ };
+
+ cpus_and(s.mask, cpu_online_map, cs->cpus_allowed);
+
+ irq_iterator(&s.v);
+}
+
+struct cpuset_irq_update {
+ struct irq_iterator v;
+ struct cpuset *cs;
+ int irq;
+ int force;
+};
+
+static int cpuset_update_irq(struct irq_iterator *v, int irq, struct irq_desc *desc)
+{
+ struct cpuset_irq_update *s =
+ container_of(v, struct cpuset_irq_update, v);
+ cpumask_t online_set;
+ int ret;
+
+ if (irq != s->irq)
+ return 0;
+
+ cpus_and(online_set, cpu_online_map, s->cs->cpus_allowed);
+
+ ret = __irq_set_affinity(irq, online_set);
+ if (!ret)
+ desc->cs = s->cs;
+ else if (s->force) {
+ /* Move to the new cpuset failed, we're forced
+ * to move it all the way up. Should we maybe
+ * printk some warning here ? */
+ __irq_set_affinity(irq, cpu_online_map);
+ desc->cs = NULL;
+ }
+
+ return ret;
+}
+
+static int update_irqs(struct cpuset *cs, char *buf)
+{
+ struct cpuset_irq_update s = {
+ .v = { .function = cpuset_update_irq },
+ .cs = cs,
+ .force = 0
+ };
+
+ if (sscanf(buf, "%d", &s.irq) != 1)
+ return -EIO;
+
+ return irq_iterator(&s.v);
+}
+
+struct cpuset_irq_print {
+ struct irq_iterator v;
+ struct cpuset *cs;
+ char *buf;
+ int len;
+ int buflen;
+};
+
+static int cpuset_sprintf_irq(struct irq_iterator *v, int irq, struct irq_desc *desc)
+{
+ struct cpuset_irq_print *s =
+ container_of(v, struct cpuset_irq_print, v);
+
+ if (desc->cs != s->cs)
+ return 0;
+
+ if (s->len > 0)
+ s->len += scnprintf(s->buf + s->len, s->buflen - s->len, " ");
+ s->len += scnprintf(s->buf + s->len, s->buflen - s->len, "%d", irq);
+
+ return 0;
+}
+
+static int cpuset_sprintf_irqlist(char *page, struct cpuset *cs)
+{
+ int ret;
+
+ struct cpuset_irq_print s = {
+ .v = { .function = cpuset_sprintf_irq },
+ .cs = cs,
+ .buf = page,
+ .len = 0,
+ .buflen = PAGE_SIZE,
+ };
+
+ mutex_lock(&callback_mutex);
+ ret = irq_iterator(&s.v);
+ mutex_unlock(&callback_mutex);
+
+ if (!ret)
+ ret = s.len;
+
+ return ret;
+}
+
+struct cpuset_irq_count {
+ struct irq_iterator v;
+ const struct cpuset *cs;
+ int count;
+};
+
+static int cpuset_count_irq(struct irq_iterator *v, int irq, struct irq_desc *desc)
+{
+ struct cpuset_irq_count *s =
+ container_of(v, struct cpuset_irq_count, v);
+
+ if (desc->cs != s->cs)
+ return 0;
+
+ s->count++;
+ return 0;
+}
+
+static int cpuset_count_irqs(const struct cpuset *cs)
+{
+ struct cpuset_irq_count s = {
+ .v = { .function = cpuset_count_irq },
+ .cs = cs,
+ .count = 0
+ };
+
+ irq_iterator(&s.v);
+
+ return s.count;
+}
+
+/*
+ * This is called if last CPU is removed from a cpuset or cpuset is destroyed.
+ * We move the irqs in the cpuset to its next-highest non-empty parent.
+ *
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
+ */
+static void move_irqs_to_parent(struct cpuset *cs)
+{
+ struct cpuset_irq_update s = {
+ .v = { .function = cpuset_update_irq },
+ .force = 1
+ };
+
+ struct cpuset *parent;
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, it can't be empty).
+ */
+ parent = cs->parent;
+ while (cpus_empty(parent->cpus_allowed))
+ parent = parent->parent;
+
+ s.cs = parent;
+ irq_iterator(&s.v);
+}
+
+#else /* CONFIG_GENERIC_HARDIRQS */
+static void update_irqs_cpumask(struct cpuset *cs)
+{
+}
+
+static int update_irqs(struct cpuset *cs, char *buf)
+{
+ return 0;
+}
+
+static int cpuset_sprintf_irqlist(char *page, struct cpuset *cs)
+{
+ return 0;
+}
+
+static int cpuset_count_irqs(const struct cpuset *cs)
+{
+ return 0;
+}
+
+static void move_irqs_to_parent(struct cpuset *cs)
+{
+}
+#endif
+
/*
* Return in *pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
@@ -468,6 +676,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
}
}
+ /* Cpusets with irqs can't have empty cpus_allowed */
+ if (cpus_empty(trial->cpus_allowed) && cpuset_count_irqs(cur))
+ return -ENOSPC;
+
return 0;
}
@@ -735,44 +947,6 @@ void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed);
}
-#ifdef CONFIG_GENERIC_HARDIRQS
-struct cpuset_irq_cpumask {
- struct irq_iterator v;
- struct cpuset *cs;
- cpumask_t mask;
-};
-
-static int
-update_irq_cpumask(struct irq_iterator *v, int irq, struct irq_desc *desc)
-{
- struct cpuset_irq_cpumask *s =
- container_of(v, struct cpuset_irq_cpumask, v);
-
- if (desc->cs != s->cs)
- return 0;
-
- irq_set_affinity(irq, s->mask);
-
- return 0;
-}
-
-static void update_irqs_cpumask(struct cpuset *cs)
-{
- struct cpuset_irq_cpumask s = {
- .v = { .function = update_irq_cpumask },
- .cs = cs,
- };
-
- cpus_and(s.mask, cpu_online_map, cs->cpus_allowed);
-
- irq_iterator(&s.v);
-}
-#else
-static void update_irqs_cpumask(struct cpuset *cs)
-{
-}
-#endif
-
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
@@ -1099,52 +1273,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
return 0;
}
-#ifdef CONFIG_GENERIC_HARDIRQS
-struct cpuset_irq_update {
- struct irq_iterator v;
- struct cpuset *cs;
- int irq;
-};
-
-static int
-cpuset_update_irq(struct irq_iterator *v, int irq, struct irq_desc *desc)
-{
- struct cpuset_irq_update *s =
- container_of(v, struct cpuset_irq_update, v);
- cpumask_t online_set;
- int ret;
-
- if (irq != s->irq)
- return 0;
-
- cpus_and(online_set, cpu_online_map, s->cs->cpus_allowed);
-
- ret = irq_set_affinity(irq, online_set);
- if (!ret)
- desc->cs = s->cs;
-
- return ret;
-}
-
-static int update_irqs(struct cpuset *cs, char *buf)
-{
- struct cpuset_irq_update s = {
- .v = { .function = cpuset_update_irq },
- .cs = cs,
- };
-
- if (sscanf(buf, "%d", &s.irq) != 1)
- return -EIO;
-
- return irq_iterator(&s.v);
-}
-#else
-static int update_irqs(struct cpuset *cs, char *buf)
-{
- return 0;
-}
-#endif
-
/*
* Frequency meter - How fast is some event occurring?
*
@@ -1414,59 +1542,6 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
return nodelist_scnprintf(page, PAGE_SIZE, mask);
}
-#ifdef CONFIG_GENERIC_HARDIRQS
-struct cpuset_irq_print {
- struct irq_iterator v;
- struct cpuset *cs;
- char *buf;
- int len;
- int buflen;
-};
-
-static int
-cpuset_sprintf_irq(struct irq_iterator *v, int irq, struct irq_desc *desc)
-{
- struct cpuset_irq_print *s =
- container_of(v, struct cpuset_irq_print, v);
-
- if (desc->cs != s->cs)
- return 0;
-
- if (s->len > 0)
- s->len += scnprintf(s->buf + s->len, s->buflen - s->len, " ");
- s->len += scnprintf(s->buf + s->len, s->buflen - s->len, "%d", irq);
-
- return 0;
-}
-
-static int cpuset_sprintf_irqlist(char *page, struct cpuset *cs)
-{
- int ret;
-
- struct cpuset_irq_print s = {
- .v = { .function = cpuset_sprintf_irq },
- .cs = cs,
- .buf = page,
- .len = 0,
- .buflen = PAGE_SIZE,
- };
-
- mutex_lock(&callback_mutex);
- ret = irq_iterator(&s.v);
- mutex_unlock(&callback_mutex);
-
- if (!ret)
- ret = s.len;
-
- return ret;
-}
-#else
-static int cpuset_sprintf_irqlist(char *page, struct cpuset *cs)
-{
- return 0;
-}
-#endif
-
static ssize_t cpuset_common_file_read(struct cgroup *cont,
struct cftype *cft,
struct file *file,
@@ -1743,6 +1818,11 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
cpuset_update_task_memory_state();
+ /* Move irqs (if any) to the parent set. Ideally we should disallow
+ * destruction of the cpusets that have irqs (that's how it works
+ * for the tasks), but refcounting seems messy. */
+ move_irqs_to_parent(cs);
+
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
@@ -1927,6 +2007,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
if (cpus_empty(cp->cpus_allowed) ||
nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
+
+ /* Move irqs from the empty cpuset to a parent */
+ if (cpus_empty(cp->cpus_allowed))
+ move_irqs_to_parent(cp);
}
}
@@ -2037,6 +2121,15 @@ cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
return mask;
}
+/**
+ * cpuset_cpumask_allowed - returns 1 if provided cpumask is a subset
+ * of allowed cpus in the set.
+ **/
+int cpuset_cpumask_allowed(cpumask_t *mask, struct cpuset *cs)
+{
+ return cpus_subset(*mask, cs->cpus_allowed);
+}
+
void cpuset_init_current_mems_allowed(void)
{
current->mems_allowed = NODE_MASK_ALL;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fdb3fbe..2588d08 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -49,6 +49,11 @@ void dynamic_irq_init(unsigned int irq)
#ifdef CONFIG_SMP
desc->affinity = CPU_MASK_ALL;
#endif
+
+#ifdef CONFIG_CPUSETS
+ desc->cs = NULL;
+#endif
+
spin_unlock_irqrestore(&desc->lock, flags);
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5154c25..5a91d1c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -70,13 +70,7 @@ int irq_can_set_affinity(unsigned int irq)
return 1;
}
-/**
- * irq_set_affinity - Set the irq affinity of a given irq
- * @irq: Interrupt to set affinity
- * @cpumask: cpumask
- *
- */
-int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+int __irq_set_affinity(unsigned int irq, cpumask_t cpumask)
{
struct irq_desc *desc = irq_desc + irq;
@@ -94,6 +88,25 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
return 0;
}
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq: Interrupt to set affinity
+ * @cpumask: cpumask
+ *
+ */
+int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+{
+ struct irq_desc *desc = irq_desc + irq;
+
+#ifdef CONFIG_CPUSETS
+ /* FIXME: Not sure about locking here. maxk */
+ if (desc->cs && !cpuset_cpumask_allowed(&cpumask, desc->cs))
+ return -EINVAL;
+#endif
+
+ return __irq_set_affinity(irq, cpumask);
+}
+
#endif
int irq_iterator(struct irq_iterator *v)
--
1.5.4.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] cpuset: cpuset irq affinities
2008-03-11 5:57 ` [PATCH 1/2] cpuset: cpuset irq affinities maxk
2008-03-11 5:57 ` [PATCH 2/2] cpusets: Improved irq affinity handling maxk
@ 2008-03-11 6:35 ` Christoph Hellwig
2008-03-11 17:05 ` Max Krasnyansky
2008-03-11 6:57 ` Paul Jackson
2 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2008-03-11 6:35 UTC (permalink / raw)
To: maxk; +Cc: mingo, pj, a.p.zijlstra, linux-kernel, menage
On Mon, Mar 10, 2008 at 10:57:25PM -0700, maxk@qualcomm.com wrote:
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
>
> Hi Paul,
>
> How about something like this; along with the in-kernel version
> of /cgroup/boot this could also provide the desired semantics.
>
> Another benefit of this approach would be that it no longer requires
> PF_THREAD_BIND, as we'd only stick unbounded kthreads into that cgroup.
Looks like my comments to the last posting of this are still not
addressed.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] cpuset: cpuset irq affinities
2008-03-11 5:57 ` [PATCH 1/2] cpuset: cpuset irq affinities maxk
2008-03-11 5:57 ` [PATCH 2/2] cpusets: Improved irq affinity handling maxk
2008-03-11 6:35 ` [PATCH 1/2] cpuset: cpuset irq affinities Christoph Hellwig
@ 2008-03-11 6:57 ` Paul Jackson
2008-03-11 17:25 ` Max Krasnyansky
2 siblings, 1 reply; 11+ messages in thread
From: Paul Jackson @ 2008-03-11 6:57 UTC (permalink / raw)
To: maxk; +Cc: mingo, a.p.zijlstra, linux-kernel, menage
Max K wrote:
> this could also provide the desired semantics.
Could you spell out what you mean by "the desired semantics" ?
I don't see any Documentation or much comments, which would
help understand this. It helps to describe both what has
changed, and, from the top, the why, what and how of what
you're doing, in part as Documentation or code comments,
for the benefit of future readers.
Did you see my discussion of this with Peter on March 6 and 7
in the lkml "[RFC/PATCH] cpuset: cpuset irq affinities" thread?
This latest patch of yours seems, offhand, to predate that discussion.
I don't see any explanation of what locking is needed when.
What semantics to you impose on irqs in overlapping cpusets,
which would seem to lead to conflicting directives as to
whether one set or another of irqs was to be applied to the
CPUs in the overlap?
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.940.382.4214
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] cpuset: cpuset irq affinities
2008-03-11 6:35 ` [PATCH 1/2] cpuset: cpuset irq affinities Christoph Hellwig
@ 2008-03-11 17:05 ` Max Krasnyansky
2008-03-11 18:58 ` Paul Jackson
0 siblings, 1 reply; 11+ messages in thread
From: Max Krasnyansky @ 2008-03-11 17:05 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: mingo, pj, a.p.zijlstra, linux-kernel, menage
Christoph Hellwig wrote:
> On Mon, Mar 10, 2008 at 10:57:25PM -0700, maxk@qualcomm.com wrote:
>> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
>>
>> Hi Paul,
>>
>> How about something like this; along with the in-kernel version
>> of /cgroup/boot this could also provide the desired semantics.
>>
>> Another benefit of this approach would be that it no longer requires
>> PF_THREAD_BIND, as we'd only stick unbounded kthreads into that cgroup.
>
> Looks like my comments to the last posting of this are still not
> addressed.
Hi Christoph,
Sorry I missed your comment. Actually I just looked at the archive of the
original thread and I do not see any comments from you. Were they maybe sent
in private to Peter Z ? Can you please resend them ?
Also btw can you comment on the 2/2 patch of this series instead. 1/2 is just
a resend of the original Peter's patch. 2/2 is more complete and tested version.
Thanx
Max
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] cpuset: cpuset irq affinities
2008-03-11 6:57 ` Paul Jackson
@ 2008-03-11 17:25 ` Max Krasnyansky
2008-03-11 19:08 ` Paul Jackson
0 siblings, 1 reply; 11+ messages in thread
From: Max Krasnyansky @ 2008-03-11 17:25 UTC (permalink / raw)
To: Paul Jackson; +Cc: mingo, a.p.zijlstra, linux-kernel, menage
Paul Jackson wrote:
> Max K wrote:
>> this could also provide the desired semantics.
>
> Could you spell out what you mean by "the desired semantics" ?
>
> I don't see any Documentation or much comments, which would
> help understand this. It helps to describe both what has
> changed, and, from the top, the why, what and how of what
> you're doing, in part as Documentation or code comments,
> for the benefit of future readers.
>
> Did you see my discussion of this with Peter on March 6 and 7
> in the lkml "[RFC/PATCH] cpuset: cpuset irq affinities" thread?
> This latest patch of yours seems, offhand, to predate that discussion.
Paul, can you please comment on 2/2 patch instead. 1/2 is just a resend of the
Peter's original patch that I was building on top. So yes it predates that
discussion. I used it as the baseline.
> I don't see any explanation of what locking is needed when.
There are more comments in 2/2. There is one spot in there where I'm not sure
about the locking (look for FIXME comment). Everything else seems to be
protected correctly by callback_lock. I may have missed things of course.
> What semantics to you impose on irqs in overlapping cpusets,
> which would seem to lead to conflicting directives as to
> whether one set or another of irqs was to be applied to the
> CPUs in the overlap?
Please take a look at
[PATCH 2/2] cpusets: Improved irq affinity handling
I'm treating irqs just like tasks (at least I think I'm :).
Max
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] cpuset: cpuset irq affinities
2008-03-11 17:05 ` Max Krasnyansky
@ 2008-03-11 18:58 ` Paul Jackson
2008-03-11 19:50 ` Max Krasnyansky
0 siblings, 1 reply; 11+ messages in thread
From: Paul Jackson @ 2008-03-11 18:58 UTC (permalink / raw)
To: Max Krasnyansky; +Cc: hch, mingo, a.p.zijlstra, linux-kernel, menage
Max wrote:
> Sorry I missed your comment. Actually I just looked at the archive of the
> original thread and I do not see any comments from you. Were they maybe sent
> in private to Peter Z ? Can you please resend them ?
I suspect Christoph is referring to a comment he made on lkml, at:
http://lkml.org/lkml/2008/3/2/3
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.940.382.4214
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] cpuset: cpuset irq affinities
2008-03-11 17:25 ` Max Krasnyansky
@ 2008-03-11 19:08 ` Paul Jackson
2008-03-11 21:31 ` Max Krasnyansky
0 siblings, 1 reply; 11+ messages in thread
From: Paul Jackson @ 2008-03-11 19:08 UTC (permalink / raw)
To: Max Krasnyansky; +Cc: mingo, a.p.zijlstra, linux-kernel, menage
Max wrote:
> Please take a look at
> [PATCH 2/2] cpusets: Improved irq affinity handling
> I'm treating irqs just like tasks (at least I think I'm :).
Well, I see the one comment in your Patch 2/2 noting you're unsure
of the locking in one place.
I don't see any further comments on or additional code involving
locking.
I don't see where you respond to my discussion with Peter of March
6 and 7, where I expressed some doubts about Peters patch (which you
built on in your patch 1/2 in this series).
I see only a little bit of additional comments in your patch 2/2
regarding handling of moving irqs to higher non-empty cpusets if a
cpuset is emptied of its CPUs.
I don't see any explanation of what you mean by "desired semantics."
I don't see any response to the alternatives to Peter's earlier patch
(your Patch 1/2 here) that Peter and I discussed in that discussion of
March 6 and 7.
And, in particular, could you respond to the question in my last
message:
> What semantics to you impose on irqs in overlapping cpusets,
> which would seem to lead to conflicting directives as to
> whether one set or another of irqs was to be applied to the
> CPUs in the overlap?
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.940.382.4214
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] cpuset: cpuset irq affinities
2008-03-11 18:58 ` Paul Jackson
@ 2008-03-11 19:50 ` Max Krasnyansky
0 siblings, 0 replies; 11+ messages in thread
From: Max Krasnyansky @ 2008-03-11 19:50 UTC (permalink / raw)
To: Paul Jackson; +Cc: hch, mingo, a.p.zijlstra, linux-kernel, menage
Paul Jackson wrote:
> Max wrote:
>> Sorry I missed your comment. Actually I just looked at the archive of the
>> original thread and I do not see any comments from you. Were they maybe sent
>> in private to Peter Z ? Can you please resend them ?
>
> I suspect Christoph is referring to a comment he made on lkml, at:
>
> http://lkml.org/lkml/2008/3/2/3
Got it. The comment was
> linux/irq.h must not be included in generic code, it's actually more
> and asm-generic/hw_irq.h. Please restructure the code so that the
> cpuset code calls into an arch interface which will then be implemented
> by arch code (which in most cases will be genirq, the other can be left
> stubbed out for now)
I'm not sure I agree with that though. We need access to irq_set_affinity()
from the cpuset code. It does not seem to make sense to call some arch
specific code if that functionality is provided by the generic layer.
Also note that IRQ functionality in the cpuset is ifdefed by the
CONFIG_GENERIC_HARDIRQS, so that we guarantied to have access to the generic code.
In addition to the irq_set_affinity we need irq_iterator() which is added by
the patch. Which again is generic.
So the question is why would we want to call into an arch interface if we can
call the generic one directly ?
Max
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] cpuset: cpuset irq affinities
2008-03-11 19:08 ` Paul Jackson
@ 2008-03-11 21:31 ` Max Krasnyansky
0 siblings, 0 replies; 11+ messages in thread
From: Max Krasnyansky @ 2008-03-11 21:31 UTC (permalink / raw)
To: Paul Jackson; +Cc: mingo, a.p.zijlstra, linux-kernel, menage
Paul Jackson wrote:
> Max wrote:
>> Please take a look at
>> [PATCH 2/2] cpusets: Improved irq affinity handling
>> I'm treating irqs just like tasks (at least I think I'm :).
>
> Well, I see the one comment in your Patch 2/2 noting you're unsure
> of the locking in one place.
>
> I don't see any further comments on or additional code involving
> locking.
I did not think they were need. I'll take another look at it.
Do have anything in particular where you think that locking needs to be
clarified/explained ?
> I don't see where you respond to my discussion with Peter of March
> 6 and 7, where I expressed some doubts about Peters patch (which you
> built on in your patch 1/2 in this series).
>
> I see only a little bit of additional comments in your patch 2/2
> regarding handling of moving irqs to higher non-empty cpusets if a
> cpuset is emptied of its CPUs.
>
> I don't see any explanation of what you mean by "desired semantics."
>
> I don't see any response to the alternatives to Peter's earlier patch
> (your Patch 1/2 here) that Peter and I discussed in that discussion of
> March 6 and 7.
Ok. I must admit that I tuned out of that discussion. I read it again just now
and my impression is that you guys went a bit off road :). I mean it seems to
me that we're making it more complicated that it needs to be.
I'm thinking of irqs as tasks (in the cpuset context). Think of an irq number
as task pid. Just like we assign a task to a cpuset we can (with this patch)
assign an irq to a cpuset.
Yes, some HW may not map nicely into that kind of view, but so far nobody has
provided any real examples of such hw. I'm sure it exists but as I suggested
before (I believe Ingo suggested that too) we can just return an error if irq
move failed. The patches already handle this scenarios (for example you won't
be able to assign an irq to the cpuset if irq_set_affinity(irq,
cs->cpus_allowed) fails, and if move to parent fails we move it all the way up).
I think this simple concept works for most use cases and is familiar for
people who deal with /proc/irq/N/smp_affinity. Currently we assign each
individual irq to a list of cpus (represented by a mask). With the patch we
can now assign an irq to a list of cpus represented by the cpuset.
So the "desired semantics" in my mind is to be able assign an IRQ to a cpuset
in the same way we do that we the tasks.
> And, in particular, could you respond to the question in my last
> message:
>
>> What semantics to you impose on irqs in overlapping cpusets,
>> which would seem to lead to conflicting directives as to
>> whether one set or another of irqs was to be applied to the
>> CPUs in the overlap?
I thought I did, respond that is. That's what I meant by "I treat them just
like tasks". A task can be assigned to only one cpuset, so is the irq. If the
cpuset that it's assigned to is overlapping with some other cpuset it does not
change that task's behavior. It's still constrained to the cpus allotted for
that cpuset. Same exact thing for the irq.
To give you an example. Lets say we have:
/dev/cpuset/set0
cpus=0-1
cpu_exclusive=0
/dev/cpuset/set1
cpus=1-2
cpu_exclusive=0
Tasks assigned to set0 will only run on cpu0 and cpu1, so will the irqs. It
does not matter that set1 is overlapping with set0.
----
My personal use case for the cpu isolation goes like this:
2way box
/dev/cpuset/
cpuset.sched_load_balance=0
/boot
cpus=0
cpu_exclusive=0
irqs=(all irqs)
tasks=(all user and kernel tasks)
/myapp0
cpus=0,1
cpu_exclusive=0
irqs=(rt irqs)
tasks=(myapp tasks)
The patches have been tested on that exact 2way setup and a couple of other
different combination with child cpusets under myapp0 for testing irq
migration due to hotplug events.
4way box
/dev/cpuset/
cpuset.sched_load_balance=0
/boot
cpus=0,1
cpu_exclusive=0
sched_load_balance=1
irqs=(all irqs)
tasks=(all user and kernel tasks)
/myapp0
cpus=0,1,2,3
cpu_exclusive=0
sched_load_balance=0
irqs=(none)
tasks=(myapp tasks)
/myapp1
cpus=2,3
cpu_exclusive=0
sched_load_balance=0
irqs=(rt irqs)
tasks=(none)
Disclaimer: I have not tried the 4way setup above yet. I do not see why it
would not work though.
In case you're wondering why I'm assigning all cpus to '/myapp0' is because as
I mentioned before in my case some threads need to run along with other
regular apps on the cpus that provide full kernel services and other threads
must run on the isolated cpus. '/myapp1' is used only for (rt irqs).
Max
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2008-03-11 21:31 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-11 5:57 [PATCH 0/2] cpusets: support for irqs maxk
2008-03-11 5:57 ` [PATCH 1/2] cpuset: cpuset irq affinities maxk
2008-03-11 5:57 ` [PATCH 2/2] cpusets: Improved irq affinity handling maxk
2008-03-11 6:35 ` [PATCH 1/2] cpuset: cpuset irq affinities Christoph Hellwig
2008-03-11 17:05 ` Max Krasnyansky
2008-03-11 18:58 ` Paul Jackson
2008-03-11 19:50 ` Max Krasnyansky
2008-03-11 6:57 ` Paul Jackson
2008-03-11 17:25 ` Max Krasnyansky
2008-03-11 19:08 ` Paul Jackson
2008-03-11 21:31 ` Max Krasnyansky
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).