LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Max Krasnyansky <maxk@qualcomm.com>
To: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Christoph Lameter <clameter@sgi.com>, Ingo Molnar <mingo@elte.hu>,
Srivatsa Vaddagiri <vatsa@in.ibm.com>,
"Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>,
Gautham shenoy <ego@in.ibm.com>, Andrew Morton <akpm@osdl.org>,
linux-kernel@vger.kernel.org
Subject: Re: slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems
Date: Tue, 20 Feb 2007 10:39:08 -0800 [thread overview]
Message-ID: <45DB404C.4070305@qualcomm.com> (raw)
In-Reply-To: <20070129182742.GA158@tv-sign.ru>
Folks,
Oleg Nesterov wrote:
>>> Even if smp_processor_id() was stable during the execution of cache_reap(),
>>> this work_struct can be moved to another CPU if CPU_DEAD happens. We can't
>>> avoid this, and this is correct.
>> Uhh.... This may not be correct in terms of how the slab operates.
>
> But this is practically impossible to avoid. We can't delay CPU_DOWN until all
> workqueues flush their cwq->worklist. This is livelockable, the work can re-queue
> itself, and new works can be added since the dying CPU is still on cpu_online_map.
> This means that some pending works will be processed on another CPU.
>
> delayed_work is even worse, the timer can migrate as well.
>
> The first problem (smp_processor_id() is not stable) could be solved if we
> use freezer or with the help of not-yet-implemented scalable lock_cpu_hotplug.
>
>>> This means that __get_cpu_var(reap_work) returns a "wrong" struct delayed_work.
>>> This is absolutely harmless right now, but may be it's better to use
>>> container_of(unused, struct delayed_work, work).
>> Well seems that we have a set of unresolved issues with workqueues and cpu
>> hotplug.
How about storing 'cpu' explicitly in the work queue instead of relying on the
smp_processor_id() and friends ? That way there is no ambiguity when threads/timers get
moved around.
I'm cooking a set of patches to extend cpu isolation concept a bit. In which case I'd
like one CPU to run cache_reap timer on behalf of another cpu. See the patch below.
diff --git a/mm/slab.c b/mm/slab.c
index c610062..0f46d11 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -766,7 +766,17 @@ int slab_is_available(void)
return g_cpucache_up == FULL;
}
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
+struct slab_work {
+ struct delayed_work dw;
+ unsigned int cpu;
+};
+
+static DEFINE_PER_CPU(struct slab_work, reap_work);
+
+static inline struct array_cache *cpu_cache_get_on(struct kmem_cache *cachep, unsigned int cpu)
+{
+ return cachep->array[cpu];
+}
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
{
@@ -915,9 +925,9 @@ static void init_reap_node(int cpu)
per_cpu(reap_node, cpu) = node;
}
-static void next_reap_node(void)
+static void next_reap_node(unsigned int cpu)
{
- int node = __get_cpu_var(reap_node);
+ int node = per_cpu(reap_node, cpu);
/*
* Also drain per cpu pages on remote zones
@@ -928,12 +938,12 @@ static void next_reap_node(void)
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(reap_node) = node;
+ per_cpu(reap_node, cpu) = node;
}
#else
#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
+#define next_reap_node(cpu) do { } while (0)
#endif
/*
@@ -945,17 +955,18 @@ static void next_reap_node(void)
*/
static void __devinit start_cpu_timer(int cpu)
{
- struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
+ struct slab_work *reap_work = &per_cpu(reap_work, cpu);
/*
* When this gets called from do_initcalls via cpucache_init(),
* init_workqueues() has already run, so keventd will be setup
* at that time.
*/
- if (keventd_up() && reap_work->work.func == NULL) {
+ if (keventd_up() && reap_work->dw.work.func == NULL) {
init_reap_node(cpu);
- INIT_DELAYED_WORK(reap_work, cache_reap);
- schedule_delayed_work_on(cpu, reap_work,
+ INIT_DELAYED_WORK(&reap_work->dw, cache_reap);
+ reap_work->cpu = cpu;
+ schedule_delayed_work_on(cpu, &reap_work->dw,
__round_jiffies_relative(HZ, cpu));
}
}
@@ -1004,7 +1015,7 @@ static int transfer_objects(struct array_cache *to,
#ifndef CONFIG_NUMA
#define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
+#define reap_alien(cachep, l3, cpu) do { } while (0)
static inline struct array_cache **alloc_alien_cache(int node, int limit)
{
@@ -1099,9 +1110,9 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
/*
* Called from cache_reap() to regularly drain alien caches round robin.
*/
-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, unsigned int cpu)
{
- int node = __get_cpu_var(reap_node);
+ int node = per_cpu(reap_node, cpu);
if (l3->alien) {
struct array_cache *ac = l3->alien[node];
@@ -4017,16 +4028,17 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
* If we cannot acquire the cache chain mutex then just give up - we'll try
* again on the next iteration.
*/
-static void cache_reap(struct work_struct *unused)
+static void cache_reap(struct work_struct *_work)
{
struct kmem_cache *searchp;
struct kmem_list3 *l3;
int node = numa_node_id();
+ struct slab_work *work = (struct slab_work *) _work;
+
if (!mutex_trylock(&cache_chain_mutex)) {
/* Give up. Setup the next iteration. */
- schedule_delayed_work(&__get_cpu_var(reap_work),
- round_jiffies_relative(REAPTIMEOUT_CPUC));
+ schedule_delayed_work(&work->dw, round_jiffies_relative(REAPTIMEOUT_CPUC));
return;
}
@@ -4040,9 +4052,9 @@ static void cache_reap(struct work_struct *unused)
*/
l3 = searchp->nodelists[node];
- reap_alien(searchp, l3);
+ reap_alien(searchp, l3, work->cpu);
- drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
+ drain_array(searchp, l3, cpu_cache_get_on(searchp, work->cpu), 0, node);
/*
* These are racy checks but it does not matter
@@ -4069,11 +4081,11 @@ next:
}
check_irq_on();
mutex_unlock(&cache_chain_mutex);
- next_reap_node();
- refresh_cpu_vm_stats(smp_processor_id());
+ next_reap_node(work->cpu);
+ refresh_cpu_vm_stats(work->cpu);
+
/* Set up the next iteration */
- schedule_delayed_work(&__get_cpu_var(reap_work),
- round_jiffies_relative(REAPTIMEOUT_CPUC));
+ schedule_delayed_work(&work->dw, round_jiffies_relative(REAPTIMEOUT_CPUC));
}
#ifdef CONFIG_PROC_FS
Max
next prev parent reply other threads:[~2007-02-20 19:06 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-01-29 1:13 Oleg Nesterov
2007-01-29 16:54 ` Christoph Lameter
2007-01-29 17:19 ` Oleg Nesterov
2007-01-29 17:27 ` Christoph Lameter
2007-01-29 18:27 ` Oleg Nesterov
2007-01-29 19:09 ` Christoph Lameter
2007-01-29 19:29 ` Oleg Nesterov
2007-01-29 19:25 ` Christoph Lameter
2007-01-29 19:49 ` Oleg Nesterov
2007-01-29 20:29 ` Christoph Lameter
2007-01-29 21:05 ` Oleg Nesterov
2007-01-29 21:48 ` Christoph Lameter
2007-01-29 22:14 ` Oleg Nesterov
2007-02-20 18:39 ` Max Krasnyansky [this message]
2007-02-20 18:45 ` Christoph Lameter
2007-02-20 20:05 ` Oleg Nesterov
2007-02-20 21:22 ` Max Krasnyansky
2007-02-20 21:35 ` Christoph Lameter
2007-02-20 22:01 ` Max Krasnyansky
2007-02-20 22:14 ` Christoph Lameter
2007-02-20 22:48 ` SLAB cache reaper on isolated cpus Max Krasnyansky
2007-02-20 23:19 ` Christoph Lameter
2007-02-21 3:41 ` Max Krasnyansky
2007-02-20 21:05 ` slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems Max Krasnyansky
2007-02-20 21:34 ` Christoph Lameter
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=45DB404C.4070305@qualcomm.com \
--to=maxk@qualcomm.com \
--cc=akpm@osdl.org \
--cc=clameter@sgi.com \
--cc=ego@in.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=oleg@tv-sign.ru \
--cc=vatsa@in.ibm.com \
--cc=venkatesh.pallipadi@intel.com \
--subject='Re: slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems' \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).