LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Max Krasnyansky <maxk@qualcomm.com>
To: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Christoph Lameter <clameter@sgi.com>, Ingo Molnar <mingo@elte.hu>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	"Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>,
	Gautham shenoy <ego@in.ibm.com>, Andrew Morton <akpm@osdl.org>,
	linux-kernel@vger.kernel.org
Subject: Re: slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems
Date: Tue, 20 Feb 2007 10:39:08 -0800	[thread overview]
Message-ID: <45DB404C.4070305@qualcomm.com> (raw)
In-Reply-To: <20070129182742.GA158@tv-sign.ru>

Folks,

Oleg Nesterov wrote:
>>> Even if smp_processor_id() was stable during the execution of cache_reap(),
>>> this work_struct can be moved to another CPU if CPU_DEAD happens. We can't
>>> avoid this, and this is correct.
>> Uhh.... This may not be correct in terms of how the slab operates.
> 
> But this is practically impossible to avoid. We can't delay CPU_DOWN until all
> workqueues flush their cwq->worklist. This is livelockable, the work can re-queue
> itself, and new works can be added since the dying CPU is still on cpu_online_map.
> This means that some pending works will be processed on another CPU.
> 
> delayed_work is even worse, the timer can migrate as well.
> 
> The first problem (smp_processor_id() is not stable) could be solved if we
> use freezer or with the help of not-yet-implemented scalable lock_cpu_hotplug.
> 
>>> This means that __get_cpu_var(reap_work) returns a "wrong" struct delayed_work.
>>> This is absolutely harmless right now, but may be it's better to use
>>> container_of(unused, struct delayed_work, work).
>> Well seems that we have a set of unresolved issues with workqueues and cpu 
>> hotplug.

How about storing 'cpu' explicitly in the work queue instead of relying on the
smp_processor_id() and friends ? That way there is no ambiguity when threads/timers get
moved around.
I'm cooking a set of patches to extend cpu isolation concept a bit. In which case I'd
like one CPU to run cache_reap timer on behalf of another cpu. See the patch below.

diff --git a/mm/slab.c b/mm/slab.c
index c610062..0f46d11 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -766,7 +766,17 @@ int slab_is_available(void)
  	return g_cpucache_up == FULL;
  }

-static DEFINE_PER_CPU(struct delayed_work, reap_work);
+struct slab_work {
+	struct delayed_work dw;
+	unsigned int       cpu;
+};
+
+static DEFINE_PER_CPU(struct slab_work, reap_work);
+
+static inline struct array_cache *cpu_cache_get_on(struct kmem_cache *cachep, unsigned int cpu)
+{
+	return cachep->array[cpu];
+}

  static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
  {
@@ -915,9 +925,9 @@ static void init_reap_node(int cpu)
  	per_cpu(reap_node, cpu) = node;
  }

-static void next_reap_node(void)
+static void next_reap_node(unsigned int cpu)
  {
-	int node = __get_cpu_var(reap_node);
+	int node = per_cpu(reap_node, cpu);

  	/*
  	 * Also drain per cpu pages on remote zones
@@ -928,12 +938,12 @@ static void next_reap_node(void)
  	node = next_node(node, node_online_map);
  	if (unlikely(node >= MAX_NUMNODES))
  		node = first_node(node_online_map);
-	__get_cpu_var(reap_node) = node;
+	per_cpu(reap_node, cpu) = node;
  }

  #else
  #define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
+#define next_reap_node(cpu) do { } while (0)
  #endif

  /*
@@ -945,17 +955,18 @@ static void next_reap_node(void)
   */
  static void __devinit start_cpu_timer(int cpu)
  {
-	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
+	struct slab_work *reap_work = &per_cpu(reap_work, cpu);

  	/*
  	 * When this gets called from do_initcalls via cpucache_init(),
  	 * init_workqueues() has already run, so keventd will be setup
  	 * at that time.
  	 */
-	if (keventd_up() && reap_work->work.func == NULL) {
+	if (keventd_up() && reap_work->dw.work.func == NULL) {
  		init_reap_node(cpu);
-		INIT_DELAYED_WORK(reap_work, cache_reap);
-		schedule_delayed_work_on(cpu, reap_work,
+		INIT_DELAYED_WORK(&reap_work->dw, cache_reap);
+		reap_work->cpu = cpu;
+		schedule_delayed_work_on(cpu, &reap_work->dw,
  					__round_jiffies_relative(HZ, cpu));
  	}
  }
@@ -1004,7 +1015,7 @@ static int transfer_objects(struct array_cache *to,
  #ifndef CONFIG_NUMA

  #define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
+#define reap_alien(cachep, l3, cpu) do { } while (0)

  static inline struct array_cache **alloc_alien_cache(int node, int limit)
  {
@@ -1099,9 +1110,9 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
  /*
   * Called from cache_reap() to regularly drain alien caches round robin.
   */
-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, unsigned int cpu)
  {
-	int node = __get_cpu_var(reap_node);
+	int node = per_cpu(reap_node, cpu);

  	if (l3->alien) {
  		struct array_cache *ac = l3->alien[node];
@@ -4017,16 +4028,17 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
   * If we cannot acquire the cache chain mutex then just give up - we'll try
   * again on the next iteration.
   */
-static void cache_reap(struct work_struct *unused)
+static void cache_reap(struct work_struct *_work)
  {
  	struct kmem_cache *searchp;
  	struct kmem_list3 *l3;
  	int node = numa_node_id();

+	struct slab_work *work = (struct slab_work *) _work;
+
  	if (!mutex_trylock(&cache_chain_mutex)) {
  		/* Give up. Setup the next iteration. */
-		schedule_delayed_work(&__get_cpu_var(reap_work),
-				      round_jiffies_relative(REAPTIMEOUT_CPUC));
+		schedule_delayed_work(&work->dw, round_jiffies_relative(REAPTIMEOUT_CPUC));
  		return;
  	}

@@ -4040,9 +4052,9 @@ static void cache_reap(struct work_struct *unused)
  		 */
  		l3 = searchp->nodelists[node];

-		reap_alien(searchp, l3);
+		reap_alien(searchp, l3, work->cpu);

-		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
+		drain_array(searchp, l3, cpu_cache_get_on(searchp, work->cpu), 0, node);

  		/*
  		 * These are racy checks but it does not matter
@@ -4069,11 +4081,11 @@ next:
  	}
  	check_irq_on();
  	mutex_unlock(&cache_chain_mutex);
-	next_reap_node();
-	refresh_cpu_vm_stats(smp_processor_id());
+	next_reap_node(work->cpu);
+	refresh_cpu_vm_stats(work->cpu);
+
  	/* Set up the next iteration */
-	schedule_delayed_work(&__get_cpu_var(reap_work),
-		round_jiffies_relative(REAPTIMEOUT_CPUC));
+	schedule_delayed_work(&work->dw, round_jiffies_relative(REAPTIMEOUT_CPUC));
  }

  #ifdef CONFIG_PROC_FS


Max




  parent reply	other threads:[~2007-02-20 19:06 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-01-29  1:13 Oleg Nesterov
2007-01-29 16:54 ` Christoph Lameter
2007-01-29 17:19   ` Oleg Nesterov
2007-01-29 17:27     ` Christoph Lameter
2007-01-29 18:27       ` Oleg Nesterov
2007-01-29 19:09         ` Christoph Lameter
2007-01-29 19:29           ` Oleg Nesterov
2007-01-29 19:25         ` Christoph Lameter
2007-01-29 19:49           ` Oleg Nesterov
2007-01-29 20:29             ` Christoph Lameter
2007-01-29 21:05               ` Oleg Nesterov
2007-01-29 21:48                 ` Christoph Lameter
2007-01-29 22:14                   ` Oleg Nesterov
2007-02-20 18:39         ` Max Krasnyansky [this message]
2007-02-20 18:45           ` Christoph Lameter
2007-02-20 20:05             ` Oleg Nesterov
2007-02-20 21:22               ` Max Krasnyansky
2007-02-20 21:35                 ` Christoph Lameter
2007-02-20 22:01                   ` Max Krasnyansky
2007-02-20 22:14                     ` Christoph Lameter
2007-02-20 22:48                       ` SLAB cache reaper on isolated cpus Max Krasnyansky
2007-02-20 23:19                         ` Christoph Lameter
2007-02-21  3:41                           ` Max Krasnyansky
2007-02-20 21:05             ` slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems Max Krasnyansky
2007-02-20 21:34               ` Christoph Lameter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=45DB404C.4070305@qualcomm.com \
    --to=maxk@qualcomm.com \
    --cc=akpm@osdl.org \
    --cc=clameter@sgi.com \
    --cc=ego@in.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=oleg@tv-sign.ru \
    --cc=vatsa@in.ibm.com \
    --cc=venkatesh.pallipadi@intel.com \
    --subject='Re: slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).