LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Kirill Tkhai <ktkhai@virtuozzo.com>
To: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: akpm@linux-foundation.org, shakeelb@google.com,
	viro@zeniv.linux.org.uk, hannes@cmpxchg.org, mhocko@kernel.org,
	tglx@linutronix.de, pombredanne@nexb.com,
	stummala@codeaurora.org, gregkh@linuxfoundation.org,
	sfr@canb.auug.org.au, guro@fb.com, mka@chromium.org,
	penguin-kernel@I-love.SAKURA.ne.jp, chris@chris-wilson.co.uk,
	longman@redhat.com, minchan@kernel.org, hillf.zj@alibaba-inc.com,
	ying.huang@intel.com, mgorman@techsingularity.net, jbacik@fb.com,
	linux@roeck-us.net, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, willy@infradead.org, lirongqing@baidu.com,
	aryabinin@virtuozzo.com
Subject: Re: [PATCH v2 04/12] mm: Assign memcg-aware shrinkers bitmap to memcg
Date: Mon, 23 Apr 2018 13:54:50 +0300	[thread overview]
Message-ID: <14ebcccf-3ea8-59f4-d7ea-793aaba632c0@virtuozzo.com> (raw)
In-Reply-To: <20180422175900.dsjmm7gt2nsqj3er@esperanza>

On 22.04.2018 20:59, Vladimir Davydov wrote:
> On Tue, Apr 17, 2018 at 09:53:31PM +0300, Kirill Tkhai wrote:
>> Imagine a big node with many cpus, memory cgroups and containers.
>> Let we have 200 containers, every container has 10 mounts,
>> and 10 cgroups. All container tasks don't touch foreign
>> containers mounts. If there is intensive pages write,
>> and global reclaim happens, a writing task has to iterate
>> over all memcgs to shrink slab, before it's able to go
>> to shrink_page_list().
>>
>> Iteration over all the memcg slabs is very expensive:
>> the task has to visit 200 * 10 = 2000 shrinkers
>> for every memcg, and since there are 2000 memcgs,
>> the total calls are 2000 * 2000 = 4000000.
>>
>> So, the shrinker makes 4 million do_shrink_slab() calls
>> just to try to isolate SWAP_CLUSTER_MAX pages in one
>> of the actively writing memcg via shrink_page_list().
>> I've observed a node spending almost 100% in kernel,
>> making useless iteration over already shrinked slab.
>>
>> This patch adds bitmap of memcg-aware shrinkers to memcg.
>> The size of the bitmap depends on bitmap_nr_ids, and during
>> memcg life it's maintained to be enough to fit bitmap_nr_ids
>> shrinkers. Every bit in the map is related to corresponding
>> shrinker id.
>>
>> Next patches will maintain set bit only for really charged
>> memcg. This will allow shrink_slab() to increase its
>> performance in significant way. See the last patch for
>> the numbers.
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> ---
>>  include/linux/memcontrol.h |   15 +++++
>>  mm/memcontrol.c            |  125 ++++++++++++++++++++++++++++++++++++++++++++
>>  mm/vmscan.c                |   21 +++++++
>>  3 files changed, 160 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index af9eed2e3e04..2ec96ab46b01 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -115,6 +115,7 @@ struct mem_cgroup_per_node {
>>  	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
>>  
>>  	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
> 
>> +	struct memcg_shrinker_map __rcu	*shrinkers_map;
> 
> shrinker_map
> 
>>  
>>  	struct rb_node		tree_node;	/* RB tree node */
>>  	unsigned long		usage_in_excess;/* Set to the value by which */
>> @@ -153,6 +154,11 @@ struct mem_cgroup_thresholds {
>>  	struct mem_cgroup_threshold_ary *spare;
>>  };
>>  
>> +struct memcg_shrinker_map {
>> +	struct rcu_head rcu;
>> +	unsigned long map[0];
>> +};
>> +
> 
> This struct should be defined before struct mem_cgroup_per_node.
> 
> A comment explaining what this map is for and what it maps would be
> really helpful.
> 
>>  enum memcg_kmem_state {
>>  	KMEM_NONE,
>>  	KMEM_ALLOCATED,
>> @@ -1200,6 +1206,8 @@ extern int memcg_nr_cache_ids;
>>  void memcg_get_cache_ids(void);
>>  void memcg_put_cache_ids(void);
>>  
>> +extern int shrinkers_max_nr;
>> +
> 
> memcg_shrinker_id_max?

memcg_shrinker_id_max sounds like an includive value, doesn't it?
While shrinker->id < shrinker_max_nr.

Let's better use memcg_shrinker_nr_max.

>>  /*
>>   * Helper macro to loop through all memcg-specific caches. Callers must still
>>   * check if the cache is valid (it is either valid or NULL).
>> @@ -1223,6 +1231,13 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
>>  	return memcg ? memcg->kmemcg_id : -1;
>>  }
>>  
>> +extern struct memcg_shrinker_map __rcu *root_shrinkers_map[];
>> +#define SHRINKERS_MAP(memcg, nid)					\
>> +	(memcg == root_mem_cgroup || !memcg ?				\
>> +	 root_shrinkers_map[nid] : memcg->nodeinfo[nid]->shrinkers_map)
>> +
>> +extern int expand_shrinker_maps(int old_id, int id);
>> +
> 
> I'm strongly against using a special map for the root cgroup. I'd prefer
> to disable this optimization for the root cgroup altogether and simply
> iterate over all registered shrinkers when it comes to global reclaim.
> It shouldn't degrade performance as the root cgroup is singular.
>
>>  #else
>>  #define for_each_memcg_cache_index(_idx)	\
>>  	for (; NULL; )
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 2959a454a072..562dfb1be9ef 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -305,6 +305,113 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
>>  
>>  struct workqueue_struct *memcg_kmem_cache_wq;
>>  
>> +static DECLARE_RWSEM(shrinkers_max_nr_rwsem);
> 
> Why rwsem? AFAIU you want to synchronize between two code paths: when a
> memory cgroup is allocated (or switched online?) to allocate a shrinker
> map for it and in the function growing shrinker maps for all cgroups.
> A mutex should be enough for this.
> 
>> +struct memcg_shrinker_map __rcu *root_shrinkers_map[MAX_NUMNODES] = { 0 };
>> +
>> +static void get_shrinkers_max_nr(void)
>> +{
>> +	down_read(&shrinkers_max_nr_rwsem);
>> +}
>> +
>> +static void put_shrinkers_max_nr(void)
>> +{
>> +	up_read(&shrinkers_max_nr_rwsem);
>> +}
>> +
>> +static void kvfree_map_rcu(struct rcu_head *head)
> 
> free_shrinker_map_rcu
> 
>> +{
>> +	kvfree(container_of(head, struct memcg_shrinker_map, rcu));
>> +}
>> +
>> +static int memcg_expand_maps(struct mem_cgroup *memcg, int nid,
> 
> Bad name: the function reallocates just one map, not many maps; the name
> doesn't indicate that it is about shrinkers; the name is inconsistent
> with alloc_shrinker_maps and free_shrinker_maps. Please fix.
> 
>> +			     int size, int old_size)
>> +{
>> +	struct memcg_shrinker_map *new, *old;
>> +
>> +	lockdep_assert_held(&shrinkers_max_nr_rwsem);
>> +
>> +	new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
>> +	if (!new)
>> +		return -ENOMEM;
>> +
>> +	/* Set all old bits, clear all new bits */
>> +	memset(new->map, (int)0xff, old_size);
>> +	memset((void *)new->map + old_size, 0, size - old_size);
>> +
>> +	old = rcu_dereference_protected(SHRINKERS_MAP(memcg, nid), true);
>> +
>> +	if (memcg)
>> +		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinkers_map, new);
>> +	else
>> +		rcu_assign_pointer(root_shrinkers_map[nid], new);
>> +
>> +	if (old)
>> +		call_rcu(&old->rcu, kvfree_map_rcu);
>> +
>> +	return 0;
>> +}
>> +
>> +static int alloc_shrinker_maps(struct mem_cgroup *memcg, int nid)
>> +{
>> +	/* Skip allocation, when we're initializing root_mem_cgroup */
>> +	if (!root_mem_cgroup)
>> +		return 0;
>> +
>> +	return memcg_expand_maps(memcg, nid, shrinkers_max_nr/BITS_PER_BYTE, 0);
>> +}
>> +
>> +static void free_shrinker_maps(struct mem_cgroup *memcg,
>> +			       struct mem_cgroup_per_node *pn)
>> +{
>> +	struct memcg_shrinker_map *map;
>> +
>> +	if (memcg == root_mem_cgroup)
>> +		return;
>> +
>> +	/* IDR unhashed long ago, and expand_shrinker_maps can't race with us */
>> +	map = rcu_dereference_protected(pn->shrinkers_map, true);
>> +	kvfree_map_rcu(&map->rcu);
>> +}
>> +
>> +static struct idr mem_cgroup_idr;
>> +
>> +int expand_shrinker_maps(int old_nr, int nr)
>> +{
>> +	int id, size, old_size, node, ret;
>> +	struct mem_cgroup *memcg;
>> +
>> +	old_size = old_nr / BITS_PER_BYTE;
>> +	size = nr / BITS_PER_BYTE;
>> +
>> +	down_write(&shrinkers_max_nr_rwsem);
>> +	for_each_node(node) {
> 
> Iterating over cgroups first, numa nodes second seems like a better idea
> to me. I think you should fold for_each_node in memcg_expand_maps.
>
>> +		idr_for_each_entry(&mem_cgroup_idr, memcg, id) {
> 
> Iterating over mem_cgroup_idr looks strange. Why don't you use
> for_each_mem_cgroup?

We want to allocate shrinkers maps in mem_cgroup_css_alloc(), since
mem_cgroup_css_online() mustn't fail (it's a requirement of currently
existing design of memcg_cgroup::id).

A new memcg is added to parent's list between two of these calls:

css_create()
  ss->css_alloc()
  list_add_tail_rcu(&css->sibling, &parent_css->children)
  ss->css_online()


for_each_mem_cgroup() does not see allocated, but not linked children.
 
>> +			if (id == 1)
>> +				memcg = NULL;
>> +			ret = memcg_expand_maps(memcg, node, size, old_size);
>> +			if (ret)
>> +				goto unlock;
>> +		}
>> +
>> +		/* root_mem_cgroup is not initialized yet */
>> +		if (id == 0)
>> +			ret = memcg_expand_maps(NULL, node, size, old_size);
>> +	}
>> +unlock:
>> +	up_write(&shrinkers_max_nr_rwsem);
>> +	return ret;
>> +}
>> +#else /* CONFIG_SLOB */
>> +static void get_shrinkers_max_nr(void) { }
>> +static void put_shrinkers_max_nr(void) { }
>> +
>> +static int alloc_shrinker_maps(struct mem_cgroup *memcg, int nid)
>> +{
>> +	return 0;
>> +}
>> +static void free_shrinker_maps(struct mem_cgroup *memcg,
>> +			       struct mem_cgroup_per_node *pn) { }
>> +
>>  #endif /* !CONFIG_SLOB */
>>  
>>  /**
>> @@ -3002,6 +3109,8 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
>>  }
>>  
>>  #ifndef CONFIG_SLOB
>> +int shrinkers_max_nr;
>> +
>>  static int memcg_online_kmem(struct mem_cgroup *memcg)
>>  {
>>  	int memcg_id;
>> @@ -4266,7 +4375,10 @@ static DEFINE_IDR(mem_cgroup_idr);
>>  static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
>>  {
>>  	if (memcg->id.id > 0) {
>> +		/* Removing IDR must be visible for expand_shrinker_maps() */
>> +		get_shrinkers_max_nr();
>>  		idr_remove(&mem_cgroup_idr, memcg->id.id);
>> +		put_shrinkers_max_nr();
>>  		memcg->id.id = 0;
>>  	}
>>  }
>> @@ -4333,12 +4445,17 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
>>  	if (!pn->lruvec_stat_cpu)
>>  		goto err_pcpu;
>>  
>> +	if (alloc_shrinker_maps(memcg, node))
>> +		goto err_maps;
>> +
>>  	lruvec_init(&pn->lruvec);
>>  	pn->usage_in_excess = 0;
>>  	pn->on_tree = false;
>>  	pn->memcg = memcg;
>>  	return 0;
>>  
>> +err_maps:
>> +	free_percpu(pn->lruvec_stat_cpu);
>>  err_pcpu:
>>  	memcg->nodeinfo[node] = NULL;
>>  	kfree(pn);
>> @@ -4352,6 +4469,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
>>  	if (!pn)
>>  		return;
>>  
>> +	free_shrinker_maps(memcg, pn);
>>  	free_percpu(pn->lruvec_stat_cpu);
>>  	kfree(pn);
>>  }
>> @@ -4407,13 +4525,18 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>>  #ifdef CONFIG_CGROUP_WRITEBACK
>>  	INIT_LIST_HEAD(&memcg->cgwb_list);
>>  #endif
>> +
>> +	get_shrinkers_max_nr();
>>  	for_each_node(node)
>> -		if (alloc_mem_cgroup_per_node_info(memcg, node))
>> +		if (alloc_mem_cgroup_per_node_info(memcg, node)) {
>> +			put_shrinkers_max_nr();
>>  			goto fail;
>> +		}
>>  
>>  	memcg->id.id = idr_alloc(&mem_cgroup_idr, memcg,
>>  				 1, MEM_CGROUP_ID_MAX,
>>  				 GFP_KERNEL);
>> +	put_shrinkers_max_nr();
>>  	if (memcg->id.id < 0)
>>  		goto fail;
>>  
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 4f02fe83537e..f63eb5596c35 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -172,6 +172,22 @@ static DECLARE_RWSEM(shrinker_rwsem);
>>  #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
>>  static DEFINE_IDR(shrinkers_id_idr);
>>  
>> +static int expand_shrinker_id(int id)
>> +{
>> +	if (likely(id < shrinkers_max_nr))
>> +		return 0;
>> +
>> +	id = shrinkers_max_nr * 2;
>> +	if (id == 0)
>> +		id = BITS_PER_BYTE;
>> +
>> +	if (expand_shrinker_maps(shrinkers_max_nr, id))
>> +		return -ENOMEM;
>> +
>> +	shrinkers_max_nr = id;
>> +	return 0;
>> +}
>> +
> 
> I think this function should live in memcontrol.c and shrinkers_max_nr
> should be private to memcontrol.c.
> 
>>  static int add_memcg_shrinker(struct shrinker *shrinker)
>>  {
>>  	int id, ret;
>> @@ -180,6 +196,11 @@ static int add_memcg_shrinker(struct shrinker *shrinker)
>>  	ret = id = idr_alloc(&shrinkers_id_idr, shrinker, 0, 0, GFP_KERNEL);
>>  	if (ret < 0)
>>  		goto unlock;
>> +	ret = expand_shrinker_id(id);
>> +	if (ret < 0) {
>> +		idr_remove(&shrinkers_id_idr, id);
>> +		goto unlock;
>> +	}
>>  	shrinker->id = id;
>>  	ret = 0;
>>  unlock:
>>

  reply	other threads:[~2018-04-23 10:54 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-17 18:52 [PATCH v2 00/12] Improve shrink_slab() scalability (old complexity was O(n^2), new is O(n)) Kirill Tkhai
2018-04-17 18:53 ` [PATCH v2 01/12] mm: Assign id to every memcg-aware shrinker Kirill Tkhai
2018-04-18 14:14   ` Tetsuo Handa
2018-04-18 14:27     ` Kirill Tkhai
2018-04-18 14:32       ` Tetsuo Handa
2018-04-18 15:02         ` Kirill Tkhai
2018-04-22 17:16   ` Vladimir Davydov
2018-04-17 18:53 ` [PATCH v2 02/12] memcg: Refactoring in mem_cgroup_alloc() Kirill Tkhai
2018-04-17 18:53 ` [PATCH v2 03/12] memcg: Refactoring in alloc_mem_cgroup_per_node_info() Kirill Tkhai
2018-04-17 18:53 ` [PATCH v2 04/12] mm: Assign memcg-aware shrinkers bitmap to memcg Kirill Tkhai
2018-04-18 12:55   ` kbuild test robot
2018-04-18 13:05     ` Kirill Tkhai
2018-04-22 17:59   ` Vladimir Davydov
2018-04-23 10:54     ` Kirill Tkhai [this message]
2018-04-24 11:28       ` Vladimir Davydov
2018-04-24 11:38         ` Kirill Tkhai
2018-04-24 12:15           ` Vladimir Davydov
2018-04-24 12:24             ` Kirill Tkhai
2018-04-28 15:08               ` Vladimir Davydov
2018-05-03 11:15                 ` Kirill Tkhai
2018-04-24 12:13         ` Kirill Tkhai
2018-04-23 11:02     ` Kirill Tkhai
2018-04-23 11:06     ` Kirill Tkhai
2018-04-24 11:08       ` Vladimir Davydov
2018-04-17 18:53 ` [PATCH v2 05/12] fs: Propagate shrinker::id to list_lru Kirill Tkhai
2018-04-22 18:03   ` Vladimir Davydov
2018-04-17 18:53 ` [PATCH v2 06/12] list_lru: Add memcg argument to list_lru_from_kmem() Kirill Tkhai
2018-04-17 18:54 ` [PATCH v2 07/12] list_lru: Pass dst_memcg argument to memcg_drain_list_lru_node() Kirill Tkhai
2018-04-17 18:54 ` [PATCH v2 08/12] list_lru: Pass lru " Kirill Tkhai
2018-04-17 18:54 ` [PATCH v2 09/12] mm: Set bit in memcg shrinker bitmap on first list_lru item apearance Kirill Tkhai
2018-04-17 18:54 ` [PATCH v2 10/12] mm: Iterate only over charged shrinkers during memcg shrink_slab() Kirill Tkhai
2018-04-22 18:19   ` Vladimir Davydov
2018-04-17 18:54 ` [PATCH v2 11/12] mm: Add SHRINK_EMPTY shrinker methods return value Kirill Tkhai
2018-04-17 18:54 ` [PATCH v2 12/12] mm: Clear shrinker bit if there are no objects related to memcg Kirill Tkhai
2018-04-22 18:21   ` Vladimir Davydov
2018-04-23 10:01     ` Kirill Tkhai
2018-04-24 10:56       ` Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=14ebcccf-3ea8-59f4-d7ea-793aaba632c0@virtuozzo.com \
    --to=ktkhai@virtuozzo.com \
    --cc=akpm@linux-foundation.org \
    --cc=aryabinin@virtuozzo.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=gregkh@linuxfoundation.org \
    --cc=guro@fb.com \
    --cc=hannes@cmpxchg.org \
    --cc=hillf.zj@alibaba-inc.com \
    --cc=jbacik@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux@roeck-us.net \
    --cc=lirongqing@baidu.com \
    --cc=longman@redhat.com \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@kernel.org \
    --cc=minchan@kernel.org \
    --cc=mka@chromium.org \
    --cc=penguin-kernel@I-love.SAKURA.ne.jp \
    --cc=pombredanne@nexb.com \
    --cc=sfr@canb.auug.org.au \
    --cc=shakeelb@google.com \
    --cc=stummala@codeaurora.org \
    --cc=tglx@linutronix.de \
    --cc=vdavydov.dev@gmail.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    --subject='Re: [PATCH v2 04/12] mm: Assign memcg-aware shrinkers bitmap to memcg' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).