LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: David Hildenbrand <david@redhat.com>
To: Anshuman Khandual <anshuman.khandual@arm.com>,
	linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, akpm@linux-foundation.org,
	catalin.marinas@arm.com, will.deacon@arm.com
Cc: mhocko@suse.com, mgorman@techsingularity.net,
	james.morse@arm.com, mark.rutland@arm.com, robin.murphy@arm.com,
	cpandya@codeaurora.org, arunks@codeaurora.org,
	dan.j.williams@intel.com, osalvador@suse.de, cai@lca.pw,
	logang@deltatee.com, ira.weiny@intel.com
Subject: Re: [PATCH V3 4/4] arm64/mm: Enable memory hot remove
Date: Tue, 14 May 2019 11:08:54 +0200	[thread overview]
Message-ID: <d996d839-40da-6b7b-d12a-844e4fa66f92@redhat.com> (raw)
In-Reply-To: <1557824407-19092-5-git-send-email-anshuman.khandual@arm.com>

On 14.05.19 11:00, Anshuman Khandual wrote:
> Memory removal from an arch perspective involves tearing down two different
> kernel based mappings i.e vmemmap and linear while releasing related page
> table and any mapped pages allocated for given physical memory range to be
> removed.
> 
> Define a common kernel page table tear down helper remove_pagetable() which
> can be used to unmap given kernel virtual address range. In effect it can
> tear down both vmemap or kernel linear mappings. This new helper is called
> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
> 
> For linear mapping there are no actual allocated pages which are mapped to
> create the translation. Any pfn on a given entry is derived from physical
> address (__va(PA) --> PA) whose linear translation is to be created. They
> need not be freed as they were never allocated in the first place. But for
> vmemmap which is a real virtual mapping (like vmalloc) physical pages are
> allocated either from buddy or memblock which get mapped in the kernel page
> table. These allocated and mapped pages need to be freed during translation
> tear down. But page table pages need to be freed in both these cases.
> 
> These mappings need to be differentiated while deciding if a mapped page at
> any level i.e [pte|pmd|pud]_page() should be freed or not. Callers for the
> mapping tear down process should pass on 'sparse_vmap' variable identifying
> kernel vmemmap mappings.
> 
> While here update arch_add_mempory() to handle __add_pages() failures by
> just unmapping recently added kernel linear mapping. Now enable memory hot
> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
> 
> This implementation is overall inspired from kernel page table tear down
> procedure on X86 architecture.
> 
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
>  arch/arm64/Kconfig  |   3 +
>  arch/arm64/mm/mmu.c | 204 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 205 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 1c0cb51..bb4e571 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -268,6 +268,9 @@ config HAVE_GENERIC_GUP
>  config ARCH_ENABLE_MEMORY_HOTPLUG
>  	def_bool y
>  
> +config ARCH_ENABLE_MEMORY_HOTREMOVE
> +	def_bool y
> +
>  config SMP
>  	def_bool y
>  
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 37a902c..bd2d003 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -733,6 +733,177 @@ int kern_addr_valid(unsigned long addr)
>  
>  	return pfn_valid(pte_pfn(pte));
>  }
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void free_hotplug_page_range(struct page *page, ssize_t size)
> +{
> +	WARN_ON(PageReserved(page));
> +	free_pages((unsigned long)page_address(page), get_order(size));
> +}
> +
> +static void free_hotplug_pgtable_page(struct page *page)
> +{
> +	free_hotplug_page_range(page, PAGE_SIZE);
> +}
> +
> +static void free_pte_table(pte_t *ptep, pmd_t *pmdp, unsigned long addr)
> +{
> +	struct page *page;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		if (!pte_none(ptep[i]))
> +			return;
> +	}
> +
> +	page = pmd_page(*pmdp);
> +	pmd_clear(pmdp);
> +	__flush_tlb_kernel_pgtable(addr);
> +	free_hotplug_pgtable_page(page);
> +}
> +
> +#if (CONFIG_PGTABLE_LEVELS > 2)
> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
> +{
> +	struct page *page;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		if (!pmd_none(pmdp[i]))
> +			return;
> +	}
> +
> +	page = pud_page(*pudp);
> +	pud_clear(pudp);
> +	__flush_tlb_kernel_pgtable(addr);
> +	free_hotplug_pgtable_page(page);
> +}
> +#else
> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr) { }
> +#endif
> +
> +#if (CONFIG_PGTABLE_LEVELS > 3)
> +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr)
> +{
> +	struct page *page;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		if (!pud_none(pudp[i]))
> +			return;
> +	}
> +
> +	page = pgd_page(*pgdp);
> +	pgd_clear(pgdp);
> +	__flush_tlb_kernel_pgtable(addr);
> +	free_hotplug_pgtable_page(page);
> +}
> +#else
> +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr) { }
> +#endif
> +
> +static void
> +remove_pte_table(pmd_t *pmdp, unsigned long addr,
> +			unsigned long end, bool sparse_vmap)
> +{
> +	struct page *page;
> +	pte_t *ptep;
> +	unsigned long start = addr;
> +
> +	for (; addr < end; addr += PAGE_SIZE) {
> +		ptep = pte_offset_kernel(pmdp, addr);
> +		if (!pte_present(*ptep))
> +			continue;
> +
> +		if (sparse_vmap) {
> +			page = pte_page(READ_ONCE(*ptep));
> +			free_hotplug_page_range(page, PAGE_SIZE);
> +		}
> +		pte_clear(&init_mm, addr, ptep);
> +	}
> +	flush_tlb_kernel_range(start, end);
> +}
> +
> +static void
> +remove_pmd_table(pud_t *pudp, unsigned long addr,
> +			unsigned long end, bool sparse_vmap)
> +{
> +	unsigned long next;
> +	struct page *page;
> +	pte_t *ptep_base;
> +	pmd_t *pmdp;
> +
> +	for (; addr < end; addr = next) {
> +		next = pmd_addr_end(addr, end);
> +		pmdp = pmd_offset(pudp, addr);
> +		if (!pmd_present(*pmdp))
> +			continue;
> +
> +		if (pmd_sect(*pmdp)) {
> +			if (sparse_vmap) {
> +				page = pmd_page(READ_ONCE(*pmdp));
> +				free_hotplug_page_range(page, PMD_SIZE);
> +			}
> +			pmd_clear(pmdp);
> +			continue;
> +		}
> +		ptep_base = pte_offset_kernel(pmdp, 0UL);
> +		remove_pte_table(pmdp, addr, next, sparse_vmap);
> +		free_pte_table(ptep_base, pmdp, addr);
> +	}
> +}
> +
> +static void
> +remove_pud_table(pgd_t *pgdp, unsigned long addr,
> +			unsigned long end, bool sparse_vmap)
> +{
> +	unsigned long next;
> +	struct page *page;
> +	pmd_t *pmdp_base;
> +	pud_t *pudp;
> +
> +	for (; addr < end; addr = next) {
> +		next = pud_addr_end(addr, end);
> +		pudp = pud_offset(pgdp, addr);
> +		if (!pud_present(*pudp))
> +			continue;
> +
> +		if (pud_sect(*pudp)) {
> +			if (sparse_vmap) {
> +				page = pud_page(READ_ONCE(*pudp));
> +				free_hotplug_page_range(page, PUD_SIZE);
> +			}
> +			pud_clear(pudp);
> +			continue;
> +		}
> +		pmdp_base = pmd_offset(pudp, 0UL);
> +		remove_pmd_table(pudp, addr, next, sparse_vmap);
> +		free_pmd_table(pmdp_base, pudp, addr);
> +	}
> +}
> +
> +static void
> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
> +{
> +	unsigned long addr, next;
> +	pud_t *pudp_base;
> +	pgd_t *pgdp;
> +
> +	spin_lock(&init_mm.page_table_lock);
> +	for (addr = start; addr < end; addr = next) {
> +		next = pgd_addr_end(addr, end);
> +		pgdp = pgd_offset_k(addr);
> +		if (!pgd_present(*pgdp))
> +			continue;
> +
> +		pudp_base = pud_offset(pgdp, 0UL);
> +		remove_pud_table(pgdp, addr, next, sparse_vmap);
> +		free_pud_table(pudp_base, pgdp, addr);
> +	}
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +#endif
> +
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>  #if !ARM64_SWAPPER_USES_SECTION_MAPS
>  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
> @@ -780,6 +951,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>  void vmemmap_free(unsigned long start, unsigned long end,
>  		struct vmem_altmap *altmap)
>  {
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +	remove_pagetable(start, end, true);
> +#endif
>  }
>  #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
>  
> @@ -1070,10 +1244,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
> +{
> +	WARN_ON(pgdir != init_mm.pgd);
> +	remove_pagetable(start, start + size, false);
> +}
> +
>  int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>  		    bool want_memblock)
>  {
> -	int flags = 0;
> +	int ret, flags = 0;
>  
>  	if (rodata_full || debug_pagealloc_enabled())
>  		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> @@ -1081,7 +1261,27 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>  	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>  			     size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
>  
> -	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
> +	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>  			   altmap, want_memblock);
> +	if (ret)
> +		__remove_pgd_mapping(swapper_pg_dir,
> +					__phys_to_virt(start), size);
> +	return ret;
>  }
> +
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)

You need to rebase to -next (linus master soon). This function is now a
void ...

> +{
> +	unsigned long start_pfn = start >> PAGE_SHIFT;
> +	unsigned long nr_pages = size >> PAGE_SHIFT;
> +	struct zone *zone = page_zone(pfn_to_page(start_pfn));
> +	int ret = 0;
> +
> +	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);

.. and this call can no longer fail :) Which simplifies this patch.

> +	if (!ret)
> +		__remove_pgd_mapping(swapper_pg_dir,
> +					__phys_to_virt(start), size);
> +	return ret;
> +}
> +#endif
>  #endif
> 


-- 

Thanks,

David / dhildenb

  reply	other threads:[~2019-05-14  9:09 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-14  9:00 [PATCH V3 0/4] " Anshuman Khandual
2019-05-14  9:00 ` [PATCH V3 1/4] mm/hotplug: Reorder arch_remove_memory() call in __remove_memory() Anshuman Khandual
2019-05-14  9:05   ` David Hildenbrand
2019-05-14  9:00 ` [PATCH V3 2/4] arm64/mm: Hold memory hotplug lock while walking for kernel page table dump Anshuman Khandual
2019-05-14  9:16   ` David Hildenbrand
2019-05-14 15:40   ` Mark Rutland
2019-05-15  1:56     ` Anshuman Khandual
2019-05-15 16:58   ` Michal Hocko
2019-05-16 10:23     ` Mark Rutland
2019-05-16 11:05       ` Michal Hocko
2019-05-22 16:42         ` Mark Rutland
2019-05-24  6:06           ` Anshuman Khandual
2019-05-27  7:20           ` Michal Hocko
2019-05-28 14:09             ` Mark Rutland
2019-05-16 11:06       ` Anshuman Khandual
2019-05-16 11:16         ` Michal Hocko
2019-05-23  8:40           ` David Hildenbrand
2019-05-24  5:43             ` Anshuman Khandual
2019-05-14  9:00 ` [PATCH V3 3/4] arm64/mm: Inhibit huge-vmap with ptdump Anshuman Khandual
2019-05-14  9:20   ` David Hildenbrand
2019-05-16  8:38   ` Ard Biesheuvel
2019-05-14  9:00 ` [PATCH V3 4/4] arm64/mm: Enable memory hot remove Anshuman Khandual
2019-05-14  9:08   ` David Hildenbrand [this message]
2019-05-15 11:49   ` Mark Rutland
2019-05-16  5:34     ` Anshuman Khandual
2019-05-16 10:57       ` Mark Rutland
2019-05-17  3:15         ` Anshuman Khandual
2019-05-14  9:10 ` [PATCH V3 0/4] " David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d996d839-40da-6b7b-d12a-844e4fa66f92@redhat.com \
    --to=david@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=anshuman.khandual@arm.com \
    --cc=arunks@codeaurora.org \
    --cc=cai@lca.pw \
    --cc=catalin.marinas@arm.com \
    --cc=cpandya@codeaurora.org \
    --cc=dan.j.williams@intel.com \
    --cc=ira.weiny@intel.com \
    --cc=james.morse@arm.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=logang@deltatee.com \
    --cc=mark.rutland@arm.com \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@suse.com \
    --cc=osalvador@suse.de \
    --cc=robin.murphy@arm.com \
    --cc=will.deacon@arm.com \
    --subject='Re: [PATCH V3 4/4] arm64/mm: Enable memory hot remove' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).