Linux-Fsdevel Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Anthony Yznaga <anthony.yznaga@oracle.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: willy@infradead.org, corbet@lwn.net, tglx@linutronix.de,
	mingo@redhat.com, bp@alien8.de, x86@kernel.org, hpa@zytor.com,
	dave.hansen@linux.intel.com, luto@kernel.org,
	peterz@infradead.org, rppt@linux.ibm.com,
	akpm@linux-foundation.org, hughd@google.com,
	ebiederm@xmission.com, masahiroy@kernel.org, ardb@kernel.org,
	ndesaulniers@google.com, dima@golovin.in,
	daniel.kiper@oracle.com, nivedita@alum.mit.edu,
	rafael.j.wysocki@intel.com, dan.j.williams@intel.com,
	zhenzhong.duan@oracle.com, jroedel@suse.de, bhe@redhat.com,
	guro@fb.com, Thomas.Lendacky@amd.com,
	andriy.shevchenko@linux.intel.com, keescook@chromium.org,
	hannes@cmpxchg.org, minchan@kernel.org, mhocko@kernel.org,
	ying.huang@intel.com, yang.shi@linux.alibaba.com,
	gustavo@embeddedor.com, ziqian.lzq@antfin.com,
	vdavydov.dev@gmail.com, jason.zeng@intel.com,
	kevin.tian@intel.com, zhiyuan.lv@intel.com, lei.l.li@intel.com,
	paul.c.lai@intel.com, ashok.raj@intel.com,
	linux-fsdevel@vger.kernel.org, linux-doc@vger.kernel.org,
	kexec@lists.infradead.org
Subject: [RFC 15/43] PKRAM: provide a way to ban pages from use by PKRAM
Date: Wed,  6 May 2020 17:41:41 -0700	[thread overview]
Message-ID: <1588812129-8596-16-git-send-email-anthony.yznaga@oracle.com> (raw)
In-Reply-To: <1588812129-8596-1-git-send-email-anthony.yznaga@oracle.com>

Not all memory ranges can be used for saving preserved over-kexec data.
For example, a kexec kernel may be loaded before pages are preserved.
The memory regions where the kexec segments will be copied to on kexec
must not contain preserved pages or else they will be clobbered.

Originally-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/linux/pkram.h |   2 +
 mm/pkram.c            | 210 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+)

diff --git a/include/linux/pkram.h b/include/linux/pkram.h
index 409022e1472f..1ba48442ef8e 100644
--- a/include/linux/pkram.h
+++ b/include/linux/pkram.h
@@ -69,10 +69,12 @@ phys_addr_t pkram_memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 extern unsigned long pkram_reserved_pages;
 void pkram_reserve(void);
 void pkram_free_pgt(void);
+void pkram_ban_region(unsigned long start, unsigned long end);
 #else
 #define pkram_reserved_pages 0UL
 static inline void pkram_reserve(void) { }
 static inline void pkram_free_pgt(void) { }
+static inline void pkram_ban_region(unsigned long start, unsigned long end) { }
 #endif
 
 #endif /* _LINUX_PKRAM_H */
diff --git a/mm/pkram.c b/mm/pkram.c
index e49c9bcd3854..60863c8ecbab 100644
--- a/mm/pkram.c
+++ b/mm/pkram.c
@@ -119,6 +119,28 @@ unsigned long __initdata pkram_reserved_pages;
 static bool pkram_reservation_in_progress;
 
 /*
+ * For tracking a region of memory that PKRAM is not allowed to use.
+ */
+struct banned_region {
+	unsigned long start, end;		/* pfn, inclusive */
+};
+
+#define MAX_NR_BANNED		(32 + MAX_NUMNODES * 2)
+
+static unsigned int nr_banned;			/* number of banned regions */
+
+/* banned regions; arranged in ascending order, do not overlap */
+static struct banned_region banned[MAX_NR_BANNED];
+/*
+ * If a page allocated for PKRAM turns out to belong to a banned region,
+ * it is placed on the banned_pages list so subsequent allocation attempts
+ * do not encounter it again. The list is shrunk when system memory is low.
+ */
+static LIST_HEAD(banned_pages);			/* linked through page::lru */
+static DEFINE_SPINLOCK(banned_pages_lock);
+static unsigned long nr_banned_pages;
+
+/*
  * The PKRAM super block pfn, see above.
  */
 static int __init parse_pkram_sb_pfn(char *arg)
@@ -223,12 +245,120 @@ void __init pkram_reserve(void)
 	pr_info("PKRAM: %lu pages reserved\n", pkram_reserved_pages);
 }
 
+/*
+ * Ban pfn range [start..end] (inclusive) from use in PKRAM.
+ */
+void pkram_ban_region(unsigned long start, unsigned long end)
+{
+	int i, merged = -1;
+
+	if (pkram_reservation_in_progress)
+		return;
+
+	/* first try to merge the region with an existing one */
+	for (i = nr_banned - 1; i >= 0 && start <= banned[i].end + 1; i--) {
+		if (end + 1 >= banned[i].start) {
+			start = min(banned[i].start, start);
+			end = max(banned[i].end, end);
+			if (merged < 0)
+				merged = i;
+		} else
+			/*
+			 * Regions are arranged in ascending order and do not
+			 * intersect so the merged region cannot jump over its
+			 * predecessors.
+			 */
+			BUG_ON(merged >= 0);
+	}
+
+	i++;
+
+	if (merged >= 0) {
+		banned[i].start = start;
+		banned[i].end = end;
+		/* shift if merged with more than one region */
+		memmove(banned + i + 1, banned + merged + 1,
+			sizeof(*banned) * (nr_banned - merged - 1));
+		nr_banned -= merged - i;
+		return;
+	}
+
+	/*
+	 * The region does not intersect with an existing one;
+	 * try to create a new one.
+	 */
+	if (nr_banned == MAX_NR_BANNED) {
+		pr_err("PKRAM: Failed to ban %lu-%lu: "
+		       "Too many banned regions\n", start, end);
+		return;
+	}
+
+	memmove(banned + i + 1, banned + i,
+		sizeof(*banned) * (nr_banned - i));
+	banned[i].start = start;
+	banned[i].end = end;
+	nr_banned++;
+}
+
+static void pkram_show_banned(void)
+{
+	int i;
+	unsigned long n, total = 0;
+
+	pr_info("PKRAM: banned regions:\n");
+	for (i = 0; i < nr_banned; i++) {
+		n = banned[i].end - banned[i].start + 1;
+		pr_info("%4d: [%08lx - %08lx] %ld pages\n",
+			i, banned[i].start, banned[i].end, n);
+		total += n;
+	}
+	pr_info("Total banned: %ld pages in %d regions\n",
+		total, nr_banned);
+}
+
+/*
+ * Returns true if the page may not be used for storing preserved data.
+ */
+static bool pkram_page_banned(struct page *page)
+{
+	unsigned long epfn, pfn = page_to_pfn(page);
+	int l = 0, r = nr_banned - 1, m;
+
+	epfn = pfn + compound_nr(page) - 1;
+
+	/* do binary search */
+	while (l <= r) {
+		m = (l + r) / 2;
+		if (epfn < banned[m].start)
+			r = m - 1;
+		else if (pfn > banned[m].end)
+			l = m + 1;
+		else
+			return true;
+	}
+	return false;
+}
+
 static inline struct page *__pkram_alloc_page(gfp_t gfp_mask, bool add_to_map)
 {
 	struct page *page;
+	LIST_HEAD(list);
+	unsigned long len = 0;
 	int err;
 
 	page = alloc_page(gfp_mask);
+	while (page && pkram_page_banned(page)) {
+		len++;
+		list_add(&page->lru, &list);
+		page = alloc_page(gfp_mask);
+	}
+	if (len > 0) {
+		spin_lock(&banned_pages_lock);
+		nr_banned_pages += len;
+		list_splice(&list, &banned_pages);
+		spin_unlock(&banned_pages_lock);
+	}
+
 	if (page && add_to_map) {
 		err = pkram_add_identity_map(page);
 		if (err) {
@@ -256,6 +386,53 @@ static inline void pkram_free_page(void *addr)
 	free_page((unsigned long)addr);
 }
 
+static void __banned_pages_shrink(unsigned long nr_to_scan)
+{
+	struct page *page;
+
+	if (nr_to_scan <= 0)
+		return;
+
+	while (nr_banned_pages > 0) {
+		BUG_ON(list_empty(&banned_pages));
+		page = list_first_entry(&banned_pages, struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+		nr_banned_pages--;
+		nr_to_scan--;
+		if (!nr_to_scan)
+			break;
+	}
+}
+
+static unsigned long
+banned_pages_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return nr_banned_pages;
+}
+
+static unsigned long
+banned_pages_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	int nr_left = nr_banned_pages;
+
+	if (!sc->nr_to_scan || !nr_left)
+		return nr_left;
+
+	spin_lock(&banned_pages_lock);
+	__banned_pages_shrink(sc->nr_to_scan);
+	nr_left = nr_banned_pages;
+	spin_unlock(&banned_pages_lock);
+
+	return nr_left;
+}
+
+static struct shrinker banned_pages_shrinker = {
+	.count_objects = banned_pages_count,
+	.scan_objects = banned_pages_scan,
+	.seeks = DEFAULT_SEEKS,
+};
+
 static inline void pkram_insert_node(struct pkram_node *node)
 {
 	list_add(&virt_to_page(node)->lru, &pkram_nodes);
@@ -665,6 +842,32 @@ static int __pkram_save_page(struct pkram_stream *ps,
 	return 0;
 }
 
+static int __pkram_save_page_copy(struct pkram_stream *ps, struct page *page,
+				short flags)
+{
+	int nr_pages = compound_nr(page);
+	pgoff_t index = page->index;
+	int i, err;
+
+	for (i = 0; i < nr_pages; i++, index++) {
+		struct page *p = page + i;
+		struct page *new;
+
+		new = pkram_alloc_page(ps->gfp_mask);
+		if (!new)
+			return -ENOMEM;
+
+		copy_highpage(new, p);
+		err = __pkram_save_page(ps, new, flags, index);
+		put_page(new);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 /**
  * Save page @page to the preserved memory node and object associated with
  * stream @ps. The stream must have been initialized with pkram_prepare_save()
@@ -688,6 +891,10 @@ int pkram_save_page(struct pkram_stream *ps, struct page *page, short flags)
 
 	BUG_ON((node->flags & PKRAM_ACCMODE_MASK) != PKRAM_SAVE);
 
+	/* if page is banned, relocate it */
+	if (pkram_page_banned(page))
+		return __pkram_save_page_copy(ps, page, flags);
+
 	err = __pkram_save_page(ps, page, flags, page->index);
 	if (!err)
 		err = pkram_add_identity_map(page);
@@ -891,6 +1098,7 @@ static void __pkram_reboot(void)
 	unsigned long pgd_pfn = 0;
 
 	if (pkram_pgd) {
+		pkram_show_banned();
 		list_for_each_entry_reverse(page, &pkram_nodes, lru) {
 			node = page_address(page);
 			if (WARN_ON(node->flags & PKRAM_ACCMODE_MASK))
@@ -957,6 +1165,7 @@ static int __init pkram_init_sb(void)
 		page = __pkram_alloc_page(GFP_KERNEL | __GFP_ZERO, false);
 		if (!page) {
 			pr_err("PKRAM: Failed to allocate super block\n");
+			__banned_pages_shrink(ULONG_MAX);
 			return 0;
 		}
 		pkram_sb = page_address(page);
@@ -979,6 +1188,7 @@ static int __init pkram_init(void)
 {
 	if (pkram_init_sb()) {
 		register_reboot_notifier(&pkram_reboot_notifier);
+		register_shrinker(&banned_pages_shrinker);
 		sysfs_update_group(kernel_kobj, &pkram_attr_group);
 	}
 	return 0;
-- 
2.13.3


  parent reply	other threads:[~2020-05-07  0:46 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-07  0:41 [RFC 00/43] PKRAM: Preserved-over-Kexec RAM Anthony Yznaga
2020-05-07  0:41 ` [RFC 01/43] mm: add PKRAM API stubs and Kconfig Anthony Yznaga
2020-05-07  0:41 ` [RFC 02/43] mm: PKRAM: implement node load and save functions Anthony Yznaga
2020-05-07  0:41 ` [RFC 03/43] mm: PKRAM: implement object " Anthony Yznaga
2020-05-07  0:41 ` [RFC 04/43] mm: PKRAM: implement page stream operations Anthony Yznaga
2020-05-07  0:41 ` [RFC 05/43] mm: PKRAM: support preserving transparent hugepages Anthony Yznaga
2020-05-07  0:41 ` [RFC 06/43] mm: PKRAM: implement byte stream operations Anthony Yznaga
2020-05-07  0:41 ` [RFC 07/43] mm: PKRAM: link nodes by pfn before reboot Anthony Yznaga
2020-05-07  0:41 ` [RFC 08/43] mm: PKRAM: introduce super block Anthony Yznaga
2020-05-07  0:41 ` [RFC 09/43] PKRAM: build a physical mapping pagetable of pages to be preserved Anthony Yznaga
2020-05-07  0:41 ` [RFC 10/43] PKRAM: add code for walking the preserved pages pagetable Anthony Yznaga
2020-05-07  0:41 ` [RFC 11/43] PKRAM: pass the preserved pages pagetable to the next kernel Anthony Yznaga
2020-05-07  0:41 ` [RFC 12/43] mm: PKRAM: reserve preserved memory at boot Anthony Yznaga
2020-05-07  0:41 ` [RFC 13/43] mm: PKRAM: free preserved pages pagetable Anthony Yznaga
2020-05-07  0:41 ` [RFC 14/43] mm: memblock: PKRAM: prevent memblock resize from clobbering preserved pages Anthony Yznaga
2020-05-11 13:57   ` Mike Rapoport
2020-05-11 23:29     ` Anthony Yznaga
2020-05-07  0:41 ` Anthony Yznaga [this message]
2020-05-07  0:41 ` [RFC 16/43] kexec: PKRAM: prevent kexec clobbering preserved pages in some cases Anthony Yznaga
2020-05-07  0:41 ` [RFC 17/43] PKRAM: provide a way to check if a memory range has preserved pages Anthony Yznaga
2020-05-07  0:41 ` [RFC 18/43] kexec: PKRAM: avoid clobbering already " Anthony Yznaga
2020-05-07  0:41 ` [RFC 19/43] mm: PKRAM: allow preserved memory to be freed from userspace Anthony Yznaga
2020-05-07  0:41 ` [RFC 20/43] PKRAM: disable feature when running the kdump kernel Anthony Yznaga
2020-05-07  0:41 ` [RFC 21/43] x86/KASLR: PKRAM: support physical kaslr Anthony Yznaga
2020-05-07 17:51   ` Kees Cook
2020-05-07 18:41     ` Anthony Yznaga
2020-05-07  0:41 ` [RFC 22/43] mm: shmem: introduce shmem_insert_page Anthony Yznaga
2020-05-07  0:41 ` [RFC 23/43] mm: shmem: enable saving to PKRAM Anthony Yznaga
2020-05-07  0:41 ` [RFC 24/43] mm: shmem: prevent swapping of PKRAM-enabled tmpfs pages Anthony Yznaga
2020-05-07  0:41 ` [RFC 25/43] mm: shmem: specify the mm to use when inserting pages Anthony Yznaga
2020-05-07  0:41 ` [RFC 26/43] mm: shmem: when inserting, handle pages already charged to a memcg Anthony Yznaga
2020-05-07  0:41 ` [RFC 27/43] x86/mm/numa: add numa_isolate_memblocks() Anthony Yznaga
2020-05-07  0:41 ` [RFC 28/43] PKRAM: ensure memblocks with preserved pages init'd for numa Anthony Yznaga
2020-05-07  0:41 ` [RFC 29/43] memblock: PKRAM: mark memblocks that contain preserved pages Anthony Yznaga
2020-05-07  0:41 ` [RFC 30/43] memblock: add for_each_reserved_mem_range() Anthony Yznaga
2020-05-07  0:41 ` [RFC 31/43] memblock, mm: defer initialization of preserved pages Anthony Yznaga
2020-05-07  0:41 ` [RFC 32/43] shmem: PKRAM: preserve shmem files a chunk at a time Anthony Yznaga
2020-05-07  0:41 ` [RFC 33/43] PKRAM: atomically add and remove link pages Anthony Yznaga
2020-05-07  0:42 ` [RFC 34/43] shmem: PKRAM: multithread preserving and restoring shmem pages Anthony Yznaga
2020-05-07 16:30   ` Randy Dunlap
2020-05-07 17:59     ` Anthony Yznaga
2020-05-07  0:42 ` [RFC 35/43] shmem: introduce shmem_insert_pages() Anthony Yznaga
2020-05-07  0:42 ` [RFC 36/43] PKRAM: add support for loading pages in bulk Anthony Yznaga
2020-05-07  0:42 ` [RFC 37/43] shmem: PKRAM: enable bulk loading of preserved pages into shmem Anthony Yznaga
2020-05-07  0:42 ` [RFC 38/43] mm: implement splicing a list of pages to the LRU Anthony Yznaga
2020-05-07  0:42 ` [RFC 39/43] shmem: optimize adding pages to the LRU in shmem_insert_pages() Anthony Yznaga
2020-05-07  0:42 ` [RFC 40/43] shmem: initial support for adding multiple pages to pagecache Anthony Yznaga
2020-05-07  0:42 ` [RFC 41/43] XArray: add xas_export_node() and xas_import_node() Anthony Yznaga
2020-05-07  0:42 ` [RFC 42/43] shmem: reduce time holding xa_lock when inserting pages Anthony Yznaga
2020-05-07  0:42 ` [RFC 43/43] PKRAM: improve index alignment of pkram_link entries Anthony Yznaga

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1588812129-8596-16-git-send-email-anthony.yznaga@oracle.com \
    --to=anthony.yznaga@oracle.com \
    --cc=Thomas.Lendacky@amd.com \
    --cc=akpm@linux-foundation.org \
    --cc=andriy.shevchenko@linux.intel.com \
    --cc=ardb@kernel.org \
    --cc=ashok.raj@intel.com \
    --cc=bhe@redhat.com \
    --cc=bp@alien8.de \
    --cc=corbet@lwn.net \
    --cc=dan.j.williams@intel.com \
    --cc=daniel.kiper@oracle.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dima@golovin.in \
    --cc=ebiederm@xmission.com \
    --cc=guro@fb.com \
    --cc=gustavo@embeddedor.com \
    --cc=hannes@cmpxchg.org \
    --cc=hpa@zytor.com \
    --cc=hughd@google.com \
    --cc=jason.zeng@intel.com \
    --cc=jroedel@suse.de \
    --cc=keescook@chromium.org \
    --cc=kevin.tian@intel.com \
    --cc=kexec@lists.infradead.org \
    --cc=lei.l.li@intel.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=masahiroy@kernel.org \
    --cc=mhocko@kernel.org \
    --cc=minchan@kernel.org \
    --cc=mingo@redhat.com \
    --cc=ndesaulniers@google.com \
    --cc=nivedita@alum.mit.edu \
    --cc=paul.c.lai@intel.com \
    --cc=peterz@infradead.org \
    --cc=rafael.j.wysocki@intel.com \
    --cc=rppt@linux.ibm.com \
    --cc=tglx@linutronix.de \
    --cc=vdavydov.dev@gmail.com \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    --cc=yang.shi@linux.alibaba.com \
    --cc=ying.huang@intel.com \
    --cc=zhenzhong.duan@oracle.com \
    --cc=zhiyuan.lv@intel.com \
    --cc=ziqian.lzq@antfin.com \
    --subject='Re: [RFC 15/43] PKRAM: provide a way to ban pages from use by PKRAM' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).