LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org
Cc: Justin Forbes <jmforbes@linuxtx.org>,
	Zwane Mwaikambo <zwane@arm.linux.org.uk>,
	"Theodore Ts'o" <tytso@mit.edu>,
	Randy Dunlap <rdunlap@xenotime.net>,
	Dave Jones <davej@redhat.com>,
	Chuck Wolber <chuckw@quantumlinux.com>,
	Chris Wedgwood <reviews@ml.cw.f00f.org>,
	Michael Krufky <mkrufky@linuxtv.org>,
	Chuck Ebbert <cebbert@redhat.com>,
	Domenico Andreoli <cavokz@gmail.com>, Willy Tarreau <w@1wt.eu>,
	Rodrigo Rubira Branco <rbranco@la.checkpoint.com>,
	Jake Edge <jake@lwn.net>, Eugene Teo <eteo@redhat.com>,
	torvalds@linux-foundation.org, akpm@linux-foundation.org,
	alan@lxorguk.ukuu.org.uk, Andy Whitcroft <apw@shadowen.org>,
	Jon Tollefson <kniht@linux.vnet.ibm.com>,
	Mel Gorman <mel@csn.ul.ie>, Nick Piggin <nickpiggin@yahoo.com.au>,
	Christoph Lameter <cl@linux-foundation.org>
Subject: [patch 09/49] hugetlbfs: handle pages higher order than MAX_ORDER
Date: Tue, 11 Nov 2008 16:23:09 -0800	[thread overview]
Message-ID: <20081112002309.GJ10989@kroah.com> (raw)
In-Reply-To: <20081112002215.GA10989@kroah.com>

[-- Attachment #1: hugetlbfs-handle-pages-higher-order-than-max_order.patch --]
[-- Type: text/plain, Size: 4570 bytes --]

2.6.27-stable review patch.  If anyone has any objections, please let us know.

------------------

From: Andy Whitcroft <apw@shadowen.org>

commit 69d177c2fc702d402b17fdca2190d5a7e3ca55c5 upstream

When working with hugepages, hugetlbfs assumes that those hugepages are
smaller than MAX_ORDER.  Specifically it assumes that the mem_map is
contigious and uses that to optimise access to the elements of the mem_map
that represent the hugepage.  Gigantic pages (such as 16GB pages on
powerpc) by definition are of greater order than MAX_ORDER (larger than
MAX_ORDER_NR_PAGES in size).  This means that we can no longer make use of
the buddy alloctor guarentees for the contiguity of the mem_map, which
ensures that the mem_map is at least contigious for maximmally aligned
areas of MAX_ORDER_NR_PAGES pages.

This patch adds new mem_map accessors and iterator helpers which handle
any discontiguity at MAX_ORDER_NR_PAGES boundaries.  It then uses these to
implement gigantic page versions of copy_huge_page and clear_huge_page,
and to allow follow_hugetlb_page handle gigantic pages.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

---
 mm/hugetlb.c  |   37 ++++++++++++++++++++++++++++++++++++-
 mm/internal.h |   28 ++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -353,11 +353,26 @@ static int vma_has_reserves(struct vm_ar
 	return 0;
 }
 
+static void clear_gigantic_page(struct page *page,
+			unsigned long addr, unsigned long sz)
+{
+	int i;
+	struct page *p = page;
+
+	might_sleep();
+	for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
+		cond_resched();
+		clear_user_highpage(p, addr + i * PAGE_SIZE);
+	}
+}
 static void clear_huge_page(struct page *page,
 			unsigned long addr, unsigned long sz)
 {
 	int i;
 
+	if (unlikely(sz > MAX_ORDER_NR_PAGES))
+		return clear_gigantic_page(page, addr, sz);
+
 	might_sleep();
 	for (i = 0; i < sz/PAGE_SIZE; i++) {
 		cond_resched();
@@ -365,12 +380,32 @@ static void clear_huge_page(struct page 
 	}
 }
 
+static void copy_gigantic_page(struct page *dst, struct page *src,
+			   unsigned long addr, struct vm_area_struct *vma)
+{
+	int i;
+	struct hstate *h = hstate_vma(vma);
+	struct page *dst_base = dst;
+	struct page *src_base = src;
+	might_sleep();
+	for (i = 0; i < pages_per_huge_page(h); ) {
+		cond_resched();
+		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+		i++;
+		dst = mem_map_next(dst, dst_base, i);
+		src = mem_map_next(src, src_base, i);
+	}
+}
 static void copy_huge_page(struct page *dst, struct page *src,
 			   unsigned long addr, struct vm_area_struct *vma)
 {
 	int i;
 	struct hstate *h = hstate_vma(vma);
 
+	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
+		return copy_gigantic_page(dst, src, addr, vma);
+
 	might_sleep();
 	for (i = 0; i < pages_per_huge_page(h); i++) {
 		cond_resched();
@@ -2113,7 +2148,7 @@ int follow_hugetlb_page(struct mm_struct
 same_page:
 		if (pages) {
 			get_page(page);
-			pages[i] = page + pfn_offset;
+			pages[i] = mem_map_offset(page, pfn_offset);
 		}
 
 		if (vmas)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -54,6 +54,34 @@ static inline unsigned long page_order(s
 }
 
 /*
+ * Return the mem_map entry representing the 'offset' subpage within
+ * the maximally aligned gigantic page 'base'.  Handle any discontiguity
+ * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
+ */
+static inline struct page *mem_map_offset(struct page *base, int offset)
+{
+	if (unlikely(offset >= MAX_ORDER_NR_PAGES))
+		return pfn_to_page(page_to_pfn(base) + offset);
+	return base + offset;
+}
+
+/*
+ * Iterator over all subpages withing the maximally aligned gigantic
+ * page 'base'.  Handle any discontiguity in the mem_map.
+ */
+static inline struct page *mem_map_next(struct page *iter,
+						struct page *base, int offset)
+{
+	if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
+		unsigned long pfn = page_to_pfn(base) + offset;
+		if (!pfn_valid(pfn))
+			return NULL;
+		return pfn_to_page(pfn);
+	}
+	return iter + 1;
+}
+
+/*
  * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
  * so all functions starting at paging_init should be marked __init
  * in those cases. SPARSEMEM, however, allows for memory hotplug,

-- 

  parent reply	other threads:[~2008-11-12  0:29 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20081112001401.926965113@mini.kroah.org>
2008-11-12  0:22 ` [patch 00/49] 2.6.27.5 stable review Greg KH
2008-11-12  0:22   ` [patch 01/49] ext3: wait on all pending commits in ext3_sync_fs Greg KH
2008-11-12  0:22   ` [patch 02/49] x86: add DMI quirk for AMI BIOS which corrupts address 0xc000 during resume Greg KH
2008-11-12  0:22   ` [patch 03/49] x86: reserve low 64K on AMI and Phoenix BIOS boxen Greg KH
2008-11-12  0:22   ` [patch 04/49] x86: add X86_RESERVE_LOW_64K Greg KH
2008-11-12  0:23   ` [patch 05/49] x86: fix CONFIG_X86_RESERVE_LOW_64K=y Greg KH
2008-11-12  0:23   ` [patch 06/49] x86: fix macro with bad_bios_dmi_table Greg KH
2008-11-12  0:23   ` [patch 07/49] cgroups: fix invalid cgrp->dentry before cgroup has been completely removed Greg KH
2008-11-12  0:23   ` [patch 08/49] hugetlb: pull gigantic page initialisation out of the default path Greg KH
2008-11-12  0:23   ` Greg KH [this message]
2008-11-12  0:23   ` [patch 10/49] cciss: fix regression firmware not displayed in procfs Greg KH
2008-11-12  0:23   ` [patch 11/49] cciss: fix sysfs broken symlink regression Greg KH
2008-11-12  0:23   ` [patch 12/49] cciss: new hardware support Greg KH
2008-11-12  0:23   ` [patch 13/49] md: linear: Fix a division by zero bug for very small arrays Greg KH
2008-11-12  0:23   ` [patch 14/49] md: fix bug in raid10 recovery Greg KH
2008-11-12  0:23   ` [patch 15/49] JFFS2: fix race condition in jffs2_lzo_compress() Greg KH
2008-11-12  0:23   ` [patch 16/49] JFFS2: Fix lack of locking in thread_should_wake() Greg KH
2008-11-12  0:23   ` [patch 17/49] ARM: xsc3: fix xsc3_l2_inv_range Greg KH
2008-11-12  0:23   ` [patch 18/49] MTD: Fix cfi_send_gen_cmd handling of x16 devices in x8 mode (v4) Greg KH
2008-11-12  0:23   ` [patch 19/49] x86: dont use tsc_khz to calculate lpj if notsc is passed Greg KH
2008-11-12  0:23   ` [patch 20/49] net: unix: fix inflight counting bug in garbage collector Greg KH
2008-11-12  0:23   ` [patch 21/49] r8169: get ethtool settings through the generic mii helper Greg KH
2008-11-12  0:23   ` [patch 22/49] r8169: fix RxMissed register access Greg KH
2008-11-12  0:23   ` [patch 23/49] r8169: wake up the PHY of the 8168 Greg KH
2008-11-12  0:23   ` [patch 24/49] I/OAT: fix channel resources free for not allocated channels Greg KH
2008-11-12  0:23   ` [patch 25/49] I/OAT: fix dma_pin_iovec_pages() error handling Greg KH
2008-11-12  0:23   ` [patch 26/49] I/OAT: fix async_tx.callback checking Greg KH
2008-11-12  0:23   ` [patch 27/49] dca: fixup initialization dependency Greg KH
2008-11-12  0:23   ` [patch 28/49] iwlwifi: allow consecutive scans in unassociated state Greg KH
2008-11-12  0:23   ` [patch 29/49] iwlwifi: allow association on radar channel in power save Greg KH
2008-11-12  0:23   ` [patch 30/49] iwlwifi: remove HT flags from RXON when not in HT anymore Greg KH
2008-11-12  0:23   ` [patch 31/49] iwlwifi: dont fail if scan is issued too early Greg KH
2008-11-12  0:24   ` [patch 32/49] iwlwifi: use correct DMA_MASK Greg KH
2008-11-12  0:24   ` [patch 33/49] iwlwifi: fix suspend to RAM in iwlwifi Greg KH
2008-11-12  0:24   ` [patch 34/49] iwlwifi: generic init calibrations framework Greg KH
2008-11-12  0:24   ` [patch 35/49] zd1211rw: Add 2 device IDs Greg KH
2008-11-12  0:24   ` [patch 36/49] iwl3945: fix deadlock on suspend Greg KH
2008-11-12  0:24   ` [patch 37/49] iwl3945: do not send scan command if channel count zero Greg KH
2008-11-12  0:24   ` [patch 38/49] cpqarry: fix return value of cpqarray_init() Greg KH
2008-11-12  0:24   ` [patch 39/49] ACPI: dock: avoid check _STA method Greg KH
2008-11-12  0:24   ` [patch 40/49] ARM: 5300/1: fixup spitz reset during boot Greg KH
2008-11-12  0:24   ` [patch 41/49] KEYS: Make request key instantiate the per-user keyrings Greg KH
2008-11-12  0:24   ` [patch 42/49] libata: fix last_reset timestamp handling Greg KH
2008-11-12  0:24   ` [patch 43/49] ALSA: hda: make a STAC_DELL_EQ option Greg KH
2008-11-12  0:24   ` [patch 44/49] Fix __pfn_to_page(pfn) for CONFIG_DISCONTIGMEM=y Greg KH
2008-11-12  0:24   ` [patch 45/49] mmc: increase SD write timeout for crappy cards Greg KH
2008-11-12  0:24   ` [patch 46/49] hfsplus: fix Buffer overflow with a corrupted image (CVE-2008-4933) Greg KH
2008-11-12  0:24   ` [patch 47/49] hfsplus: check read_mapping_page() return value (CVE-2008-4934) Greg KH
2008-11-12  0:24   ` [patch 48/49] hfs: fix namelength memory corruption (CVE-2008-5025) Greg KH
2008-11-12  0:24   ` [patch 49/49] HID: fix incorrent length condition in hidraw_write() Greg KH
2008-11-12  0:44   ` [patch 00/49] 2.6.27.5 stable review Gabriel C
2008-11-12  1:07     ` Greg KH
2008-11-12  0:54   ` Willy Tarreau
2008-11-12 14:08   ` Frans Pop
2008-11-12 17:03     ` [stable] " Greg KH
2008-11-13 22:07     ` Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20081112002309.GJ10989@kroah.com \
    --to=gregkh@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=apw@shadowen.org \
    --cc=cavokz@gmail.com \
    --cc=cebbert@redhat.com \
    --cc=chuckw@quantumlinux.com \
    --cc=cl@linux-foundation.org \
    --cc=davej@redhat.com \
    --cc=eteo@redhat.com \
    --cc=jake@lwn.net \
    --cc=jmforbes@linuxtx.org \
    --cc=kniht@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mel@csn.ul.ie \
    --cc=mkrufky@linuxtv.org \
    --cc=nickpiggin@yahoo.com.au \
    --cc=rbranco@la.checkpoint.com \
    --cc=rdunlap@xenotime.net \
    --cc=reviews@ml.cw.f00f.org \
    --cc=stable@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=tytso@mit.edu \
    --cc=w@1wt.eu \
    --cc=zwane@arm.linux.org.uk \
    --subject='Re: [patch 09/49] hugetlbfs: handle pages higher order than MAX_ORDER' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).