LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 3/3] Fix the wrong corrupted page information of hugetlb page.
@ 2011-01-21  1:24 Jin Dongming
  2011-01-21  5:12 ` Naoya Horiguchi
  0 siblings, 1 reply; 5+ messages in thread
From: Jin Dongming @ 2011-01-21  1:24 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Naoya Horiguchi, Huang Ying, Hidetoshi Seto, LKLM

When more than on tail page of the same mapped hugetlb page are
poisoned at the same time, the sum of corrupted page should be
the size of one hugetlb page(512 pages). But HardwareCorrupted
information in /proc/meminfo is 1024 pages or more.

That is because when the above condition happened,
__memory_failure() will be called many times and pages in hugetlb
page are accounted more than one time.

This patch fixed it by incrementing the number of corrupted page
one by one.

Signed-off-by: Jin Dongming <jin.dongming@np.css.fujitsu.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 mm/memory-failure.c |   78 +++++++++++++++++++++++++++++++-------------------
 1 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 824850a..56d1dfe 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -931,16 +931,22 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
 	int nr_pages = 1 << compound_trans_order(hpage);
-	for (i = 0; i < nr_pages; i++)
-		SetPageHWPoison(hpage + i);
+	for (i = 0; i < nr_pages; i++) {
+		if (TestSetPageHWPoison(hpage + i))
+			continue;
+
+		atomic_long_inc(&mce_bad_pages);
+	}
 }
 
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
 	int nr_pages = 1 << compound_trans_order(hpage);
-	for (i = 0; i < nr_pages; i++)
-		ClearPageHWPoison(hpage + i);
+	for (i = 0; i < nr_pages; i++) {
+		if (TestClearPageHWPoison(hpage + i))
+			atomic_long_dec(&mce_bad_pages);
+	}
 }
 
 int __memory_failure(unsigned long pfn, int trapno, int flags)
@@ -949,7 +955,6 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	struct page *p;
 	struct page *hpage;
 	int res;
-	unsigned int nr_pages;
 
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -968,8 +973,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 		return 0;
 	}
 
-	nr_pages = 1 << compound_trans_order(hpage);
-	atomic_long_add(nr_pages, &mce_bad_pages);
+	atomic_long_inc(&mce_bad_pages);
 
 	/*
 	 * We need/can do nothing about count=0 pages.
@@ -991,18 +995,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 			action_result(pfn, "free buddy", DELAYED);
 			return 0;
 		} else if (PageHuge(hpage)) {
-			/*
-			 * Check "just unpoisoned", "filter hit", and
-			 * "race with other subpage."
-			 */
 			lock_page_nosync(hpage);
-			if (!PageHWPoison(p)
-			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
-			    || (p != hpage && TestSetPageHWPoison(hpage))) {
-				atomic_long_sub(nr_pages, &mce_bad_pages);
+			/* Check "just unpoisoned". */
+			if (!PageHWPoison(hpage)) {
 				unlock_page(hpage);
 				return 0;
 			}
+
+			/* Check "filter hit". */
+			if (hwpoison_filter(p)) {
+				if (TestClearPageHWPoison(p))
+					atomic_long_dec(&mce_bad_pages);
+
+				unlock_page(hpage);
+				return 0;
+			}
+
+			/* Check "race with other subpage". */
+			if (p != hpage) {
+				if (TestSetPageHWPoison(hpage)) {
+					unlock_page(hpage);
+					return 0;
+				}
+
+				atomic_long_inc(&mce_bad_pages);
+			}
+
 			set_page_hwpoison_huge_page(hpage);
 			res = dequeue_hwpoisoned_huge_page(hpage);
 			action_result(pfn, "free huge",
@@ -1055,7 +1073,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	}
 	if (hwpoison_filter(p)) {
 		if (TestClearPageHWPoison(p))
-			atomic_long_sub(nr_pages, &mce_bad_pages);
+			atomic_long_dec(&mce_bad_pages);
 		unlock_page(hpage);
 		put_page(hpage);
 		return 0;
@@ -1065,12 +1083,17 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	 * For error on the tail page, we should set PG_hwpoison
 	 * on the head page to show that the hugepage is hwpoisoned
 	 */
-	if (PageTail(p) && TestSetPageHWPoison(hpage)) {
-		action_result(pfn, "hugepage already hardware poisoned",
-				IGNORED);
-		unlock_page(hpage);
-		put_page(hpage);
-		return 0;
+	if (PageTail(p)) {
+		if (TestSetPageHWPoison(hpage)) {
+			action_result(pfn,
+					"hugepage already hardware poisoned",
+					IGNORED);
+			unlock_page(hpage);
+			put_page(hpage);
+			return 0;
+		}
+
+		atomic_long_inc(&mce_bad_pages);
 	}
 	/*
 	 * Set PG_hwpoison on all pages in an error hugepage,
@@ -1154,7 +1177,6 @@ int unpoison_memory(unsigned long pfn)
 	struct page *page;
 	struct page *p;
 	int freeit = 0;
-	unsigned int nr_pages;
 
 	if (!pfn_valid(pfn))
 		return -ENXIO;
@@ -1167,8 +1189,6 @@ int unpoison_memory(unsigned long pfn)
 		return 0;
 	}
 
-	nr_pages = 1 << compound_trans_order(page);
-
 	if (!get_page_unless_zero(page)) {
 		/*
 		 * Since HWPoisoned hugepage should have non-zero refcount,
@@ -1181,7 +1201,7 @@ int unpoison_memory(unsigned long pfn)
 			return 0;
 		}
 		if (TestClearPageHWPoison(p))
-			atomic_long_sub(nr_pages, &mce_bad_pages);
+			atomic_long_dec(&mce_bad_pages);
 		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
 		return 0;
 	}
@@ -1195,7 +1215,7 @@ int unpoison_memory(unsigned long pfn)
 	 */
 	if (TestClearPageHWPoison(page)) {
 		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
-		atomic_long_sub(nr_pages, &mce_bad_pages);
+		atomic_long_dec(&mce_bad_pages);
 		freeit = 1;
 		if (PageHuge(page))
 			clear_page_hwpoison_huge_page(page);
@@ -1304,8 +1324,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
 		return ret;
 	}
 done:
-	if (!PageHWPoison(hpage))
-		atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
 	set_page_hwpoison_huge_page(hpage);
 	dequeue_hwpoisoned_huge_page(hpage);
 	/* keep elevated page count for bad page */
@@ -1433,7 +1451,7 @@ int soft_offline_page(struct page *page, int flags)
 		return ret;
 
 done:
-	atomic_long_add(1, &mce_bad_pages);
+	atomic_long_inc(&mce_bad_pages);
 	SetPageHWPoison(page);
 	/* keep elevated page count for bad page */
 	return ret;
-- 
1.7.2.2



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3] Fix the wrong corrupted page information of hugetlb page.
  2011-01-21  1:24 [PATCH 3/3] Fix the wrong corrupted page information of hugetlb page Jin Dongming
@ 2011-01-21  5:12 ` Naoya Horiguchi
  2011-01-21  6:05   ` Jin Dongming
  2011-01-21  8:17   ` [PATCH 3/3 -v2] " Jin Dongming
  0 siblings, 2 replies; 5+ messages in thread
From: Naoya Horiguchi @ 2011-01-21  5:12 UTC (permalink / raw)
  To: Jin Dongming; +Cc: Andi Kleen, Huang Ying, Hidetoshi Seto, LKLM

Hi,

> @@ -991,18 +995,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
>  			action_result(pfn, "free buddy", DELAYED);
>  			return 0;
>  		} else if (PageHuge(hpage)) {
> -			/*
> -			 * Check "just unpoisoned", "filter hit", and
> -			 * "race with other subpage."
> -			 */
>  			lock_page_nosync(hpage);
> -			if (!PageHWPoison(p)
> -			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
> -			    || (p != hpage && TestSetPageHWPoison(hpage))) {
> -				atomic_long_sub(nr_pages, &mce_bad_pages);
> +			/* Check "just unpoisoned". */
> +			if (!PageHWPoison(hpage)) {
                                          ^^^^^
Isn't this "p" (as you changed in patch 2/3) ?

-- Naoya Horiguchi

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3] Fix the wrong corrupted page information of hugetlb page.
  2011-01-21  5:12 ` Naoya Horiguchi
@ 2011-01-21  6:05   ` Jin Dongming
  2011-01-21  8:17   ` [PATCH 3/3 -v2] " Jin Dongming
  1 sibling, 0 replies; 5+ messages in thread
From: Jin Dongming @ 2011-01-21  6:05 UTC (permalink / raw)
  To: Naoya Horiguchi; +Cc: Andi Kleen, Huang Ying, Hidetoshi Seto, LKLM

Hi, Naoya-san

(2011/01/21 14:12), Naoya Horiguchi wrote:
> Hi,
> 
>> @@ -991,18 +995,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
>>  			action_result(pfn, "free buddy", DELAYED);
>>  			return 0;
>>  		} else if (PageHuge(hpage)) {
>> -			/*
>> -			 * Check "just unpoisoned", "filter hit", and
>> -			 * "race with other subpage."
>> -			 */
>>  			lock_page_nosync(hpage);
>> -			if (!PageHWPoison(p)
>> -			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
>> -			    || (p != hpage && TestSetPageHWPoison(hpage))) {
>> -				atomic_long_sub(nr_pages, &mce_bad_pages);
>> +			/* Check "just unpoisoned". */
>> +			if (!PageHWPoison(hpage)) {
>                                           ^^^^^
> Isn't this "p" (as you changed in patch 2/3) ?
Yes, you are right.
I will send a new one as soon as possible.

Thanks.

Best Regards,
Jin Dongming
> 
> -- Naoya Horiguchi
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 3/3 -v2] Fix the wrong corrupted page information of hugetlb page.
  2011-01-21  5:12 ` Naoya Horiguchi
  2011-01-21  6:05   ` Jin Dongming
@ 2011-01-21  8:17   ` Jin Dongming
  2011-01-21  9:56     ` Naoya Horiguchi
  1 sibling, 1 reply; 5+ messages in thread
From: Jin Dongming @ 2011-01-21  8:17 UTC (permalink / raw)
  To: Naoya Horiguchi; +Cc: Andi Kleen, Huang Ying, Hidetoshi Seto, LKLM

When more than on tail page of the same mapped hugetlb page are
poisoned at the same time, the sum of corrupted page should be
the size of one hugetlb page(512 pages). But HardwareCorrupted
information in /proc/meminfo is 1024 pages or more.

That is because when the above condition happened,
__memory_failure() will be called many times and pages in hugetlb
page are accounted more than one time.

This patch fixed it by incrementing the number of corrupted page
one by one.

-v2 fix checking for "just unpoisoned".

Signed-off-by: Jin Dongming <jin.dongming@np.css.fujitsu.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 mm/memory-failure.c |   78 +++++++++++++++++++++++++++++++-------------------
 1 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 824850a..9789834 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -931,16 +931,22 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
 	int nr_pages = 1 << compound_trans_order(hpage);
-	for (i = 0; i < nr_pages; i++)
-		SetPageHWPoison(hpage + i);
+	for (i = 0; i < nr_pages; i++) {
+		if (TestSetPageHWPoison(hpage + i))
+			continue;
+
+		atomic_long_inc(&mce_bad_pages);
+	}
 }
 
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
 	int nr_pages = 1 << compound_trans_order(hpage);
-	for (i = 0; i < nr_pages; i++)
-		ClearPageHWPoison(hpage + i);
+	for (i = 0; i < nr_pages; i++) {
+		if (TestClearPageHWPoison(hpage + i))
+			atomic_long_dec(&mce_bad_pages);
+	}
 }
 
 int __memory_failure(unsigned long pfn, int trapno, int flags)
@@ -949,7 +955,6 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	struct page *p;
 	struct page *hpage;
 	int res;
-	unsigned int nr_pages;
 
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -968,8 +973,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 		return 0;
 	}
 
-	nr_pages = 1 << compound_trans_order(hpage);
-	atomic_long_add(nr_pages, &mce_bad_pages);
+	atomic_long_inc(&mce_bad_pages);
 
 	/*
 	 * We need/can do nothing about count=0 pages.
@@ -991,18 +995,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 			action_result(pfn, "free buddy", DELAYED);
 			return 0;
 		} else if (PageHuge(hpage)) {
-			/*
-			 * Check "just unpoisoned", "filter hit", and
-			 * "race with other subpage."
-			 */
 			lock_page_nosync(hpage);
-			if (!PageHWPoison(p)
-			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
-			    || (p != hpage && TestSetPageHWPoison(hpage))) {
-				atomic_long_sub(nr_pages, &mce_bad_pages);
+			/* Check "just unpoisoned". */
+			if (!PageHWPoison(p)) {
 				unlock_page(hpage);
 				return 0;
 			}
+
+			/* Check "filter hit". */
+			if (hwpoison_filter(p)) {
+				if (TestClearPageHWPoison(p))
+					atomic_long_dec(&mce_bad_pages);
+
+				unlock_page(hpage);
+				return 0;
+			}
+
+			/* Check "race with other subpage". */
+			if (p != hpage) {
+				if (TestSetPageHWPoison(hpage)) {
+					unlock_page(hpage);
+					return 0;
+				}
+
+				atomic_long_inc(&mce_bad_pages);
+			}
+
 			set_page_hwpoison_huge_page(hpage);
 			res = dequeue_hwpoisoned_huge_page(hpage);
 			action_result(pfn, "free huge",
@@ -1055,7 +1073,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	}
 	if (hwpoison_filter(p)) {
 		if (TestClearPageHWPoison(p))
-			atomic_long_sub(nr_pages, &mce_bad_pages);
+			atomic_long_dec(&mce_bad_pages);
 		unlock_page(hpage);
 		put_page(hpage);
 		return 0;
@@ -1065,12 +1083,17 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	 * For error on the tail page, we should set PG_hwpoison
 	 * on the head page to show that the hugepage is hwpoisoned
 	 */
-	if (PageTail(p) && TestSetPageHWPoison(hpage)) {
-		action_result(pfn, "hugepage already hardware poisoned",
-				IGNORED);
-		unlock_page(hpage);
-		put_page(hpage);
-		return 0;
+	if (PageTail(p)) {
+		if (TestSetPageHWPoison(hpage)) {
+			action_result(pfn,
+					"hugepage already hardware poisoned",
+					IGNORED);
+			unlock_page(hpage);
+			put_page(hpage);
+			return 0;
+		}
+
+		atomic_long_inc(&mce_bad_pages);
 	}
 	/*
 	 * Set PG_hwpoison on all pages in an error hugepage,
@@ -1154,7 +1177,6 @@ int unpoison_memory(unsigned long pfn)
 	struct page *page;
 	struct page *p;
 	int freeit = 0;
-	unsigned int nr_pages;
 
 	if (!pfn_valid(pfn))
 		return -ENXIO;
@@ -1167,8 +1189,6 @@ int unpoison_memory(unsigned long pfn)
 		return 0;
 	}
 
-	nr_pages = 1 << compound_trans_order(page);
-
 	if (!get_page_unless_zero(page)) {
 		/*
 		 * Since HWPoisoned hugepage should have non-zero refcount,
@@ -1181,7 +1201,7 @@ int unpoison_memory(unsigned long pfn)
 			return 0;
 		}
 		if (TestClearPageHWPoison(p))
-			atomic_long_sub(nr_pages, &mce_bad_pages);
+			atomic_long_dec(&mce_bad_pages);
 		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
 		return 0;
 	}
@@ -1195,7 +1215,7 @@ int unpoison_memory(unsigned long pfn)
 	 */
 	if (TestClearPageHWPoison(page)) {
 		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
-		atomic_long_sub(nr_pages, &mce_bad_pages);
+		atomic_long_dec(&mce_bad_pages);
 		freeit = 1;
 		if (PageHuge(page))
 			clear_page_hwpoison_huge_page(page);
@@ -1304,8 +1324,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
 		return ret;
 	}
 done:
-	if (!PageHWPoison(hpage))
-		atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
 	set_page_hwpoison_huge_page(hpage);
 	dequeue_hwpoisoned_huge_page(hpage);
 	/* keep elevated page count for bad page */
@@ -1433,7 +1451,7 @@ int soft_offline_page(struct page *page, int flags)
 		return ret;
 
 done:
-	atomic_long_add(1, &mce_bad_pages);
+	atomic_long_inc(&mce_bad_pages);
 	SetPageHWPoison(page);
 	/* keep elevated page count for bad page */
 	return ret;
-- 
1.7.2.2


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3 -v2] Fix the wrong corrupted page information of hugetlb page.
  2011-01-21  8:17   ` [PATCH 3/3 -v2] " Jin Dongming
@ 2011-01-21  9:56     ` Naoya Horiguchi
  0 siblings, 0 replies; 5+ messages in thread
From: Naoya Horiguchi @ 2011-01-21  9:56 UTC (permalink / raw)
  To: Jin Dongming; +Cc: Andi Kleen, Huang Ying, Hidetoshi Seto, LKLM

On Fri, Jan 21, 2011 at 05:17:36PM +0900, Jin Dongming wrote:
> When more than on tail page of the same mapped hugetlb page are
> poisoned at the same time, the sum of corrupted page should be
> the size of one hugetlb page(512 pages). But HardwareCorrupted
> information in /proc/meminfo is 1024 pages or more.
> 
> That is because when the above condition happened,
> __memory_failure() will be called many times and pages in hugetlb
> page are accounted more than one time.
> 
> This patch fixed it by incrementing the number of corrupted page
> one by one.
> 
> -v2 fix checking for "just unpoisoned".
> 
> Signed-off-by: Jin Dongming <jin.dongming@np.css.fujitsu.com>
> Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2011-01-21  9:58 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-21  1:24 [PATCH 3/3] Fix the wrong corrupted page information of hugetlb page Jin Dongming
2011-01-21  5:12 ` Naoya Horiguchi
2011-01-21  6:05   ` Jin Dongming
2011-01-21  8:17   ` [PATCH 3/3 -v2] " Jin Dongming
2011-01-21  9:56     ` Naoya Horiguchi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).