LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Tony Luck <tony.luck@intel.com>
To: Borislav Petkov <bp@alien8.de>
Cc: Jue Wang <juew@google.com>, Ding Hui <dinghui@sangfor.com.cn>,
	naoya.horiguchi@nec.com, osalvador@suse.de,
	Youquan Song <youquan.song@intel.com>,
	huangcun@sangfor.com.cn, x86@kernel.org,
	linux-edac@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, Tony Luck <tony.luck@intel.com>
Subject: [PATCH v2 1/3] x86/mce: Avoid infinite loop for copy from user recovery
Date: Tue, 17 Aug 2021 17:29:40 -0700	[thread overview]
Message-ID: <20210818002942.1607544-2-tony.luck@intel.com> (raw)
In-Reply-To: <20210818002942.1607544-1-tony.luck@intel.com>

Recovery action when get_user() triggers a machine check uses the fixup
path to make get_user() return -EFAULT.  Also queue_task_work() sets up
so that kill_me_maybe() will be called on return to user mode to send
a SIGBUS to the current process.

But there are places in the kernel where the code assumes that this
EFAULT return was simply because of a page fault. The code takes some
action to fix that, and then retries the access. This results in a second
machine check.

While processing this second machine check queue_task_work() is called
again. But since this uses the same callback_head structure that was used
in the first call, the net result is an entry on the current->task_works
list that points to itself. When task_work_run() is called it loops
forever in this code:

        do {
                next = work->next;
                work->func(work);
                work = next;
                cond_resched();
        } while (work);

Add a counter (current->mce_count) to keep track of repeated machine
checks before task_work() is called. First machine check saves the address
information and calls task_work_add(). Subsequent machine checks before
that task_work call back is executed check that the address is in the
same page as the first machine check (since the callback will offline
exactly one page).

Expected worst case is two machine checks before moving on (e.g. one user
access with page faults disabled, then a repeat to the same addrsss with
page faults enabled). Just in case there is some code that loops forever
enforce a limit of 10.

Cc: <stable@vger.kernel.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mce/core.c | 43 +++++++++++++++++++++++++---------
 include/linux/sched.h          |  1 +
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 22791aadc085..94830ee9581c 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1250,6 +1250,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
 
 static void kill_me_now(struct callback_head *ch)
 {
+	struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+	p->mce_count = 0;
 	force_sig(SIGBUS);
 }
 
@@ -1259,6 +1262,7 @@ static void kill_me_maybe(struct callback_head *cb)
 	int flags = MF_ACTION_REQUIRED;
 	int ret;
 
+	p->mce_count = 0;
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
 
 	if (!p->mce_ripv)
@@ -1287,17 +1291,34 @@ static void kill_me_maybe(struct callback_head *cb)
 	}
 }
 
-static void queue_task_work(struct mce *m, int kill_current_task)
+static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
 {
-	current->mce_addr = m->addr;
-	current->mce_kflags = m->kflags;
-	current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
-	current->mce_whole_page = whole_page(m);
+	int count = ++current->mce_count;
 
-	if (kill_current_task)
-		current->mce_kill_me.func = kill_me_now;
-	else
-		current->mce_kill_me.func = kill_me_maybe;
+	/* First call, save all the details */
+	if (count == 1) {
+		current->mce_addr = m->addr;
+		current->mce_kflags = m->kflags;
+		current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+		current->mce_whole_page = whole_page(m);
+
+		if (kill_current_task)
+			current->mce_kill_me.func = kill_me_now;
+		else
+			current->mce_kill_me.func = kill_me_maybe;
+	}
+
+	/* Ten is likley overkill. Don't expect more than two faults before task_work() */
+	if (count > 10)
+		mce_panic("Too many machine checks while accessing user data", m, msg);
+
+	/* Second or later call, make sure page address matches the one from first call */
+	if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+		mce_panic("Machine checks to different user pages", m, msg);
+
+	/* Do not call task_work_add() more than once */
+	if (count > 1)
+		return;
 
 	task_work_add(current, &current->mce_kill_me, TWA_RESUME);
 }
@@ -1435,7 +1456,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
 		/* If this triggers there is no way to recover. Die hard. */
 		BUG_ON(!on_thread_stack() || !user_mode(regs));
 
-		queue_task_work(&m, kill_current_task);
+		queue_task_work(&m, msg, kill_current_task);
 
 	} else {
 		/*
@@ -1453,7 +1474,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
 		}
 
 		if (m.kflags & MCE_IN_KERNEL_COPYIN)
-			queue_task_work(&m, kill_current_task);
+			queue_task_work(&m, msg, kill_current_task);
 	}
 out:
 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ec8d07d88641..f6935787e7e8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1394,6 +1394,7 @@ struct task_struct {
 					mce_whole_page : 1,
 					__mce_reserved : 62;
 	struct callback_head		mce_kill_me;
+	int				mce_count;
 #endif
 
 #ifdef CONFIG_KRETPROBES
-- 
2.29.2


  reply	other threads:[~2021-08-18  0:30 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-06 19:06 [PATCH 0/3] More machine check recovery fixes Tony Luck
2021-07-06 19:06 ` [PATCH 1/3] x86/mce: Change to not send SIGBUS error during copy from user Tony Luck
2021-07-06 19:06 ` [PATCH 2/3] x86/mce: Avoid infinite loop for copy from user recovery Tony Luck
2021-07-06 19:06 ` [PATCH 3/3] x86/mce: Drop copyin special case for #MC Tony Luck
2021-08-18  0:29 ` [PATCH v2 0/3] More machine check recovery fixes Tony Luck
2021-08-18  0:29   ` Tony Luck [this message]
2021-08-20 17:31     ` [PATCH v2 1/3] x86/mce: Avoid infinite loop for copy from user recovery Borislav Petkov
2021-08-20 18:59       ` Luck, Tony
2021-08-20 19:27         ` Borislav Petkov
2021-08-20 20:23           ` Luck, Tony
2021-08-21  4:51             ` Tony Luck
2021-08-21 21:51               ` Al Viro
2021-08-22 14:36             ` Borislav Petkov
2021-08-20 20:33           ` Luck, Tony
2021-08-22 14:46             ` Borislav Petkov
2021-08-23 15:24               ` Luck, Tony
2021-09-13  9:24     ` Borislav Petkov
2021-09-13 21:52       ` [PATCH v3] " Luck, Tony
2021-09-14  8:28         ` Borislav Petkov
2021-08-18  0:29   ` [PATCH v2 2/3] x86/mce: Change to not send SIGBUS error during copy from user Tony Luck
2021-09-21  7:52     ` [tip: ras/core] " tip-bot2 for Tony Luck
2021-08-18  0:29   ` [PATCH v2 3/3] x86/mce: Drop copyin special case for #MC Tony Luck
2021-09-20  9:13     ` Borislav Petkov
2021-09-20 16:18       ` Luck, Tony
2021-09-20 16:37         ` Borislav Petkov
2021-09-20 16:43           ` Luck, Tony
2021-09-21  7:52     ` [tip: ras/core] " tip-bot2 for Tony Luck
2021-08-18 16:14   ` [PATCH v2 0/3] More machine check recovery fixes Luck, Tony
  -- strict thread matches above, loose matches on Subject: below --
2021-01-08 22:22 [PATCH 0/2] Fix infinite machine check loop in futex_wait_setup() Tony Luck
2021-01-11 21:44 ` [PATCH v2 0/3] " Tony Luck
2021-01-11 21:44   ` [PATCH v2 1/3] x86/mce: Avoid infinite loop for copy from user recovery Tony Luck
2021-01-11 22:11     ` Andy Lutomirski
2021-01-11 22:20       ` Luck, Tony
2021-01-12 17:00         ` Andy Lutomirski
2021-01-12 17:16           ` Luck, Tony
2021-01-12 17:21             ` Andy Lutomirski
2021-01-12 18:23               ` Luck, Tony
2021-01-12 18:57                 ` Andy Lutomirski
2021-01-12 20:52                   ` Luck, Tony
2021-01-12 22:04                     ` Andy Lutomirski
2021-01-13  1:50                       ` Luck, Tony
2021-01-13  4:15                         ` Andy Lutomirski
2021-01-13 10:00                           ` Borislav Petkov
2021-01-13 16:06                             ` Luck, Tony
2021-01-13 16:19                               ` Borislav Petkov
2021-01-13 16:32                                 ` Luck, Tony
2021-01-13 17:35                                   ` Borislav Petkov
2021-01-14 20:22     ` Borislav Petkov
2021-01-14 21:05       ` Luck, Tony

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210818002942.1607544-2-tony.luck@intel.com \
    --to=tony.luck@intel.com \
    --cc=bp@alien8.de \
    --cc=dinghui@sangfor.com.cn \
    --cc=huangcun@sangfor.com.cn \
    --cc=juew@google.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=naoya.horiguchi@nec.com \
    --cc=osalvador@suse.de \
    --cc=x86@kernel.org \
    --cc=youquan.song@intel.com \
    --subject='Re: [PATCH v2 1/3] x86/mce: Avoid infinite loop for copy from user recovery' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).