LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [patch] my mmu notifiers
@ 2008-02-19  8:43 Nick Piggin
  2008-02-19  8:44 ` [patch] my mmu notifier sample driver Nick Piggin
                   ` (2 more replies)
  0 siblings, 3 replies; 120+ messages in thread
From: Nick Piggin @ 2008-02-19  8:43 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

Well I started reviewing the mmu notifier code, but it is kind of hard to
know what you're talking about just by reading through code and not trying
your suggestions for yourself...

So I implemented mmu notifiers slightly differently. Andrea's mmu notifiers
are rather similar. However I have tried to make a point of minimising the
impact the the core mm/. I don't see why we need to invalidate or flush
anything when changing the pte to be _more_ permissive, and I don't
understand the need for invalidate_begin/invalidate_end pairs at all.
What I have done is basically create it so that the notifiers get called
basically in the same place as the normal TLB flushing is done, and nowhere
else.

I also wanted to avoid calling notifier code from inside eg. hardware TLB
or pte manipulation primitives. These things are already pretty well
spaghetti, so I'd like to just place them right where needed first... I
think eventually it will need a bit of a rethink to make it more consistent
and more general. But I prefer to do put them in the caller for the moment.

I have also attempted to write a skeleton driver. Not like Christoph's
drivers, but one that actually does something. This one can mmap a
window into its own virtual address space. It's not perfect yet (I need
to replace page_mkwrite with ->fault in the core mm before I can get
enough information to do protection properly I think). However I think it
may be race-free in the fault vs unmap paths. It's pretty complex, I must
say.

---

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h
+++ linux-2.6/include/linux/mm_types.h
@@ -228,6 +228,9 @@ struct mm_struct {
 #ifdef CONFIG_CGROUP_MEM_CONT
 	struct mem_cgroup *mem_cgroup;
 #endif
+#ifdef CONFIG_MMU_NOTIFIER
+	struct hlist_head mmu_notifier_list;
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
Index: linux-2.6/include/linux/mmu_notifier.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/mmu_notifier.h
@@ -0,0 +1,69 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include <linux/list.h>
+#include <linux/mm_types.h>
+
+struct mmu_notifier;
+struct mmu_notifier_operations;
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier {
+	struct hlist_node hlist;
+	const struct mmu_notifier_operations *ops;
+	struct mm_struct *mm;
+};
+
+struct mmu_notifier_operations {
+	void (*release)(struct mmu_notifier *mn);
+	int (*clear_young)(struct mmu_notifier *mn, unsigned long address);
+	void (*unmap)(struct mmu_notifier *mn, unsigned long address);
+	void (*invalidate_range)(struct mmu_notifier *mn, unsigned long start, unsigned long end);
+};
+
+static inline void mmu_notifier_init_mm(struct mm_struct *mm)
+{
+	INIT_HLIST_HEAD(&mm->mmu_notifier_list);
+}
+
+static inline void mmu_notifier_init(struct mmu_notifier *mn, const struct mmu_notifier_operations *ops, struct mm_struct *mm)
+{
+	INIT_HLIST_NODE(&mn->hlist);
+	mn->ops = ops;
+	mn->mm = mm;
+}
+
+extern void mmu_notifier_register(struct mmu_notifier *mn);
+extern void mmu_notifier_unregister(struct mmu_notifier *mn);
+
+extern void mmu_notifier_exit_mm(struct mm_struct *mm);
+extern int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long address);
+extern void mmu_notifier_unmap(struct mm_struct *mm, unsigned long address);
+extern void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end);
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+static inline void mmu_notifier_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void mmu_notifier_exit_mm(struct mm_struct *mm)
+{
+}
+
+static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long address)
+{
+	return 0;
+}
+
+static inline void mmu_notifier_unmap(struct mm_struct *mm, unsigned long address)
+{
+}
+
+static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+}
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -43,6 +43,7 @@
 #include <linux/memcontrol.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -358,6 +359,7 @@ static struct mm_struct * mm_init(struct
 	mm->ioctx_list = NULL;
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
+	mmu_notifier_init_mm(mm);
 	mm_init_cgroup(mm, p);
 
 	if (likely(!mm_alloc_pgd(mm))) {
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c
+++ linux-2.6/mm/filemap_xip.c
@@ -195,6 +195,7 @@ __xip_unmap (struct address_space * mapp
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush(vma, address, pte);
+			mmu_notifier_unmap(mm, address);
 			page_remove_rmap(page, vma);
 			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c
+++ linux-2.6/mm/fremap.c
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -34,6 +35,7 @@ static void zap_pte(struct mm_struct *mm
 		if (page) {
 			if (pte_dirty(pte))
 				set_page_dirty(page);
+			mmu_notifier_unmap(mm, addr);
 			page_remove_rmap(page, vma);
 			page_cache_release(page);
 			update_hiwater_rss(mm);
Index: linux-2.6/mm/hugetlb.c
===================================================================
--- linux-2.6.orig/mm/hugetlb.c
+++ linux-2.6/mm/hugetlb.c
@@ -14,6 +14,7 @@
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -771,10 +772,12 @@ void __unmap_hugepage_range(struct vm_ar
 		page = pte_page(pte);
 		if (pte_dirty(pte))
 			set_page_dirty(page);
+		mmu_notifier_unmap(mm, address);
 		list_add(&page->lru, &page_list);
 	}
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
+	mmu_notifier_invalidate_range(mm, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
 		list_del(&page->lru);
 		put_page(page);
@@ -1048,6 +1051,7 @@ void hugetlb_change_protection(struct vm
 			continue;
 		if (!pte_none(*ptep)) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
+			mmu_notifier_unmap(mm, address);
 			pte = pte_mkhuge(pte_modify(pte, newprot));
 			set_huge_pte_at(mm, address, ptep, pte);
 		}
@@ -1056,6 +1060,7 @@ void hugetlb_change_protection(struct vm
 	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 
 	flush_tlb_range(vma, start, end);
+	mmu_notifier_invalidate_range(mm, start, end);
 }
 
 struct file_region {
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -51,6 +51,7 @@
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -626,9 +627,10 @@ int copy_page_range(struct mm_struct *ds
 
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
-				unsigned long addr, unsigned long end,
+				unsigned long start, unsigned long end,
 				long *zap_work, struct zap_details *details)
 {
+	unsigned long addr = start;
 	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
 	spinlock_t *ptl;
@@ -670,6 +672,7 @@ static unsigned long zap_pte_range(struc
 			}
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
+			mmu_notifier_unmap(mm, addr);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
@@ -702,6 +705,7 @@ static unsigned long zap_pte_range(struc
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
+	mmu_notifier_invalidate_range(mm, start, end);
 	add_mm_rss(mm, file_rss, anon_rss);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
@@ -981,6 +985,7 @@ no_page_table:
 	}
 	return page;
 }
+EXPORT_SYMBOL(follow_page);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
@@ -1676,6 +1681,7 @@ gotten:
 		 * thread doing COW.
 		 */
 		ptep_clear_flush(vma, address, page_table);
+		mmu_notifier_unmap(mm, address);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lru_cache_add_active(new_page);
@@ -2200,7 +2206,7 @@ static int __do_fault(struct mm_struct *
 	vmf.flags = flags;
 	vmf.page = NULL;
 
-	BUG_ON(vma->vm_flags & VM_PFNMAP);
+	/* BUG_ON(vma->vm_flags & VM_PFNMAP); */
 
 	if (likely(vma->vm_ops->fault)) {
 		ret = vma->vm_ops->fault(vma, &vmf);
@@ -2498,8 +2504,10 @@ static inline int handle_pte_fault(struc
 		 * This still avoids useless tlb flushes for .text page faults
 		 * with threads.
 		 */
-		if (write_access)
+		if (write_access) {
 			flush_tlb_page(vma, address);
+			mmu_notifier_invalidate_range(mm, address, address+PAGE_SIZE);
+		}
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -26,6 +26,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -2037,6 +2038,7 @@ void exit_mmap(struct mm_struct *mm)
 	unsigned long end;
 
 	/* mm's last user has gone, and its about to be pulled down */
+	mmu_notifier_exit_mm(mm);
 	arch_exit_mmap(mm);
 
 	lru_add_drain();
Index: linux-2.6/mm/mmu_notifier.c
===================================================================
--- /dev/null
+++ linux-2.6/mm/mmu_notifier.c
@@ -0,0 +1,85 @@
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+
+
+#define __mmu_notifier_for_each(mm, mn, hnode)			\
+	hlist_for_each_entry_rcu(mn, hnode, &(mm)->mmu_notifier_list, hlist)
+
+#define do_mmu_notifier_for_each(mm, mn)			\
+	do {							\
+		if (unlikely(!hlist_empty(&(mm)->mmu_notifier_list))) { \
+			struct hlist_node *__do_for_each_node;	\
+			rcu_read_lock();			\
+			__mmu_notifier_for_each(mm, mn, __do_for_each_node) {
+
+#define while_mmu_notifier_for_each				\
+			}					\
+			rcu_read_unlock();			\
+		}						\
+	} while (0)
+
+
+void mmu_notifier_register(struct mmu_notifier *mn)
+{
+	hlist_add_head_rcu(&mn->hlist, &mn->mm->mmu_notifier_list);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(mmu_notifier_register);
+
+void mmu_notifier_unregister(struct mmu_notifier *mn)
+{
+	hlist_del_rcu(&mn->hlist);
+	synchronize_rcu();
+}
+
+void mmu_notifier_exit_mm(struct mm_struct *mm)
+{
+	if (unlikely(!hlist_empty(&mm->mmu_notifier_list))) {
+		struct mmu_notifier *mn;
+		struct hlist_node *n, *t;
+
+		hlist_for_each_entry_safe(mn, n, t,
+				&mm->mmu_notifier_list, hlist) {
+			hlist_del_rcu(&mn->hlist);
+			if (mn->ops->release)
+				mn->ops->release(mn);
+		}
+	}
+}
+
+int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long address)
+{
+	struct mmu_notifier *mn;
+	int ret = 0;
+
+	do_mmu_notifier_for_each(mm, mn) {
+		if (mn->ops->clear_young) {
+			if (mn->ops->clear_young(mn, address))
+				ret = 1;
+		}
+	} while_mmu_notifier_for_each;
+
+	return ret;
+}
+
+void mmu_notifier_unmap(struct mm_struct *mm, unsigned long address)
+{
+	struct mmu_notifier *mn;
+
+	do_mmu_notifier_for_each(mm, mn) {
+		if (mn->ops->unmap)
+			mn->ops->unmap(mn, address);
+	} while_mmu_notifier_for_each;
+}
+
+void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+	struct mmu_notifier *mn;
+
+	do_mmu_notifier_for_each(mm, mn) {
+		if (mn->ops->invalidate_range)
+			mn->ops->invalidate_range(mn, start, end);
+	} while_mmu_notifier_for_each;
+}
Index: linux-2.6/mm/mprotect.c
===================================================================
--- linux-2.6.orig/mm/mprotect.c
+++ linux-2.6/mm/mprotect.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -45,6 +46,8 @@ static void change_pte_range(struct mm_s
 			 * into place.
 			 */
 			ptent = ptep_get_and_clear(mm, addr, pte);
+			mmu_notifier_unmap(mm, addr);
+
 			ptent = pte_modify(ptent, newprot);
 			/*
 			 * Avoid taking write faults for pages we know to be
@@ -125,6 +128,7 @@ static void change_protection(struct vm_
 		change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
 	} while (pgd++, addr = next, addr != end);
 	flush_tlb_range(vma, start, end);
+	mmu_notifier_invalidate_range(mm, start, end);
 }
 
 int
Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c
+++ linux-2.6/mm/mremap.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -105,6 +106,7 @@ static void move_ptes(struct vm_area_str
 		if (pte_none(*old_pte))
 			continue;
 		pte = ptep_clear_flush(vma, old_addr, old_pte);
+		mmu_notifier_unmap(mm, old_addr);
 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlbflush.h>
 
@@ -287,8 +288,12 @@ static int page_referenced_one(struct pa
 	if (vma->vm_flags & VM_LOCKED) {
 		referenced++;
 		*mapcount = 1;	/* break early from loop */
-	} else if (ptep_clear_flush_young(vma, address, pte))
-		referenced++;
+	} else {
+		if (ptep_clear_flush_young(vma, address, pte))
+			referenced++;
+		if (mmu_notifier_clear_young(mm, address))
+			referenced++;
+	}
 
 	/* Pretend the page is referenced if the task has the
 	   swap token and is in the middle of a page fault. */
@@ -455,6 +460,7 @@ static int page_mkclean_one(struct page 
 
 		flush_cache_page(vma, address, pte_pfn(*pte));
 		entry = ptep_clear_flush(vma, address, pte);
+		mmu_notifier_unmap(mm, address);
 		entry = pte_wrprotect(entry);
 		entry = pte_mkclean(entry);
 		set_pte_at(mm, address, pte, entry);
@@ -711,10 +717,21 @@ static int try_to_unmap_one(struct page 
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)))) {
-		ret = SWAP_FAIL;
-		goto out_unmap;
+	if (!migration) {
+		int referenced;
+
+		if (vma->vm_flags & VM_LOCKED) {
+fail:
+			ret = SWAP_FAIL;
+			goto out_unmap;
+		}
+		referenced = 0;
+		if (ptep_clear_flush_young(vma, address, pte))
+			referenced = 1;
+		if (mmu_notifier_clear_young(mm, address))
+			referenced = 1;
+		if (referenced)
+			goto fail;
 	}
 
 	/* Nuke the page table entry. */
@@ -724,6 +741,7 @@ static int try_to_unmap_one(struct page 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
+	mmu_notifier_unmap(mm, address);
 
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
@@ -839,12 +857,19 @@ static void try_to_unmap_cluster(unsigne
 	update_hiwater_rss(mm);
 
 	for (; address < end; pte++, address += PAGE_SIZE) {
+		int referenced;
+
 		if (!pte_present(*pte))
 			continue;
 		page = vm_normal_page(vma, address, *pte);
 		BUG_ON(!page || PageAnon(page));
 
+		referenced = 0;
 		if (ptep_clear_flush_young(vma, address, pte))
+			referenced = 1;
+		if (mmu_notifier_clear_young(mm, address))
+			referenced = 1;
+		if (referenced)
 			continue;
 
 		/* Nuke the page table entry. */
@@ -858,6 +883,7 @@ static void try_to_unmap_cluster(unsigne
 		/* Move the dirty bit to the physical page now the pte is gone. */
 		if (pte_dirty(pteval))
 			set_page_dirty(page);
+		mmu_notifier_unmap(mm, address);
 
 		page_remove_rmap(page, vma);
 		page_cache_release(page);
Index: linux-2.6/mm/Makefile
===================================================================
--- linux-2.6.orig/mm/Makefile
+++ linux-2.6/mm/Makefile
@@ -33,4 +33,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
-
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
Index: linux-2.6/mm/Kconfig
===================================================================
--- linux-2.6.orig/mm/Kconfig
+++ linux-2.6/mm/Kconfig
@@ -193,3 +193,7 @@ config NR_QUICK
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+	bool "MMU notifiers"
+	def_bool y

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [patch] my mmu notifier sample driver
  2008-02-19  8:43 [patch] my mmu notifiers Nick Piggin
@ 2008-02-19  8:44 ` Nick Piggin
  2008-02-19 11:59 ` [patch] my mmu notifiers Robin Holt
  2008-02-19 13:58 ` Andrea Arcangeli
  2 siblings, 0 replies; 120+ messages in thread
From: Nick Piggin @ 2008-02-19  8:44 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter


Index: linux-2.6/drivers/char/mmu_notifier_skel.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/char/mmu_notifier_skel.c
@@ -0,0 +1,255 @@
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mmu_notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/seqlock.h>
+#include <asm/tlbflush.h>
+
+static DEFINE_SPINLOCK(mmn_lock);
+static RADIX_TREE(rmap_tree, GFP_ATOMIC);
+static seqcount_t rmap_seq = SEQCNT_ZERO;
+
+static int __rmap_add(unsigned long mem, unsigned long vaddr)
+{
+	int err;
+
+	err = radix_tree_insert(&rmap_tree, mem >> PAGE_SHIFT, (void *)vaddr);
+
+	return err;
+}
+
+static void __rmap_del(unsigned long mem)
+{
+	void *ret;
+
+	ret = radix_tree_delete(&rmap_tree, mem >> PAGE_SHIFT);
+	BUG_ON(!ret);
+}
+
+static unsigned long rmap_find(unsigned long mem)
+{
+	unsigned long vaddr;
+
+	rcu_read_lock();
+	vaddr = (unsigned long)radix_tree_lookup(&rmap_tree, mem >> PAGE_SHIFT);
+	rcu_read_unlock();
+
+	return vaddr;
+}
+
+static struct page *follow_page_atomic(struct mm_struct *mm, unsigned long address, int write)
+{
+	struct vm_area_struct *vma;
+
+	vma = find_vma(mm, address);
+        if (!vma || (vma->vm_start > address))
+                return NULL;
+
+	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+		return NULL;
+
+	return follow_page(vma, address, FOLL_GET|(write ? FOLL_WRITE : 0));
+}
+
+static int mmn_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long source_vaddr = (unsigned long)vmf->pgoff << PAGE_SHIFT;
+	unsigned long dest_vaddr = (unsigned long)vmf->virtual_address;
+	unsigned long pfn;
+	struct page *page;
+	pgprot_t prot;
+	int write = vmf->flags & FAULT_FLAG_WRITE;
+	int ret;
+
+	printk("mmn_vm_fault %s@vaddr=%lx sourcing from %lx\n", write ? "write" : "read", dest_vaddr, source_vaddr);
+
+	BUG_ON(mm != current->mm); /* disallow get_user_pages */
+
+again:
+	spin_lock(&mmn_lock);
+	write_seqcount_begin(&rmap_seq);
+	page = follow_page_atomic(mm, source_vaddr, write);
+	if (unlikely(!page)) {
+		write_seqcount_end(&rmap_seq);
+		spin_unlock(&mmn_lock);
+		ret = get_user_pages(current, mm, source_vaddr,
+					1, write, 0, &page, NULL);
+		if (ret != 1)
+			goto out_err;
+		put_page(page);
+		goto again;
+	}
+
+	ret = __rmap_add(source_vaddr, dest_vaddr);
+	if (ret)
+		goto out_lock;
+
+	pfn = page_to_pfn(page);
+	prot = vma->vm_page_prot;
+	if (!write)
+		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags & ~(VM_WRITE|VM_MAYWRITE));
+	ret = vm_insert_pfn(vma, dest_vaddr, pfn);
+	vma->vm_page_prot = prot;
+	if (ret) {
+		if (ret == -EBUSY)
+			WARN_ON(1);
+		goto out_rmap;
+	}
+	write_seqcount_end(&rmap_seq);
+	spin_unlock(&mmn_lock);
+	put_page(page);
+
+        return VM_FAULT_NOPAGE;
+
+out_rmap:
+	__rmap_del(source_vaddr);
+out_lock:
+	write_seqcount_end(&rmap_seq);
+	spin_unlock(&mmn_lock);
+	put_page(page);
+out_err:
+	switch (ret) {
+	case -EFAULT:
+	case -EEXIST:
+	case -EBUSY:
+		return VM_FAULT_SIGBUS;
+	case -ENOMEM:
+		return VM_FAULT_OOM;
+	default:
+		BUG();
+	}
+}
+
+struct vm_operations_struct mmn_vm_ops = {
+        .fault = mmn_vm_fault,
+};
+
+static int mmu_notifier_busy;
+static struct mmu_notifier mmu_notifier;
+
+static int mmn_clear_young(struct mmu_notifier *mn, unsigned long address)
+{
+	unsigned long vaddr;
+	unsigned seq;
+	struct mm_struct *mm = mn->mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+
+	do {
+		seq = read_seqcount_begin(&rmap_seq);
+		vaddr = rmap_find(address);
+	} while (read_seqcount_retry(&rmap_seq, seq));
+
+	if (vaddr == 0)
+		return 0;
+
+	printk("mmn_clear_young@vaddr=%lx sourced from %lx\n", vaddr, address);
+
+	spin_lock(&mmn_lock);
+        pgd = pgd_offset(mm, vaddr);
+        pud = pud_offset(pgd, vaddr);
+	if (pud) {
+		pmd = pmd_offset(pud, vaddr);
+		if (pmd) {
+			ptep = pte_offset_map(pmd, vaddr);
+			if (ptep) {
+				pte = *ptep;
+				if (!pte_present(pte)) {
+					/* x86 specific, don't have a vma */
+					ptep_get_and_clear(mm, vaddr, ptep);
+					__flush_tlb_one(vaddr);
+				}
+				pte_unmap(ptep);
+			}
+		}
+	}
+	__rmap_del(address);
+	spin_unlock(&mmn_lock);
+
+        return 1;
+}
+
+static void mmn_unmap(struct mmu_notifier *mn, unsigned long address)
+{
+	mmn_clear_young(mn, address);
+}
+
+static void mmn_release(struct mmu_notifier *mn)
+{
+	mmu_notifier_busy = 0;
+}
+
+static struct mmu_notifier_operations mmn_ops = {
+	.clear_young = mmn_clear_young,
+	.unmap = mmn_unmap,
+	.release = mmn_release,
+};
+
+static int mmn_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int busy;
+
+	if ((vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE)
+		return -EINVAL;
+
+	spin_lock(&mmn_lock);
+	busy = mmu_notifier_busy;
+	if (!busy)
+		mmu_notifier_busy = 1;
+	spin_unlock(&mmn_lock);
+	if (busy)
+		return -EBUSY;
+
+	vma->vm_flags |= VM_PFNMAP;
+	vma->vm_ops = &mmn_vm_ops;
+
+	mmu_notifier_init(&mmu_notifier, &mmn_ops, current->mm);
+	mmu_notifier_register(&mmu_notifier);
+
+	return 0;
+}
+
+static const struct file_operations mmn_fops =
+{
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.mmap		= mmn_mmap,
+};
+
+static struct miscdevice mmn_miscdev =
+{
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "mmn",
+	.fops	= &mmn_fops
+};
+
+static int __init mmn_init(void)
+{
+	if (misc_register(&mmn_miscdev)) {
+		printk(KERN_ERR "mmn: unable to register device\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+static void __exit mmn_exit(void)
+{
+	misc_deregister(&mmn_miscdev);
+}
+
+MODULE_DESCRIPTION("mmu_notifier skeleton driver");
+MODULE_LICENSE("GPL");
+
+module_init(mmn_init);
+module_exit(mmn_exit);
+
Index: linux-2.6/drivers/char/Kconfig
===================================================================
--- linux-2.6.orig/drivers/char/Kconfig
+++ linux-2.6/drivers/char/Kconfig
@@ -4,6 +4,10 @@
 
 menu "Character devices"
 
+config MMU_NOTIFIER_SKEL
+	tristate "MMU Notifier skeleton driver"
+	default n
+
 config VT
 	bool "Virtual terminal" if EMBEDDED
 	depends on !S390
Index: linux-2.6/drivers/char/Makefile
===================================================================
--- linux-2.6.orig/drivers/char/Makefile
+++ linux-2.6/drivers/char/Makefile
@@ -97,6 +97,7 @@ obj-$(CONFIG_CS5535_GPIO)	+= cs5535_gpio
 obj-$(CONFIG_GPIO_VR41XX)	+= vr41xx_giu.o
 obj-$(CONFIG_GPIO_TB0219)	+= tb0219.o
 obj-$(CONFIG_TELCLOCK)		+= tlclk.o
+obj-$(CONFIG_MMU_NOTIFIER_SKEL) += mmu_notifier_skel.o
 
 obj-$(CONFIG_MWAVE)		+= mwave/
 obj-$(CONFIG_AGP)		+= agp/

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19  8:43 [patch] my mmu notifiers Nick Piggin
  2008-02-19  8:44 ` [patch] my mmu notifier sample driver Nick Piggin
@ 2008-02-19 11:59 ` Robin Holt
  2008-02-19 13:58 ` Andrea Arcangeli
  2 siblings, 0 replies; 120+ messages in thread
From: Robin Holt @ 2008-02-19 11:59 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, Andrea Arcangeli, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Tue, Feb 19, 2008 at 09:43:57AM +0100, Nick Piggin wrote:
> So I implemented mmu notifiers slightly differently. Andrea's mmu notifiers
> are rather similar. However I have tried to make a point of minimising the
> impact the the core mm/. I don't see why we need to invalidate or flush
> anything when changing the pte to be _more_ permissive, and I don't
> understand the need for invalidate_begin/invalidate_end pairs at all.
> What I have done is basically create it so that the notifiers get called
> basically in the same place as the normal TLB flushing is done, and nowhere
> else.

Because XPMEM needs to be able to sleep during its callout.  For that,
we need to move this outside of the page table lock and suddenly we
need the begin/end pair again.  There was considerable discussion about
this exact point numerous times.  We tried to develop the most inclusive
design possible.  Our design would even be extendable to IB, assuming they
made some very disruptive changes to their MPI and communication libraries.
IB would suffer the same problems XPMEM does in that the TLB entries
need to be removed on a remote host which is operating completely
independently.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19  8:43 [patch] my mmu notifiers Nick Piggin
  2008-02-19  8:44 ` [patch] my mmu notifier sample driver Nick Piggin
  2008-02-19 11:59 ` [patch] my mmu notifiers Robin Holt
@ 2008-02-19 13:58 ` Andrea Arcangeli
  2008-02-19 14:27   ` Jack Steiner
                     ` (2 more replies)
  2 siblings, 3 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-19 13:58 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Tue, Feb 19, 2008 at 09:43:57AM +0100, Nick Piggin wrote:
> are rather similar. However I have tried to make a point of minimising the
> impact the the core mm/. I don't see why we need to invalidate or flush

I also tried hard to minimise the impact of the core mm/, I also
argued with Christoph that cluttering mm/ wasn't a good idea for
things like age_page that could be a 1 liner change instead of a
multiple-liner change, without any loss of flexibility or readability.

> anything when changing the pte to be _more_ permissive, and I don't

Note that in my patch the invalidate_pages in mprotect can be
trivially switched to a mprotect_pages with proper params. This will
prevent page faults completely in the secondary MMU (there will only
be tlb misses after the tlb flush just like for the core linux pte),
and it'll allow all the secondary MMU pte blocks (512/1024 at time
with my PT lock design) to be updated to have proper permissions
matching the core linux pte.

> understand the need for invalidate_begin/invalidate_end pairs at all.

The need of the pairs is crystal clear to me: range_begin is needed
for GRU _but_only_if_ range_end is called after releasing the
reference that the VM holds on the page. _begin will flush the GRU tlb
and at the same time it will take a mutex that will block further GRU
tlb-miss-interrupts (no idea how they manange those nightmare locking,
I didn't even try to add more locking to KVM and I get away with the
fact KVM takes the pin on the page itself).

My patch calls invalidate_page/pages before the reference is released
on the page, so GRU will work fine despite lack of
range_begin. Furthermore with my patch GRU will be auto-serialized by
the PT lock w/o the need of any additional locking.

> What I have done is basically create it so that the notifiers get called
> basically in the same place as the normal TLB flushing is done, and nowhere
> else.

That was one of my objectives too.

> I also wanted to avoid calling notifier code from inside eg. hardware TLB
> or pte manipulation primitives. These things are already pretty well
> spaghetti, so I'd like to just place them right where needed first... I
> think eventually it will need a bit of a rethink to make it more consistent
> and more general. But I prefer to do put them in the caller for the moment.

Your patch should also work for KVM but it's suboptimal, my patch can
be orders of magnitude more efficient for GRU thanks to the
invalidate_pages optimization. Christoph complained about having to
call one method per pte.

And adding invalidate_range is useless unless you fully support
xpmem. You're calling invalidate_range in places that can't sleep...

No idea why xpmem needs range_begin, I perfectly understand why GRU
needs _begin with Chrisotph's patch (gru lacks the page pin) but I
dunno why xpmem needs range_begin (xpmem has the page pin so I also
think it could avoid using range_begin). Still to support GRU you need
both to call invalidate_range in places that can sleep and you need
the external rmap notifier. The moment you add xpmem into the equation
your and my clean patches become Christoph's one...

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 13:58 ` Andrea Arcangeli
@ 2008-02-19 14:27   ` Jack Steiner
  2008-02-19 23:04     ` Nick Piggin
  2008-02-27 22:50     ` Christoph Lameter
  2008-02-19 22:59   ` Nick Piggin
  2008-02-19 23:11   ` Nick Piggin
  2 siblings, 2 replies; 120+ messages in thread
From: Jack Steiner @ 2008-02-19 14:27 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	linux-kernel, linux-mm, daniel.blueman, Christoph Lameter

> On Tue, Feb 19, 2008 at 02:58:51PM +0100, Andrea Arcangeli wrote:
> > understand the need for invalidate_begin/invalidate_end pairs at all.
> 
> The need of the pairs is crystal clear to me: range_begin is needed
> for GRU _but_only_if_ range_end is called after releasing the
> reference that the VM holds on the page. _begin will flush the GRU tlb
> and at the same time it will take a mutex that will block further GRU
> tlb-miss-interrupts (no idea how they manange those nightmare locking,
> I didn't even try to add more locking to KVM and I get away with the
> fact KVM takes the pin on the page itself).

As it turns out, no actual mutex is required. _begin_ simply increments a
count of active range invalidates, _end_ decrements the count. New TLB
dropins are deferred while range callouts are active.

This would appear to be racy but the GRU has special hardware that
simplifies locking. When the GRU sees a TLB invalidate, all outstanding
misses & potentially inflight TLB dropins are marked by the GRU with a
"kill" bit. When the dropin finally occurs, the dropin is ignored & the
instruction is simply restarted. The instruction will fault again & the TLB
dropin will be repeated.  This is optimized for the case where invalidates
are rare - true for users of the GRU.


In general, though, I agree. Most users of mmu_notifiers would likely
required a mutex or something equivalent.


--- jack




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 13:58 ` Andrea Arcangeli
  2008-02-19 14:27   ` Jack Steiner
@ 2008-02-19 22:59   ` Nick Piggin
  2008-02-20  0:46     ` Andrea Arcangeli
  2008-02-27 22:55     ` Christoph Lameter
  2008-02-19 23:11   ` Nick Piggin
  2 siblings, 2 replies; 120+ messages in thread
From: Nick Piggin @ 2008-02-19 22:59 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Tue, Feb 19, 2008 at 02:58:51PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 19, 2008 at 09:43:57AM +0100, Nick Piggin wrote:
> > are rather similar. However I have tried to make a point of minimising the
> > impact the the core mm/. I don't see why we need to invalidate or flush
> 
> I also tried hard to minimise the impact of the core mm/, I also
> argued with Christoph that cluttering mm/ wasn't a good idea for
> things like age_page that could be a 1 liner change instead of a
> multiple-liner change, without any loss of flexibility or readability.
> 
> > anything when changing the pte to be _more_ permissive, and I don't
> 
> Note that in my patch the invalidate_pages in mprotect can be
> trivially switched to a mprotect_pages with proper params. This will
> prevent page faults completely in the secondary MMU (there will only
> be tlb misses after the tlb flush just like for the core linux pte),
> and it'll allow all the secondary MMU pte blocks (512/1024 at time
> with my PT lock design) to be updated to have proper permissions
> matching the core linux pte.
> 
> > understand the need for invalidate_begin/invalidate_end pairs at all.
> 
> The need of the pairs is crystal clear to me: range_begin is needed
> for GRU _but_only_if_ range_end is called after releasing the
> reference that the VM holds on the page. _begin will flush the GRU tlb
> and at the same time it will take a mutex that will block further GRU
> tlb-miss-interrupts (no idea how they manange those nightmare locking,
> I didn't even try to add more locking to KVM and I get away with the
> fact KVM takes the pin on the page itself).
> 
> My patch calls invalidate_page/pages before the reference is released
> on the page, so GRU will work fine despite lack of
> range_begin. Furthermore with my patch GRU will be auto-serialized by
> the PT lock w/o the need of any additional locking.

That's why I don't understand the need for the pairs: it should be
done like this.


> > What I have done is basically create it so that the notifiers get called
> > basically in the same place as the normal TLB flushing is done, and nowhere
> > else.
> 
> That was one of my objectives too.
> 
> > I also wanted to avoid calling notifier code from inside eg. hardware TLB
> > or pte manipulation primitives. These things are already pretty well
> > spaghetti, so I'd like to just place them right where needed first... I
> > think eventually it will need a bit of a rethink to make it more consistent
> > and more general. But I prefer to do put them in the caller for the moment.
> 
> Your patch should also work for KVM but it's suboptimal, my patch can
> be orders of magnitude more efficient for GRU thanks to the
> invalidate_pages optimization. Christoph complained about having to
> call one method per pte.

OK, I didn't see the invalidate_pages call...

 
> And adding invalidate_range is useless unless you fully support
> xpmem. You're calling invalidate_range in places that can't sleep...

I thought that could be used by a non-sleeping user (not intending
to try supporting sleeping users). If it is useless then it should
go away (BTW. I didn't see your recent patch, some of my confusion
I think stems from Christoph's novel way of merging and splitting
patches).


> No idea why xpmem needs range_begin, I perfectly understand why GRU
> needs _begin with Chrisotph's patch (gru lacks the page pin) but I
> dunno why xpmem needs range_begin (xpmem has the page pin so I also
> think it could avoid using range_begin). Still to support GRU you need
> both to call invalidate_range in places that can sleep and you need
> the external rmap notifier. The moment you add xpmem into the equation
> your and my clean patches become Christoph's one...

Sorry, I kind of didn't have time to follow the conversation so well
before; are there patches posted for gru and/or xpmem?


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 14:27   ` Jack Steiner
@ 2008-02-19 23:04     ` Nick Piggin
  2008-02-20  0:52       ` Andrea Arcangeli
  2008-02-27 22:50     ` Christoph Lameter
  1 sibling, 1 reply; 120+ messages in thread
From: Nick Piggin @ 2008-02-19 23:04 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Tue, Feb 19, 2008 at 08:27:25AM -0600, Jack Steiner wrote:
> > On Tue, Feb 19, 2008 at 02:58:51PM +0100, Andrea Arcangeli wrote:
> > > understand the need for invalidate_begin/invalidate_end pairs at all.
> > 
> > The need of the pairs is crystal clear to me: range_begin is needed
> > for GRU _but_only_if_ range_end is called after releasing the
> > reference that the VM holds on the page. _begin will flush the GRU tlb
> > and at the same time it will take a mutex that will block further GRU
> > tlb-miss-interrupts (no idea how they manange those nightmare locking,
> > I didn't even try to add more locking to KVM and I get away with the
> > fact KVM takes the pin on the page itself).
> 
> As it turns out, no actual mutex is required. _begin_ simply increments a
> count of active range invalidates, _end_ decrements the count. New TLB
> dropins are deferred while range callouts are active.
> 
> This would appear to be racy but the GRU has special hardware that
> simplifies locking. When the GRU sees a TLB invalidate, all outstanding
> misses & potentially inflight TLB dropins are marked by the GRU with a
> "kill" bit. When the dropin finally occurs, the dropin is ignored & the
> instruction is simply restarted. The instruction will fault again & the TLB
> dropin will be repeated.  This is optimized for the case where invalidates
> are rare - true for users of the GRU.

OK (thanks to Robin as well). Now I understand why you are using it,
but I don't understand why you don't defer new TLBs after the point
where the linux pte changes. If you can do that, then you look and
act much more like a TLB from the point of view of the Linux vm.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 13:58 ` Andrea Arcangeli
  2008-02-19 14:27   ` Jack Steiner
  2008-02-19 22:59   ` Nick Piggin
@ 2008-02-19 23:11   ` Nick Piggin
  2008-02-19 23:40     ` Jack Steiner
                       ` (3 more replies)
  2 siblings, 4 replies; 120+ messages in thread
From: Nick Piggin @ 2008-02-19 23:11 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Tue, Feb 19, 2008 at 02:58:51PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 19, 2008 at 09:43:57AM +0100, Nick Piggin wrote:
> > anything when changing the pte to be _more_ permissive, and I don't
> 
> Note that in my patch the invalidate_pages in mprotect can be
> trivially switched to a mprotect_pages with proper params. This will
> prevent page faults completely in the secondary MMU (there will only
> be tlb misses after the tlb flush just like for the core linux pte),
> and it'll allow all the secondary MMU pte blocks (512/1024 at time
> with my PT lock design) to be updated to have proper permissions
> matching the core linux pte.

Sorry, I realise I still didn't get this through my head yet (and also
have not seen your patch recently). So I don't know exactly what you
are doing...

But why does _anybody_ (why does Christoph's patches) need to invalidate
when they are going to be more permissive? This should be done lazily by
the driver, I would have thought.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 23:11   ` Nick Piggin
@ 2008-02-19 23:40     ` Jack Steiner
  2008-02-21  4:42       ` Nick Piggin
  2008-02-20  1:09     ` Andrea Arcangeli
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 120+ messages in thread
From: Jack Steiner @ 2008-02-19 23:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 12:11:57AM +0100, Nick Piggin wrote:
> On Tue, Feb 19, 2008 at 02:58:51PM +0100, Andrea Arcangeli wrote:
> > On Tue, Feb 19, 2008 at 09:43:57AM +0100, Nick Piggin wrote:
> > > anything when changing the pte to be _more_ permissive, and I don't
> > 
> > Note that in my patch the invalidate_pages in mprotect can be
> > trivially switched to a mprotect_pages with proper params. This will
> > prevent page faults completely in the secondary MMU (there will only
> > be tlb misses after the tlb flush just like for the core linux pte),
> > and it'll allow all the secondary MMU pte blocks (512/1024 at time
> > with my PT lock design) to be updated to have proper permissions
> > matching the core linux pte.
> 
> Sorry, I realise I still didn't get this through my head yet (and also
> have not seen your patch recently). So I don't know exactly what you
> are doing...
> 
> But why does _anybody_ (why does Christoph's patches) need to invalidate
> when they are going to be more permissive? This should be done lazily by
> the driver, I would have thought.


Agree. Although for most real applications, the performance difference
is probably negligible.

--- jack

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 22:59   ` Nick Piggin
@ 2008-02-20  0:46     ` Andrea Arcangeli
  2008-02-27 22:55     ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-20  0:46 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Tue, Feb 19, 2008 at 11:59:23PM +0100, Nick Piggin wrote:
> That's why I don't understand the need for the pairs: it should be
> done like this.

Yes, except it can't be done like this for xpmem.

> OK, I didn't see the invalidate_pages call...

See the last patch I posted to Andrew, you've probably looked at the
old patches, the old patches didn't work for GRU and didn't work for
xpmem and they weren't optimized to cluster the invalidates for each
4k-large-pte.

> I thought that could be used by a non-sleeping user (not intending
> to try supporting sleeping users). If it is useless then it should
> go away (BTW. I didn't see your recent patch, some of my confusion
> I think stems from Christoph's novel way of merging and splitting
> patches).

I kept improving my patch in case the VM maintainers would consider
xpmem requirements not workable from a linux-VM point of view, and
they preferred to have something obviously safe, strightforward and
non intrusive, despite it doesn't support the only sleeping user out
there I know of (xpmem). My patch supports KVM and GRU (and any other
not sleeping user).

> > No idea why xpmem needs range_begin, I perfectly understand why GRU
> > needs _begin with Chrisotph's patch (gru lacks the page pin) but I
> > dunno why xpmem needs range_begin (xpmem has the page pin so I also
> > think it could avoid using range_begin). Still to support GRU you need
> > both to call invalidate_range in places that can sleep and you need
> > the external rmap notifier. The moment you add xpmem into the equation
> > your and my clean patches become Christoph's one...
> 
> Sorry, I kind of didn't have time to follow the conversation so well
> before; are there patches posted for gru and/or xpmem?

There's some xpmem code posted but the posted one isn't using the mmu
notifiers yet. GRU code may be available from Jack. I only know for
sure their requirements in terms of mmu notifiers.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 23:04     ` Nick Piggin
@ 2008-02-20  0:52       ` Andrea Arcangeli
  2008-02-20  2:46         ` Robin Holt
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-20  0:52 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Jack Steiner, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 12:04:27AM +0100, Nick Piggin wrote:
> On Tue, Feb 19, 2008 at 08:27:25AM -0600, Jack Steiner wrote:
> > > On Tue, Feb 19, 2008 at 02:58:51PM +0100, Andrea Arcangeli wrote:
> > > > understand the need for invalidate_begin/invalidate_end pairs at all.
> > > 
> > > The need of the pairs is crystal clear to me: range_begin is needed
> > > for GRU _but_only_if_ range_end is called after releasing the
> > > reference that the VM holds on the page. _begin will flush the GRU tlb
> > > and at the same time it will take a mutex that will block further GRU
> > > tlb-miss-interrupts (no idea how they manange those nightmare locking,
> > > I didn't even try to add more locking to KVM and I get away with the
> > > fact KVM takes the pin on the page itself).
> > 
> > As it turns out, no actual mutex is required. _begin_ simply increments a
> > count of active range invalidates, _end_ decrements the count. New TLB
> > dropins are deferred while range callouts are active.
> > 
> > This would appear to be racy but the GRU has special hardware that
> > simplifies locking. When the GRU sees a TLB invalidate, all outstanding
> > misses & potentially inflight TLB dropins are marked by the GRU with a
> > "kill" bit. When the dropin finally occurs, the dropin is ignored & the
> > instruction is simply restarted. The instruction will fault again & the TLB
> > dropin will be repeated.  This is optimized for the case where invalidates
> > are rare - true for users of the GRU.
> 
> OK (thanks to Robin as well). Now I understand why you are using it,
> but I don't understand why you don't defer new TLBs after the point
> where the linux pte changes. If you can do that, then you look and
> act much more like a TLB from the point of view of the Linux vm.

Christoph was forced to put the invalidate_range callback _after_
dropping the PT lock because xpmem has to wait I/O there. But
invalidate_range is called after freeing the VM reference on the pages
so then GRU needed a _range_begin too because GRU has to flush the tlb
before the VM reference on the page is released (xpmem and KVM pin the
pages mapped by the secondary mmu, GRU doesn't). So then
invalidate_range was renamed to invalidate_range_end.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 23:11   ` Nick Piggin
  2008-02-19 23:40     ` Jack Steiner
@ 2008-02-20  1:09     ` Andrea Arcangeli
  2008-02-20 10:39       ` [PATCH] mmu notifiers #v6 Andrea Arcangeli
  2008-02-21  4:47       ` [patch] my mmu notifiers Nick Piggin
  2008-02-20  2:49     ` Robin Holt
  2008-02-27 22:56     ` Christoph Lameter
  3 siblings, 2 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-20  1:09 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 12:11:57AM +0100, Nick Piggin wrote:
> Sorry, I realise I still didn't get this through my head yet (and also
> have not seen your patch recently). So I don't know exactly what you
> are doing...

The last version was posted here:

http://marc.info/?l=kvm-devel&m=120321732521533&w=2

> But why does _anybody_ (why does Christoph's patches) need to invalidate
> when they are going to be more permissive? This should be done lazily by
> the driver, I would have thought.

This can be done lazily by the driver yes. The place where I've an
invalidate_pages in mprotect however can also become less permissive.
It's simpler to invalidate always and it's not guaranteed the
secondary mmu page fault is capable of refreshing the spte across a
writeprotect fault. In the future this can be changed to
mprotect_pages though, so no page fault will happen in the secondary
mmu.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-20  0:52       ` Andrea Arcangeli
@ 2008-02-20  2:46         ` Robin Holt
  0 siblings, 0 replies; 120+ messages in thread
From: Robin Holt @ 2008-02-20  2:46 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, Jack Steiner, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman, Christoph Lameter

On Wed, Feb 20, 2008 at 01:52:06AM +0100, Andrea Arcangeli wrote:
> On Wed, Feb 20, 2008 at 12:04:27AM +0100, Nick Piggin wrote:
> > OK (thanks to Robin as well). Now I understand why you are using it,
> > but I don't understand why you don't defer new TLBs after the point
> > where the linux pte changes. If you can do that, then you look and
> > act much more like a TLB from the point of view of the Linux vm.
> 
> Christoph was forced to put the invalidate_range callback _after_
> dropping the PT lock because xpmem has to wait I/O there. But
> invalidate_range is called after freeing the VM reference on the pages
> so then GRU needed a _range_begin too because GRU has to flush the tlb
> before the VM reference on the page is released (xpmem and KVM pin the
> pages mapped by the secondary mmu, GRU doesn't). So then
> invalidate_range was renamed to invalidate_range_end.

Currently, xpmem blocks faults for the range specified at the _begin
callout, then shoots down remote TLBs and does the put_page for all the
pages in the specified range.  The _end callout merely removes the block.
We do not do any wait for I/O.  By the time we return from the _begin
callout, all activity by the remotes is stopped, pages are dereferenced,
and future faults are blocked until released by the _end callout.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 23:11   ` Nick Piggin
  2008-02-19 23:40     ` Jack Steiner
  2008-02-20  1:09     ` Andrea Arcangeli
@ 2008-02-20  2:49     ` Robin Holt
  2008-02-27 22:56     ` Christoph Lameter
  3 siblings, 0 replies; 120+ messages in thread
From: Robin Holt @ 2008-02-20  2:49 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 12:11:57AM +0100, Nick Piggin wrote:
> On Tue, Feb 19, 2008 at 02:58:51PM +0100, Andrea Arcangeli wrote:
> > On Tue, Feb 19, 2008 at 09:43:57AM +0100, Nick Piggin wrote:
> > > anything when changing the pte to be _more_ permissive, and I don't
> > 
> > Note that in my patch the invalidate_pages in mprotect can be
> > trivially switched to a mprotect_pages with proper params. This will
> > prevent page faults completely in the secondary MMU (there will only
> > be tlb misses after the tlb flush just like for the core linux pte),
> > and it'll allow all the secondary MMU pte blocks (512/1024 at time
> > with my PT lock design) to be updated to have proper permissions
> > matching the core linux pte.
> 
> Sorry, I realise I still didn't get this through my head yet (and also
> have not seen your patch recently). So I don't know exactly what you
> are doing...
> 
> But why does _anybody_ (why does Christoph's patches) need to invalidate
> when they are going to be more permissive? This should be done lazily by
> the driver, I would have thought.

I don't believe it should, but it probably does right now.  I do know
the case where a write fault where there is no need for a COW does
not call out on the PTE change.  I see no reason the others should not
handle this as well.  Just off the top of my head, I can only think of
the mprotect case needing to special case the more permissive state and
I don't think that changes PTEs at all, merely updates the VMA.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] mmu notifiers #v6
  2008-02-20  1:09     ` Andrea Arcangeli
@ 2008-02-20 10:39       ` Andrea Arcangeli
  2008-02-20 10:45         ` [PATCH] KVM swapping (+ seqlock fix) with " Andrea Arcangeli
                           ` (4 more replies)
  2008-02-21  4:47       ` [patch] my mmu notifiers Nick Piggin
  1 sibling, 5 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-20 10:39 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

Given Nick's comments I ported my version of the mmu notifiers to
latest mainline. There are no known bugs AFIK and it's obviously safe
(nothing is allowed to schedule inside rcu_read_lock taken by
mmu_notifier() with my patch).

XPMEM simply can't use RCU for the registration locking if it wants to
schedule inside the mmu notifier calls. So I guess it's better to add
the XPMEM invalidate_range_end/begin/external-rmap as a whole
different subsystem that will have to use a mutex (not RCU) to
serialize, and at the same time that CONFIG_XPMEM will also have to
switch the i_mmap_lock to a mutex. I doubt xpmem fits inside a
CONFIG_MMU_NOTIFIER anymore, or we'll all run a bit slower because of
it. It's really a call of how much we want to optimize the MMU
notifier, by keeping things like RCU for the registration.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -46,6 +46,7 @@
 	__young = ptep_test_and_clear_young(__vma, __address, __ptep);	\
 	if (__young)							\
 		flush_tlb_page(__vma, __address);			\
+	__young |= mmu_notifier_age_page((__vma)->vm_mm, __address);	\
 	__young;							\
 })
 #endif
@@ -86,6 +87,7 @@ do {									\
 	pte_t __pte;							\
 	__pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep);	\
 	flush_tlb_page(__vma, __address);				\
+	mmu_notifier(invalidate_page, (__vma)->vm_mm, __address);	\
 	__pte;								\
 })
 #endif
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
--- a/include/asm-s390/pgtable.h
+++ b/include/asm-s390/pgtable.h
@@ -735,6 +735,7 @@ static inline pte_t ptep_clear_flush(str
 {
 	pte_t pte = *ptep;
 	ptep_invalidate(vma->vm_mm, address, ptep);
+	mmu_notifier(invalidate_page, vma->vm_mm, address);
 	return pte;
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -10,6 +10,7 @@
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/completion.h>
+#include <linux/mmu_notifier.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -228,6 +229,8 @@ struct mm_struct {
 #ifdef CONFIG_CGROUP_MEM_CONT
 	struct mem_cgroup *mem_cgroup;
 #endif
+
+	struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
new file mode 100644
--- /dev/null
+++ b/include/linux/mmu_notifier.h
@@ -0,0 +1,132 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct mmu_notifier;
+
+struct mmu_notifier_ops {
+	/*
+	 * Called when nobody can register any more notifier in the mm
+	 * and after the "mn" notifier has been disarmed already.
+	 */
+	void (*release)(struct mmu_notifier *mn,
+			struct mm_struct *mm);
+
+	/*
+	 * invalidate_page[s] is called in atomic context
+	 * after any pte has been updated and before
+	 * dropping the PT lock required to update any Linux pte.
+	 * Once the PT lock will be released the pte will have its
+	 * final value to export through the secondary MMU.
+	 * Before this is invoked any secondary MMU is still ok
+	 * to read/write to the page previously pointed by the
+	 * Linux pte because the old page hasn't been freed yet.
+	 * If required set_page_dirty has to be called internally
+	 * to this method.
+	 */
+	void (*invalidate_page)(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long address);
+	void (*invalidate_pages)(struct mmu_notifier *mn,
+				 struct mm_struct *mm,
+				 unsigned long start, unsigned long end);
+
+	/*
+	 * Age page is called in atomic context inside the PT lock
+	 * right after the VM is test-and-clearing the young/accessed
+	 * bitflag in the pte. This way the VM will provide proper aging
+	 * to the accesses to the page through the secondary MMUs
+	 * and not only to the ones through the Linux pte.
+	 */
+	int (*age_page)(struct mmu_notifier *mn,
+			struct mm_struct *mm,
+			unsigned long address);
+};
+
+struct mmu_notifier {
+	struct hlist_node hlist;
+	const struct mmu_notifier_ops *ops;
+};
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier_head {
+	struct hlist_head head;
+	spinlock_t lock;
+};
+
+#include <linux/mm_types.h>
+
+/*
+ * RCU is used to traverse the list. A quiescent period needs to pass
+ * before the notifier is guaranteed to be visible to all threads.
+ */
+extern void mmu_notifier_register(struct mmu_notifier *mn,
+				  struct mm_struct *mm);
+/*
+ * RCU is used to traverse the list. A quiescent period needs to pass
+ * before the "struct mmu_notifier" can be freed. Alternatively it
+ * can be synchronously freed inside ->release when the list can't
+ * change anymore and nobody could possibly walk it.
+ */
+extern void mmu_notifier_unregister(struct mmu_notifier *mn,
+				    struct mm_struct *mm);
+extern void mmu_notifier_release(struct mm_struct *mm);
+extern int mmu_notifier_age_page(struct mm_struct *mm,
+				 unsigned long address);
+
+static inline void mmu_notifier_head_init(struct mmu_notifier_head *mnh)
+{
+	INIT_HLIST_HEAD(&mnh->head);
+	spin_lock_init(&mnh->lock);
+}
+
+#define mmu_notifier(function, mm, args...)				\
+	do {								\
+		struct mmu_notifier *__mn;				\
+		struct hlist_node *__n;					\
+									\
+		if (unlikely(!hlist_empty(&(mm)->mmu_notifier.head))) { \
+			rcu_read_lock();				\
+			hlist_for_each_entry_rcu(__mn, __n,		\
+						 &(mm)->mmu_notifier.head, \
+						 hlist)			\
+				if (__mn->ops->function)		\
+					__mn->ops->function(__mn,	\
+							    mm,		\
+							    args);	\
+			rcu_read_unlock();				\
+		}							\
+	} while (0)
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+struct mmu_notifier_head {};
+
+#define mmu_notifier_register(mn, mm) do {} while(0)
+#define mmu_notifier_unregister(mn, mm) do {} while (0)
+#define mmu_notifier_release(mm) do {} while (0)
+#define mmu_notifier_age_page(mm, address) ({ 0; })
+#define mmu_notifier_head_init(mmh) do {} while (0)
+
+/*
+ * Notifiers that use the parameters that they were passed so that the
+ * compiler does not complain about unused variables but does proper
+ * parameter checks even if !CONFIG_MMU_NOTIFIER.
+ * Macros generate no code.
+ */
+#define mmu_notifier(function, mm, args...)			       \
+	do {							       \
+		if (0) {					       \
+			struct mmu_notifier *__mn;		       \
+								       \
+			__mn = (struct mmu_notifier *)(0x00ff);	       \
+			__mn->ops->function(__mn, mm, args);	       \
+		};						       \
+	} while (0)
+
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -362,6 +362,7 @@ static struct mm_struct * mm_init(struct
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
+		mmu_notifier_head_init(&mm->mmu_notifier);
 		return mm;
 	}
 
diff --git a/mm/Kconfig b/mm/Kconfig
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,3 +193,7 @@ config VIRT_TO_BUS
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+	def_bool y
+	bool "MMU notifier, for paging KVM/RDMA"
diff --git a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,4 +33,4 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
-
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -768,6 +768,7 @@ void __unmap_hugepage_range(struct vm_ar
 		if (pte_none(pte))
 			continue;
 
+		mmu_notifier(invalidate_page, mm, address);
 		page = pte_page(pte);
 		if (pte_dirty(pte))
 			set_page_dirty(page);
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -504,6 +504,7 @@ static int copy_pte_range(struct mm_stru
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
 	int rss[2];
+	unsigned long start;
 
 again:
 	rss[1] = rss[0] = 0;
@@ -515,6 +516,7 @@ again:
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 	arch_enter_lazy_mmu_mode();
 
+	start = addr;
 	do {
 		/*
 		 * We are holding two locks at this point - either of them
@@ -535,6 +537,8 @@ again:
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
+	if (is_cow_mapping(vma->vm_flags))
+		mmu_notifier(invalidate_pages, vma->vm_mm, start, addr);
 	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
 	add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -670,6 +674,7 @@ static unsigned long zap_pte_range(struc
 			}
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
+			mmu_notifier(invalidate_page, mm, addr);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
@@ -1269,6 +1274,7 @@ static int remap_pte_range(struct mm_str
 {
 	pte_t *pte;
 	spinlock_t *ptl;
+	unsigned long start = addr;
 
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
@@ -1280,6 +1286,7 @@ static int remap_pte_range(struct mm_str
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
+	mmu_notifier(invalidate_pages, mm, start, addr);
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2048,6 +2048,7 @@ void exit_mmap(struct mm_struct *mm)
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
+	mmu_notifier_release(mm);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,73 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+
+/*
+ * No synchronization. This function can only be called when only a single
+ * process remains that performs teardown.
+ */
+void mmu_notifier_release(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n, *tmp;
+
+	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+		hlist_for_each_entry_safe(mn, n, tmp,
+					  &mm->mmu_notifier.head, hlist) {
+			hlist_del(&mn->hlist);
+			if (mn->ops->release)
+				mn->ops->release(mn, mm);
+		}
+	}
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->age_page can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int mmu_notifier_age_page(struct mm_struct *mm, unsigned long address)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+	int young = 0;
+
+	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(mn, n,
+					 &mm->mmu_notifier.head, hlist) {
+			if (mn->ops->age_page)
+				young |= mn->ops->age_page(mn, mm, address);
+		}
+		rcu_read_unlock();
+	}
+
+	return young;
+}
+
+void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	spin_lock(&mm->mmu_notifier.lock);
+	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier.head);
+	spin_unlock(&mm->mmu_notifier.lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	spin_lock(&mm->mmu_notifier.lock);
+	hlist_del_rcu(&mn->hlist);
+	spin_unlock(&mm->mmu_notifier.lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mprotect.c b/mm/mprotect.c
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -32,6 +32,7 @@ static void change_pte_range(struct mm_s
 {
 	pte_t *pte, oldpte;
 	spinlock_t *ptl;
+	unsigned long start = addr;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -71,6 +72,7 @@ static void change_pte_range(struct mm_s
 
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
+	mmu_notifier(invalidate_pages, mm, start, addr);
 	pte_unmap_unlock(pte - 1, ptl);
 }
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] KVM swapping (+ seqlock fix) with mmu notifiers #v6
  2008-02-20 10:39       ` [PATCH] mmu notifiers #v6 Andrea Arcangeli
@ 2008-02-20 10:45         ` Andrea Arcangeli
  2008-02-27 22:06           ` [PATCH] KVM swapping with mmu notifiers #v7 Andrea Arcangeli
  2008-02-20 11:33         ` [PATCH] mmu notifiers #v6 Robin Holt
                           ` (3 subsequent siblings)
  4 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-20 10:45 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

This is the same as before but against the mmu notifier #v6 patch,
running on top of 2.6.25-rc latest, and in this last update I fixed
the last race condition with a seqlock. I described the exact fix in a
earlier email, in short the seqlock-write is in the
invalidate_page/pages, and the reader will re-issue gfn_to_page if it
finds a seqlock read failure (see the change to paging_tmpl.h). With
this on top of mmu notifier #v6 there are no more practical or
theoretical known problems, nor in the kvm swapping, nor in the mmu
notifier patch (which also supports all sleeping users not just KVM,
without requiring a page pin).

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 41962e7..e1287ab 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM && EXPERIMENTAL
 	select PREEMPT_NOTIFIERS
+	select MMU_NOTIFIER
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6656efa..9151d64 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -533,6 +533,110 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		kvm_flush_remote_tlbs(kvm);
 }
 
+static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
+{
+	struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+	get_page(page);
+	rmap_remove(kvm, spte);
+	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+	kvm_flush_remote_tlbs(kvm);
+	__free_page(page);
+}
+
+static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte, *curr_spte;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+		curr_spte = spte;
+		spte = rmap_next(kvm, rmapp, spte);
+		kvm_unmap_spte(kvm, curr_spte);
+	}
+}
+
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int young = 0;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		int _young;
+		u64 _spte = *spte;
+		BUG_ON(!(_spte & PT_PRESENT_MASK));
+		_young = _spte & PT_ACCESSED_MASK;
+		if (_young) {
+			young = !!_young;
+			set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+	return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+	int young = 0;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+
+	if (young)
+		kvm_flush_remote_tlbs(kvm);
+
+	return young;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cdafce3..6d09d13 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -370,6 +370,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	int write_pt = 0;
 	int r;
 	struct page *page;
+	unsigned mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -397,6 +398,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	}
 
 	down_read(&current->mm->mmap_sem);
+	mmu_seq = read_seqbegin(&vcpu->kvm->arch.mmu_notifier_invalidate_lock);
 	page = gfn_to_page(vcpu->kvm, walker.gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -421,6 +423,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, "post page fault (fixed)");
 	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	if (read_seqretry(&vcpu->kvm->arch.mmu_notifier_invalidate_lock, mmu_seq)) {
+		down_read(&current->mm->mmap_sem);
+		if (page != gfn_to_page(vcpu->kvm, walker.gfn))
+			BUG();
+		up_read(&current->mm->mmap_sem);
+		kvm_release_page_clean(page);
+	}
+
 	up_read(&vcpu->kvm->slots_lock);
 
 	return write_pt;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dc8d538..f2594be 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3279,6 +3279,47 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+	struct kvm_arch *kvm_arch;
+	kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
+	return container_of(kvm_arch, struct kvm, arch);
+}
+
+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+				      struct mm_struct *mm,
+				      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	write_seqlock(&kvm->arch.mmu_notifier_invalidate_lock);
+	kvm_unmap_hva(kvm, address);
+	write_sequnlock(&kvm->arch.mmu_notifier_invalidate_lock);
+}
+
+void kvm_mmu_notifier_invalidate_pages(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start, unsigned long end)
+{
+	for (; start < end; start += PAGE_SIZE)
+		kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+int kvm_mmu_notifier_age_page(struct mmu_notifier *mn,
+			      struct mm_struct *mm,
+			      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	return kvm_age_hva(kvm, address);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+	.invalidate_pages	= kvm_mmu_notifier_invalidate_pages,
+	.age_page		= kvm_mmu_notifier_age_page,
+};
+
 struct  kvm *kvm_arch_create_vm(void)
 {
 	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -3288,6 +3329,10 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 
+	kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+	mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+	seqlock_init(&kvm->arch.mmu_notifier_invalidate_lock);
+
 	return kvm;
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 0c429c8..306beaa 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -294,6 +295,9 @@ struct kvm_arch{
 	struct page *apic_access_page;
 
 	gpa_t wall_clock;
+
+	struct mmu_notifier mmu_notifier;
+	seqlock_t mmu_notifier_invalidate_lock;
 };
 
 struct kvm_vm_stat {
@@ -411,6 +415,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);


This as usual is the KVM locking patch to browse memslots without the
memslot lock mutex.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c910c7..80b719d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3245,16 +3245,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	 */
 	if (!user_alloc) {
 		if (npages && !old.rmap) {
+			unsigned long userspace_addr;
+
 			down_write(&current->mm->mmap_sem);
-			memslot->userspace_addr = do_mmap(NULL, 0,
-						     npages * PAGE_SIZE,
-						     PROT_READ | PROT_WRITE,
-						     MAP_SHARED | MAP_ANONYMOUS,
-						     0);
+			userspace_addr = do_mmap(NULL, 0,
+						 npages * PAGE_SIZE,
+						 PROT_READ | PROT_WRITE,
+						 MAP_SHARED | MAP_ANONYMOUS,
+						 0);
 			up_write(&current->mm->mmap_sem);
 
-			if (IS_ERR((void *)memslot->userspace_addr))
-				return PTR_ERR((void *)memslot->userspace_addr);
+			if (IS_ERR((void *)userspace_addr))
+				return PTR_ERR((void *)userspace_addr);
+
+			/* set userspace_addr atomically for kvm_hva_to_rmapp */
+			spin_lock(&kvm->mmu_lock);
+			memslot->userspace_addr = userspace_addr;
+			spin_unlock(&kvm->mmu_lock);
 		} else {
 			if (!old.user_alloc && old.rmap) {
 				int ret;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cf6df51..743c5c5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -299,7 +299,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.rmap, 0, npages * sizeof(*new.rmap));
 
 		new.user_alloc = user_alloc;
-		new.userspace_addr = mem->userspace_addr;
+		/*
+		 * hva_to_rmmap() serialzies with the mmu_lock and to be
+		 * safe it has to ignore memslots with !user_alloc &&
+		 * !userspace_addr.
+		 */
+		if (user_alloc)
+			new.userspace_addr = mem->userspace_addr;
+		else
+			new.userspace_addr = 0;
 	}
 
 	/* Allocate page dirty bitmap if needed */
@@ -312,14 +320,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.dirty_bitmap, 0, dirty_bytes);
 	}
 
+	spin_lock(&kvm->mmu_lock);
 	if (mem->slot >= kvm->nmemslots)
 		kvm->nmemslots = mem->slot + 1;
 
 	*memslot = new;
+	spin_unlock(&kvm->mmu_lock);
 
 	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
 	if (r) {
+		spin_lock(&kvm->mmu_lock);
 		*memslot = old;
+		spin_unlock(&kvm->mmu_lock);
 		goto out_free;
 	}
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 10:39       ` [PATCH] mmu notifiers #v6 Andrea Arcangeli
  2008-02-20 10:45         ` [PATCH] KVM swapping (+ seqlock fix) with " Andrea Arcangeli
@ 2008-02-20 11:33         ` Robin Holt
  2008-02-20 12:03           ` Andrea Arcangeli
  2008-02-20 14:41         ` Robin Holt
                           ` (2 subsequent siblings)
  4 siblings, 1 reply; 120+ messages in thread
From: Robin Holt @ 2008-02-20 11:33 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 11:39:42AM +0100, Andrea Arcangeli wrote:
> Given Nick's comments I ported my version of the mmu notifiers to
> latest mainline. There are no known bugs AFIK and it's obviously safe
> (nothing is allowed to schedule inside rcu_read_lock taken by
> mmu_notifier() with my patch).
> 
> XPMEM simply can't use RCU for the registration locking if it wants to
> schedule inside the mmu notifier calls. So I guess it's better to add
> the XPMEM invalidate_range_end/begin/external-rmap as a whole
> different subsystem that will have to use a mutex (not RCU) to
> serialize, and at the same time that CONFIG_XPMEM will also have to
> switch the i_mmap_lock to a mutex. I doubt xpmem fits inside a
> CONFIG_MMU_NOTIFIER anymore, or we'll all run a bit slower because of
> it. It's really a call of how much we want to optimize the MMU
> notifier, by keeping things like RCU for the registration.

But won't that other "subsystem" cause us to have two seperate callouts
that do equivalent things and therefore force a removal of this and go
back to what Christoph has currently proposed?

Robin

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 11:33         ` [PATCH] mmu notifiers #v6 Robin Holt
@ 2008-02-20 12:03           ` Andrea Arcangeli
  2008-02-20 12:24             ` Robin Holt
  2008-02-21  5:02             ` Nick Piggin
  0 siblings, 2 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-20 12:03 UTC (permalink / raw)
  To: Robin Holt
  Cc: Nick Piggin, akpm, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 05:33:13AM -0600, Robin Holt wrote:
> But won't that other "subsystem" cause us to have two seperate callouts
> that do equivalent things and therefore force a removal of this and go
> back to what Christoph has currently proposed?

The point is that a new kind of notifier that only supports sleeping
users will allow to keep optimizing the mmu notifier patch for the
non-sleeping users. If we keep going Christoph's way of having a
single notifier that fits all he will have to:

1) drop the entire RCU locking from its patches (making all previous
   rcu discussions and fixes void) those discussions only made sense
   if applied to _my_ patch, not Christoph's patches as long as you
   pretend to sleep in any of his mmu notifier methods like invalidate_range_*.

2) probably modify the linux VM to replace the i_mmap_lock and perhaps
   PT lock with a mutex (see Nick's comments for details)

I'm unconvinced both the main linux VM and the mmu notifier should be
changed like this just to support xpmem. All non-sleeping users don't
need that. Nevertheless I'm fully welcome to support xpmem (and it's
not my call nor my interest to comment if allocating skbs in
try_to_unmap in order to unpin pages is workable, let's assume it's
workable for the sake of this discussion) with a new config option
that will also alter how the core VM works, in order to fully support
the sleeping users for filebacked mappings.

This will also create less confusion in the registration. With
Christoph's one-config-option-fits-all you had to half register into
the mmu notifier (the sleeping calls, so not invalidate_page) and full
register in the external rmap notifier, and I had to only half
register into the mmu notifier (not range_begin) and not register in
the rmap external notifier.

With two separate config options for sleeping and non sleeping users,
I'll 100% register in the mmu notifier methods, and the non-sleeping
users will 100% register the xpmem methods. You won't have to have
designed the mmu notifier patches to understand how to use it.

In theory both KVM and GRU are free to use the xpmem methods too (the
invalidate_page will be page_t based instead of [mm,addr] based, but
that's possible to handle with KVM changes if one wants to), but if a
distro only wants to support the sleeping users in their binary kernel
images, they won't be forced to alter how the VM works to do
that.

If there's agreement that the VM should alter its locking from
spinlock to mutex for its own good, then Christoph's
one-config-option-fits-all becomes a lot more appealing (replacing RCU
with a mutex in the mmu notifier list registration locking isn't my
main worry and the non-sleeping-users may be ok to live with it).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 12:03           ` Andrea Arcangeli
@ 2008-02-20 12:24             ` Robin Holt
  2008-02-20 12:32               ` Andrea Arcangeli
  2008-02-21  5:02             ` Nick Piggin
  1 sibling, 1 reply; 120+ messages in thread
From: Robin Holt @ 2008-02-20 12:24 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Nick Piggin, akpm, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 01:03:24PM +0100, Andrea Arcangeli wrote:
> I'm unconvinced both the main linux VM and the mmu notifier should be
> changed like this just to support xpmem. All non-sleeping users don't
> need that. Nevertheless I'm fully welcome to support xpmem (and it's
> not my call nor my interest to comment if allocating skbs in
> try_to_unmap in order to unpin pages is workable, let's assume it's
> workable for the sake of this discussion) with a new config option
> that will also alter how the core VM works, in order to fully support
> the sleeping users for filebacked mappings.

We do not need to do any allocation in the messaging layer, all
structures used for messaging are allocated at module load time.
The allocation discussions we had early on were about trying to
rearrange you notifiers to allow a seperate worker thread to do the
invalidate and then the main thread would spin waiting for the worker to
complete.  That was canned by the moving your notifier to before the
lock was grabbed which led us to the point of needing a _begin and _end.

> This will also create less confusion in the registration. With
> Christoph's one-config-option-fits-all you had to half register into
> the mmu notifier (the sleeping calls, so not invalidate_page) and full
> register in the external rmap notifier, and I had to only half
> register into the mmu notifier (not range_begin) and not register in
> the rmap external notifier.
> 
> With two separate config options for sleeping and non sleeping users,
> I'll 100% register in the mmu notifier methods, and the non-sleeping
> users will 100% register the xpmem methods. You won't have to have
> designed the mmu notifier patches to understand how to use it.

So, fundamentally, how would they be different?  Would we be required to
add another notifier list to the mm and have two seperate callout
points?  Reduction would end up with the same half-registered
half-not-registered situation you point out above.  Then further
reduction would lead to the elimination of the callouts you have just
proposed and using the _begin/_end callouts and we are back to
Christoph's current patch.

Robin

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 12:24             ` Robin Holt
@ 2008-02-20 12:32               ` Andrea Arcangeli
  2008-02-20 13:15                 ` Robin Holt
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-20 12:32 UTC (permalink / raw)
  To: Robin Holt
  Cc: Nick Piggin, akpm, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 06:24:24AM -0600, Robin Holt wrote:
> We do not need to do any allocation in the messaging layer, all
> structures used for messaging are allocated at module load time.
> The allocation discussions we had early on were about trying to
> rearrange you notifiers to allow a seperate worker thread to do the
> invalidate and then the main thread would spin waiting for the worker to
> complete.  That was canned by the moving your notifier to before the
> lock was grabbed which led us to the point of needing a _begin and _end.

I thought you called some net/* function inside the mmu notifier
methods. Those always require several ram allocations internally.

> So, fundamentally, how would they be different?  Would we be required to
> add another notifier list to the mm and have two seperate callout
> points?  Reduction would end up with the same half-registered
> half-not-registered situation you point out above.  Then further
> reduction would lead to the elimination of the callouts you have just
> proposed and using the _begin/_end callouts and we are back to
> Christoph's current patch.

Did you miss Nick's argument that we'd need to change some VM lock to
mutex and solve lock issues first? Are you implying mutex are more
efficient for the VM? (you may seek support from preempt-rt folks at
least) or are you implying the VM would better run slower with mutex
in order to have a single config option?

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 12:32               ` Andrea Arcangeli
@ 2008-02-20 13:15                 ` Robin Holt
  0 siblings, 0 replies; 120+ messages in thread
From: Robin Holt @ 2008-02-20 13:15 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Nick Piggin, akpm, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 01:32:36PM +0100, Andrea Arcangeli wrote:
> On Wed, Feb 20, 2008 at 06:24:24AM -0600, Robin Holt wrote:
> > We do not need to do any allocation in the messaging layer, all
> > structures used for messaging are allocated at module load time.
> > The allocation discussions we had early on were about trying to
> > rearrange you notifiers to allow a seperate worker thread to do the
> > invalidate and then the main thread would spin waiting for the worker to
> > complete.  That was canned by the moving your notifier to before the
> > lock was grabbed which led us to the point of needing a _begin and _end.
> 
> I thought you called some net/* function inside the mmu notifier
> methods. Those always require several ram allocations internally.

Nope, that was the discussions with the IB folks.  We only use XPC and
both the messages we send and the XPC internals do not need to allocate.

> > So, fundamentally, how would they be different?  Would we be required to
> > add another notifier list to the mm and have two seperate callout
> > points?  Reduction would end up with the same half-registered
> > half-not-registered situation you point out above.  Then further
> > reduction would lead to the elimination of the callouts you have just
> > proposed and using the _begin/_end callouts and we are back to
> > Christoph's current patch.
> 
> Did you miss Nick's argument that we'd need to change some VM lock to
> mutex and solve lock issues first? Are you implying mutex are more
> efficient for the VM? (you may seek support from preempt-rt folks at
> least) or are you implying the VM would better run slower with mutex
> in order to have a single config option?

That would be if we needed to support file backed mappings and hugetlbfs
mappings.  Currently (and for the last 6 years), XPMEM has not supported
either of those.  I don't view either as being a realistic possibility,
but it is certainly something we would need to address before either
could be supported.

Robin

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 10:39       ` [PATCH] mmu notifiers #v6 Andrea Arcangeli
  2008-02-20 10:45         ` [PATCH] KVM swapping (+ seqlock fix) with " Andrea Arcangeli
  2008-02-20 11:33         ` [PATCH] mmu notifiers #v6 Robin Holt
@ 2008-02-20 14:41         ` Robin Holt
  2008-02-20 15:34           ` Andrea Arcangeli
  2008-02-20 21:03         ` Jack Steiner
  2008-02-21  4:54         ` Nick Piggin
  4 siblings, 1 reply; 120+ messages in thread
From: Robin Holt @ 2008-02-20 14:41 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 11:39:42AM +0100, Andrea Arcangeli wrote:
> XPMEM simply can't use RCU for the registration locking if it wants to
> schedule inside the mmu notifier calls. So I guess it's better to add

Whoa there.  In Christoph's patch, we did not use rcu for the list.  It
was a simple hlist_head.  The list manipulations were done under
down_write(&current->mm->mmap_sem) and would therefore not be racy.  All
the callout locations are already acquiring the mmap_sem at least
readably, so we should be safe.  Maybe I missed a race somewhere.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 14:41         ` Robin Holt
@ 2008-02-20 15:34           ` Andrea Arcangeli
  0 siblings, 0 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-20 15:34 UTC (permalink / raw)
  To: Robin Holt
  Cc: Nick Piggin, akpm, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 08:41:55AM -0600, Robin Holt wrote:
> On Wed, Feb 20, 2008 at 11:39:42AM +0100, Andrea Arcangeli wrote:
> > XPMEM simply can't use RCU for the registration locking if it wants to
> > schedule inside the mmu notifier calls. So I guess it's better to add
> 
> Whoa there.  In Christoph's patch, we did not use rcu for the list.  It
> was a simple hlist_head.  The list manipulations were done under
> down_write(&current->mm->mmap_sem) and would therefore not be racy.  All
> the callout locations are already acquiring the mmap_sem at least
> readably, so we should be safe.  Maybe I missed a race somewhere.

You missed quite a few, see when atomic=1 and when mmu_rmap_notifier
is invoked for example.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 10:39       ` [PATCH] mmu notifiers #v6 Andrea Arcangeli
                           ` (2 preceding siblings ...)
  2008-02-20 14:41         ` Robin Holt
@ 2008-02-20 21:03         ` Jack Steiner
  2008-02-21  4:54         ` Nick Piggin
  4 siblings, 0 replies; 120+ messages in thread
From: Jack Steiner @ 2008-02-20 21:03 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	linux-kernel, linux-mm, daniel.blueman, Christoph Lameter

On Wed, Feb 20, 2008 at 11:39:42AM +0100, Andrea Arcangeli wrote:
> Given Nick's comments I ported my version of the mmu notifiers to
> latest mainline. There are no known bugs AFIK and it's obviously safe
> (nothing is allowed to schedule inside rcu_read_lock taken by
> mmu_notifier() with my patch).
> ....

I ported the GRU driver to use the latest #v6 patch and ran a series of
tests on it using our system simulator. The simulator is slow so true
stress or swapping is not possible - at least within a finite amount of
time.

Functionally, the #v6 patch seems to work for the GRU. However, I did
notice two significant differences that make the #v6 performance worse for
the GRU than Christoph's patch.  I think one difference is easily fixable
but the other is more difficult:

	- the location of the mmu_notifier_release() callout is at a
	  different place in the 2 patches. Christoph has the callout
	  BEFORE the call to unmap_vmas() whereas you have it AFTER. The
	  net result is that the GRU does a LOT of 1-page TLB flushes
	  during process teardown.  These flushes are not done with
	  Christops's patch.

	- the range callouts in Christoph's patch benefit the GRU because
	  multiple TLB entries can be flushed with a single GRU
	  instruction (the GRU hardware supports a range flush using a
	  vaddr & length).  The #v6 patch does a TLB flush for each page in
	  the range.  Flushing on the GRU is slow so being able to flush
	  multiple pages with a single request is a benefit.

Seems like the latter difference could be significant for other users
of mmu notifiers.


--- jack

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 23:40     ` Jack Steiner
@ 2008-02-21  4:42       ` Nick Piggin
  2008-02-22 16:31         ` Jack Steiner
  0 siblings, 1 reply; 120+ messages in thread
From: Nick Piggin @ 2008-02-21  4:42 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Tue, Feb 19, 2008 at 05:40:50PM -0600, Jack Steiner wrote:
> On Wed, Feb 20, 2008 at 12:11:57AM +0100, Nick Piggin wrote:
> > On Tue, Feb 19, 2008 at 02:58:51PM +0100, Andrea Arcangeli wrote:
> > > On Tue, Feb 19, 2008 at 09:43:57AM +0100, Nick Piggin wrote:
> > > > anything when changing the pte to be _more_ permissive, and I don't
> > > 
> > > Note that in my patch the invalidate_pages in mprotect can be
> > > trivially switched to a mprotect_pages with proper params. This will
> > > prevent page faults completely in the secondary MMU (there will only
> > > be tlb misses after the tlb flush just like for the core linux pte),
> > > and it'll allow all the secondary MMU pte blocks (512/1024 at time
> > > with my PT lock design) to be updated to have proper permissions
> > > matching the core linux pte.
> > 
> > Sorry, I realise I still didn't get this through my head yet (and also
> > have not seen your patch recently). So I don't know exactly what you
> > are doing...
> > 
> > But why does _anybody_ (why does Christoph's patches) need to invalidate
> > when they are going to be more permissive? This should be done lazily by
> > the driver, I would have thought.
> 
> 
> Agree. Although for most real applications, the performance difference
> is probably negligible.

But importantly, doing it that way means you share test coverage with
the CPU TLB flushing code, and you don't introduce a new concept to the
VM.

So, it _has_ to be lazy flushing, IMO (as there doesn't seem to be a
good reason otherwise). mprotect shouldn't really be a special case,
because it still has to flush the CPU tlbs as well when restricting
access.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-20  1:09     ` Andrea Arcangeli
  2008-02-20 10:39       ` [PATCH] mmu notifiers #v6 Andrea Arcangeli
@ 2008-02-21  4:47       ` Nick Piggin
  1 sibling, 0 replies; 120+ messages in thread
From: Nick Piggin @ 2008-02-21  4:47 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 02:09:41AM +0100, Andrea Arcangeli wrote:
> On Wed, Feb 20, 2008 at 12:11:57AM +0100, Nick Piggin wrote:
> > Sorry, I realise I still didn't get this through my head yet (and also
> > have not seen your patch recently). So I don't know exactly what you
> > are doing...
> 
> The last version was posted here:
> 
> http://marc.info/?l=kvm-devel&m=120321732521533&w=2
> 
> > But why does _anybody_ (why does Christoph's patches) need to invalidate
> > when they are going to be more permissive? This should be done lazily by
> > the driver, I would have thought.
> 
> This can be done lazily by the driver yes. The place where I've an
> invalidate_pages in mprotect however can also become less permissive.

That's OK, because we have to flush tlbs there too.


> It's simpler to invalidate always and it's not guaranteed the
> secondary mmu page fault is capable of refreshing the spte across a
> writeprotect fault.

I think we just have to make sure that it _can_ do writeprotect
faults. AFAIKS, that will be possible if the driver registers a
.page_mkwrite handler (actually not quite -- page_mkwrite is fairly
crap, so I have a patch to merge it together with .fault so we get
address information as well). Anyway, I really think we should do
it that way.

> In the future this can be changed to
> mprotect_pages though, so no page fault will happen in the secondary
> mmu.

Possibly, but hopefully not needed for performance. Let's wait and
see.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 10:39       ` [PATCH] mmu notifiers #v6 Andrea Arcangeli
                           ` (3 preceding siblings ...)
  2008-02-20 21:03         ` Jack Steiner
@ 2008-02-21  4:54         ` Nick Piggin
  2008-02-21 14:40           ` Andrea Arcangeli
  4 siblings, 1 reply; 120+ messages in thread
From: Nick Piggin @ 2008-02-21  4:54 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 11:39:42AM +0100, Andrea Arcangeli wrote:
> Given Nick's comments I ported my version of the mmu notifiers to
> latest mainline. There are no known bugs AFIK and it's obviously safe
> (nothing is allowed to schedule inside rcu_read_lock taken by
> mmu_notifier() with my patch).

Thanks! Yes the seqlock you are using now ends up looking similar
to what I did and I couldn't find a hole in that either. So I
think this is going to work.

I do prefer some parts of my patch, however for everyone's sanity,
I think you should be the maintainer of the mmu notifiers, and I
will send you incremental changes that can be discussed more easily
that way (nothing major, mainly style and minor things).


> XPMEM simply can't use RCU for the registration locking if it wants to
> schedule inside the mmu notifier calls. So I guess it's better to add
> the XPMEM invalidate_range_end/begin/external-rmap as a whole
> different subsystem that will have to use a mutex (not RCU) to
> serialize, and at the same time that CONFIG_XPMEM will also have to
> switch the i_mmap_lock to a mutex. I doubt xpmem fits inside a
> CONFIG_MMU_NOTIFIER anymore, or we'll all run a bit slower because of
> it. It's really a call of how much we want to optimize the MMU
> notifier, by keeping things like RCU for the registration.

I agree: your coherent, non-sleeping mmu notifiers are pretty simple
and unintrusive. The sleeping version is fundamentally going to either
need to change VM locks, or be non-coherent, so I don't think there is
a question of making one solution fit everybody. So the sleeping /
xrmap patch should be kept either completely independent, or as an
add-on to this one.

I will post some suggestions to you when I get a chance.

 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-20 12:03           ` Andrea Arcangeli
  2008-02-20 12:24             ` Robin Holt
@ 2008-02-21  5:02             ` Nick Piggin
  1 sibling, 0 replies; 120+ messages in thread
From: Nick Piggin @ 2008-02-21  5:02 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, akpm, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Wed, Feb 20, 2008 at 01:03:24PM +0100, Andrea Arcangeli wrote:
> If there's agreement that the VM should alter its locking from
> spinlock to mutex for its own good, then Christoph's
> one-config-option-fits-all becomes a lot more appealing (replacing RCU
> with a mutex in the mmu notifier list registration locking isn't my
> main worry and the non-sleeping-users may be ok to live with it).

Just from a high level view, in some cases we can just say that no we
aren't going to support this. And this may well be one of those cases.

The more constraints placed on the VM, the harder it becomes to
improve and adapt in future. And this seems like a pretty big restriction.
(especially if we can eg. work around it completely by having a special
purpose driver to get_user_pages on comm buffers as I suggested in the
other mail).

At any rate, I believe Andrea's patch really places minimal or no further
constraints than a regular CPU TLB (or the hash tables that some archs
implement). So we're kind of in 2 different leagues here.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-21  4:54         ` Nick Piggin
@ 2008-02-21 14:40           ` Andrea Arcangeli
  2008-02-21 16:10             ` Jack Steiner
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-21 14:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Thu, Feb 21, 2008 at 05:54:30AM +0100, Nick Piggin wrote:
> will send you incremental changes that can be discussed more easily
> that way (nothing major, mainly style and minor things).

I don't need to say you're very welcome ;).

> I agree: your coherent, non-sleeping mmu notifiers are pretty simple
> and unintrusive. The sleeping version is fundamentally going to either
> need to change VM locks, or be non-coherent, so I don't think there is
> a question of making one solution fit everybody. So the sleeping /
> xrmap patch should be kept either completely independent, or as an
> add-on to this one.

The need to change the VM locks to fit the sleepable "mmu notifier"
needs, I think is the major reason why the sleeping patch should be a
separate config option unless you think the i_mmap_lock will benefit
the VM for its own good regardless of the sleepable mmu
notifiers. Otherwise we'll end up merging in mainline an API that can
only satisfy the needs of the "sleeping users" that are only
interested about anonymous memory. While the basic concept of the mmu
notifiers is to cover the whole user visible address space, not just
anonymous memory! Furthermore XPMEM users already asked to work on
tmpfs/MAP_SHARED too...

Originally the trick that I was trying to remove the "atomic" param,
was to defer the invalidate_range after dropping the i_mmap_lock. But
clearly in truncate we'll have no more guarantees that nor the vma nor
the MM still exists after spin_unlock(i_mmap_lock) is called... So
it's simply impossible to call the mmu notifier out of the i_mmap_lock
for truncate, and Christoph's patch looks unfixable without altering
the VM core locking. Christoph's API one-config-fits-all can't really
fit-all, but only the anonymous memory.

However if I wear a KVM hat, I cannot care less what is merged as long
as .25 will be able to fully swap reliably a virtualized guest OS ;).
This is why I'm totally willing to support any decision in favor of
anything (including your own patch that would only work for KVM) that
can be merged.

> I will post some suggestions to you when I get a chance.

I really want suggestions on Jack's concern about issuing an
invalidate per pte entry or per-pte instead of per-range. I'll answer
that in a separate email. For KVM my patch is already close to optimal
because each single spte invalidate requires a fixed amount of work,
but for GRU a large invalidate-range would be more efficient.

To address the GRU _valid_ concern, I can create a second version of
my patch with range_begin/end instead of invalidate_pages, that still
won't support sleeping users like XPMEM but only KVM and GRU. Then
it's up to Christoph when he comes back to alter the vm locking so
that those calls can sleep too... But that will require a much bigger
change and then perhaps xpmem can share the same mmu notifiers when
the config option to make the mmu notifier sleepable is enabled. But
that part would better be incremental as it's not so obviously safe to
merge as the mmu notifier themself.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v6
  2008-02-21 14:40           ` Andrea Arcangeli
@ 2008-02-21 16:10             ` Jack Steiner
  2008-02-27 19:26               ` [PATCH] mmu notifiers #v7 Andrea Arcangeli
  0 siblings, 1 reply; 120+ messages in thread
From: Jack Steiner @ 2008-02-21 16:10 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	linux-kernel, linux-mm, daniel.blueman, Christoph Lameter

> I really want suggestions on Jack's concern about issuing an
> invalidate per pte entry or per-pte instead of per-range. I'll answer
> that in a separate email. For KVM my patch is already close to optimal
> because each single spte invalidate requires a fixed amount of work,
> but for GRU a large invalidate-range would be more efficient.
>
> To address the GRU _valid_ concern, I can create a second version of
> my patch with range_begin/end instead of invalidate_pages, that still

I don't know how much significance to place on this data, but it is
a real data point.

I ran the GRU regression test suite on kernels with both types of
mmu_notifiers. The kernel/driver using Christoph's patch had
1/7 the number of TLB invalidates as Andrea's patch.

This reduction is due to both differences I mentioned yesterday:
	- different location of callout for address space teardown
	- range callouts

Unfortunately, the current driver does not allow me to quantify
which of the differences is most significant.

Also, I'll try to post the driver within the next few days. It is
still in development but it compiles and can successfully run most
workloads on a system simulator.

--- jack

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-21  4:42       ` Nick Piggin
@ 2008-02-22 16:31         ` Jack Steiner
  0 siblings, 0 replies; 120+ messages in thread
From: Jack Steiner @ 2008-02-22 16:31 UTC (permalink / raw)
  To: Nick Piggin, andrea
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	linux-kernel, linux-mm, daniel.blueman, Christoph Lameter

> Also, I'll try to post the driver within the next few days. It is
> still in development but it compiles and can successfully run most
> workloads on a system simulator.

Here is the source of the GRU driver. It is still in development but
it compiles & runs (on IA64) in a system simulator.

The GRU is a hardware resource located in the chipset. It is
mmaped into the user address space. The GRU contains functions such
as load/store, scatter/gather, bcopy, etc. It is directly accessed
by user instructions using user virtual addresses. GRU instructions
(ex., bcopy) use user virtual addresses for operands. The GRU
contains a large TLB that is functionally very similar to processor TLBs.


This version uses the V7 mmu notifier patch from Christoph. The changes
to switch to Andrea's patch are trivial. (Note, however, that XPMEM still
requires Christoph's patch).


The interesting parts relating to mmu_notifiers are in the
following functions:
	gru_try_dropin() - does TLB dropins
	gru_flush_tlb_range() - TLB flushing
	gru_mmuops_...() - all functions starting with "gru_mmuops_"
	gru_register_mmu_notifier() - registers notifiers


I have no doubt that there are bugs in the code. If you find them, please
let me know where they are ....    :-)

Other comments appreciated, too.




Portions are rough but this 
 arch/ia64/sn/kernel/sn2/sn2_smp.c |    5 
 drivers/Makefile                  |    1 
 drivers/gru/Makefile              |    4 
 drivers/gru/gru.h                 |  348 +++++++++++++
 drivers/gru/gru_instructions.h    |  502 +++++++++++++++++++
 drivers/gru/grufault.c            |  557 ++++++++++++++++++++++
 drivers/gru/grufile.c             |  453 +++++++++++++++++
 drivers/gru/gruhandles.h          |  655 +++++++++++++++++++++++++
 drivers/gru/grukservices.c        |  129 +++++
 drivers/gru/grulib.h              |   84 +++
 drivers/gru/grumain.c             |  958 ++++++++++++++++++++++++++++++++++++++
 drivers/gru/grummuops.c           |  376 ++++++++++++++
 drivers/gru/gruprocfs.c           |  309 ++++++++++++
 drivers/gru/grutables.h           |  517 ++++++++++++++++++++
 drivers/sn/Kconfig                |    7 
 15 files changed, 4905 insertions(+)



Index: linux/drivers/Makefile
===================================================================
--- linux.orig/drivers/Makefile	2008-02-22 09:37:21.759206853 -0600
+++ linux/drivers/Makefile	2008-02-22 09:37:51.722947267 -0600
@@ -5,6 +5,7 @@
 # Rewritten to use lists instead of if-statements.
 #
 
+obj-$(CONFIG_GRU)		+= gru/
 obj-$(CONFIG_PCI)		+= pci/
 obj-$(CONFIG_PARISC)		+= parisc/
 obj-$(CONFIG_RAPIDIO)		+= rapidio/
Index: linux/drivers/gru/Makefile
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/Makefile	2008-02-22 09:37:51.742949764 -0600
@@ -0,0 +1,4 @@
+#
+EXTRA_CFLAGS += -Werror -Wall
+obj-$(CONFIG_GRU) := gru.o
+gru-y := grufile.o grumain.o grufault.o grummuops.o gruprocfs.o grukservices.o
Index: linux/drivers/sn/Kconfig
===================================================================
--- linux.orig/drivers/sn/Kconfig	2008-02-22 09:37:21.803212347 -0600
+++ linux/drivers/sn/Kconfig	2008-02-22 09:37:51.774953759 -0600
@@ -18,4 +18,11 @@ config SGI_IOC3
 	I/O controller or a PCI IOC3 serial card say Y.
 	Otherwise say N.
 
+config GRU
+	tristate "SGI GRU driver"
+	default y
+	---help---
+	This option enables basic support for the SGI UV GRU driver.
+
+
 endmenu
Index: linux/arch/ia64/sn/kernel/sn2/sn2_smp.c
===================================================================
--- linux.orig/arch/ia64/sn/kernel/sn2/sn2_smp.c	2008-02-22 09:37:21.831215842 -0600
+++ linux/arch/ia64/sn/kernel/sn2/sn2_smp.c	2008-02-22 09:37:51.838961749 -0600
@@ -113,6 +113,11 @@ void sn_migrate(struct task_struct *task
 	pda_t *last_pda = pdacpu(task_thread_info(task)->last_cpu);
 	volatile unsigned long *adr = last_pda->pio_write_status_addr;
 	unsigned long val = last_pda->pio_write_status_val;
+	extern void gru_migrate_task(int, int);
+
+	if (current->mm && hlist_empty(&current->mm->mmu_notifier.head) &&
+	    task_thread_info(current)->last_cpu != task_cpu(current))
+		gru_migrate_task(task_thread_info(current)->last_cpu, task_cpu(current));
 
 	/* Drain PIO writes from old CPU's Shub */
 	while (unlikely((*adr & SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK)
Index: linux/drivers/gru/gru.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/gru.h	2008-02-11 11:22:32.000000000 -0600
@@ -0,0 +1,348 @@
+/*
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All rights reserved.
+ */
+
+#ifndef _GRU_H_
+#define _GRU_H_
+
+#ifdef EMUSUPPORT
+#define _EMUSUPPORT 1
+#else
+#define _EMUSUPPORT 0
+#endif
+
+#ifndef __KERNEL__
+#include <stdlib.h>
+#else
+#include <linux/types.h>
+#endif
+
+/*
+ * Maximum number of GRU segments that a user can have open
+ * ZZZ temp - set higher for testing. Revisit.
+ */
+#define GRU_MAX_OPEN_CONTEXTS		32
+
+/*
+ * Constants for addressing user Gseg
+ */
+#define GRU_CB_BASE             0
+#define GRU_DS_BASE             0x20000
+#define GRU_HANDLE_STRIDE       256
+#define GRU_CACHE_LINE_BYTES	64
+
+
+/*
+ * GRU Segment limits
+ */
+#define GRU_MAX_CB		(128 - 16)
+#define GRU_DS_BYTES		(32768 - 1024)
+
+/*
+ * Pagesize used to map GRU GSeg
+ */
+#ifdef __ia64__
+#define GRU_GSEG_PAGESIZE	(256 * 1024)
+#define GRU_GSEG_PAGESIZE_SHIFT 18
+#else
+#define GRU_GSEG_PAGESIZE	(2 * 1024 * 1024UL)
+#endif
+
+
+/* Basic types  - improve type checking */
+typedef struct { void *cookie; } gru_cookie_t;
+typedef struct gru_control_segment_s gru_segment_t;
+typedef struct gru_control_block_s gru_control_block_t;
+
+/* Flags for GRU options on the gru_create_context() call */
+/* Select one of the follow 4 options to specify how TLB misses are handled */
+#define GRU_OPT_MISS_DEFAULT	0x0000	/* Use default mode */
+#define GRU_OPT_MISS_USER_POLL	0x0001	/* User will poll CB for faults */
+#define GRU_OPT_MISS_FMM_INTR	0x0002	/* Send interrut to cpu to
+					   handle fault */
+#define GRU_OPT_MISS_FMM_POLL	0x0003	/* Use system polling thread */
+#define GRU_OPT_MISS_MASK	0x0003	/* Mask for TLB MISS option */
+
+/*
+ * Ugly testing hack!! - if set, GRU thinks all pages are 1 TB.
+ * Works on emulator only
+ */
+#define GRU_OPT_FAKE_TB_PAGES	0x8000	/* EMU testing only - GRU uses
+					   1 TB pages */
+/*
+ * Get exception detail for CB that failed.
+ */
+
+/*
+ * Structure used to fetch exception detail for CBs that terminate with
+ * CBS_EXCEPTION
+ */
+struct control_block_extended_exc_detail {
+	unsigned long	cb;
+	int		opc;
+	int		ecause;
+	int		exopc;
+	long		exceptdet0;
+	int		exceptdet1;
+};
+
+
+
+/*----------------------------------------------------------------------------
+ * Inline functions for waiting for CB completion & checking CB status
+ */
+
+/*
+ * Control block status and exception codes
+ */
+#define CBS_IDLE			0
+#define CBS_EXCEPTION			1
+#define CBS_ACTIVE			2
+#define CBS_CALL_OS			3
+
+/* CB substatus bitmasks */
+#define CBSS_MSG_QUEUE_MASK		7
+#define CBSS_IMPLICIT_ABORT_ACTIVE_MASK	8
+
+/* CB substatus message queue values (low 3 bits of substatus) */
+#define CBSS_LB_OVERFLOWED		1
+#define CBSS_QLIMIT_REACHED		2
+#define CBSS_PAGE_OVERFLOW		3
+#define CBSS_AMO_NACKED			4
+#define CBSS_PUT_NACKED			5
+
+/*
+ * Control block definition for checking status
+ */
+struct gru_control_block_status {
+	volatile unsigned int	icmd		:1;
+	unsigned int		unused1		:31;
+	unsigned int		unused2		:24;
+	volatile unsigned int	istatus		:2;
+	volatile unsigned int	isubstatus	:4;
+	unsigned int		inused3		:2;
+};
+
+/* Get CB status */
+static inline int gru_get_cb_status(gru_control_block_t *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	return cbs->istatus;
+}
+
+/* Get CB message queue substatus */
+static inline int gru_get_cb_message_queue_substatus(gru_control_block_t *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	return cbs->isubstatus & CBSS_MSG_QUEUE_MASK;
+}
+
+/* Get CB substatus */
+static inline int gru_get_cb_substatus(gru_control_block_t *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	return cbs->isubstatus;
+}
+
+extern int gru_check_status_proc(gru_control_block_t *cb);
+extern int gru_wait_proc(gru_control_block_t *cb);
+extern void gru_wait_abort_proc(gru_control_block_t *cb);
+extern void gru_abort(int, gru_control_block_t *cb, char *str);
+
+/* Check the status of a CB. If the CB is in UPM mode, call the
+ * OS to handle the UPM status.
+ * Returns the CB status field value (0 for normal completion)
+ */
+static inline int gru_check_status(gru_control_block_t *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+	int ret = cbs->istatus;
+
+	if (_EMUSUPPORT || ret == CBS_CALL_OS)
+		ret = gru_check_status_proc(cb);
+	return ret;
+}
+
+/* Wait for CB to complete.
+ * Returns the CB status field value (0 for normal completion)
+ */
+static inline int gru_wait(gru_control_block_t *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	if (cbs->istatus != CBS_IDLE)
+		return gru_wait_proc(cb);
+	return cbs->istatus;
+}
+
+/* Wait for CB to complete. Aborts program if error. (Note: error does NOT
+ * mean TLB mis - only fatal errors such as memory parity error or user
+ * bugs will cause termination.
+ */
+static inline void gru_wait_abort(gru_control_block_t *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	if (cbs->istatus != CBS_IDLE)
+		gru_wait_abort_proc(cb);
+}
+
+#ifndef __KERNEL__
+/* Name of DSO library */
+#define LIBGRU_SO		"libgru.so"
+
+/* Environment variables for controlling behavior*/
+
+/*
+ * Override TLBMISS fault map mode
+ * 	- "user_polling", "interrupt", "os_polling"
+ */
+#define GRU_TLBMISS_MODE_ENV	"GRU_TLBMISS_MODE"
+
+/* Set exception retry count for numalink timeout & memory parity */
+#define GRU_EXCEPTION_RETRY_ENV	"GRU_EXCEPTION_RETRY"
+#define GRU_EXCEPTION_RETRY_DEFAULT	3
+
+
+
+/*
+ * Create a new GRU context
+ *	cookie		- (OUT): magic identifier of the GRU segment
+ *	start 		- starting address for mmaped segments (NULL means
+ *			  OS picks address).
+ *	ctlblks 	- number of active control blocks
+ *	dataseg_bytes 	- number of data segment bytes
+ *	max_threads 	- maximum number of threads that will use the context
+ *	options 	- specifies various options
+ *			  (see constants below)
+ *
+ *  Returns 0 if successful, else error code returned in errno
+ */
+extern int gru_create_context(gru_cookie_t *cookie, void *start,
+			      unsigned int ctlblks, unsigned int dataseg_bytes,
+			      unsigned int max_threads, unsigned int options);
+
+
+/*
+ * Destroy a GRU context
+ * 	cookie	- cookie returned from gru_create_context()
+ *
+ * Returns:
+ * 	 0 - success
+ * 	-1 - failure. See errno for additional status
+ */
+extern int gru_destroy_context(gru_cookie_t cookie);
+
+
+/*
+ * Get the handle to a thread's private GRU context
+ * 	cookie		- cookie returned from gru_create_context()
+ * 	threadnum	- thread number (0 .. #threads-1)
+ *
+ * Returns pointer to GSeg if successful, else returns NULL.
+ * Error code returned in errno
+ */
+gru_segment_t *gru_get_thread_gru_segment(gru_cookie_t cookie, int threadnum);
+
+/*
+ * Flush a range of virtual addresses from the GRU TLB (intended for testcases
+ * only)
+ */
+int gru_flush_tlb(gru_segment_t *gseg, void *vaddr, size_t len);
+
+/*
+ * Unload a GRU context & free GRU resource. Will be reloaded on next
+ * reference.
+ */
+int gru_unload_context(void *gseg);
+
+/*
+ * Get struct control_block_extended_exc_detail for CB.
+ */
+extern int gru_get_cb_exception_detail(gru_control_block_t *cb,
+		       struct control_block_extended_exc_detail *excdet);
+
+/* Get a string that describes the CB exception detail. */
+extern char *gru_get_cb_exception_detail_str(int ret, gru_control_block_t *cb);
+
+
+/*
+ * Get a pointer to a control block
+ * 	gseg	- GSeg address returned from gru_get_thread_gru_segment()
+ * 	index	- index of desired CB
+ */
+static inline gru_control_block_t *gru_get_cb_pointer(gru_segment_t *gseg,
+						      int index)
+{
+	return (gru_control_block_t *)((void *)gseg + GRU_CB_BASE +
+				      index * GRU_HANDLE_STRIDE);
+}
+
+/*
+ * Get a pointer to a cacheline in the data segment portion of a GSeg
+ * 	gseg	- GSeg address returned from gru_get_thread_gru_segment()
+ * 	index	- index of desired cache line
+ */
+static inline void *gru_get_data_pointer(gru_segment_t *gseg, int index)
+{
+	return (void *)((void *)gseg + GRU_DS_BASE +
+			index * GRU_CACHE_LINE_BYTES);
+}
+
+/*
+ * Convert a vaddr into the tri index within the GSEG
+ * 	vaddr		- virtual address of within gseg
+ */
+static inline int gru_get_tri(void *vaddr)
+{
+	return (((unsigned long)vaddr & (GRU_GSEG_PAGESIZE - 1)) - GRU_DS_BASE);
+}
+#endif		/* ! __KERNEL__ */
+
+#ifdef EMUSUPPORT
+/*
+ * Hooks for instruction emulator
+ */
+enum {EMU_ID_SIM2_CHET, EMU_ID_SIM2_SIM2, EMU_ID_MEDUSA};
+int gru_emulator_id(void);
+
+extern void emuloguser(char *fmt, ...);
+extern int is_emu(void);
+# ifdef __KERNEL__
+   extern void emu_writeback_hook(void *p);
+   extern void emu_kwait_hook(void *p, int wait);
+#  define gru_flush_cache_hook(p)	emu_writeback_hook(p)
+#  define gru_emulator_wait_hook(p, w)	emu_kwait_hook(p, w)
+# else
+   extern void lib_cb_wait_hook(void *p, int wait) __attribute__ ((weak));
+   extern void lib_writeback_hook(void *p) __attribute__ ((weak));
+
+#  define gru_flush_cache_hook(p)				\
+	do {							\
+		if (lib_writeback_hook)				\
+			lib_writeback_hook(p);			\
+	 } while (0)
+
+#  define gru_emulator_wait_hook(p, w)				\
+	   do {							\
+		   if (lib_cb_wait_hook)			\
+			lib_cb_wait_hook(p, w);			\
+	   } while (0)
+
+# endif
+#else
+#define emuloguser		printf
+#define gru_flush_cache_hook(p)
+#define gru_emulator_wait_hook(p, w)
+#define is_emu() 0
+#endif
+
+#endif				/* _GRU_H_ */
Index: linux/drivers/gru/gru_instructions.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/gru_instructions.h	2008-01-25 08:13:07.135721041 -0600
@@ -0,0 +1,502 @@
+/*
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All rights reserved.
+ */
+
+#ifndef _GRU_INSTRUCTIONS_H_
+#define _GRU_INSTRUCTIONS_H_
+
+/*
+ * Instruction formats
+ */
+
+/*
+ * Generic instruction format.
+ * This definition has precise bit field definitions.
+ */
+struct gru_instruction_bits {
+    /* DW 0  - low */
+    unsigned int		icmd:      1;
+    unsigned char		ima:	   3;	/* CB_DelRep, unmapped mode */
+    unsigned char		reserved0: 4;
+    unsigned int		xtype:     3;
+    unsigned int		iaa0:      2;
+    unsigned int		iaa1:      2;
+    unsigned char		reserved1: 1;
+    unsigned char		opc:       8;	/* opcode */
+    unsigned char		exopc:     8;	/* extended opcode */
+    /* DW 0  - high */
+    unsigned int		idef2:    22;	/* TRi0 */
+    unsigned char		reserved2: 2;
+    unsigned char		istatus:   2;
+    unsigned char		isubstatus:4;
+    unsigned char		reserved3: 2;
+    /* DW 1 */
+    unsigned long		idef4;		/* 42 bits: TRi1, BufSize */
+    /* DW 2-6 */
+    unsigned long		idef1;		/* BAddr0 */
+    unsigned long		idef5;		/* Nelem */
+    unsigned long		idef6;		/* Stride, Operand1 */
+    unsigned long		idef3;		/* BAddr1, Value, Operand2 */
+    unsigned long		reserved4;
+    /* DW 7 */
+    unsigned long		avalue;		 /* AValue */
+};
+
+/*
+ * Generic instruction with friendlier names. This format is used
+ * for inline instructions.
+ */
+struct gru_instruction {
+    /* DW 0 */
+    volatile unsigned int	op32;    /* icmd,xtype,iaa0,ima,opc */
+    unsigned int		tri0;
+    /* DW 1-7 */
+    unsigned long		tri1_bufsize;
+    unsigned long		baddr0;
+    unsigned long		nelem;
+    unsigned long		op1_stride;
+    unsigned long		op2_value_baddr1;
+    unsigned long		reserved0;
+    unsigned long		avalue;
+};
+
+/* Some shifts and masks for the low 32 bits of a GRU command */
+#define GRU_CB_ICMD_SHFT	0
+#define GRU_CB_ICMD_MASK	0x1
+#define GRU_CB_XTYPE_SHFT	8
+#define GRU_CB_XTYPE_MASK	0x7
+#define GRU_CB_IAA0_SHFT	11
+#define GRU_CB_IAA0_MASK	0x3
+#define GRU_CB_IAA1_SHFT	13
+#define GRU_CB_IAA1_MASK	0x3
+#define GRU_CB_IMA_SHFT		1
+#define GRU_CB_IMA_MASK		0x3
+#define GRU_CB_OPC_SHFT		16
+#define GRU_CB_OPC_MASK		0xff
+#define GRU_CB_EXOPC_SHFT	24
+#define GRU_CB_EXOPC_MASK	0xff
+
+/* GRU instruction opcodes (opc field) */
+#define OP_NOP		0x00
+#define OP_BCOPY	0x01
+#define OP_VLOAD	0x02
+#define OP_IVLOAD	0x03
+#define OP_VSTORE	0x04
+#define OP_IVSTORE	0x05
+#define OP_VSET		0x06
+#define OP_IVSET	0x07
+#define OP_MESQ		0x08
+#define OP_GAMXR	0x09
+#define OP_GAMIR	0x0a
+#define OP_GAMIRR	0x0b
+#define OP_GAMER	0x0c
+#define OP_GAMERR	0x0d
+#define OP_BSTORE	0x0e
+#define OP_VFLUSH	0x0f
+
+
+/* Extended opcodes values (exopc field) */
+
+/* GAMIR - AMOs with implicit operands */
+#define EOP_IR_FETCH	0x01 /* Plain fetch of memory */
+#define EOP_IR_CLR	0x02 /* Fetch and clear */
+#define EOP_IR_INC	0x05 /* Fetch and increment */
+#define EOP_IR_DEC	0x07 /* Fetch and decrement */
+#define EOP_IR_QCHK1	0x0d /* Queue check, 64 byte msg */
+#define EOP_IR_QCHK2	0x0e /* Queue check, 128 byte msg */
+
+/* GAMIRR - Registered AMOs with implicit operands */
+#define EOP_IRR_FETCH	0x01 /* Registered fetch of memory */
+#define EOP_IRR_CLR	0x02 /* Registered fetch and clear */
+#define EOP_IRR_INC	0x05 /* Registered fetch and increment */
+#define EOP_IRR_DEC	0x07 /* Registered fetch and decrement */
+#define EOP_IRR_DECZ	0x0f /* Registered fetch and decrement, update on zero*/
+
+/* GAMER - AMOs with explicit operands */
+#define EOP_ER_SWAP	0x00 /* Exchange argument and memory */
+#define EOP_ER_OR	0x01 /* Logical OR with memory */
+#define EOP_ER_AND	0x02 /* Logical AND with memory */
+#define EOP_ER_XOR	0x03 /* Logical XOR with memory */
+#define EOP_ER_ADD	0x04 /* Add value to memory */
+#define EOP_ER_CSWAP	0x08 /* Compare with operand2, write operand1 if match*/
+#define EOP_ER_CADD	0x0c /* Queue check, operand1*64 byte msg */
+
+/* GAMERR - Registered AMOs with explicit operands */
+#define EOP_ERR_SWAP	0x00 /* Exchange argument and memory */
+#define EOP_ERR_OR	0x01 /* Logical OR with memory */
+#define EOP_ERR_AND	0x02 /* Logical AND with memory */
+#define EOP_ERR_XOR	0x03 /* Logical XOR with memory */
+#define EOP_ERR_ADD	0x04 /* Add value to memory */
+#define EOP_ERR_CSWAP	0x08 /* Compare with operand2, write operand1 if match*/
+#define EOP_ERR_EPOLL	0x09 /* Poll for equality */
+#define EOP_ERR_NPOLL	0x0a /* Poll for inequality */
+
+/* GAMXR - SGI Arithmetic unit */
+
+
+/* Transfer types (xtype field) */
+#define XTYPE_B		0x0	/* byte */
+#define XTYPE_S		0x1	/* short (2-byte) */
+#define XTYPE_W		0x2	/* word (4-byte) */
+#define XTYPE_DW	0x3	/* doubleword (8-byte) */
+#define XTYPE_RSVD4	0x4
+#define XTYPE_RSVD5	0x5
+#define XTYPE_CL	0x6	/* cacheline (64-byte) */
+#define XTYPE_RSVD7	0x7
+
+
+/* Instruction access attributes (iaa0, iaa1 fields) */
+#define IAA_RAM		0x0	/* normal cached RAM access */
+#define IAA_NCRAM	0x2	/* noncoherent RAM access */
+#define IAA_MMIO	0x1	/* noncoherent memory-mapped I/O space */
+#define IAA_REGISTER	0x3	/* memory-mapped registers, etc. */
+
+
+/* Instruction mode attributes (ima field) */
+#define IMA_CB_DELAY	0x1	/* hold read responses until status changes */
+#define IMA_UNMAPPED	0x2	/* bypass the TLBs (OS only) */
+#define IMA_INTERRUPT	0x4	/* Interrupt when instruction completes */
+
+/* CBE ecause bits */
+#define CBE_CAUSE_RI_BIT					0
+#define CBE_CAUSE_INVALID_INSTRUCTION_BIT			1
+#define CBE_CAUSE_UNMAPPED_MODE_FORBIDDEN_BIT			2
+#define CBE_CAUSE_PE_CHECK_DATA_ERROR_BIT			3
+#define CBE_CAUSE_IAA_GAA_MISMATCH_BIT				4
+#define CBE_CAUSE_DATA_SEGMENT_LIMIT_EXCEPTION_BIT		5
+#define CBE_CAUSE_OS_FATAL_TLB_FAULT_BIT			6
+#define CBE_CAUSE_EXECUTION_HW_ERROR_BIT			7
+#define CBE_CAUSE_TLBHW_ERROR_BIT				8
+#define CBE_CAUSE_RA_REQUEST_TIMEOUT_BIT			9
+#define CBE_CAUSE_HA_REQUEST_TIMEOUT_BIT			10
+#define CBE_CAUSE_RA_RESPONSE_FATAL_BIT				11
+#define CBE_CAUSE_RA_RESPONSE_NON_FATAL_BIT			12
+#define CBE_CAUSE_HA_RESPONSE_FATAL_BIT				13
+#define CBE_CAUSE_HA_RESPONSE_NON_FATAL_BIT			14
+#define CBE_CAUSE_ADDRESS_SPACE_DECODE_ERROR_BIT		15
+#define CBE_CAUSE_RESPONSE_DATA_ERROR_BIT			16
+#define CBE_CAUSE_PROTOCOL_STATE_DATA_ERROR_BIT			17
+
+#define CBE_CAUSE_RI				(1 << CBE_CAUSE_RI_BIT)
+#define CBE_CAUSE_INVALID_INSTRUCTION		(1 << CBE_CAUSE_INVALID_INSTRUCTION_BIT)
+#define CBE_CAUSE_UNMAPPED_MODE_FORBIDDEN	(1 << CBE_CAUSE_UNMAPPED_MODE_FORBIDDEN_BIT)
+#define CBE_CAUSE_PE_CHECK_DATA_ERROR		(1 << CBE_CAUSE_PE_CHECK_DATA_ERROR_BIT)
+#define CBE_CAUSE_IAA_GAA_MISMATCH		(1 << CBE_CAUSE_IAA_GAA_MISMATCH_BIT)
+#define CBE_CAUSE_DATA_SEGMENT_LIMIT_EXCEPTION	(1 << CBE_CAUSE_DATA_SEGMENT_LIMIT_EXCEPTION_BIT)
+#define CBE_CAUSE_OS_FATAL_TLB_FAULT		(1 << CBE_CAUSE_OS_FATAL_TLB_FAULT_BIT)
+#define CBE_CAUSE_EXECUTION_HW_ERROR		(1 << CBE_CAUSE_EXECUTION_HW_ERROR_BIT)
+#define CBE_CAUSE_TLBHW_ERROR			(1 << CBE_CAUSE_TLBHW_ERROR_BIT)
+#define CBE_CAUSE_RA_REQUEST_TIMEOUT		(1 << CBE_CAUSE_RA_REQUEST_TIMEOUT_BIT)
+#define CBE_CAUSE_HA_REQUEST_TIMEOUT		(1 << CBE_CAUSE_HA_REQUEST_TIMEOUT_BIT)
+#define CBE_CAUSE_RA_RESPONSE_FATAL		(1 << CBE_CAUSE_RA_RESPONSE_FATAL_BIT)
+#define CBE_CAUSE_RA_RESPONSE_NON_FATAL		(1 << CBE_CAUSE_RA_RESPONSE_NON_FATAL_BIT)
+#define CBE_CAUSE_HA_RESPONSE_FATAL		(1 << CBE_CAUSE_HA_RESPONSE_FATAL_BIT)
+#define CBE_CAUSE_HA_RESPONSE_NON_FATAL		(1 << CBE_CAUSE_HA_RESPONSE_NON_FATAL_BIT)
+#define CBE_CAUSE_ADDRESS_SPACE_DECODE_ERROR	(1 << CBE_CAUSE_ADDRESS_SPACE_DECODE_ERROR_BIT)
+#define CBE_CAUSE_RESPONSE_DATA_ERROR		(1 << CBE_CAUSE_RESPONSE_DATA_ERROR_BIT)
+#define CBE_CAUSE_PROTOCOL_STATE_DATA_ERROR	(1 << CBE_CAUSE_PROTOCOL_STATE_DATA_ERROR_BIT)
+
+
+/* Message queue head structure */
+union gru_mesqhead {
+	unsigned long	val;
+	struct {
+		unsigned int	head;
+		unsigned int	limit;
+	} q;
+};
+
+
+/* Generate the low word of a GRU instruction */
+static inline unsigned int
+opword(unsigned char opcode, unsigned char exopc, unsigned char xtype,
+       unsigned char iaa0, unsigned char iaa1,
+       unsigned char ima)
+{
+    return ((1 << GRU_CB_ICMD_SHFT) |
+	    (iaa0 << GRU_CB_IAA0_SHFT) |
+	    (iaa1 << GRU_CB_IAA1_SHFT) |
+	    (xtype << GRU_CB_XTYPE_SHFT) |
+	    (ima << GRU_CB_IMA_SHFT) |
+	    (opcode << GRU_CB_OPC_SHFT) |
+	    (exopc << GRU_CB_EXOPC_SHFT));
+}
+
+/*
+ * Prefetch a cacheline
+ * 	??? should I use actual "load" or hardware prefetch???
+ */
+static inline void gru_prefetch(void *p)
+{
+	*(volatile char *)p;
+}
+
+
+/*
+ * Use the "fc" instruction as a hook into the emulator
+ * 	ZZZ serialization requirements here???
+ */
+static inline void gru_flush_cache(void *p)
+{
+#if defined(__ia64__)
+	asm volatile ("fc %0"::"r" (p):"memory");
+#elif defined(__x86_64__)
+	asm volatile("clflush %0" :: "m" (p));
+#else
+#error "bad arch"
+#endif
+	gru_flush_cache_hook(p);	/* No code generated unless -D EMUSUPPORT */
+}
+
+
+/* Values for the "hints" parameter of the GRU instruction functions */
+#define HINT_CB_UNMAPPED	IMA_UNMAPPED
+#define HINT_CB_DELAY		IMA_CB_DELAY
+
+/* Convert "hints" to IMA */
+#define CB_IMA(h)		((h) & (IMA_UNMAPPED | IMA_CB_DELAY))
+
+/* Convert data segment cache line index into TRI0 / TRI1 value */
+#define GRU_DINDEX(i)		((i) * GRU_CACHE_LINE_BYTES)
+
+/* Inline functions for GRU instructions.
+ *     Note:
+ *     	- nelem and stride are in elements
+ *     	- tri0/tri1 is in bytes for the beginning of the data segment.
+ */
+static inline void gru_vload(gru_control_block_t *cb, void *mem_addr, int iaa0,
+		unsigned int tri0, unsigned char xtype, unsigned long nelem,
+		unsigned long stride, unsigned long hints)
+{
+	struct gru_instruction *ins = (struct gru_instruction *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->op1_stride = stride;
+	ins->op32 = opword(OP_VLOAD, 0, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_vstore(gru_control_block_t *cb, void *mem_addr, int iaa0,
+		unsigned int tri0, unsigned char xtype, unsigned long nelem,
+		unsigned long stride, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->op1_stride = stride;
+	ins->op32 = opword(OP_VSTORE, 0, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_ivload(gru_control_block_t *cb, void *mem_addr, int iaa0,
+		unsigned int tri0, unsigned int tri1, unsigned char xtype,
+		unsigned long nelem, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->tri1_bufsize = tri1;
+	ins->op32 = opword(OP_IVLOAD, 0, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_ivstore(gru_control_block_t *cb, void *mem_addr,
+		int iaa0, unsigned int tri0, unsigned int tri1,
+		unsigned char xtype, unsigned long nelem, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->tri1_bufsize = tri1;
+	ins->op32 = opword(OP_IVSTORE, 0, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_vset(gru_control_block_t *cb, void *mem_addr, int iaa0,
+		unsigned long value, unsigned char xtype, unsigned long nelem,
+		unsigned long stride, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->op2_value_baddr1 = value;
+	ins->nelem = nelem;
+	ins->op1_stride = stride;
+	ins->op32 = opword(OP_VSET, 0, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_ivset(gru_control_block_t *cb, void *mem_addr, int iaa0,
+		unsigned long value, unsigned int tri1, unsigned char xtype,
+		unsigned long nelem, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->op2_value_baddr1 = value;
+	ins->nelem = nelem;
+	ins->tri1_bufsize = tri1;
+	ins->op32 = opword(OP_IVSET, 0, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_vflush(gru_control_block_t *cb, void *mem_addr, int iaa0,
+		unsigned long nelem, unsigned char xtype, unsigned long stride,
+		unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->op1_stride = stride;
+	ins->nelem = nelem;
+	ins->op32 = opword(OP_VFLUSH, 0, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_nop(gru_control_block_t *cb, int hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->op32 = opword(OP_NOP, 0, 0, 0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+
+static inline void gru_bcopy(gru_control_block_t *cb, const void *src,
+		int iaa0, void *dest, int iaa1,
+		unsigned long nelem, unsigned int xtype, unsigned int tri0,
+		unsigned int bufsize, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op2_value_baddr1 = (long)dest;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->tri1_bufsize = bufsize;
+	ins->op1_stride = 1;
+	ins->op32 = opword(OP_BCOPY, 0, xtype, iaa0, iaa1, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_bstore(gru_control_block_t *cb, const void *src,
+		void *dest, int iaa0, unsigned long nelem, unsigned int xtype,
+		unsigned int tri0, unsigned int stride, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op2_value_baddr1 = (long)dest;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->op1_stride = stride;
+	ins->op32 = opword(OP_BSTORE, 0, xtype, iaa0, iaa0, CB_IMA(hints));
+	/* ZZZ iaa0 or iaa1 */
+	gru_flush_cache(ins);
+}
+
+static inline void gru_gamir(gru_control_block_t *cb, int exopc, void *src,
+		int iaa0, unsigned int xtype, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op32 = opword(OP_GAMIR, exopc, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_gamirr(gru_control_block_t *cb, int exopc, void *src,
+		int iaa0, unsigned int xtype, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op32 = opword(OP_GAMIRR, exopc, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_gamer(gru_control_block_t *cb, int exopc, void *src,
+		int iaa0, unsigned int xtype,
+		unsigned long operand1, unsigned long operand2,
+		unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op2_value_baddr1 = operand1;
+	ins->op1_stride = operand2;
+	ins->op32 = opword(OP_GAMER, exopc, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_gamerr(gru_control_block_t *cb, int exopc, void *src,
+		int iaa0, unsigned int xtype, unsigned long operand1,
+		unsigned long operand2, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op2_value_baddr1 = operand1;
+	ins->op1_stride = operand2;
+	ins->op32 = opword(OP_GAMERR, exopc, xtype, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline void gru_mesq(gru_control_block_t *cb, void *queue, int iaa0,
+		unsigned long msg_bytes, unsigned long tri0,
+		unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)queue;
+	ins->nelem = msg_bytes / GRU_CACHE_LINE_BYTES;
+	ins->tri0 = tri0;
+	ins->op32 = opword(OP_MESQ, 0, XTYPE_CL, iaa0, 0, CB_IMA(hints));
+	gru_flush_cache(ins);
+}
+
+static inline unsigned long gru_get_amo_value(gru_control_block_t *cb)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	return ins->avalue;
+}
+
+static inline int gru_get_amo_value_head(gru_control_block_t *cb)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	return (ins->avalue & 0xffffffff);
+}
+
+static inline int gru_get_amo_value_limit(gru_control_block_t *cb)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	return ins->avalue >> 32;
+}
+
+static inline union gru_mesqhead  gru_mesq_head(int head, int limit)
+{
+	union gru_mesqhead mqh;
+
+	mqh.q.head = head;
+	mqh.q.limit = limit;
+	return mqh;
+}
+
+
+#endif				/* _GRU_INSTRUCTIONS_H_ */
Index: linux/drivers/gru/grufault.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/grufault.c	2008-02-19 10:19:25.876327857 -0600
@@ -0,0 +1,557 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              FAULT HANDLER FOR GRU DETECTED TLB MISSES
+ *
+ * This file contains code that handles TLB misses within the GRU.
+ * These misses are reported either via interrupts or user polling of
+ * the user CB.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#ifdef EMU
+#include "preemu.h"
+#endif
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include "gru.h"
+#include "grutables.h"
+#include "grulib.h"
+#include "gru_instructions.h"
+#ifdef EMU
+#include "emu.h"
+#endif
+
+/*
+ * Test if a physical address is a valid GRU GSEG address
+ */
+static inline int is_gru_paddr(unsigned long paddr)
+{
+	return (paddr >= gru_start_paddr && paddr < gru_end_paddr);
+}
+
+/*
+ * Find and lock the gts that contains the specified user vaddr.
+ *
+ * Returns:
+ * 	- *gts with the mmap_sem locked for read and the GTS locked.
+ *	- NULL if vaddr invalid OR is not a valid GSEG vaddr.
+ */
+
+static struct gru_thread_state *gru_find_and_lock_gts(unsigned long vaddr)
+{
+	struct vm_area_struct *vma;
+	struct gru_thread_state *gts;
+
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, vaddr);
+	if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops) {
+		gts = gru_find_thread_state(vma, TSID(vaddr - vma->vm_start));
+		if (gts) {
+			down(&gts->ts_ctxsem);
+			return gts;
+		}
+	}
+	up_read(&current->mm->mmap_sem);
+	return NULL;
+}
+
+/*
+ * Unlock a GTS that was previously locked with gru_find_and_lock_gts().
+ */
+static void gru_unlock_gts(struct gru_thread_state *gts)
+{
+	up(&gts->ts_ctxsem);
+	up_read(&current->mm->mmap_sem);
+}
+
+/*
+ * Set a CB.istatus to active using a user virtual address. This must be done
+ * just prior to a TFH RESTART. The new cb.istatus is an in-cache status ONLY.
+ * If the line is evicted, the status may be lost. The in-cache update
+ * is necessary to prevent the user from seeing a stale cb.istatus that will
+ * change as soon as the TFH restart is complete. Races may cause an
+ * occasional failure to clear the cb.istatus, but that is ok.
+ */
+static void gru_cb_set_istatus_active(unsigned long __user *cb)
+{
+	union {
+		struct gru_instruction_bits bits;
+		unsigned long dw;
+	} u;
+
+	if (cb) {
+		get_user(u.dw, cb);
+		u.bits.istatus = CBS_ACTIVE;
+		put_user(u.dw, cb);
+	}
+}
+
+/*
+ * Convert a interrupt IRQ to a pointer to the GRU GTS that caused the
+ * interrupt. Interrupts are always sent to a cpu on the blade that contains the
+ * GRU (except for headless blades which are not currently supported). A blade
+ * has N grus; a block of N consecutive IRQs is assigned to the GRUs. The IRQ
+ * number uniquely identifies the GRU chipleton the local blade that caused the
+ * interrupt. Always called in interrupt context.
+ */
+static inline struct gru_state *irq_to_gru(int irq)
+{
+	return &gru_base[numa_blade_id()]->bs_grus[irq - IRQ_GRU];
+}
+
+/*
+ * Read & clear a TFM
+ *
+ * The GRU has an array of fault maps. A map is private to a cpu
+ * Only one cpu will be accessing a cpu's fault map.
+ *
+ * This function scans the cpu-private fault map & clears all bits that
+ * are set. The function returns a bitmap that indicates the bits that
+ * were cleared. Note that sense the maps may be updated asynchronously by
+ * the GRU, atomic operations must be used to clear bits.
+ */
+static void get_clear_fault_map(struct gru_state *gru,
+				struct gru_tlb_fault_map *map)
+{
+	unsigned long i, k;
+	struct gru_tlb_fault_map *tfm;
+
+	tfm = get_tfm_for_cpu(gru, gru_cpu_fault_map_id());
+	prefetchw(tfm);		/* Helps on hardware, required for emulator */
+	for (i = 0; i < BITS_TO_LONGS(GRU_NUM_CBE); i++) {
+		k = tfm->fault_bits[i];
+		if (k)
+			k = xchg(&tfm->fault_bits[i], 0UL);
+		map->fault_bits[i] = k;
+	}
+
+	/*
+	 * Not functionally required but helps performance. (Required
+	 * on emulator)
+	 */
+	gru_flush_cache(tfm);
+}
+
+/*
+ * Atomic (interrupt context) & non-atomic (user context) functions to
+ * convert a vaddr into a physical address & pagesize.
+ * 	returns:
+ * 		  0 - successful
+ * 		< 0 - error code
+ * 		  1 - (atomic only) try again in non-atomic context
+ */
+static int non_atomic_pte_lookup(struct vm_area_struct *vma,
+				 unsigned long vaddr, int write,
+				 unsigned long *paddr, int *pagesize)
+{
+	struct page *page;
+
+	if (get_user_pages
+	    (current, current->mm, vaddr, 1, write, 1, &page, NULL) <= 0)
+		return -EFAULT;
+	*paddr = page_to_phys(page);
+	*pagesize =
+	    is_vm_hugetlb_page(vma) ? GRU_PAGESIZE(HPAGE_SHIFT) :
+	    GRU_PAGESIZE(PAGE_SHIFT);
+	put_page(page);
+	return 0;
+}
+
+static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
+			     int write, unsigned long *paddr, int *pagesize)
+{
+	struct page *page;
+
+	page = follow_page(vma, vaddr, (write ? FOLL_WRITE : 0));
+	if (!page)
+		return 1;
+	*paddr = page_to_phys(page);
+	*pagesize =
+	    is_vm_hugetlb_page(vma) ? GRU_PAGESIZE(HPAGE_SHIFT) :
+	    GRU_PAGESIZE(PAGE_SHIFT);
+	return 0;
+}
+
+/*
+ * Drop a TLB entry into the GRU. The fault is described by info in an TFH.
+ *	Input:
+ *		cb    Address of user CBR. Null if not running in user context
+ * 	Return:
+ * 		  0 = dropin, exception, or switch to UPM successful
+ * 		  1 = range invalidate active
+ * 		  2 = asid == 0
+ * 		< 0 = error code
+ *
+ */
+static int gru_try_dropin(struct gru_thread_state *gts,
+			  struct gru_tlb_fault_handle *tfh,
+			  unsigned long __user *cb)
+{
+	struct mm_struct *mm = gts->ts_mm;
+	struct vm_area_struct *vma;
+	int pagesize, asid, write, ret;
+	unsigned long paddr, vaddr;
+
+	/*
+	 * NOTE: The GRU contains magic hardware that eliminates races between
+	 * TLB invalidates and TLB dropins. If an invalidate occurs
+	 * in the window between reading the TFH and the subsequent TLB dropin,
+	 * the dropin is ignored. This eliminates the need for additional locks.
+	 */
+	write = (tfh->cause & TFHCAUSE_TLB_MOD) != 0;
+	vaddr = tfh->missvaddr;
+	asid = tfh->missasid;
+	if (asid == 0)
+		goto failnoasid;
+
+	rmb();	/* TFH must be cache resident before reading ms_range_active */
+
+	/*
+	 * TFH is cache resident - at least briefly. Fail the dropin
+	 * if a range invalidate is active.
+	 */
+	if (atomic_read(&gts->ts_ms->ms_range_active))
+		goto failactive;
+
+	vma = find_vma(mm, vaddr);
+	if (!vma)
+		goto failinval;
+
+	/*
+	 * Atomic lookup is faster & usually works even if called in non-atomic
+	 * context.
+	 */
+	ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &pagesize);
+	if (ret) {
+		if (!cb)
+			goto failupm;
+		if (non_atomic_pte_lookup(vma, vaddr, write, &paddr, &pagesize))
+			goto failinval;
+	}
+	if (is_gru_paddr(paddr))
+		goto failinval;
+	gru_cb_set_istatus_active(cb);
+	tfh_write_restart(tfh, paddr, GAA_RAM, vaddr, asid, write, pagesize);
+	STAT(tlb_dropin);
+	gru_dbg(grudev,
+		"%s: tfh 0x%p, vaddr 0x%lx, asid 0x%x, ps %d, paddr 0x%lx\n",
+		ret ? "non-atomic" : "atomic", tfh, vaddr, asid, pagesize,
+		paddr);
+	return 0;
+
+failnoasid:
+	/* No asid (delayed unload). */
+	STAT(tlb_dropin_fail_no_asid);
+	gru_dbg(grudev, "FAILED no_asid tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
+	if (!cb)
+		tfh_user_polling_mode(tfh);
+	return 2;
+
+failupm:
+	/* Atomic failure switch CBR to UPM */
+	STAT(tlb_dropin_fail_upm);
+	gru_dbg(grudev, "FAILED upm tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
+	tfh_user_polling_mode(tfh);
+	return 1;
+
+failinval:
+	/* All errors (atomic & non-atomic) switch CBR to EXCEPTION state */
+	STAT(tlb_dropin_fail_invalid);
+	gru_dbg(grudev, "FAILED inval tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
+	tfh_exception(tfh);
+	return -EFAULT;
+
+failactive:
+	/* Range invalidate active. Switch to UPM iff atomic */
+	STAT(tlb_dropin_fail_range_active);
+	gru_dbg(grudev, "FAILED range active: tfh 0x%p, vaddr 0x%lx\n",
+		tfh, vaddr);
+	if (!cb)
+		tfh_user_polling_mode(tfh);
+	return 1;
+}
+
+/*
+ * Process an external interrupt from the GRU. This interrupt is
+ * caused by a TLB miss.
+ * Note that this is the interrupt handler that is registered with linux
+ * interrupt handlers.
+ */
+irqreturn_t gru_intr(int irq, void *dev_id)
+{
+	struct gru_state *gru;
+	struct gru_tlb_fault_map map;
+	struct gru_thread_state *gts;
+	struct gru_tlb_fault_handle *tfh = NULL;
+	int cbrnum, ctxnum;
+
+	STAT(intr);
+
+	gru = irq_to_gru(irq);
+	if (!gru) {
+		dev_err(grudev, "GRU: invalid interrupt: cpu %d, irq %d\n",
+			raw_smp_processor_id(), irq);
+		return IRQ_NONE;
+	}
+	get_clear_fault_map(gru, &map);
+	gru_dbg(grudev, "irq %d, gru %x, map 0x%lx\n", irq, gru->gs_gid,
+		map.fault_bits[0]);
+
+	for_each_cbr_in_tfm(cbrnum, map.fault_bits) {
+		tfh = get_tfh_by_index(gru, cbrnum);
+		prefetchw(tfh);	/* Helps on hdw, required for emulator */
+
+		/*
+		 * When hardware sets a bit in the faultmap, it implicitly
+		 * locks the GRU context so that it cannot be unloaded.
+		 * gs_gts cannot change until a TFH start/writestart command
+		 * is issued
+		 */
+		ctxnum = tfh->ctxnum;
+		gts = gru->gs_gts[ctxnum];
+		if (down_read_trylock(&gts->ts_mm->mmap_sem)) {
+			gru_try_dropin(gts, tfh, NULL);
+			up_read(&gts->ts_mm->mmap_sem);
+		} else {
+			tfh_user_polling_mode(tfh);
+		}
+	}
+	return IRQ_HANDLED;
+}
+
+/*
+ * UPM call but nothing found in TFH. It _could_ be a race that was lost,
+ * a user bug, or a hardware bug. Try to determine which.
+ */
+static int gru_check_for_bug(unsigned long arg,
+			     struct gru_tlb_fault_handle *tfh)
+{
+	struct gru_instruction_bits ins, *cb = (void *)arg;
+
+	STAT(call_os_check_for_bug);
+	gru_dbg(grudev, "cb %p\n", cb);
+	if (copy_from_user(&ins, cb, sizeof(ins)))
+		return -EFAULT;
+	if (cb->istatus != CBS_CALL_OS)
+		return 0;
+	barrier();
+	gru_flush_cache(cb);
+	if (copy_from_user(&ins, cb, sizeof(ins)))
+		return -EFAULT;
+	if (cb->istatus != CBS_CALL_OS) {
+		dev_info(grudev, "cb %p: Possible coherency bug\n", cb);
+		return 0;
+	}
+
+	gru_flush_cache(tfh);
+	barrier();
+
+	if (tfh->state == TFHSTATE_MISS_UPM) {
+		dev_info(grudev, "tfh %p: Possible coherency bug\n", cb);
+		return -EAGAIN;
+	}
+	gru_dbg(grudev, "cb %p: CB in UPM state but no TFH fault\n", cb);
+	return -EIO;
+
+}
+
+static int gru_user_dropin(struct gru_thread_state *gts,
+			   struct gru_tlb_fault_handle *tfh,
+			   unsigned long __user *cb)
+{
+	struct gru_mm_struct *gms = gts->ts_ms;
+	int ret;
+
+	while (1) {
+		wait_event(gms->ms_wait_queue,
+			   atomic_read(&gms->ms_range_active) == 0);
+		prefetchw(tfh);	/* Helps on hdw, required for emulator */
+		ret = gru_try_dropin(gts, tfh, cb);
+		if (ret <= 0)
+			return ret;
+		STAT(call_os_wait_queue);
+	}
+}
+
+/*
+ * This interface is called as a result of a user detecting a "call OS" bit
+ * in a user CB. Normally means that a TLB fault has occurred.
+ * 	cb - user virtual address of the CB
+ */
+int gru_handle_user_call_os(unsigned long cb)
+{
+	struct gru_tlb_fault_handle *tfh;
+	struct gru_thread_state *gts;
+	unsigned long __user *cbp;
+	int ucbnum, cbrnum, ret = -EINVAL;
+
+	STAT(call_os);
+	gru_dbg(grudev, "address 0x%lx\n", cb);
+
+	/* sanity check the cb pointer */
+	ucbnum = UCBNUM(cb);
+	if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB)
+		return -EINVAL;
+	cbp = (unsigned long *)cb;
+
+	gts = gru_find_and_lock_gts(cb);
+	if (!gts)
+		return -EINVAL;
+
+	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	/*
+	 * If force_unload is set, the UPM TLB fault is phony. The task
+	 * has migrated to another node and the GSEG must be moved. Just
+	 * unload the context. The task will page fault and assign a new
+	 * context.
+	 */
+	ret = -EAGAIN;
+	cbrnum = thread_cbr_number(gts, ucbnum);
+	if (gts->ts_force_unload) {
+		gru_unload_context(gts, 1);
+	} else if (gts->ts_gru) {
+		tfh = get_tfh_by_index(gts->ts_gru, cbrnum);
+		prefetchw(tfh);	/* Helps on hdw, required for emulator */
+		if (tfh->state == TFHSTATE_IDLE) {
+			gru_dbg(grudev, "UNEXPECTED: tfh %p idle\n", tfh);
+			gru_flush_cache(tfh);
+			STAT(call_os_tfh_idle);
+		}
+		if (tfh->state == TFHSTATE_MISS_UPM)
+			ret = gru_user_dropin(gts, tfh, cbp);
+		else
+			ret = gru_check_for_bug(cb, tfh);
+	}
+exit:
+	gru_unlock_gts(gts);
+	return ret;
+}
+
+/*
+ * Fetch the exception detail information for a CB that terminated with
+ * an exception.
+ */
+int gru_get_exception_detail(unsigned long arg)
+{
+	struct control_block_extended_exc_detail excdet;
+	struct gru_control_block_extended *cbe;
+	struct gru_thread_state *gts;
+	int ucbnum, cbrnum, ret;
+
+	STAT(user_exception);
+	if (copy_from_user(&excdet, (void __user *)arg, sizeof(excdet)))
+		return -EFAULT;
+
+	gru_dbg(grudev, "address 0x%lx\n", excdet.cb);
+	gts = gru_find_and_lock_gts(excdet.cb);
+	if (!gts)
+		return -EINVAL;
+
+	if (gts->ts_gru) {
+		ucbnum = UCBNUM(excdet.cb);
+		cbrnum = thread_cbr_number(gts, ucbnum);
+		cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
+		excdet.opc = cbe->opccpy;
+		excdet.exopc = cbe->exopccpy;
+		excdet.ecause = cbe->ecause;
+		excdet.exceptdet0 = cbe->idef1upd;
+		excdet.exceptdet1 = cbe->idef3upd;
+		ret = 0;
+	} else {
+		ret = -EAGAIN;
+	}
+	gru_unlock_gts(gts);
+
+	gru_dbg(grudev, "address 0x%lx, ecause 0x%x\n", excdet.cb,
+		excdet.ecause);
+	if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
+		ret = -EFAULT;
+	return ret;
+}
+
+/*
+ * User request to unload a context. Content is saved for possible reload.
+ */
+int gru_user_unload_context(unsigned long arg)
+{
+	struct gru_thread_state *gts;
+	struct gru_unload_context_req req;
+
+	STAT(user_unload_context);
+	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+		return -EFAULT;
+
+	gru_dbg(grudev, "vaddr 0x%lx\n", req.vaddr);
+
+	gts = gru_find_and_lock_gts(req.vaddr);
+	if (!gts)
+		return -EINVAL;
+
+	if (gts->ts_gru)
+		gru_unload_context(gts, 1);
+	gru_unlock_gts(gts);
+
+	return 0;
+}
+
+/*
+ * User request to flush a range of virtual addresses from the GRU TLB
+ * (Mainly for testing).
+ */
+int gru_user_flush_tlb(unsigned long arg)
+{
+	struct gru_thread_state *gts;
+	struct gru_flush_tlb_req req;
+
+	STAT(user_flush_tlb);
+	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+		return -EFAULT;
+
+	gru_dbg(grudev, "gseg 0x%lx, vaddr 0x%lx, len 0x%lx\n", req.gseg,
+		req.vaddr, req.len);
+
+	gts = gru_find_and_lock_gts(req.gseg);
+	if (!gts)
+		return -EINVAL;
+
+	gru_flush_tlb_range(gts->ts_ms, req.vaddr, req.vaddr + req.len);
+	gru_unlock_gts(gts);
+
+	return 0;
+}
+
+/*
+ * Register the current task as the user of the GSEG slice.
+ * Needed for TLB fault interrupt targeting.
+ */
+int gru_set_task_slice(long address)
+{
+	struct gru_thread_state *gts;
+
+	STAT(set_task_slice);
+	gru_dbg(grudev, "address 0x%lx\n", address);
+	gts = gru_find_and_lock_gts(address);
+	if (!gts)
+		return -EINVAL;
+
+	gts->ts_tgid_owner = current->tgid;
+	gru_unlock_gts(gts);
+
+	return 0;
+}
Index: linux/drivers/gru/grufile.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/grufile.c	2008-02-19 09:30:53.000000000 -0600
@@ -0,0 +1,453 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              FILE OPERATIONS & DRIVER INITIALIZATION
+ *
+ * This file supports the user system call for file open, close, mmap, etc.
+ * This also incudes the driver initialization code.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#ifdef EMU
+#include "preemu.h"
+#endif
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+#ifdef __ia64__
+#include <asm/sn/addrs.h>
+#include <asm/sn/sn_cpuid.h>
+#else
+#define cnodeid_to_nasid(n)	0	/* ZZZ fixme */
+#endif
+#ifdef EMU
+#include "emu.h"
+#endif
+
+#ifndef EMU
+struct gru_stats_s gru_stats;
+struct gru_blade_state *gru_base[GRU_MAX_BLADES];
+unsigned long gru_start_paddr, gru_end_paddr;
+#endif
+
+static struct file_operations gru_fops;
+static struct miscdevice gru_miscdev;
+
+/*
+ * gru_vma_open
+ *
+ * Called when a device mapping is created by a means other than mmap
+ * (via fork, etc.).  Increments the reference count on the underlying
+ * gru data so it is not freed prematurely.
+ */
+STATIC void gru_vma_open(struct vm_area_struct *vma)
+{
+	struct gru_thread_state *gts;
+	struct gru_thread_data *gtd;
+
+	if (IS_THREAD_DATA(vma->vm_private_data)) {
+		gtd = vma->vm_private_data;
+	} else {
+		gts = gru_find_thread_state(vma, TSID(0));
+		down(&gts->ts_ctxsem);
+		zap_page_range(vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE, NULL);
+		if (gts->ts_gru)
+			gru_unload_context(gts, 1);
+		gtd = gts->ts_td;
+		up(&gts->ts_ctxsem);
+	}
+
+	atomic_inc(&gtd->td_refcnt);
+	vma->vm_private_data = gtd;
+	gru_dbg(grudev, "vma %p, gtd %p, refcnt %d\n", vma, gtd,
+		atomic_read(&gtd->td_refcnt));
+}
+
+/*
+ * gru_vma_close
+ *
+ * Called when unmapping a device mapping. Frees all gru resources
+ * and tables belonging to the vma.
+ */
+STATIC void gru_vma_close(struct vm_area_struct *vma)
+{
+	struct gru_vma_data *vdata;
+	struct gru_thread_state *gts;
+	struct list_head *entry, *next;
+
+	if (IS_THREAD_DATA(vma->vm_private_data)) {
+		gru_dbg(grudev, "vma %p, td %p\n", vma, vma->vm_private_data);
+		gtd_drop(vma->vm_private_data);
+	} else {
+		vdata = vma->vm_private_data;
+		vma->vm_private_data = NULL;
+		gru_dbg(grudev, "vma %p, vdata %p\n", vma, vdata);
+		list_for_each_safe(entry, next, &vdata->vd_head) {
+			gts =
+			    list_entry(entry, struct gru_thread_state, ts_next);
+			list_del(&gts->ts_next);
+			down(&gts->ts_ctxsem);
+			if (gts->ts_gru)
+				gru_unload_context(gts, 0);
+			up(&gts->ts_ctxsem);
+			gtd_drop(gts->ts_td);
+			gts_drop(gts);
+		}
+		kfree(vdata);
+		STAT(vdata_free);
+	}
+}
+
+/*
+ * gru_file_open
+ *
+ * Called when the GRU is opened.
+ */
+STATIC int gru_file_open(struct inode *inode, struct file *file)
+{
+	struct gru_file_data *fdata;
+
+	fdata = kzalloc(sizeof(*fdata), GFP_KERNEL);
+	if (!fdata)
+		return -ENOMEM;
+
+	STAT(fdata_alloc);
+	file->private_data = (void *)fdata;
+	gru_dbg(grudev, "file %p, fdata %p\n", file, fdata);
+	return 0;
+}
+
+/*
+ * gru_file_release
+ *
+ * Called when the GRU is released - last "open" has been closed.
+ */
+STATIC int gru_file_release(struct inode *inode, struct file *file)
+{
+	gru_dbg(grudev, "file %p, fdata %p\n", file, file->private_data);
+	kfree(file->private_data);
+	STAT(fdata_free);
+	return 0;
+}
+
+/*
+ * gru_file_mmap
+ *
+ * Called when mmaping the device.  Initializes the vma with a fault handler
+ * and private data structure necessary to allocate, track, and free the
+ * underlying pages.
+ */
+STATIC int gru_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct gru_file_data *fdata = file->private_data;
+
+	if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) != (VM_SHARED | VM_WRITE))
+		return -EPERM;
+
+	if (vma->vm_start & (GRU_GSEG_PAGESIZE - 1) ||
+	    CONTEXT_WINDOW_BYTES(fdata->fd_thread_slices) !=
+	    vma->vm_end - vma->vm_start)
+		return -EINVAL;
+
+	vma->vm_flags |=
+	    (VM_IO | VM_LOCKED | VM_DONTEXPAND | VM_PFNMAP | VM_RESERVED);
+	vma->vm_page_prot = PAGE_SHARED;
+	vma->vm_ops = &gru_vm_ops;
+
+	vma->vm_private_data = gru_alloc_vma_data(vma, TSID(0), NULL);
+	if (!vma->vm_private_data)
+		return -ENOMEM;
+
+	gru_dbg(grudev, "file %p, fdata %p, vaddr 0x%lx, vma %p, vdata %p\n",
+		file, file->private_data, vma->vm_start, vma,
+		vma->vm_private_data);
+	return 0;
+}
+
+/*
+ * Create a new GRU context
+ */
+static int gru_create_new_context(unsigned long arg,
+				  struct gru_file_data *fdata)
+{
+	struct gru_create_context_req req;
+
+	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+		return -EFAULT;
+
+	if (req.data_segment_bytes == 0
+	    || req.data_segment_bytes > GRU_NUM_USER_DSR_BYTES)
+		return -EINVAL;
+	if (req.control_blocks == 0 || req.control_blocks > GRU_NUM_USER_CBR)
+		return -EINVAL;
+	if (req.maximum_thread_count == 0 || req.maximum_thread_count > NR_CPUS)
+		return -EINVAL;
+
+	if (!(req.options & GRU_OPT_MISS_MASK))
+		req.options |= GRU_OPT_MISS_USER_POLL;	/* ZZZ change default */
+
+	fdata->fd_dsr_au_count = GRU_DS_BYTES_TO_AU(req.data_segment_bytes);
+	fdata->fd_user_options = req.options;
+	fdata->fd_cbr_au_count = GRU_CB_COUNT_TO_AU(req.control_blocks);
+	fdata->fd_thread_slices = req.maximum_thread_count;
+
+	return 0;
+}
+
+/*
+ * Get GRU configuration info (temp - for emulator testing)
+ */
+static long gru_get_config_info(unsigned long arg)
+{
+	struct gru_config_info info;
+
+	info.cpus = num_online_cpus();
+	info.nodes = num_online_nodes();
+	info.blades = info.nodes / NODESPERBLADE;
+	info.chiplets = GRU_CHIPLETS_PER_BLADE * info.blades;
+
+	if (copy_to_user((void __user *)arg, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * gru_file_unlocked_ioctl
+ *
+ * Called to update file attributes via IOCTL calls.
+ */
+STATIC long gru_file_unlocked_ioctl(struct file *file, unsigned int req,
+				    unsigned long arg)
+{
+	int err = -EBADRQC;
+
+	gru_dbg(grudev, "file %p, fdata %p\n", file, file->private_data);
+
+	switch (req) {
+	case GRU_CREATE_CONTEXT:
+		err = gru_create_new_context(arg, file->private_data);
+		break;
+	case GRU_SET_TASK_SLICE:
+		err = gru_set_task_slice(arg);
+		break;
+	case GRU_USER_GET_EXCEPTION_DETAIL:
+		err = gru_get_exception_detail(arg);
+		break;
+	case GRU_USER_UNLOAD_CONTEXT:
+		err = gru_user_unload_context(arg);
+		break;
+	case GRU_USER_FLUSH_TLB:
+		err = gru_user_flush_tlb(arg);
+		break;
+	case GRU_USER_CALL_OS:
+		err = gru_handle_user_call_os(arg);
+		break;
+	case GRU_GET_CONFIG_INFO:
+		err = gru_get_config_info(arg);
+		break;
+	}
+	return err;
+}
+
+/*
+ * Called at init time to build tables for all GRUs that are present in the
+ * system.
+ */
+static void gru_init_chiplet(struct gru_state *gru, unsigned long paddr,
+			void *vaddr, int base_nasid, int nid, int bid, int grunum)
+{
+	spin_lock_init(&gru->gs_lock);
+	spin_lock_init(&gru->gs_asid_lock);
+	gru->gs_gru_base_paddr = paddr;
+	gru->gs_gru_base_vaddr = vaddr;
+	gru->gs_gid = bid * GRUS_PER_HUB + grunum;
+	gru->gs_blade = gru_base[bid];
+	gru->gs_present = 1;
+	gru->gs_blade_id = bid;
+	gru->gs_cbr_map = (GRU_CBR_AU == 64) ? ~0 : (1UL << GRU_CBR_AU) - 1;
+	gru->gs_dsr_map = (1UL << GRU_DSR_AU) - 1;
+	gru_tgh_flush_init(gru);
+	gru_dbg(grudev, "bid %d, nid %d, gru %x, vaddr %p (0x%lx)\n",
+			bid, nid, gru->gs_gid, gru->gs_gru_base_vaddr,
+			gru->gs_gru_base_paddr);
+	gru_kservices_init(gru);
+}
+
+static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr,
+			   int base_nasid)
+{
+	int nasid, nid, bid, grunum;
+	int order = get_order(sizeof(struct gru_blade_state));
+	struct page *page;
+	struct gru_state *gru;
+	unsigned long paddr;
+	void *vaddr;
+
+	for_each_online_node(nid) {
+		bid = nid_to_blade(nid);
+		nasid = cnodeid_to_nasid(nid);
+		if (gru_base[bid])
+			continue;
+		page = alloc_pages_node(nid, GFP_KERNEL, order);
+		if (!page)
+			goto fail;
+		gru_base[bid] = page_address(page);
+		memset(gru_base[bid], 0, sizeof(struct gru_blade_state));
+		gru_base[bid]->bs_lru_gru = &gru_base[bid]->bs_grus[0];
+		spin_lock_init(&gru_base[bid]->bs_lock);
+
+		for (gru = gru_base[bid]->bs_grus, grunum = 0;
+		     		grunum < GRU_CHIPLETS_PER_BLADE; grunum++, gru++) {
+			paddr = gru_base_paddr + GRUCHIPOFFSET(nasid, base_nasid, grunum);
+			vaddr = gru_base_vaddr + GRUCHIPOFFSET(nasid, base_nasid, grunum);
+			gru_init_chiplet(gru, paddr, vaddr, nasid, bid, nid, grunum);
+		}
+	}
+
+	return 0;
+
+fail:
+	for (nid--; nid >= 0; nid--)
+		free_pages((unsigned long)gru_base[nid], order);
+	return -ENOMEM;
+}
+
+/*
+ * gru_init
+ *
+ * Called at boot or module load time to initialize the GRUs.
+ */
+STATIC int __init gru_init(void)
+{
+	int ret, irqno;
+	char id[10];
+	void *gru_start_vaddr;
+	int base_nasid;
+
+#ifdef EMU
+	gru_start_paddr = GRUPSEGBASE;
+	gru_end_paddr = GRUPSEGBASE + MAX_NUMNODES * GRU_SIZE;
+	gru_start_vaddr = GRUVSEGBASE;
+	base_nasid = 0;
+#else
+	/* Need real addresses from ACPI */
+	gru_start_paddr = 0xd000000000UL;
+	gru_end_paddr = 0xd000000000UL + MAX_NUMNODES * GRU_SIZE;
+	gru_start_vaddr = __va(gru_start_paddr);
+	base_nasid = 0;
+#endif
+	printk(KERN_INFO "GRU space: 0x%lx - 0x%lx\n",
+	       gru_start_paddr, gru_end_paddr);
+	for (irqno = 0; irqno < GRU_CHIPLETS_PER_BLADE; irqno++) {
+		ret = request_irq(IRQ_GRU + irqno, gru_intr, 0, id, NULL);
+		if (ret) {
+			printk(KERN_ERR "%s: request_irq failed\n",
+			       GRU_DRIVER_ID_STR);
+			goto exit1;
+		}
+	}
+
+	ret = misc_register(&gru_miscdev);
+	if (ret) {
+		printk(KERN_ERR "%s: misc_register failed\n",
+		       GRU_DRIVER_ID_STR);
+		goto exit1;
+	}
+
+	ret = gru_proc_init();
+	if (ret) {
+		printk(KERN_ERR "%s: proc init failed\n", GRU_DRIVER_ID_STR);
+		goto exit2;
+	}
+
+	ret = gru_init_tables(gru_start_paddr, gru_start_vaddr, base_nasid);
+	if (ret) {
+		printk(KERN_ERR "%s: init tables failed\n", GRU_DRIVER_ID_STR);
+		goto exit3;
+	}
+
+	printk(KERN_INFO "%s: v%s\n", GRU_DRIVER_ID_STR, REVISION);
+	return 0;
+
+exit3:
+	gru_proc_exit();
+exit2:
+	misc_deregister(&gru_miscdev);
+exit1:
+	for (--irqno; irqno >= 0; irqno--)
+		free_irq(IRQ_GRU + irqno, NULL);
+	return ret;
+
+}
+
+static void __exit gru_exit(void)
+{
+	int i, bid;
+	int order = get_order(sizeof(struct gru_state) * GRU_CHIPLETS_PER_BLADE);
+
+	for (i = 0; i < GRU_CHIPLETS_PER_BLADE; i++)
+		free_irq(IRQ_GRU + i, NULL);
+
+	for (bid = 0; bid < GRU_MAX_BLADES; bid++)
+		free_pages((unsigned long)gru_base[bid], order);
+
+	misc_deregister(&gru_miscdev);
+	gru_proc_exit();
+}
+
+static struct file_operations gru_fops = {
+	.owner = THIS_MODULE,
+	.open = gru_file_open,
+	.release = gru_file_release,
+	.unlocked_ioctl = gru_file_unlocked_ioctl,
+	.mmap = gru_file_mmap,
+};
+
+static struct miscdevice gru_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "gru",
+	.fops = &gru_fops,
+};
+
+struct vm_operations_struct gru_vm_ops = {
+	.open = gru_vma_open,
+	.close = gru_vma_close,
+	.nopfn = gru_nopfn,
+};
+
+module_init(gru_init);
+module_exit(gru_exit);
+
+#ifndef MODULE
+static int set_debug_options(char *str)
+{
+	int val;
+
+	get_option(&str, &val);
+	options = val;
+	return 1;
+}
+
+__setup("gru_debug=", set_debug_options);
+#endif
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Driver for SGI GRU");
+MODULE_LICENSE("GPL");
Index: linux/drivers/gru/gruhandles.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/gruhandles.h	2008-02-19 09:30:53.000000000 -0600
@@ -0,0 +1,655 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              GRU HANDLE DEFINITION
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#ifndef _ASM_IA64_SN_GRUHANDLES_H
+#define _ASM_IA64_SN_GRUHANDLES_H
+
+/*
+ * Manifest constants for GRU Memory Map
+ */
+#define GRU_GSEG0_BASE		0
+#define GRU_MCS_BASE		(64 * 1024 * 1024)
+#define GRU_SIZE		(128UL * 1024 * 1024)
+
+/* Handle & resource counts */
+#define GRU_NUM_CB		128
+#define GRU_NUM_DSR_BYTES	(32 * 1024)
+#define GRU_NUM_TFM		16
+#define GRU_NUM_TGH		24
+#define GRU_NUM_CBE		128
+#define GRU_NUM_TFH		128
+#define GRU_NUM_CCH		16
+#define GRU_NUM_GSH		1
+
+/* Resources PERMANENTLY reserved for kernel use */
+#define GRU_NUM_KERNEL_CBR	16
+#define GRU_NUM_KERNEL_DSR_BYTES 1024
+#define KERNEL_CTXNUM		15
+
+/* Maximum resource counts that can be reserved by user programs */
+#define GRU_NUM_USER_CBR	(GRU_NUM_CBE - GRU_NUM_KERNEL_CBR)
+#define GRU_NUM_USER_DSR_BYTES	(GRU_NUM_DSR_BYTES - GRU_NUM_KERNEL_DSR_BYTES)
+
+/* Bytes per handle & handle stride. Code assumes all cb, tfh, cbe handles
+ * are the same */
+#define GRU_HANDLE_BYTES	64
+#define GRU_HANDLE_STRIDE	256
+
+/* Base addresses of handles */
+#define GRU_TFM_BASE		(GRU_MCS_BASE + 0x00000)
+#define GRU_TGH_BASE		(GRU_MCS_BASE + 0x08000)
+#define GRU_CBE_BASE		(GRU_MCS_BASE + 0x10000)
+#define GRU_TFH_BASE		(GRU_MCS_BASE + 0x18000)
+#define GRU_CCH_BASE		(GRU_MCS_BASE + 0x20000)
+#define GRU_GSH_BASE		(GRU_MCS_BASE + 0x30000)
+
+/* User gseg constants */
+#define GRU_GSEG_STRIDE		(4 * 1024 * 1024)
+#ifdef __ia64__
+#define GRU_GSEG_PAGESIZE	(256 * 1024)
+#define GRU_GSEG_PAGESIZE_SHIFT	18
+#else
+#define GRU_GSEG_PAGESIZE	(2 * 1024 * 1024UL)
+#endif
+#define GSEG_BASE(a)		((a) & ~(GRU_GSEG_PAGESIZE - 1))
+
+/* Data segment constants */
+#define GRU_DSR_AU_BYTES	1024
+#define GRU_DSR_CL		(GRU_NUM_DSR_BYTES / GRU_CACHE_LINE_BYTES)
+#define GRU_DSR_AU_CL		(GRU_DSR_AU_BYTES / GRU_CACHE_LINE_BYTES)
+#define GRU_DSR_AU		(GRU_NUM_DSR_BYTES / GRU_DSR_AU_BYTES)
+
+/* Control block constants */
+#define GRU_CBR_AU_SIZE		2
+#define GRU_CBR_AU		(GRU_NUM_CBE / GRU_CBR_AU_SIZE)
+
+/* Convert resource counts to the number of AU */
+#define GRU_DS_BYTES_TO_AU(n)	(((n) + GRU_DSR_AU_BYTES - 1) / \
+				 GRU_DSR_AU_BYTES)
+#define GRU_CB_COUNT_TO_AU(n)	(((n) + GRU_CBR_AU_SIZE - 1) / 	\
+				 GRU_CBR_AU_SIZE)
+
+/* UV limits */
+#define GRUS_PER_HUB		2
+#define GRU_HUBS_PER_BLADE	1
+#define GRU_CHIPLETS_PER_BLADE	(GRU_HUBS_PER_BLADE * GRUS_PER_HUB)
+
+/* User GRU Gseg offsets */
+#define GRU_CB_BASE		0
+#define GRU_CB_LIMIT		(GRU_CB_BASE + GRU_HANDLE_STRIDE * GRU_NUM_CBE)
+#define GRU_DS_BASE		0x20000
+#define GRU_DS_LIMIT		(GRU_DS_BASE + GRU_NUM_DSR_BYTES)
+
+/* General addressing macros. b=grubase, c=ctxnum, i=cbnum, cl=cacheline#  */
+#define GRU_GSEG(b, c)		((void *)((b) + GRU_GSEG0_BASE + GRU_GSEG_STRIDE * (c)))
+#define GRU_GSEG_CB(b, c, i)	((void *)(GRU_GSEG((b), (c)) + GRU_CB_BASE + GRU_HANDLE_STRIDE * (i)))
+#define GRU_GSEG_DS(b, c, cl)	((void *)(GRU_GSEG((b), (c)) + GRU_DS_BASE + GRU_CACHE_LINE_BYTES * (cl)))
+#define GRU_TFM(b, c)		((struct gru_tlb_fault_map *)((unsigned long)(b) + GRU_TFM_BASE + (c) * GRU_HANDLE_STRIDE))
+#define GRU_TGH(b, c)		((struct gru_tlb_global_handle *)((unsigned long)(b) + GRU_TGH_BASE + (c) * GRU_HANDLE_STRIDE))
+#define GRU_CBE(b, n)		((struct gru_control_block_extended *)((unsigned long)(b) + GRU_CBE_BASE + (n) * GRU_HANDLE_STRIDE))
+#define GRU_TFH(b, n)		((struct gru_tlb_fault_handle *)((unsigned long)(b) + GRU_TFH_BASE + (n) * GRU_HANDLE_STRIDE))
+#define GRU_CCH(b, n)		((struct gru_context_configuration_handle *)((unsigned long)(b) + GRU_CCH_BASE + (n) * GRU_HANDLE_STRIDE))
+#define GRU_GSH(b)		((struct gru_global_status_handle *)((unsigned long)(b) + GRU_GSH_BASE))
+
+/* Test if an offset is a valid kernel handle address. Ex:  TYPE_IS(CBE, chiplet_offset) */
+#define TYPE_IS(hid, h)		((h) >= GRU_##hid##_BASE && (h) < GRU_##hid##_BASE + GRU_NUM_##hid * GRU_HANDLE_STRIDE	\
+				 && (((h) & (GRU_HANDLE_STRIDE - 1)) == 0))
+
+/* Test a GRU physical address to determine the type of address range (does NOT validate holes) */
+#define IS_MCS_PADDR(h)		(((h) & (GRU_SIZE - 1)) >= GRU_MCS_BASE)
+#define IS_CBR_PADDR(h)		(((h) & (GRU_SIZE - 1)) < GRU_MCS_BASE && (((h) & (GRU_GSEG_STRIDE - 1)) < GRU_DS_BASE))
+#define IS_DSR_PADDR(h)		(((h) & (GRU_SIZE - 1)) < GRU_MCS_BASE && (((h) & (GRU_GSEG_STRIDE - 1)) >= GRU_DS_BASE))
+
+/* Convert an arbitrary handle address to the beginning of the GRU segment */
+#ifndef __PLUGIN__
+#define GRUBASE(h)		((void *)((unsigned long)(h) & ~(GRU_SIZE - 1)))
+#else
+/* Emulator hack */
+extern void *gmu_grubase(void *h);
+#define GRUBASE(h)		gmu_grubase(h)
+#endif
+
+/* Convert a GRU physical address to the chiplet offset */
+#define GSEGPOFF(h) ((h) & (GRU_SIZE - 1))
+
+/* Convert a GSEG CB address to the relative CB number within the user gseg context */
+#define UCBNUM(cb) ((((unsigned long)(cb) - GRU_CB_BASE) % GRU_GSEG_PAGESIZE) / GRU_HANDLE_STRIDE)
+
+/* Convert a TFH address to the relative TFH number within the GRU*/
+#define TFHNUM(tfh) ((((unsigned long)(tfh) - GRU_TFH_BASE) % GRU_SIZE) / GRU_HANDLE_STRIDE)
+
+/* Convert a CCH address to the relative context number within the GRU*/
+#define CCHNUM(cch) ((((unsigned long)(cch) - GRU_CCH_BASE) % GRU_SIZE) / GRU_HANDLE_STRIDE)
+
+/* Convert a CBE address to the relative context number within the GRU*/
+#define CBENUM(cbe) ((((unsigned long)(cbe) - GRU_CBE_BASE) % GRU_SIZE) / GRU_HANDLE_STRIDE)
+
+/* Convert a TFM address to the relative context number within the GRU*/
+#define TFMNUM(tfm) ((((unsigned long)(tfm) - GRU_TFM_BASE) % GRU_SIZE) / GRU_HANDLE_STRIDE)
+
+/* byte offset to a specific GRU chiplet. (n=nasid, bn=base_nasid for first node, c=chiplet (0 or 1)*/
+#define GRUCHIPOFFSET(n, bn, c) (GRU_SIZE * ((n) - (bn) + (c)))
+
+#ifndef BITS_TO_LONGS
+#define BITS_TO_LONGS(bits)     (((bits)+64-1)/64)
+#endif
+
+/*
+ * GSH - GRU Status Handle
+ *
+ */
+struct gru_global_status_handle {
+	unsigned long bits[BITS_TO_LONGS(GRU_NUM_CBE) * 2];
+	unsigned long fill[4];
+};
+
+enum gru_gsh_status {
+	GSHSTATUS_INACTIVE,
+	GSHSTATUS_IDLE,
+	GSHSTATUS_ACTIVE,
+	GSHSTATUS_INTERRUPTED
+};
+
+/*
+ * Global TLB Fault Map
+ *
+ */
+struct gru_tlb_fault_map {
+	unsigned long fault_bits[BITS_TO_LONGS(GRU_NUM_CBE)];
+	unsigned long fill0[2];
+	unsigned long done_bits[BITS_TO_LONGS(GRU_NUM_CBE)];
+	unsigned long fill1[2];
+};
+
+/*
+ * TGH - TLB Global Handle
+ *
+ */
+struct gru_tlb_global_handle {
+	unsigned int cmd:1;		/* DW 0 */
+	unsigned int delresp:1;
+	unsigned int opc:1;
+	unsigned int fill1:5;
+
+	unsigned int fill2:8;
+
+	unsigned int status:2;
+	unsigned long fill3:2;
+	unsigned int state:3;
+	unsigned long fill4:1;
+
+	unsigned int cause:3;
+	unsigned long fill5:37;
+
+	unsigned long vaddr:64;		/* DW 1 */
+
+	unsigned int asid:24;		/* DW 2 */
+	unsigned int fill6:8;
+
+	unsigned int pagesize:5;
+	unsigned int fill7:11;
+
+	unsigned int global:1;
+	unsigned int fill8:15;
+
+	unsigned long vaddrmask:39;	/* DW 3 */
+	unsigned int fill9:9;
+	unsigned int n:10;
+	unsigned int fill10:6;
+
+	unsigned int ctxbitmap:16;	/* DW4 */
+	unsigned long fill11[3];
+};
+
+enum gru_tgh_cmd {
+	TGHCMD_START
+};
+
+enum gru_tgh_opc {
+	TGHOP_TLBNOP,
+	TGHOP_TLBINV
+};
+
+enum gru_tgh_status {
+	TGHSTATUS_IDLE,
+	TGHSTATUS_EXCEPTION,
+	TGHSTATUS_ACTIVE
+};
+
+enum gru_tgh_state {
+	TGHSTATE_IDLE,
+	TGHSTATE_PE_INVAL,
+	TGHSTATE_INTERRUPT_INVAL,
+	TGHSTATE_WAITDONE,
+	TGHSTATE_RESTART_CTX,
+};
+
+/*
+ * TFH - TLB Global Handle
+ *
+ */
+struct gru_tlb_fault_handle {
+	unsigned int cmd:1;		/* DW 0 - low 32*/
+	unsigned int delresp:1;
+	unsigned int fill0:2;
+	unsigned int opc:3;
+	unsigned int fill1:9;
+
+	unsigned int status:2;
+	unsigned int fill2:1;
+	unsigned int color:1;
+	unsigned int state:3;
+	unsigned int fill3:1;
+
+	unsigned int cause:7;		/* DW 0 - high 32 */
+	unsigned int fill4:1;
+
+	unsigned int indexway:12;
+	unsigned int fill5:4;
+
+	unsigned int ctxnum:4;
+	unsigned int fill6:12;
+
+	unsigned long missvaddr:64;	/* DW 1 */
+
+	unsigned int missasid:24;	/* DW 2 */
+	unsigned int fill7:8;
+	unsigned int fillasid:24;
+	unsigned int dirty:1;
+	unsigned int gaa:2;
+	unsigned long fill8:5;
+
+	unsigned long pfn:41;		/* DW 3 */
+	unsigned int fill9:7;
+	unsigned int pagesize:5;
+	unsigned int fill10:11;
+
+	unsigned long fillvaddr:64;	/* DW 4 */
+
+	unsigned long fill11[3];
+};
+
+enum gru_tfh_opc {
+	TFHOP_NOOP,
+	TFHOP_RESTART,
+	TFHOP_WRITE_ONLY,
+	TFHOP_WRITE_RESTART,
+	TFHOP_EXCEPTION,
+	TFHOP_USER_POLLING_MODE = 7,
+};
+
+enum tfh_status {
+	TFHSTATUS_IDLE,
+	TFHSTATUS_EXCEPTION,
+	TFHSTATUS_ACTIVE,
+};
+
+enum tfh_state {
+	TFHSTATE_INACTIVE,
+	TFHSTATE_IDLE,
+	TFHSTATE_MISS_UPM,
+	TFHSTATE_MISS_FMM,
+	TFHSTATE_HW_ERR,
+	TFHSTATE_WRITE_TLB,
+	TFHSTATE_RESTART_CBR,
+};
+
+/* TFH cause bits */
+enum tfh_cause {
+	TFHCAUSE_NONE,
+	TFHCAUSE_TLB_MISS,
+	TFHCAUSE_TLB_MOD,
+	TFHCAUSE_HW_ERROR_RR,
+	TFHCAUSE_HW_ERROR_MAIN_ARRAY,
+	TFHCAUSE_HW_ERROR_VALID,
+	TFHCAUSE_HW_ERROR_PAGESIZE,
+	TFHCAUSE_INSTRUCTION_EXCEPTION,
+	TFHCAUSE_UNCORRECTIBLE_ERROR,
+};
+
+/* GAA values */
+#define GAA_RAM				0x0
+#define GAA_NCRAM			0x2
+#define GAA_MMIO			0x1
+#define GAA_REGISTER			0x3
+
+/* GRU paddr shift for pfn. (NOTE: shift is NOT by actual pagesize) */
+#define GRU_PADDR_SHIFT			12
+
+/*
+ * Context Configuration handle
+ *
+ */
+struct gru_context_configuration_handle {
+	unsigned int cmd:1;			/* DW0 */
+	unsigned int delresp:1;
+	unsigned int opc:3;
+	unsigned int unmap_enable:1;
+	unsigned int req_slice_set_enable:1;
+	unsigned int req_slice:2;
+	unsigned int cb_int_enable:1;
+	unsigned int tlb_int_enable:1;
+	unsigned int tfm_fault_bit_enable:1;
+	unsigned int tlb_int_select:4;
+
+	unsigned int status:2;
+	unsigned int state:2;
+	unsigned int reserved2:4;
+
+	unsigned int cause:4;
+	unsigned int tfm_done_bit_enable:1;
+	unsigned int unused:3;
+
+	unsigned int dsr_allocation_map;
+
+	unsigned long cbr_allocation_map;	/* DW1 */
+
+	unsigned int asid[8];			/* DW 2 - 5 */
+	unsigned short sizeavail[8];		/* DW 6 - 7 */
+} __attribute__ ((packed));
+
+enum gru_cch_opc {
+	CCHOP_START = 1,
+	CCHOP_ALLOCATE,
+	CCHOP_INTERRUPT,
+	CCHOP_DEALLOCATE,
+	CCHOP_INTERRUPT_SYNC,
+};
+
+enum gru_cch_status {
+	CCHSTATUS_IDLE,
+	CCHSTATUS_EXCEPTION,
+	CCHSTATUS_ACTIVE,
+};
+
+enum gru_cch_state {
+	CCHSTATE_INACTIVE,
+	CCHSTATE_MAPPED,
+	CCHSTATE_ACTIVE,
+	CCHSTATE_INTERRUPTED,
+};
+
+/* CCH Exception cause */
+enum gru_cch_cause {
+	CCHCAUSE_REGION_REGISTER_WRITE_ERROR = 1,
+	CCHCAUSE_ILLEGAL_OPCODE = 2,
+	CCHCAUSE_INVALID_START_REQUEST = 3,
+	CCHCAUSE_INVALID_ALLOCATION_REQUEST = 4,
+	CCHCAUSE_INVALID_DEALLOCATION_REQUEST = 5,
+	CCHCAUSE_INVALID_INTERRUPT_REQUEST = 6,
+	CCHCAUSE_CCH_BUSY = 7,
+	CCHCAUSE_NO_CBRS_TO_ALLOCATE = 8,
+	CCHCAUSE_BAD_TFM_CONFIG = 9,
+	CCHCAUSE_CBR_RESOURCES_OVERSUBSCRIPED = 10,
+	CCHCAUSE_DSR_RESOURCES_OVERSUBSCRIPED = 11,
+	CCHCAUSE_CBR_DEALLOCATION_ERROR = 12,
+};
+/*
+ * CBE - Control Block Extended
+ *
+ */
+struct gru_control_block_extended {
+	unsigned int reserved0:1;	/* DW 0  - low */
+	unsigned int imacpy:3;
+	unsigned int reserved1:4;
+	unsigned int xtypecpy:3;
+	unsigned int iaa0cpy:2;
+	unsigned int iaa1cpy:2;
+	unsigned int reserved2:1;
+	unsigned int opccpy:8;
+	unsigned int exopccpy:8;
+
+	unsigned int idef2cpy:22;	/* DW 0  - high */
+	unsigned int reserved3:10;
+
+	unsigned int idef4cpy:22;	/* DW 1 */
+	unsigned int reserved4:10;
+	unsigned int idef4upd:22;
+	unsigned int reserved5:10;
+
+	unsigned long idef1upd:64;	/* DW 2 */
+
+	unsigned long idef5cpy:64;	/* DW 3 */
+
+	unsigned long idef6cpy:64;	/* DW 4 */
+
+	unsigned long idef3upd:64;	/* DW 5 */
+
+	unsigned long idef5upd:64;	/* DW 6 */
+
+	unsigned int idef2upd:22;	/* DW 7 */
+	unsigned int reserved6:10;
+
+	unsigned int ecause:20;
+	unsigned int cbrstate:4;
+	unsigned int cbrexecstatus:8;
+};
+
+enum gru_cbr_state {
+	CBRSTATE_INACTIVE,
+	CBRSTATE_IDLE,
+	CBRSTATE_PE_CHECK,
+	CBRSTATE_QUEUED,
+	CBRSTATE_WAIT_RESPONSE,
+	CBRSTATE_INTERRUPTED,
+	CBRSTATE_INTERRUPTED_MISS_FMM,
+	CBRSTATE_BUSY_INTERRUPT_MISS_FMM,
+	CBRSTATE_INTERRUPTED_MISS_UPM,
+	CBRSTATE_BUSY_INTERRUPTED_MISS_UPM,
+	CBRSTATE_REQUEST_ISSUE,
+	CBRSTATE_BUSY_INTERRUPT,
+};
+
+/* CBE cbrexecstatus bits */
+#define CBR_EXS_ABORT_OCC_BIT				0
+#define CBR_EXS_INT_OCC_BIT				1
+#define CBR_EXS_PENDING_BIT				2
+#define CBR_EXS_QUEUED_BIT				3
+#define CBR_EXS_TLBHW_BIT				4
+#define CBR_EXS_EXCEPTION_BIT				5
+
+#define CBR_EXS_ABORT_OCC				(1 << CBR_EXS_ABORT_OCC_BIT)
+#define CBR_EXS_INT_OCC					(1 << CBR_EXS_INT_OCC_BIT)
+#define CBR_EXS_PENDING					(1 << CBR_EXS_PENDING_BIT)
+#define CBR_EXS_QUEUED					(1 << CBR_EXS_QUEUED_BIT)
+#define CBR_EXS_TLBHW					(1 << CBR_EXS_TLBHW_BIT)
+#define CBR_EXS_EXCEPTION				(1 << CBR_EXS_EXCEPTION_BIT)
+
+/* CBE ecause bits  - defined in gru_instructions.h */
+
+/*
+ * Convert a processor pagesize into the strange encoded pagesize used by the GRU.
+ * Processor pagesize is encoded as log of bytes per page. (or PAGE_SHIFT)
+ * 	pagesize	log pagesize	grupagesize
+ * 	  4k			12	0
+ * 	  8k			13	1
+ * 	 16k 			14	2
+ * 	 64k			16	3
+ * 	256k			18	4
+ * 	...
+ */
+#define GRU_PAGESIZE(sh)		(((sh) <= 14) ? (sh) - 12 : ((sh) >> 1) - 5)
+#define GRU_SIZEAVAIL(sh)		(1UL << GRU_PAGESIZE(sh))
+
+/* minimum TLB purge count to ensure a full purge */
+#define GRUMAXINVAL			1024UL
+
+/* convert the weird GRU encoded pagesize to a pageshift or pagesize */
+#define GRUPAGESHIFT(e)			(((e) < 2) ? (12UL + (e)) : (14UL + 2UL * ((e) - 2)))
+#define GRUPAGESIZE(e)			(1UL << GRUPAGESHIFT(e))
+
+/*-----------------------------------------------------------------------------------------
+ *
+ * Handle operations
+ */
+
+#define cch_to_gsh(c)		GRU_GSH(GRUBASE(c))
+#define cch_to_tfh(c, i)	GRU_TFH(GRUBASE(c), (i))
+#define cch_to_cbe(c, i)	GRU_CBE(GRUBASE(c), (i))
+#define cbe_to_tfh(c)		GRU_TFH(GRUBASE(c), CBENUM(c))
+#define cbe_to_cch(c)		GRU_CCH(GRUBASE(c), CBENUM(c))
+#define tfh_to_cbe(c)		GRU_CBE(GRUBASE(c), TFHNUM(c))
+
+#ifdef __KERNEL__
+#include "gru_instructions.h"
+
+/* Extract the status field from a kernel handle */
+#define GET_MSEG_HANDLE_STATUS(h)	(((*(unsigned long*)(h)) >> 16) & 3)
+
+static inline void start_instruction(void *h)
+{
+	unsigned long *w0 = h;
+
+	wmb();		/* setting CMD bit must be last */
+	*w0 = *w0 | 1;
+	gru_flush_cache(h);
+}
+
+static inline int wait_instruction_complete(void *h)
+{
+	int status;
+
+	do {
+		gru_emulator_wait_hook(h, 1);	/* No code generated unless -D EMUSUPPORT */
+		cpu_relax();
+		barrier();
+		status = GET_MSEG_HANDLE_STATUS(h);
+	} while (status == CCHSTATUS_ACTIVE);
+	return status;
+}
+
+static inline int cch_allocate(struct gru_context_configuration_handle *cch,
+			       int asidval, unsigned long cbrmap,
+			       unsigned long dsrmap)
+{
+	int i;
+
+#if defined(__ia64__)
+	for (i = 0; i <= RGN_HPAGE; i++) {	/*  assume HPAGE is last region */
+		cch->asid[i] = (asidval++);
+		if (i == RGN_HPAGE)
+			cch->sizeavail[i] = GRU_SIZEAVAIL(hpage_shift);
+#ifdef EMU
+		else if (fake_tb_pages)
+			cch->sizeavail[i] = GRU_SIZEAVAIL(40);
+#endif
+		else
+			cch->sizeavail[i] = GRU_SIZEAVAIL(PAGE_SHIFT);
+	}
+#else
+	for (i = 0; i < 8; i++) {
+		cch->asid[i] = asidval++;
+		cch->sizeavail[i] = GRU_SIZEAVAIL(PAGE_SHIFT);	/* ZZZ hugepages??? */
+	}
+#endif
+
+	cch->dsr_allocation_map = dsrmap;
+	cch->cbr_allocation_map = cbrmap;
+	cch->opc = CCHOP_ALLOCATE;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int cch_start(struct gru_context_configuration_handle *cch)
+{
+	cch->opc = CCHOP_START;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int cch_interrupt(struct gru_context_configuration_handle *cch)
+{
+	cch->opc = CCHOP_INTERRUPT;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int cch_deallocate(struct gru_context_configuration_handle *cch)
+{
+	cch->opc = CCHOP_DEALLOCATE;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int cch_interrupt_sync(struct gru_context_configuration_handle
+				     *cch)
+{
+	cch->opc = CCHOP_INTERRUPT_SYNC;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int tgh_invalidate(struct gru_tlb_global_handle *tgh,
+				 unsigned long vaddr, unsigned long vaddrmask,
+				 int asid, int pagesize, int global, int n,
+				 unsigned short ctxbitmap)
+{
+	tgh->vaddr = vaddr;
+	tgh->asid = asid;
+	tgh->pagesize = pagesize;
+	tgh->n = n;
+	tgh->global = global;
+	tgh->vaddrmask = vaddrmask;
+	tgh->ctxbitmap = ctxbitmap;
+	tgh->opc = TGHOP_TLBINV;
+	start_instruction(tgh);
+	return wait_instruction_complete(tgh);
+}
+
+static inline void tfh_write_only(struct gru_tlb_fault_handle *tfh,
+				  unsigned long pfn, unsigned long vaddr,
+				  int asid, int dirty, int pagesize)
+{
+	tfh->fillasid = asid;
+	tfh->fillvaddr = vaddr;
+	tfh->pfn = pfn;
+	tfh->dirty = dirty;
+	tfh->pagesize = pagesize;
+	tfh->opc = TFHOP_WRITE_ONLY;
+	start_instruction(tfh);
+}
+
+static inline void tfh_write_restart(struct gru_tlb_fault_handle *tfh,
+				     unsigned long paddr, int gaa,
+				     unsigned long vaddr, int asid, int dirty,
+				     int pagesize)
+{
+	tfh->fillasid = asid;
+	tfh->fillvaddr = vaddr;
+	tfh->pfn = paddr >> GRU_PADDR_SHIFT;
+	tfh->gaa = gaa;
+	tfh->dirty = dirty;
+	tfh->pagesize = pagesize;
+	tfh->opc = TFHOP_WRITE_RESTART;
+	start_instruction(tfh);
+}
+
+static inline void tfh_restart(struct gru_tlb_fault_handle *tfh)
+{
+	tfh->opc = TFHOP_RESTART;
+	start_instruction(tfh);
+}
+
+static inline void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh)
+{
+	tfh->opc = TFHOP_USER_POLLING_MODE;
+	start_instruction(tfh);
+}
+
+static inline void tfh_exception(struct gru_tlb_fault_handle *tfh)
+{
+	tfh->opc = TFHOP_EXCEPTION;
+	start_instruction(tfh);
+}
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_IA64_SN_GRUHANDLES_H */
Index: linux/drivers/gru/grukservices.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/grukservices.c	2008-02-15 13:56:45.652296396 -0600
@@ -0,0 +1,129 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              KERNEL SERVICES THAT USE THE GRU
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2007-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#ifdef EMU
+#include "preemu.h"
+#endif
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+#include "gru_instructions.h"
+#ifdef __ia64__
+#include <asm/sn/addrs.h>
+#include <asm/sn/sn_cpuid.h>
+#endif
+#ifdef EMU
+#include "emu.h"
+#endif
+
+#ifdef EMU
+#define PADDR(v)	(emu_vtop((unsigned long)v))
+#elif defined(__ia64__)
+#define PADDR(v)	((void *)__pa(ia64_imva(v)))
+#else
+#define PADDR(v)	((void *)__pa(v))
+#endif
+
+#define MAGIC	0x1234567887654321UL
+
+static __cacheline_aligned unsigned long word0;
+static __cacheline_aligned unsigned long word1;
+
+static inline int gruwait(gru_control_block_t *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	while (cbs->istatus >= CBS_ACTIVE) {
+		gru_emulator_wait_hook(cb, 1); /* No code unless -DEMUSUPPORT */
+		cpu_relax();
+		barrier();
+	}
+	return cbs->istatus;
+}
+
+static int quicktest(struct gru_state *gru)
+{
+	void *cb;
+
+	cb = GRU_GSEG(gru->gs_gru_base_vaddr, KERNEL_CTXNUM);
+	word0 = MAGIC;
+
+	gru_vload(cb, (void *)PADDR(&word0), IAA_RAM, 0, XTYPE_DW, 1, 1,
+		  HINT_CB_UNMAPPED | HINT_CB_DELAY);
+	if (gruwait(cb) != CBS_IDLE)
+		BUG();
+
+	gru_vstore(cb, (void *)PADDR(&word1), IAA_RAM, 0, XTYPE_DW, 1, 1,
+		   HINT_CB_UNMAPPED | HINT_CB_DELAY);
+	if (gruwait(cb) != CBS_IDLE)
+		BUG();
+
+	if (word0 != word1 || word0 != MAGIC) {
+		printk
+		    ("GRU quicktest err: gru %d, found 0x%lx, expected 0x%lx\n",
+		     gru->gs_gid, word1, MAGIC);
+		BUG();		/* ZZZ should not be fatal */
+	}
+
+	return 0;
+}
+
+int gru_kservices_init(struct gru_state *gru)
+{
+	struct gru_context_configuration_handle *cch;
+	unsigned long cbr_map, dsr_map;
+	int err;
+
+	cbr_map =
+	    reserve_gru_cb_resources(gru,
+				     GRU_CB_COUNT_TO_AU(GRU_NUM_KERNEL_CBR),
+				     NULL);
+	dsr_map =
+	    reserve_gru_ds_resources(gru,
+				     GRU_DS_BYTES_TO_AU
+				     (GRU_NUM_KERNEL_DSR_BYTES), NULL);
+	__set_bit(KERNEL_CTXNUM, &gru->gs_context_map);
+	gru->gs_active_contexts++;
+	cch = GRU_CCH(gru->gs_gru_base_vaddr, KERNEL_CTXNUM);
+
+	lock_handle(cch);
+	cch->tfm_fault_bit_enable = 0;
+	cch->tlb_int_enable = 0;
+	cch->tfm_done_bit_enable = 0;
+	cch->unmap_enable = 1;
+	err = cch_allocate(cch, 0, cbr_map, dsr_map);
+	if (err) {
+		gru_dbg(grudev,
+			"Unable to allocate kernel CCH: gru %d, err %d\n",
+			gru->gs_gid, err);
+		BUG();
+	}
+	if (cch_start(cch)) {
+		gru_dbg(grudev, "Unable to start kernel CCH: gru %d, err %d\n",
+			gru->gs_gid, err);
+		BUG();
+	}
+	unlock_handle(cch);
+
+	return quicktest(gru);
+}
Index: linux/drivers/gru/grulib.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/grulib.h	2008-02-15 13:56:46.440393908 -0600
@@ -0,0 +1,84 @@
+/*
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All rights reserved.
+ */
+
+#ifndef _GRULIB_H_
+#define _GRULIB_H_
+
+#define GRU_BASENAME		"gru"
+#define GRU_FULLNAME		"/dev/gru"
+#define GRU_IOCTL_NUM 		 'G'
+#ifdef __ia64__
+#define GRU_GSEG_PAGESIZE	(256 * 1024)
+#define GRU_GSEG_PAGESIZE_SHIFT 18
+#else
+#define GRU_GSEG_PAGESIZE	(2 * 1024 * 1024UL)
+#endif
+
+/* Set Number of Request Blocks */
+#define GRU_CREATE_CONTEXT		_IOWR(GRU_IOCTL_NUM, 1, void *)
+
+/* Register task as using the slice */
+#define GRU_SET_TASK_SLICE		_IOWR(GRU_IOCTL_NUM, 5, void *)
+
+/* Fetch exception detail */
+#define GRU_USER_GET_EXCEPTION_DETAIL	_IOWR(GRU_IOCTL_NUM, 6, void *)
+
+/* For user call_os handling - normally a TLB fault */
+#define GRU_USER_CALL_OS		_IOWR(GRU_IOCTL_NUM, 8, void *)
+
+/* For user unload context */
+#define GRU_USER_UNLOAD_CONTEXT		_IOWR(GRU_IOCTL_NUM, 9, void *)
+
+/* For user TLB flushing (primarily for tests) */
+#define GRU_USER_FLUSH_TLB		_IOWR(GRU_IOCTL_NUM, 50, void *)
+
+/* Get some config options (primarily for tests & emulator) */
+#define GRU_GET_CONFIG_INFO		_IOWR(GRU_IOCTL_NUM, 51, void *)
+
+#define CONTEXT_WINDOW_BYTES(th)        (GRU_GSEG_PAGESIZE * (th))
+#define THREAD_POINTER(p, th)		(p + GRU_GSEG_PAGESIZE * (th))
+
+/*
+ * Structure used to pass TLB flush parameters to the driver
+ */
+struct gru_create_context_req {
+	unsigned int		data_segment_bytes;
+	unsigned int		control_blocks;
+	unsigned int		maximum_thread_count;
+	unsigned int		options;
+};
+
+/*
+ * Structure used to pass unload context parameters to the driver
+ */
+struct gru_unload_context_req {
+	unsigned long	vaddr;
+};
+
+/*
+ * Structure used to pass TLB flush parameters to the driver
+ */
+struct gru_flush_tlb_req {
+	unsigned long	gseg;
+	unsigned long	vaddr;
+	size_t		len;
+};
+
+/*
+ * GRU configuration info (temp - for testing)
+ */
+struct gru_config_info {
+	int		cpus;
+	int		blades;
+	int		nodes;
+	int		chiplets;
+	int		fill[16];
+};
+
+#endif /* _GRULIB_H_ */
Index: linux/drivers/gru/grumain.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/grumain.c	2008-02-19 09:30:53.000000000 -0600
@@ -0,0 +1,958 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *            DRIVER TABLE MANAGER + GRU CONTEXT LOAD/UNLOAD
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#ifdef EMU
+#include "preemu.h"
+#endif
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include "gru.h"
+#include "grutables.h"
+#include "gruhandles.h"
+#ifdef EMU
+#include "emu.h"
+#endif
+
+unsigned long options;
+
+static struct device_driver gru_driver = {
+	.name = "gru"
+};
+
+static struct device gru_device = {
+	.bus_id = {0},
+	.driver = &gru_driver,
+};
+
+struct device *grudev = &gru_device;
+
+/*
+ * Select a gru fault map to be used by the current cpu. Note that
+ * multiple cpus may be using the same map.
+ *	ZZZ should "shift" be used?? Depends on HT cpu numbering
+ *	ZZZ should be inline but did not work on emulator
+ */
+int gru_cpu_fault_map_id(void)
+{
+	return blade_processor_id() % GRU_NUM_TFM;
+}
+
+
+/*--------- ASID Management -------------------------------------------
+ *
+ *  Initially, assign asids sequentially from MIN_ASID .. MAX_ASID.
+ *  Once MAX is reached, flush the TLB & start over. However,
+ *  some asids may still be in use. There won't be many (percentage wise) still
+ *  in use. Search active contexts & determine the value of the first
+ *  asid in use ("x"s below). Set "limit" to this value.
+ *  This defines a block of assignable asids.
+ *
+ *  When "limit" is reached, search forward from limit+1 and determine the
+ *  next block of assignable asids.
+ *
+ *  Repeat until MAX_ASID is reached, then start over again.
+ *
+ *  Each time MAX_ASID is reached, increment the asid generation. Since
+ *  the search for in-use asids only checks contexts with GRUs currently
+ *  assigned, asids in some contexts will be missed. Prior to loading
+ *  a context, the asid generation of the GTS asid is rechecked. If it
+ *  doesn't match the current generation, a new asid will be assigned.
+ *
+ *   	0---------------x------------x---------------------x----|
+ *	  ^-next	^-limit	   				^-MAX_ASID
+ *
+ * All asid manipulation & context loading/unloading is protected by the
+ * gs_lock.
+ */
+
+/* Hit the asid limit. Start over */
+static int gru_wrap_asid(struct gru_state *gru)
+{
+	gru_dbg(grudev, "gru %p\n", gru);
+	STAT(asid_wrap);
+	gru->gs_asid_gen++;
+	gru_flush_all_tlb(gru);
+	return MIN_ASID;
+}
+
+/* Find the next chunk of unused asids */
+static int gru_reset_asid_limit(struct gru_state *gru, int asid)
+{
+	int i, gid, inuse_asid, limit;
+
+	gru_dbg(grudev, "gru %p, asid 0x%x\n", gru, asid);
+	STAT(asid_next);
+	limit = MAX_ASID;
+	if (asid >= limit)
+		asid = gru_wrap_asid(gru);
+	gid = gru->gs_gid;
+again:
+	for (i = 0; i < GRU_NUM_CCH; i++) {
+		if (!gru->gs_gts[i])
+			continue;
+		inuse_asid = gru->gs_gts[i]->ts_ms->ms_asids[gid].mt_asid;
+		gru_dbg(grudev, "gru %p, inuse_asid 0x%x, cxtnum %d, gts %p\n",
+			gru, inuse_asid, i, gru->gs_gts[i]);
+		if (inuse_asid == asid) {
+			asid += ASID_INC;
+			if (asid >= limit) {
+				/*
+				 * empty range: reset the range limit and
+				 * start over
+				 */
+				limit = MAX_ASID;
+				if (asid >= MAX_ASID)
+					asid = gru_wrap_asid(gru);
+				goto again;
+			}
+		}
+
+		if ((inuse_asid > asid) && (inuse_asid < limit))
+			limit = inuse_asid;
+	}
+	gru->gs_asid_limit = limit;
+	gru->gs_asid = asid;
+	gru_dbg(grudev, "gru %p, new asid 0x%x, new_limit 0x%x\n", gru, asid,
+		limit);
+	return asid;
+}
+
+/* Assign a new ASID to a thread context.  */
+static int gru_assign_asid(struct gru_state *gru)
+{
+	int asid;
+
+	spin_lock(&gru->gs_asid_lock);
+	gru->gs_asid += ASID_INC;
+	asid = gru->gs_asid;
+	if (asid >= gru->gs_asid_limit)
+		asid = gru_reset_asid_limit(gru, asid);
+	spin_unlock(&gru->gs_asid_lock);
+
+	gru_dbg(grudev, "gru %p, asid 0x%x\n", gru, asid);
+	return asid;
+}
+
+/*
+ * Clear n bits in a word. Return a word indicating the bits that were cleared.
+ * Optionally, build an array of chars that contain the bit numbers allocated.
+ */
+static unsigned long reserve_resources(unsigned long *p, int n, int mmax,
+				       char *idx)
+{
+	unsigned long bits = 0;
+	int i;
+
+	do {
+		i = find_first_bit(p, mmax);
+		if (i == mmax)
+			BUG();
+		__clear_bit(i, p);
+		__set_bit(i, &bits);
+		if (idx)
+			*idx++ = i;
+	} while (--n);
+	return bits;
+}
+
+unsigned long reserve_gru_cb_resources(struct gru_state *gru, int cbr_au_count,
+				       char *cbmap)
+{
+	return reserve_resources(&gru->gs_cbr_map, cbr_au_count, GRU_CBR_AU,
+				 cbmap);
+}
+
+unsigned long reserve_gru_ds_resources(struct gru_state *gru, int dsr_au_count,
+				       char *dsmap)
+{
+	return reserve_resources(&gru->gs_dsr_map, dsr_au_count, GRU_DSR_AU,
+				 dsmap);
+}
+
+static void reserve_gru_resources(struct gru_state *gru,
+				  struct gru_thread_state *gts)
+{
+	gru->gs_active_contexts++;
+	gts->ts_cbr_map =
+	    reserve_gru_cb_resources(gru, gts->ts_cbr_au_count,
+				     gts->ts_cbr_idx);
+	gts->ts_dsr_map =
+	    reserve_gru_ds_resources(gru, gts->ts_dsr_au_count, NULL);
+}
+
+static void free_gru_resources(struct gru_state *gru,
+			       struct gru_thread_state *gts)
+{
+	gru->gs_active_contexts--;
+	gru->gs_cbr_map |= gts->ts_cbr_map;
+	gru->gs_dsr_map |= gts->ts_dsr_map;
+}
+
+/*
+ * Check if a GRU has sufficient free resources to satisfy an allocation
+ * request. Note: GRU locks may or may not be held when this is called. If
+ * not held, recheck after acquiring the appropriate locks.
+ *
+ * Returns 1 if sufficient resources, 0 if not
+ */
+static int check_gru_resources(struct gru_state *gru, int cbr_au_count,
+			       int dsr_au_count, int max_active_contexts)
+{
+	return (hweight64(gru->gs_cbr_map) >= cbr_au_count
+		&& hweight64(gru->gs_dsr_map) >= dsr_au_count
+		&& gru->gs_active_contexts < max_active_contexts);
+}
+
+/*
+ * TLB manangment requires tracking all GRU chiplets that have loaded a GSEG
+ * context.
+ */
+static int gru_load_mm_tracker(struct gru_state *gru, struct gru_mm_struct *gms,
+			       int ctxnum)
+{
+	struct gru_mm_tracker *asids = &gms->ms_asids[gru->gs_gid];
+	unsigned short ctxbitmap = (1 << ctxnum);
+	int asid;
+
+	spin_lock(&gms->ms_asid_lock);
+	asid = asids->mt_asid;
+
+	if (asid == 0 || asids->mt_asid_gen != gru->gs_asid_gen) {
+		asid = gru_assign_asid(gru);
+		asids->mt_asid = asid;
+		asids->mt_asid_gen = gru->gs_asid_gen;
+		STAT(asid_new);
+	} else {
+		STAT(asid_reuse);
+	}
+
+	BUG_ON(asids->mt_ctxbitmap & ctxbitmap);
+	asids->mt_ctxbitmap |= ctxbitmap;
+	if (!test_bit(gru->gs_gid, gms->ms_asidmap))
+		__set_bit(gru->gs_gid, gms->ms_asidmap);
+	spin_unlock(&gms->ms_asid_lock);
+
+	gru_dbg(grudev,
+		"gru %x, gms %p, ctxnum 0x%d, asid 0x%x, asidmap 0x%lx\n",
+		gru->gs_gid, gms, ctxnum, asid, gms->ms_asidmap[0]);
+	return asid;
+}
+
+static void gru_unload_mm_tracker(struct gru_state *gru,
+				  struct gru_mm_struct *gms, int ctxnum)
+{
+	struct gru_mm_tracker *asids;
+	unsigned short ctxbitmap;
+
+	asids = &gms->ms_asids[gru->gs_gid];
+	ctxbitmap = (1 << ctxnum);
+	spin_lock(&gms->ms_asid_lock);
+	BUG_ON((asids->mt_ctxbitmap & ctxbitmap) != ctxbitmap);
+	asids->mt_ctxbitmap ^= ctxbitmap;
+	gru_dbg(grudev, "gru %x, gms %p, ctxnum 0x%d, asidmap 0x%lx\n",
+		gru->gs_gid, gms, ctxnum, gms->ms_asidmap[0]);
+	spin_unlock(&gms->ms_asid_lock);
+}
+
+/*
+ * Decrement the reference count on a GTD structure. Free the structure
+ * if the reference count goes to zero.
+ */
+void gtd_drop(struct gru_thread_data *gtd)
+{
+	if (gtd && atomic_dec_return(&gtd->td_refcnt) == 0) {
+		kfree(gtd);
+		STAT(gtd_free);
+	}
+}
+
+/*
+ * Decrement the reference count on a GTS structure. Free the structure
+ * if the reference count goes to zero.
+ */
+void gts_drop(struct gru_thread_state *gts)
+{
+	if (gts && atomic_dec_return(&gts->ts_refcnt) == 0) {
+		gru_drop_mmu_notifier(gts->ts_ms);
+		kfree(gts);
+		STAT(gts_free);
+	}
+}
+
+/*
+ * Locate the GTS structure for the current thread.
+ */
+static struct gru_thread_state *gru_find_current_gts_nolock(struct gru_vma_data
+							    *vdata, int tsid)
+{
+	struct gru_thread_state *gts;
+
+	list_for_each_entry(gts, &vdata->vd_head, ts_next)
+	    if (gts->ts_tsid == tsid)
+		return gts;
+	return NULL;
+}
+
+/*
+ * Break a copy-on-write reference to a gru thread data struct.
+ */
+static int gru_break_cow(struct vm_area_struct *vma,
+			 struct gru_thread_state *gts)
+{
+	struct gru_thread_data *gtd;
+	struct gru_vma_data *vdata = vma->vm_private_data;
+
+	gtd = kmalloc(THREADDATABYTES(vdata), GFP_KERNEL);
+	if (!gtd)
+		return 0;
+	STAT(gtd_alloc);
+	STAT(break_cow);
+	memcpy(gtd, gts->ts_td, THREADDATABYTES(vdata));
+	atomic_set(&gtd->td_refcnt, 1);
+	gtd_drop(gts->ts_td);
+	gts->ts_td = gtd;
+	gru_dbg(grudev, "alloc gts %p, new gtd %p\n", gts, gtd);
+	return 1;
+}
+
+/*
+ * Allocate a thread data structure.
+ */
+static struct gru_thread_data *gru_alloc_gtd(struct gru_vma_data *vdata,
+					     struct gru_thread_state *gts)
+{
+	struct gru_thread_data *gtd;
+	int bytes = THREADDATABYTES(vdata);
+
+	gtd = kzalloc(bytes, GFP_KERNEL);
+	if (!gtd)
+		return NULL;
+
+	STAT(gtd_alloc);
+	atomic_set(&gtd->td_refcnt, 1);
+	gtd->td_magic = TD_MAGIC;
+	gru_dbg(grudev, "alloc vdata %p, new gtd %p\n", vdata, gtd);
+	return gtd;
+}
+
+/*
+ * Allocate a thread state structure.
+ */
+static struct gru_thread_state *gru_alloc_gts(struct vm_area_struct *vma,
+					      struct gru_vma_data *vdata,
+					      int tsid,
+					      struct gru_thread_data *gtd)
+{
+	struct gru_thread_state *gts;
+
+	gts = kzalloc(sizeof(*gts), GFP_KERNEL);
+	if (!gts)
+		return NULL;
+
+	STAT(gts_alloc);
+	atomic_set(&gts->ts_refcnt, 1);
+	sema_init(&gts->ts_ctxsem, 1);
+	gts->ts_cbr_au_count = vdata->vd_cbr_au_count;
+	gts->ts_dsr_au_count = vdata->vd_dsr_au_count;
+	gts->ts_tsid = tsid;
+	gts->ts_user_options = vdata->vd_user_options;
+	gts->ts_ctxnum = NULLCTX;
+	gts->ts_mm = current->mm;
+	gts->ts_vma = vma;
+	gts->ts_tlb_int_select = -1;
+	gts->ts_ms = gru_register_mmu_notifier();
+	if (!gts->ts_ms)
+		goto err;
+
+	if (!gtd)
+		gtd = gru_alloc_gtd(vdata, gts);
+	if (!gtd)
+		goto err;
+
+	gts->ts_td = gtd;
+
+	gru_dbg(grudev, "alloc vdata %p, new gts %p, new gtd %p\n", vdata, gts,
+		gtd);
+	return gts;
+
+err:
+	gts_drop(gts);
+	return NULL;
+}
+
+/*
+ * Allocate a vma private data structure.
+ */
+struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma, int tsid,
+					void *gtd)
+{
+	struct gru_file_data *fdata;
+	struct gru_vma_data *vdata = NULL;
+	struct gru_thread_state *gts = NULL;
+
+	vdata = kmalloc(sizeof(*vdata), GFP_KERNEL);
+	if (!vdata)
+		return NULL;
+
+	INIT_LIST_HEAD(&vdata->vd_head);
+	spin_lock_init(&vdata->vd_lock);
+	fdata = vma->vm_file->private_data;
+	vdata->vd_cbr_au_count = fdata->fd_cbr_au_count;
+	vdata->vd_dsr_au_count = fdata->fd_dsr_au_count;
+	vdata->vd_thread_slices = fdata->fd_thread_slices;
+	vdata->vd_user_options = fdata->fd_user_options;
+
+	gts = gru_alloc_gts(vma, vdata, TSID(0), gtd);
+	if (!gts) {
+		kfree(vdata);
+		return NULL;
+	}
+	gru_dbg(grudev, "alloc vdata %p, gts %p, gtd %p\n", vdata, gts, gtd);
+	list_add(&gts->ts_next, &vdata->vd_head);
+
+	mb();			/* Make sure head is visible */
+	if (cmpxchg(&vma->vm_private_data, gtd, vdata) != gtd) {
+		if (!gtd)
+			gtd_drop(gts->ts_td);
+		gts_drop(gts);
+		kfree(vdata);
+		STAT(vdata_double_alloc);
+	} else {
+		STAT(vdata_alloc);
+	}
+	return vma->vm_private_data;
+}
+
+/*
+ * Find the thread state structure for the current thread. If none
+ * exists, allocate one.
+ *
+ * Note that the vm_private structure in the vma _may_ be a pointer to
+ * a COW thread data structure. If so, create a vma structure, etc...
+ */
+struct gru_thread_state *gru_find_thread_state(struct vm_area_struct *vma,
+					       int tsid)
+{
+	struct gru_vma_data *vdata;
+	struct gru_thread_state *gts, *ngts;
+
+	vdata = vma->vm_private_data;
+	if (IS_THREAD_DATA(vdata)) {
+		vdata = gru_alloc_vma_data(vma, tsid, vdata);
+		if (!vdata)
+			return NULL;
+	}
+
+	spin_lock(&vdata->vd_lock);
+	gts = gru_find_current_gts_nolock(vdata, tsid);
+	if (gts) {
+		spin_unlock(&vdata->vd_lock);
+		gru_dbg(grudev, "vma %p, gts %p, gtd %p\n", vma, gts,
+			gts->ts_td);
+		return gts;
+	}
+	spin_unlock(&vdata->vd_lock);
+
+	gts = gru_alloc_gts(vma, vdata, tsid, NULL);
+	if (!gts)
+		return NULL;
+
+	spin_lock(&vdata->vd_lock);
+	ngts = gru_find_current_gts_nolock(vdata, tsid);
+	if (ngts) {
+		gts_drop(gts);
+		gts = ngts;
+		STAT(gts_double_allocate);
+	} else {
+		list_add(&gts->ts_next, &vdata->vd_head);
+	}
+	spin_unlock(&vdata->vd_lock);
+
+	gru_dbg(grudev, "vma %p, new gts %p, gtd %p\n", vma, gts, gts->ts_td);
+	return gts;
+}
+
+/*
+ * Free the GRU context assigned to the thread state.
+ */
+static void gru_free_gru_context(struct gru_thread_state *gts)
+{
+	struct gru_state *gru;
+
+	gru = gts->ts_gru;
+	gru_dbg(grudev, "gts %p, gru %p\n", gts, gru);
+
+	spin_lock(&gru->gs_lock);
+	gru->gs_gts[gts->ts_ctxnum] = NULL;
+	free_gru_resources(gru, gts);
+	BUG_ON(test_bit(gts->ts_ctxnum, &gru->gs_context_map) == 0);
+	__clear_bit(gts->ts_ctxnum, &gru->gs_context_map);
+	gts->ts_ctxnum = NULLCTX;
+	gts->ts_gru = NULL;
+	spin_unlock(&gru->gs_lock);
+
+	gts_drop(gts);
+	STAT(free_context);
+}
+
+/*
+ * Prefetching cachelines help hardware performance.
+ */
+static void prefetch_data(void *p, int num, int stride)
+{
+	while (num-- > 0) {
+		prefetchw(p);
+		p += stride;
+	}
+}
+
+static inline long gru_copy_handle(void *d, void *s)
+{
+	memcpy(d, s, GRU_HANDLE_BYTES);
+	return GRU_HANDLE_BYTES;
+}
+
+/* rewrite in assembly & use lots of prefetch */
+static void gru_load_context_data(void *save, void *grubase, int ctxnum,
+				  unsigned long cbrmap, unsigned long dsrmap)
+{
+	void *gseg, *cb, *cbe;
+	unsigned long length;
+	int i, scr;
+
+	gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
+	length = hweight64(dsrmap) * GRU_DSR_AU_BYTES;
+	prefetch_data(gseg + GRU_DS_BASE, length / GRU_CACHE_LINE_BYTES,
+		      GRU_CACHE_LINE_BYTES);
+
+	cb = gseg + GRU_CB_BASE;
+	cbe = grubase + GRU_CBE_BASE;
+	for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
+		prefetch_data(cb, 1, GRU_CACHE_LINE_BYTES);
+		prefetch_data(cbe + i * GRU_HANDLE_STRIDE, 1,
+			      GRU_CACHE_LINE_BYTES);
+		cb += GRU_HANDLE_STRIDE;
+	}
+
+	cb = gseg + GRU_CB_BASE;
+	for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
+		save += gru_copy_handle(cb, save);
+		save += gru_copy_handle(cbe + i * GRU_HANDLE_STRIDE, save);
+		cb += GRU_HANDLE_STRIDE;
+	}
+
+	memcpy(gseg + GRU_DS_BASE, save, length);
+}
+
+static void gru_unload_context_data(void *save, void *grubase, int ctxnum,
+				    unsigned long cbrmap, unsigned long dsrmap)
+{
+	void *gseg, *cb, *cbe;
+	unsigned long length;
+	int i, scr;
+
+	gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
+
+	cb = gseg + GRU_CB_BASE;
+	cbe = grubase + GRU_CBE_BASE;
+	for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
+		save += gru_copy_handle(save, cb);
+		save += gru_copy_handle(save, cbe + i * GRU_HANDLE_STRIDE);
+		cb += GRU_HANDLE_STRIDE;
+	}
+	length = hweight64(dsrmap) * GRU_DSR_AU_BYTES;
+	memcpy(save, gseg + GRU_DS_BASE, length);
+}
+
+void gru_unload_context(struct gru_thread_state *gts, int savestate)
+{
+	struct gru_thread_data *gtd = gts->ts_td;
+	struct gru_state *gru = gts->ts_gru;
+	struct gru_context_configuration_handle *cch;
+	int ctxnum = gts->ts_ctxnum;
+
+	zap_page_range(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE, NULL);
+	cch = GRU_CCH(gru->gs_gru_base_vaddr, ctxnum);
+
+	lock_handle(cch);
+	if (cch_interrupt_sync(cch))
+		BUG();
+	gru_dbg(grudev, "gts %p, gtd %p\n", gts, gtd);
+
+	gru_unload_mm_tracker(gru, gts->ts_ms, gts->ts_ctxnum);
+	if (savestate)
+		gru_unload_context_data(gtd->td_gdata, gru->gs_gru_base_vaddr,
+					ctxnum, gts->ts_cbr_map,
+					gts->ts_dsr_map);
+
+	if (cch_deallocate(cch))
+		BUG();
+	gts->ts_force_unload = 0;	/* ts_force_unload locked by CCH lock */
+	unlock_handle(cch);
+
+	gru_free_gru_context(gts);
+	STAT(unload_context);
+}
+
+/*
+ * Load a GRU context by copying it from the thread data structure in memory
+ * to the GRU.
+ */
+static void gru_load_context(struct gru_thread_state *gts)
+{
+	struct gru_thread_data *gtd = gts->ts_td;
+	struct gru_state *gru = gts->ts_gru;
+	struct gru_context_configuration_handle *cch;
+	int err, asid, ctxnum = gts->ts_ctxnum;
+
+	gru_dbg(grudev, "gts %p, gtd %p\n", gts, gtd);
+	cch = GRU_CCH(gru->gs_gru_base_vaddr, ctxnum);
+
+	lock_handle(cch);
+	asid = gru_load_mm_tracker(gru, gts->ts_ms, gts->ts_ctxnum);
+	cch->tfm_fault_bit_enable =
+	    (gts->ts_user_options == GRU_OPT_MISS_FMM_POLL
+	     || gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
+	cch->tlb_int_enable = (gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
+	if (cch->tlb_int_enable) {
+		gts->ts_tlb_int_select = gru_cpu_fault_map_id();
+		cch->tlb_int_select = gts->ts_tlb_int_select;
+	}
+	cch->tfm_done_bit_enable = 0;
+	err = cch_allocate(cch, asid, gts->ts_cbr_map, gts->ts_dsr_map);
+	if (err) {
+		gru_dbg(grudev,
+			"err %d: cch %p, gts %p, cbr 0x%lx, dsr 0x%lx\n",
+			err, cch, gts, gts->ts_cbr_map, gts->ts_dsr_map);
+		BUG();
+	}
+
+	gru_load_context_data(gtd->td_gdata, gru->gs_gru_base_vaddr, ctxnum,
+			      gts->ts_cbr_map, gts->ts_dsr_map);
+
+	if (cch_start(cch))
+		BUG();
+	unlock_handle(cch);
+
+	STAT(load_context);
+}
+
+/*
+ * Update fields in an active CCH:
+ * 	- retarget interrupts on local blade
+ * 	- force a delayed context unload by clearing the CCH asids. This
+ * 	  forces TLB misses for new GRU instructions. The context is unloaded
+ * 	  when the next TLB miss occurs.
+ */
+static int gru_update_cch(struct gru_thread_state *gts, int int_select)
+{
+	struct gru_context_configuration_handle *cch;
+	struct gru_state *gru = gts->ts_gru;
+	int i, ctxnum = gts->ts_ctxnum, ret = 0;
+
+	cch = GRU_CCH(gru->gs_gru_base_vaddr, ctxnum);
+
+	lock_handle(cch);
+	if (cch->state == CCHSTATE_ACTIVE) {
+		if (gru->gs_gts[gts->ts_ctxnum] != gts)
+			goto exit;
+		if (cch_interrupt(cch))
+			BUG();
+		if (int_select >= 0) {
+			gts->ts_tlb_int_select = int_select;
+			cch->tlb_int_select = int_select;
+		} else {
+			for (i = 0; i < 8; i++)
+				cch->asid[i] = 0;
+			cch->tfm_fault_bit_enable = 0;
+			cch->tlb_int_enable = 0;
+			gts->ts_force_unload = 1;
+		}
+		if (cch_start(cch))
+			BUG();
+		ret = 1;
+	}
+exit:
+	unlock_handle(cch);
+	return ret;
+}
+
+/*
+ * Update CCH tlb interrupt select. Required when all the following is true:
+ * 	- task's GRU context is loaded into a GRU
+ * 	- task is using interrupt notification for TLB faults
+ * 	- task has migrated to a different cpu on the same blade where
+ * 	  it was previously running.
+ */
+static int gru_retarget_intr(struct gru_thread_state *gts)
+{
+	if (gts->ts_tlb_int_select < 0
+	    || gts->ts_tlb_int_select == gru_cpu_fault_map_id())
+		return 0;
+
+	gru_dbg(grudev, "retarget from %d to %d\n", gts->ts_tlb_int_select,
+		gru_cpu_fault_map_id());
+	return gru_update_cch(gts, gru_cpu_fault_map_id());
+}
+
+/*
+ * Try to unload the GRU context. Task has migrated to a different blade.
+ * Called on migration when locks could not be obtained to immediately unload
+ * the context.
+ */
+static int gru_delayed_unload_context(struct gru_thread_state *gts)
+{
+	gru_dbg(grudev, "migration unload context gts %p\n", gts);
+	return gru_update_cch(gts, -1);
+}
+
+/*
+ * All GRU contexts on the local blade are busy. Steal one from another process.
+ * This is a hack until a _real_ resource scheduler is written....
+ */
+#define next_ctxnum(n)		((n) <  GRU_NUM_CCH - 2 ? (n) + 1 : 0)
+#define next_gru(b, g)		(((g) < &(b)->bs_grus[GRU_CHIPLETS_PER_BLADE - 1]) ?  \
+				 ((g)+1) : &(b)->bs_grus[0])
+
+static void gru_steal_context(struct gru_thread_state *gts)
+{
+	struct gru_blade_state *blade;
+	struct gru_state *gru = NULL;
+	struct gru_thread_state *ngts = NULL;
+	int ctxnum, cbr, dsr, ok = 0;
+
+	cbr = gts->ts_cbr_au_count;
+	dsr = gts->ts_dsr_au_count;
+
+	preempt_disable();
+	blade = gru_base[numa_blade_id()];
+	spin_lock(&blade->bs_lock);
+
+	ctxnum = next_ctxnum(blade->bs_lru_ctxnum);
+	gru = blade->bs_lru_gru;
+	if (ctxnum == 0)
+		gru = next_gru(blade, gru);
+	while (1) {
+		spin_lock(&gru->gs_lock);
+		for (; ctxnum < GRU_NUM_CCH; ctxnum++) {
+			if (gru == blade->bs_lru_gru
+			    && ctxnum == blade->bs_lru_ctxnum)
+				break;
+			ok = check_gru_resources(gru, cbr, dsr, GRU_NUM_CCH);
+			if (ok)
+				break;
+			ngts = gru->gs_gts[ctxnum];
+			if (ngts && down_trylock(&ngts->ts_ctxsem) == 0)
+				break;
+			ngts = NULL;
+		}
+		spin_unlock(&gru->gs_lock);
+		if (ok || ngts
+		    || (gru == blade->bs_lru_gru
+			&& ctxnum == blade->bs_lru_ctxnum))
+			break;
+		ctxnum = 0;
+		gru = next_gru(blade, gru);
+	}
+	blade->bs_lru_gru = gru;
+	blade->bs_lru_ctxnum = ctxnum;
+	spin_unlock(&blade->bs_lock);
+	preempt_enable();
+
+	if (ngts) {
+		STAT(steal_context);
+		ngts->ts_steal_jiffies = jiffies;
+		gru_unload_context(ngts, 1);
+		up(&ngts->ts_ctxsem);
+	} else {
+		STAT(steal_context_failed);
+	}
+	gru_dbg(grudev,
+		"stole gru %x, ctxnum %d from gts %p. Need cb %d, ds %d;"
+		" avail cb %ld, ds %ld\n",
+		gru->gs_gid, ctxnum, ngts, cbr, dsr, hweight64(gru->gs_cbr_map),
+		hweight64(gru->gs_dsr_map));
+}
+
+/*
+ * Scan the GRUs on the local blade & assign a GRU context & ASID.
+ */
+static struct gru_state *gru_assign_gru_context(struct gru_thread_state *gts)
+{
+	struct gru_state *gru, *grux;
+	int i, max_active_contexts;
+
+	preempt_disable();
+
+again:
+	gru = NULL;
+	max_active_contexts = GRU_NUM_CCH;
+	for_each_gru_on_blade(grux, numa_blade_id(), i) {
+		if (check_gru_resources(grux, gts->ts_cbr_au_count,
+					gts->ts_dsr_au_count,
+					max_active_contexts)) {
+			gru = grux;
+			max_active_contexts = grux->gs_active_contexts;
+			if (max_active_contexts == 0)
+				break;
+		}
+	}
+
+	if (gru) {
+		spin_lock(&gru->gs_lock);
+		if (!check_gru_resources(gru, gts->ts_cbr_au_count,
+					 gts->ts_dsr_au_count, GRU_NUM_CCH)) {
+			spin_unlock(&gru->gs_lock);
+			goto again;
+		}
+		reserve_gru_resources(gru, gts);
+		gts->ts_gru = gru;
+		gts->ts_ctxnum =
+		    find_first_zero_bit(&gru->gs_context_map, GRU_NUM_CCH);
+		BUG_ON(gts->ts_ctxnum == GRU_NUM_CCH);
+		atomic_inc(&gts->ts_refcnt);
+		gru->gs_gts[gts->ts_ctxnum] = gts;
+		__set_bit(gts->ts_ctxnum, &gru->gs_context_map);
+		spin_unlock(&gru->gs_lock);
+
+		STAT(assign_context);
+		gru_dbg(grudev,
+			"gseg %p, gts %p, gru %x, ctx %d, cbr %d, dsr %d\n",
+			gseg_virtual_address(gts->ts_gru, gts->ts_ctxnum), gts,
+			gts->ts_gru->gs_gid, gts->ts_ctxnum,
+			gts->ts_cbr_au_count, gts->ts_dsr_au_count);
+	} else {
+		gru_dbg(grudev, "failed to allocate a GTS %s\n", "");
+		STAT(assign_context_failed);
+	}
+
+	preempt_enable();
+	return gru;
+}
+
+/*
+ * gru_nopage
+ *
+ * Map the user's GRU segment
+ */
+unsigned long gru_nopfn(struct vm_area_struct *vma, unsigned long address)
+{
+	struct gru_thread_state *gts;
+	unsigned long paddr;
+
+	gru_dbg(grudev, "vma %p, address 0x%lx (0x%lx)\n",
+		vma, address, GSEG_BASE(address));
+	STAT(nopfn);
+
+	gts = gru_find_thread_state(vma, TSID(address - vma->vm_start));
+	if (!gts)
+		return VM_FAULT_SIGBUS;
+
+again:
+	preempt_disable();
+	down(&gts->ts_ctxsem);
+	if (gts->ts_gru) {
+		if (gts->ts_gru->gs_blade_id != numa_blade_id()) {
+			STAT(migrated_nopfn_unload);
+			gru_unload_context(gts, 1);
+		} else {
+			if (gru_retarget_intr(gts))
+				STAT(migrated_nopfn_retarget);
+		}
+	}
+
+	if (!gts->ts_gru) {
+		while (!gru_assign_gru_context(gts)) {
+			up(&gts->ts_ctxsem);
+			preempt_enable();
+			schedule_timeout(GRU_ASSIGN_DELAY);  /* true hack ZZZ */
+			if (gts->ts_steal_jiffies + GRU_STEAL_DELAY < jiffies)
+				gru_steal_context(gts);
+			goto again;
+		}
+		if (atomic_read(&gts->ts_td->td_refcnt) > 1)
+			if (!gru_break_cow(vma, gts)) {
+				up(&gts->ts_ctxsem);
+				return VM_FAULT_SIGBUS;
+			}
+		gru_load_context(gts);
+		paddr = gseg_physical_address(gts->ts_gru, gts->ts_ctxnum);
+		remap_pfn_range(vma, address & ~(GRU_GSEG_PAGESIZE - 1),
+				paddr >> PAGE_SHIFT, GRU_GSEG_PAGESIZE,
+				vma->vm_page_prot);
+	}
+
+	up(&gts->ts_ctxsem);
+	preempt_enable();
+
+	return NOPFN_REFAULT;
+}
+
+/*
+ * gru_migrate_task
+ *
+ * Task has migrated to a different blade or a different cpu on the same blade
+ */
+static int do_migrate_gts(struct gru_state *gru, struct gru_thread_state *gts,
+			   int locked, int pbid, int bid)
+{
+	int again = 0;
+
+	if (pbid == bid) {
+		if (gru_retarget_intr(gts))
+			STAT(migrated_retarget);
+	} else if (locked && down_trylock(&gts->ts_ctxsem) == 0) {
+		spin_unlock(&gru->gs_lock);
+		gru_unload_context(gts, 1);
+		up(&gts->ts_ctxsem);
+		STAT(migrated_unload);
+		again = 1;
+	} else if (gru_delayed_unload_context(gts)) {
+		STAT(migrated_unload_delay);
+	}
+	return again;
+}
+
+void gru_migrate_task(int pcpu, int cpu)
+{
+	struct gru_state *gru;
+	struct gru_thread_state *gts;
+	struct gru_blade_state *blade;
+	struct mm_struct *mm = current->mm;
+	int pbid = cpu_to_blade(pcpu), bid = cpu_to_blade(cpu);
+	int locked = 0, ctxnum, scr;
+
+	STAT(migrate_check);
+	blade = gru_base[bid];
+	if (!blade || !mm)
+		return;
+
+again:
+	if (!locked)
+		locked= down_read_trylock(&mm->mmap_sem);
+	for_each_gru_on_blade(gru, pbid, scr) {
+		spin_lock(&gru->gs_lock);
+		for_each_gts_on_gru(gts, gru, ctxnum)
+			if (gts->ts_tgid_owner == current->tgid && gts->ts_gru)
+				if (do_migrate_gts(gru, gts, locked, pbid, bid))
+					goto again;
+		spin_unlock(&gru->gs_lock);
+	}
+
+	if (locked)
+		up_read(&mm->mmap_sem);
+}
Index: linux/drivers/gru/grummuops.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/grummuops.c	2008-02-19 09:30:53.000000000 -0600
@@ -0,0 +1,376 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * 		MMUOPS callbacks  + TLB flushing
+ *
+ * This file handles mmuops callbacks from the core kernel. The callbacks
+ * are used to update the TLB in the GRU as a result of changes in the
+ * state of a process address space. This file also handles TLB invalidates
+ * from the GRU driver.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ */
+
+#ifdef EMU
+#include "preemu.h"
+#endif
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/mmu_notifier.h>
+#include <linux/device.h>
+#include <linux/hugetlb.h>
+#include <asm/timex.h>
+#include <asm/processor.h>
+#include <asm/delay.h>
+#include "gru.h"
+#include "grutables.h"
+#ifdef EMU
+#include "emu.h"
+#endif
+
+#define gru_random()	get_cycles()
+
+/* ---------------------------------- TLB Invalidation functions --------
+ * get_tgh_handle
+ *
+ * Find a TGH to use for issuing a TLB invalidate. For GRUs that are on the
+ * local blade, use a fixed TGH that is a function of the blade-local cpu
+ * number. Normally, this TGH is private to the cpu & no contention occurs for
+ * the TGH. For offblade GRUs, select a random TGH in the range above the
+ * private TGHs. A spinlock is required to access this TGH & the lock must be
+ * released when the invalidate is completes. This sucks, but it is the best we
+ * can do.
+ *
+ * Note that the spinlock is IN the TGH handle so locking does not involve
+ * additional cache lines.
+ *
+ */
+static inline int get_off_blade_tgh(struct gru_state *gru)
+{
+	int n;
+
+	n = GRU_NUM_TGH - gru->gs_tgh_first_remote;
+	n = gru_random() % n;
+	n += gru->gs_tgh_first_remote;
+	return n;
+}
+
+static inline int get_on_blade_tgh(struct gru_state *gru)
+{
+	return blade_processor_id() >> gru->gs_tgh_local_shift;
+}
+
+static struct gru_tlb_global_handle *get_lock_tgh_handle(struct gru_state
+							 *gru)
+{
+	struct gru_tlb_global_handle *tgh;
+	int n;
+
+	preempt_disable();
+	if (numa_blade_id() == gru->gs_blade_id)
+		n = get_on_blade_tgh(gru);
+	else
+		n = get_off_blade_tgh(gru);
+	tgh = get_tgh_by_index(gru, n);
+	lock_handle(tgh);
+
+	return tgh;
+}
+
+
+static void get_unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
+{
+	unlock_handle(tgh);
+	preempt_enable();
+}
+
+/*
+ * gru_flush_tlb_range
+ *
+ * General purpose TLB invalidation function. This function scans every GRU in
+ * the ENTIRE system (partition) looking for GRUs where the specified MM has
+ * been accessed by the GRU. For each GRU found, the TLB must be invalidated OR
+ * the ASID invalidated. Invalidating an ASID causes a new ASID to be assigned
+ * on the next fault. This effectively flushes the ENTIRE TLB for the MM at the
+ * cost of (possibly) a large number of future TLBmisses.
+ *
+ * The current algorithm is optimized based on the following (somewhat true)
+ * assumptions:
+ * 	- GRU contexts are not loaded into a GRU unless a reference is made to
+ * 	  the data segment or control block (this is true, not an assumption).
+ * 	  If a DS/CB is referenced, the user will also issue instructions that
+ * 	  cause TLBmisses. It is not necessary to optimize for the case where
+ * 	  contexts are loaded but no instructions cause TLB misses. (I know
+ * 	  this will happen but I'm not optimizing for it).
+ * 	- GRU instructions to invalidate TLB entries are SLOOOOWWW - normally
+ * 	  a few usec but in unusual cases, it could be longer. Avoid if
+ * 	  possible.
+ * 	- intrablade process migration between cpus is not frequent but is
+ * 	  common.
+ * 	- a GRU context is not typically migrated to a different GRU on the
+ * 	  blade because of intrablade migration
+ *	- interblade migration is rare. Processes migrate their GRU context to
+ *	  the new blade.
+ *	- if interblade migration occurs, migration back to the original blade
+ *	  is very very rare (ie., no optimization for this case)
+ *	- most GRU instruction operate on a subset of the user REGIONS. Code
+ *	  & shared library regions are not likely targets of GRU instructions.
+ *
+ * To help improve the efficiency of TLB invalidation, the GMS data
+ * structure is maintained for EACH address space (MM struct). The GMS is
+ * also the structure that contains the pointer to the mmuops callout
+ * functions. This structure is linked to the mm_struct for the address space
+ * using the mmuops "register" function. The mmuops interfaces are used to
+ * provide the callbacks for TLB invalidation. The GMS contains:
+ *
+ * 	- asid[maxgrus] array. ASIDs are assigned to a GRU when a context is
+ * 	  loaded into the GRU.
+ * 	- asidmap[maxgrus]. bitmap to make it easier to find non-zero asids in
+ * 	  the above array
+ *	- ctxbitmap[maxgrus]. Indicates the contexts that are currently active
+ *	  in the GRU for the address space. This bitmap must be passed to the
+ *	  GRU to do an invalidate.
+ *
+ * The current algorithm for invalidating TLBs is:
+ * 	- scan the asidmap for GRUs where the context has been loaded, ie,
+ * 	  asid is non-zero.
+ * 	- for each gru found:
+ * 		- if the ctxtmap is non-zero, there are active contexts in the
+ * 		  GRU. TLB invalidate instructions must be issued to the GRU.
+ *		- if the ctxtmap is zero, no context is active. Set the ASID to
+ *		  zero to force a full TLB invalidation. This is fast but will
+ *		  cause a lot of TLB misses if the context is reloaded onto the
+ *		  GRU
+ *
+ */
+
+void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
+			 unsigned long len)
+{
+	struct gru_state *gru;
+	struct gru_mm_tracker *asids;
+	struct gru_tlb_global_handle *tgh;
+	unsigned long num;
+	int grupagesize, pagesize, pageshift, gid, asid;
+
+	pageshift = (is_hugepage(NULL, start) ? HPAGE_SHIFT : PAGE_SHIFT);
+	pagesize = (1UL << pageshift);
+	grupagesize = GRU_PAGESIZE(pageshift);
+	num = min(((len + pagesize - 1) >> pageshift), GRUMAXINVAL);
+
+	STAT(flush_tlb);
+	gru_dbg(grudev, "gms %p, start 0x%lx, len 0x%lx, asidmap 0x%lx\n", gms,
+		start, len, gms->ms_asidmap[0]);
+
+	spin_lock(&gms->ms_asid_lock);
+	for_each_gru_in_bitmap(gid, gms->ms_asidmap) {
+		STAT(flush_tlb_gru);
+		gru = GID_TO_GRU(gid);
+		asids = gms->ms_asids + gid;
+		asid = asids->mt_asid;
+		if (asids->mt_ctxbitmap && asid) {
+			STAT(flush_tlb_gru_tgh);
+			asid = GRUASID(asid, start);
+			gru_dbg(grudev,
+	"  FLUSH gruid %d, asid 0x%x, num %ld, cbmap 0x%x\n",
+				gid, asid, num, asids->mt_ctxbitmap);
+			tgh = get_lock_tgh_handle(gru);
+			tgh_invalidate(tgh, start, 0, asid, grupagesize, 0,
+				       num - 1, asids->mt_ctxbitmap);
+			get_unlock_tgh_handle(tgh);
+		} else {
+			STAT(flush_tlb_gru_zero_asid);
+			asids->mt_asid = 0;
+			__clear_bit(gru->gs_gid, gms->ms_asidmap);
+			gru_dbg(grudev,
+	"  CLEARASID gruid %d, asid 0x%x, cbtmap 0x%x, asidmap 0x%lx\n",
+				gid, asid, asids->mt_ctxbitmap,
+				gms->ms_asidmap[0]);
+		}
+	}
+	spin_unlock(&gms->ms_asid_lock);
+}
+
+/*
+ * Flush the entire TLB on a chiplet.
+ */
+void gru_flush_all_tlb(struct gru_state *gru)
+{
+	struct gru_tlb_global_handle *tgh;
+
+	gru_dbg(grudev, "gru %p, gid %d\n", gru, gru->gs_gid);
+	tgh = get_lock_tgh_handle(gru);
+	tgh_invalidate(tgh, 0, ~0, 0, 1, 1, GRUMAXINVAL - 1, 0);
+	get_unlock_tgh_handle(tgh);
+	preempt_enable();
+}
+
+/*
+ * Called from a mmuops callback to unmap a range of PTEs.
+ *
+ * Called holding the mmap_sem for write.
+ */
+static void gru_mmuops_invalidate_range_begin(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start, unsigned long end,
+				       int atomic)
+{
+	struct gru_mm_struct *gms;
+
+	STAT(mmuops_invalidate_range);
+	gms = container_of(mn, struct gru_mm_struct, ms_notifier);
+	gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, atomic %d\n", gms,
+		start, end, atomic);
+	atomic_inc(&gms->ms_range_active);
+	gru_flush_tlb_range(gms, start, end - start);
+}
+
+static void gru_mmuops_invalidate_range_end(struct mmu_notifier *mn,
+				     struct mm_struct *mm, unsigned long start,
+				     unsigned long end, int atomic)
+{
+	struct gru_mm_struct *gms;
+
+	gms = container_of(mn, struct gru_mm_struct, ms_notifier);
+	gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, atomic %d\n", gms,
+		start, end, atomic);
+	atomic_dec(&gms->ms_range_active);
+	wake_up_all(&gms->ms_wait_queue);
+}
+
+/*
+ * Called from a mmuops callback whenever a valid PTE is unloaded ex. when a
+ * page is paged out by the kernel.
+ *
+ * Called holding the mm->page_table_lock
+ */
+static void gru_mmuops_invalidate_page(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long vaddr)
+{
+	struct gru_mm_struct *gms;
+
+	STAT(mmuops_invalidate_page);
+	gms = container_of(mn, struct gru_mm_struct, ms_notifier);
+	gru_dbg(grudev, "gms %p, vaddr 0x%lx\n", gms, vaddr);
+	gru_flush_tlb_range(gms, vaddr, 1);
+}
+
+/*
+ *  Called at start of address space teardown. GTS's still
+ *  hold a reference count on the GMS. Structure is not freed
+ *  until the reference count goes to zero.
+ */
+static void gru_mmuops_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	struct gru_mm_struct *gms;
+
+	STAT(mmuops_release);
+	gms = container_of(mn, struct gru_mm_struct, ms_notifier);
+	gru_dbg(grudev, "gms %p\n", gms);
+	gms->ms_released = 1;
+}
+
+static const struct mmu_notifier_ops gru_mmuops = {
+	.release = gru_mmuops_release,
+	.invalidate_range_begin = gru_mmuops_invalidate_range_begin,
+	.invalidate_range_end = gru_mmuops_invalidate_range_end,
+	.invalidate_page = gru_mmuops_invalidate_page,
+};
+
+/* Move this to the basic mmuops file. But for now... */
+static struct mmu_notifier *mmuops_find_ops(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n, *t;
+	struct gru_mm_struct *gms;
+
+	hlist_for_each_entry_safe_rcu(mn, n, t, &mm->mmu_notifier.head, hlist)
+	    if (mn->ops == &gru_mmuops) {
+		gms = container_of(mn, struct gru_mm_struct, ms_notifier);
+		if (atomic_read(&gms->ms_refcnt) > 0)
+			return mn;
+	}
+	return NULL;
+}
+
+struct gru_mm_struct *gru_register_mmu_notifier(void)
+{
+	struct gru_mm_struct *gms;
+	struct mmu_notifier *mn;
+
+	mn = mmuops_find_ops(current->mm);
+	if (mn) {
+		gms = container_of(mn, struct gru_mm_struct, ms_notifier);
+		atomic_inc(&gms->ms_refcnt);
+	} else {
+		gms = kzalloc(sizeof(*gms), GFP_KERNEL);
+		if (gms) {
+			spin_lock_init(&gms->ms_asid_lock);
+			gms->ms_notifier.ops = &gru_mmuops;
+			atomic_set(&gms->ms_refcnt, 1);
+			init_waitqueue_head(&gms->ms_wait_queue);
+			INIT_HLIST_NODE(&gms->ms_notifier.hlist);
+			mmu_notifier_register(&gms->ms_notifier, current->mm);
+			synchronize_rcu();
+		}
+	}
+	return gms;
+}
+
+void gru_drop_mmu_notifier(struct gru_mm_struct *gms)
+{
+	if (atomic_dec_return(&gms->ms_refcnt) == 0) {
+		if (!gms->ms_released)
+			mmu_notifier_unregister(&gms->ms_notifier, current->mm);
+		synchronize_rcu();
+		kfree(gms);
+	}
+}
+
+/*
+ * Setup TGH parameters. There are:
+ * 	- 24 TGH handles per GRU chiplet
+ * 	- a portion (MAX_LOCAL_TGH) of the handles are reserved for
+ * 	  use by blade-local cpus
+ * 	- the rest are used by off-blade cpus. This usage is
+ * 	  less frequent than blade-local usage.
+ *
+ * For now, use 16 handles for local flushes, 8 for remote flushes. If the blade
+ * has less tan or equal to 16 cpus, each cpu has a unique handle that it can
+ * use.
+ */
+#define MAX_LOCAL_TGH	16
+
+void gru_tgh_flush_init(struct gru_state *gru)
+{
+	int cpus, shift = 0, n;
+
+	cpus = nr_cpus_blade(gru->gs_blade_id);
+
+	/* n = cpus rounded up to next power of 2 */
+	if (cpus) {
+		n = 1 << fls(cpus - 1);
+
+		/*
+		 * shift count for converting local cpu# to TGH index
+		 *      0 if cpus <= MAX_LOCAL_TGH,
+		 *      1 if cpus <= 2*MAX_LOCAL_TGH,
+		 *      etc
+		 */
+		shift = max(0, fls(n - 1) - fls(MAX_LOCAL_TGH - 1));
+	}
+	gru->gs_tgh_local_shift = shift;
+
+	/* first starting TGH index to use for remote purges */
+	gru->gs_tgh_first_remote = (cpus + (1 << shift) - 1) >> shift;
+
+}
Index: linux/drivers/gru/gruprocfs.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/gruprocfs.c	2008-02-15 13:56:46.200364209 -0600
@@ -0,0 +1,309 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              PROC INTERFACES
+ *
+ * This file supports the /proc interfaces for the GRU driver
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#ifdef EMU
+#include "preemu.h"
+#endif
+#include <linux/proc_fs.h>
+#include <linux/device.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+#ifdef EMU
+#include "emu.h"
+#endif
+
+#define print_stat(s, f, id)						\
+	seq_printf(s, "%lu: " id, atomic_long_read(&gru_stats.f))
+
+static int statistics_show(struct seq_file *s, void *p)
+{
+	print_stat(s, fdata_alloc, "allocate fdata");
+	print_stat(s, fdata_free, "free fdata");
+	print_stat(s, vdata_alloc, "allocate vdata");
+	print_stat(s, vdata_free, "free vdata");
+	print_stat(s, gts_alloc, "thread state allocate");
+	print_stat(s, gts_free, "thread state free");
+	print_stat(s, gtd_alloc, "thread data allocate");
+	print_stat(s, gtd_free, "thread data free");
+	print_stat(s, vdata_double_alloc, "race in cow vdata alloc");
+	print_stat(s, gts_double_allocate, "race in cow gts alloc");
+
+	print_stat(s, assign_context, "allocate context");
+	print_stat(s, assign_context_failed, "allocate context failed");
+	print_stat(s, free_context, "free GRU context");
+	print_stat(s, load_context, "load GRU context");
+	print_stat(s, unload_context, "unload GRU context");
+	print_stat(s, steal_context, "steal context");
+	print_stat(s, steal_context_failed, "steal context failed");
+	print_stat(s, nopfn, "nopfn");
+	print_stat(s, break_cow, "break cow data fault");
+
+	print_stat(s, asid_new, "asid new");
+	print_stat(s, asid_next, "asid next");
+	print_stat(s, asid_wrap, "asid wrap");
+	print_stat(s, asid_reuse, "asid reuse");
+
+	print_stat(s, intr, "interrupt");
+	print_stat(s, call_os, "user call os");
+	print_stat(s, call_os_tfh_idle, "call_os_tfh_idle");
+	print_stat(s, call_os_check_for_bug, "call_os_check_for_bug");
+	print_stat(s, call_os_wait_queue, "call_os_wait_queue");
+	print_stat(s, user_flush_tlb, "user flush tlb");
+	print_stat(s, user_unload_context, "user unload context");
+	print_stat(s, user_exception, "user exception");
+	print_stat(s, set_task_slice, "set task slice");
+	print_stat(s, migrate_check, "migrate task check");
+	print_stat(s, migrated_retarget, "migrate retarget");
+	print_stat(s, migrated_unload, "migrate unload");
+	print_stat(s, migrated_unload_delay, "migrate unload delay");
+	print_stat(s, migrated_nopfn_retarget, "migrate nopfn retarget");
+	print_stat(s, migrated_nopfn_unload, "migrate nopfn unload");
+	print_stat(s, tlb_dropin, "tlb dropin");
+	print_stat(s, tlb_dropin_fail_no_asid, "tlb_dropin_fail_no_asid");
+	print_stat(s, tlb_dropin_fail_upm, "tlb_dropin_fail_upm");
+	print_stat(s, tlb_dropin_fail_invalid, "tlb_dropin_fail_invalid");
+	print_stat(s, tlb_dropin_fail_range_active, "tlb_dropin_fail_range_active");
+	print_stat(s, mmuops_invalidate_range, "mmuops invalidate range");
+	print_stat(s, mmuops_invalidate_page, "mmuops update page");
+	print_stat(s, mmuops_age_page, "mmuops age page");
+	print_stat(s, mmuops_release, "mmuops release");
+
+	print_stat(s, flush_tlb, "flush tlb");
+	print_stat(s, flush_tlb_gru, "flush tlb gru");
+	print_stat(s, flush_tlb_gru_tgh, "flush tlb tgh");
+	print_stat(s, flush_tlb_gru_zero_asid, "flush tlb zero asid");
+	return 0;
+}
+
+static ssize_t statistics_write(struct file *file, const char __user *userbuf,
+				size_t count, loff_t *data)
+{
+	memset(&gru_stats, 0, sizeof(gru_stats));
+	return count;
+}
+
+static int options_show(struct seq_file *s, void *p)
+{
+	seq_printf(s, "0x%lx\n", options);
+	return 0;
+}
+
+static ssize_t options_write(struct file *file, const char __user *userbuf,
+			     size_t count, loff_t *data)
+{
+	char buf[80];
+
+	if (copy_from_user
+	    (buf, userbuf, count < sizeof(buf) ? count : sizeof(buf)))
+		return -EFAULT;
+	options = simple_strtoul(buf, NULL, 0);
+
+	return count;
+}
+
+static int cch_seq_show(struct seq_file *file, void *data)
+{
+	long gid = *(long *)data;
+	int i;
+	struct gru_state *gru = GID_TO_GRU(gid);
+	struct gru_thread_state *ts;
+	const char *mode[] = { "??", "UPM", "INTR", "OS_POLL" };
+
+	if (gid == 0)
+		seq_printf(file, "#%5s%5s%6s%9s%6s%8s%8s\n", "gid", "bid",
+			   "ctx#", "pid", "cbrs", "dsbytes", "mode");
+	if (gru)
+		for (i = 0; i < GRU_NUM_CCH; i++) {
+			ts = gru->gs_gts[i];
+			if (!ts)
+				continue;
+			seq_printf(file, " %5d%5d%6d%9d%6d%8d%8s\n",
+				   gru->gs_gid, gru->gs_blade_id, i,
+				   ts->ts_tgid_owner,
+				   ts->ts_cbr_au_count * GRU_CBR_AU_SIZE,
+				   ts->ts_cbr_au_count * GRU_DSR_AU_BYTES,
+				   mode[ts->ts_user_options &
+					GRU_OPT_MISS_MASK]);
+		}
+
+	return 0;
+}
+
+static int gru_seq_show(struct seq_file *file, void *data)
+{
+	long gid = *(long *)data, ctxfree, cbrfree, dsrfree;
+	struct gru_state *gru = GID_TO_GRU(gid);
+
+	if (gid == 0) {
+		seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "gid", "nid",
+			   "ctx", "cbr", "dsr", "ctx", "cbr", "dsr");
+		seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "", "", "busy",
+			   "busy", "busy", "free", "free", "free");
+	}
+	if (gru) {
+		ctxfree = GRU_NUM_CCH - gru->gs_active_contexts;
+		cbrfree = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
+		dsrfree = hweight64(gru->gs_dsr_map) * GRU_DSR_AU_BYTES;
+		seq_printf(file, " %5d%5d%7ld%6ld%6ld%8ld%6ld%6ld\n",
+			   gru->gs_gid, gru->gs_blade_id, GRU_NUM_CCH - ctxfree,
+			   GRU_NUM_CBE - cbrfree, GRU_NUM_DSR_BYTES - dsrfree,
+			   ctxfree, cbrfree, dsrfree);
+	}
+
+	return 0;
+}
+
+static void seq_stop(struct seq_file *file, void *data)
+{
+}
+
+static void *seq_start(struct seq_file *file, loff_t *gid)
+{
+	if (*gid < GRU_MAX_GRUS)
+		return gid;
+	return NULL;
+}
+
+static void *seq_next(struct seq_file *file, void *data, loff_t *gid)
+{
+	(*gid)++;
+	if (*gid < GRU_MAX_GRUS)
+		return gid;
+	return NULL;
+}
+
+static struct seq_operations cch_seq_ops = {
+	.start = seq_start,
+	.next = seq_next,
+	.stop = seq_stop,
+	.show = cch_seq_show
+};
+
+static struct seq_operations gru_seq_ops = {
+	.start = seq_start,
+	.next = seq_next,
+	.stop = seq_stop,
+	.show = gru_seq_show
+};
+
+static int statistics_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, statistics_show, NULL);
+}
+
+static int options_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, options_show, NULL);
+}
+
+static int cch_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cch_seq_ops);
+}
+
+static int gru_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &gru_seq_ops);
+}
+
+/* *INDENT-OFF* */
+static struct file_operations statistics_fops = {
+	.open 		= statistics_open,
+	.read 		= seq_read,
+	.write 		= statistics_write,
+	.llseek 	= seq_lseek,
+	.release 	= seq_release,
+};
+
+static struct file_operations options_fops = {
+	.open 		= options_open,
+	.read 		= seq_read,
+	.write 		= options_write,
+	.llseek 	= seq_lseek,
+	.release 	= seq_release,
+};
+
+static struct file_operations cch_fops = {
+	.open 		= cch_open,
+	.read 		= seq_read,
+	.llseek 	= seq_lseek,
+	.release 	= seq_release,
+};
+static struct file_operations gru_fops = {
+	.open 		= gru_open,
+	.read 		= seq_read,
+	.llseek 	= seq_lseek,
+	.release 	= seq_release,
+};
+
+static struct proc_entry {
+	char *name;
+	int mode;
+	struct file_operations *fops;
+	struct proc_dir_entry *entry;
+} proc_files[] = {
+	{"statistics", 0644, &statistics_fops},
+	{"debug_options", 0644, &options_fops},
+	{"cch_status", 0444, &cch_fops},
+	{"gru_status", 0444, &gru_fops},
+	{NULL}
+};
+/* *INDENT-ON* */
+
+static struct proc_dir_entry *proc_gru;
+
+static int create_proc_file(struct proc_entry *p)
+{
+	p->entry = create_proc_entry(p->name, p->mode, proc_gru);
+	if (!p->entry)
+		return -1;
+	p->entry->proc_fops = p->fops;
+	return 0;
+}
+
+static void delete_proc_files(void)
+{
+	struct proc_entry *p;
+
+	if (proc_gru) {
+		for (p = proc_files; p->name; p++)
+			if (p->entry)
+				remove_proc_entry(p->name, proc_gru);
+		remove_proc_entry("gru", NULL);
+	}
+}
+
+int gru_proc_init(void)
+{
+	struct proc_entry *p;
+
+	proc_gru = proc_mkdir("gru", NULL);
+
+	for (p = proc_files; p->name; p++)
+		if (create_proc_file(p))
+			goto err;
+	return 0;
+
+err:
+	delete_proc_files();
+	return -1;
+}
+
+void gru_proc_exit(void)
+{
+	delete_proc_files();
+}
Index: linux/drivers/gru/grutables.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/gru/grutables.h	2008-02-22 09:36:17.000000000 -0600
@@ -0,0 +1,517 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *            GRU DRIVER TABLES, MACROS, externs, etc
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#ifndef _ASM_IA64_SN_GRUTABLES_H
+#define _ASM_IA64_SN_GRUTABLES_H
+
+/*
+ * Tables:
+ *
+ * 	GFD - GRU File Data     - Holds GSEG options. Used to communicate with
+ * 				  user using ioctls.
+ * 	VDATA-VMA Data		- Holds a few parameters. Head of linked list of
+ * 				  GTS tables for threads using the GSEG
+ * 	GTS - Gru Thread State  - contains info for managing a GSEG context. A
+ * 				  GTS is allocated for each thread accessing a
+ * 				  GSEG.
+ *     	GTD - GRU Thread Data   - contains shadow copy of GRU data when GSEG is
+ *     				  not loaded into a GRU
+ *	GMS - GRU Memory Struct - Used to manage TLB shotdowns. Tracks GRUs
+ *				  where a GSEG has been loaded. Similar to
+ *				  an mm_struct but for GRU.
+ *
+ *	GS  - GRU State 	- Used to manage the state of a GRU chiplet
+ *	BS  - Blade State	- Used to manage state of all GRU chiplets
+ *				  on a blade
+ *
+ *
+ *  Normal task tables for task using GRU.
+ *  		- 2 threads in process
+ *  		- 2 GSEGs open in process
+ *  		- GSEG1 is being used by both tthreads
+ *  		- GSEG2 is used only by thread 2
+ *
+ *       task -->|
+ *       task ---+---> mm ->-- (mmuops) -------------+-> gms
+ *                     |                             |
+ *                     |--> vma -> vdata ---> gts--->|		GSEG1 (thread1)
+ *                     |                  |   gtd    |
+ *                     |                  |          |
+ *                     |                  +-> gts--->|		GSEG1 (thread2)
+ *                     |                      gtd    |
+ *                     |                             |
+ *                     |--> vma -> vdata ---> gts--->|		GSEG2 (thread2)
+ *                     |                      gtd
+ *                     .
+ *                     .
+ *
+ *  GSEGs are logically copy-on-write at fork time.
+ *
+ * At open
+ * 	file.private_data -> gfd
+ *
+ * At mmap,
+ * 	vma -> vdata -> gts -> gtd
+ *
+ * After fork
+ *   parent
+ * 	vma -> vdata -> gts -> gtd	# normal case
+ *   child                  /
+ * 	vma -> ------------/		# gtd shared with parent
+ *
+ *   Parent page fault for GSEG
+ *    before
+ *	vma -> vdata -> gts -> gtd
+ *    after
+ *	vma -> vdata -> gts -> gtd	# allocate a new gtd. Old gtd
+ *					  if left with child
+ *
+ *    Child page fault before
+ * 	 vma -> gtd
+ *     after 
+ *	 vma -> vdata -> gts -> gtd	# Allocate GTS. Move old gtd
+ *	 				  to new gts
+ *
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include "gru.h"
+#include "gruhandles.h"
+
+
+ /* Some hacks for running on the hardware simulator */
+#ifdef EMU
+#undef local_irq_disable
+#undef local_irq_enable
+#define local_irq_disable() 	emu_local_irq_disable()
+#define local_irq_enable() 	emu_local_irq_enable()
+void emu_local_irq_disable(void);
+void emu_local_irq_enable(void);
+#define gru_stats		gruhdr->egru_stats
+#define gru_base		gruhdr->egru_base
+#define cpu_trinfo		gruhdr->cpu_trinfo
+#define gru_start_paddr		gruhdr->egru_start_paddr
+#define gru_end_paddr		gruhdr->egru_end_paddr
+#define STATIC
+#else
+extern struct gru_stats_s gru_stats;
+extern struct gru_blade_state *gru_base[];
+extern unsigned long gru_start_paddr, gru_end_paddr;
+#define STATIC static
+#endif
+
+#define GRU_MAX_BLADES		MAX_NUMNODES
+#define GRU_MAX_GRUS		(GRU_MAX_BLADES * GRU_CHIPLETS_PER_BLADE)
+
+#define GRU_DRIVER_ID_STR       "SGI GRU Device Driver"
+#define REVISION                "0.01"
+
+/*
+ * GRU statistics.
+ */
+struct gru_stats_s {
+	atomic_long_t fdata_alloc;
+	atomic_long_t fdata_free;
+	atomic_long_t vdata_alloc;
+	atomic_long_t vdata_free;
+	atomic_long_t gts_alloc;
+	atomic_long_t gts_free;
+	atomic_long_t gtd_alloc;
+	atomic_long_t gtd_free;
+	atomic_long_t vdata_double_alloc;
+	atomic_long_t gts_double_allocate;
+	atomic_long_t assign_context;
+	atomic_long_t assign_context_failed;
+	atomic_long_t free_context;
+	atomic_long_t load_context;
+	atomic_long_t unload_context;
+	atomic_long_t steal_context;
+	atomic_long_t steal_context_failed;
+	atomic_long_t nopfn;
+	atomic_long_t break_cow;
+	atomic_long_t asid_new;
+	atomic_long_t asid_next;
+	atomic_long_t asid_wrap;
+	atomic_long_t asid_reuse;
+	atomic_long_t intr;
+	atomic_long_t call_os;
+	atomic_long_t call_os_tfh_idle;
+	atomic_long_t call_os_check_for_bug;
+	atomic_long_t call_os_wait_queue;
+	atomic_long_t user_flush_tlb;
+	atomic_long_t user_unload_context;
+	atomic_long_t user_exception;
+	atomic_long_t set_task_slice;
+	atomic_long_t migrate_check;
+	atomic_long_t migrated_retarget;
+	atomic_long_t migrated_unload;
+	atomic_long_t migrated_unload_delay;
+	atomic_long_t migrated_nopfn_retarget;
+	atomic_long_t migrated_nopfn_unload;
+	atomic_long_t tlb_dropin;
+	atomic_long_t tlb_dropin_fail_no_asid;
+	atomic_long_t tlb_dropin_fail_upm;
+	atomic_long_t tlb_dropin_fail_invalid;
+	atomic_long_t tlb_dropin_fail_range_active;
+	atomic_long_t mmuops_invalidate_range;
+	atomic_long_t mmuops_invalidate_page;
+	atomic_long_t mmuops_age_page;
+	atomic_long_t mmuops_release;
+	atomic_long_t flush_tlb;
+	atomic_long_t flush_tlb_gru;
+	atomic_long_t flush_tlb_gru_tgh;
+	atomic_long_t flush_tlb_gru_zero_asid;
+};
+
+#define GRU_DEBUG 1
+
+#define OPT_DPRINT	1
+#define OPT_STATS	0x2
+
+#ifdef EMU
+# undef dev_printk
+# define dev_printk(level, dev, s, x...)				\
+			EMULOG(TR_GRU_DEBUG, "DRV", s, x)
+#endif
+
+#define IRQ_GRU			110	/* Starting IRQ number for interrupts */
+
+/* Delay in jiffies between attempts to assign a GRU context */
+#define GRU_ASSIGN_DELAY	((HZ * 20) / 1000)
+
+/* If a process has it's context stolen, min delay in jiffies before trying to
+ * steal a context from another process */
+#define GRU_STEAL_DELAY		((HZ * 200) / 1000)
+
+#ifdef GRU_DEBUG
+#define STAT(id)	do {						\
+				if (options & OPT_STATS)		\
+					atomic_long_inc(&gru_stats.id);	\
+			} while (0)
+
+#define gru_dbg(dev, fmt, x...) do {                                            \
+				if (options & OPT_DPRINT) dev_dbg(dev, "%s: " fmt, __FUNCTION__, x); \
+			   } while (0)
+#else
+#define STAT(id)
+#define gru_dbg(x...)
+#endif
+
+/*-----------------------------------------------------------------------------
+ * ASID management
+ */
+//#define MAX_ASID	0xfffff0
+#define MAX_ASID	0x1f0
+#define MIN_ASID	8
+#define ASID_INC	8	/* number of regions */
+
+/* Generate a GRU asid value from a GRU base asid & a virtual address. */
+#ifdef __ia64__
+#define VADDR_HI_BIT		64
+#elif __x86_64
+#define VADDR_HI_BIT		48
+#else
+#error "bad arch"
+#endif
+#define GRUREGION(addr)		((addr) >> (VADDR_HI_BIT - 3) & 3)
+#define GRUASID(asid, addr)	((asid) + GRUREGION(addr))
+
+/*------------------------------------------------------------------------------
+ *  File & VMS Tables
+ */
+
+struct gru_state;
+
+/*
+ * This is the file_private data structure
+ *   Note: values are used only when GRU is mmaped. At that
+ *   time the current values are copied to the GTS.
+ */
+struct gru_file_data {
+	long		fd_user_options;	/* misc user option flags */
+	int		fd_cbr_au_count;	/* number control blocks AU */
+	int		fd_dsr_au_count;	/* data segment size AU */
+	int		fd_thread_slices;	/* max threads that will access
+						   the context */
+};
+
+/*
+ * This structure is pointed to from the mmstruct via the mmuops pointer. There
+ * is one of these per address space.
+ */
+struct gru_mm_tracker {
+	unsigned int		mt_asid_gen;	/* ASID wrap count */
+	int			mt_asid;	/* current base ASID for gru */
+	unsigned short		mt_ctxbitmap;	/* bitmap of contexts using
+						   asid */
+};
+
+struct gru_mm_struct {
+	struct mmu_notifier	ms_notifier;
+	atomic_t		ms_refcnt;
+	char			ms_released;
+	spinlock_t		ms_asid_lock;
+	atomic_t		ms_range_active;	/* number of range_invals active */
+	wait_queue_head_t	ms_wait_queue;
+	DECLARE_BITMAP(ms_asidmap, GRU_MAX_GRUS);
+	struct gru_mm_tracker	ms_asids[GRU_MAX_GRUS];
+};
+
+/*
+ * One of these structures is allocated when a GSEG is mmaped. The
+ * structure is pointed to by the vma->vm_private_data field in the vma struct.
+ * Note: after a fork, the CHILD's vm_private_data field points to a
+ * "struct gru_thread_data" (the VM open callout can't allocate memory).
+ * The normal vdata/gts/gtd structures are allocated on first fault.
+ */
+struct gru_vma_data {
+	spinlock_t		vd_lock;	/* Serialize access to vma */
+	struct list_head	vd_head;	/* head of linked list of gts */
+	long			vd_user_options;/* misc user option flags */
+	int			vd_cbr_au_count;
+	int			vd_dsr_au_count;
+	int			vd_thread_slices;
+};
+
+/*
+ * One of these is allocated for each thread accessing a mmaped GRU. A linked
+ * list of these structure is hung off the struct gru_vma_data in the mm_struct.
+ */
+struct gru_thread_data {
+	long			td_magic;	/* magic ID for IS_THREAD_DATA */
+	atomic_t		td_refcnt;	/* number of GTS structs sharing data */
+	unsigned long		td_gdata[0];	/* save area for GRU data (CB, DS, CBE) */
+};
+#define TD_MAGIC		0xabcd1235
+#define IS_THREAD_DATA(p)	(*((long *)(p)) == TD_MAGIC)
+
+struct gru_thread_state {
+	struct list_head	ts_next;	/* list - head at vma-private */
+	struct semaphore	ts_ctxsem;	/* load/unload CTX lock */
+	struct mm_struct	*ts_mm;		/* mm currently mapped to context */
+	struct vm_area_struct	*ts_vma;	/* vma of GRU context */
+	struct gru_state	*ts_gru;	/* GRU where the context is loaded */
+	struct gru_mm_struct	*ts_ms;		/* asid & ioproc struct */
+	struct gru_thread_data	*ts_td;		/* gru thread data */
+	unsigned long		ts_steal_jiffies;/* jiffies when context last stolen */
+	pid_t			ts_tgid_owner;	/* task that is using the context - for migration */
+	int			ts_tsid;	/* thread that owns the structure */
+	int			ts_tlb_int_select;/* target cpu if interrupts enabled */
+	int			ts_ctxnum;	/* context number where the context is loaded */
+	atomic_t		ts_refcnt;	/* reference count GTS */
+	long			ts_user_options;/* misc user option flags */
+	unsigned long		ts_cbr_map;	/* map of allocated CBRs */
+	unsigned long		ts_dsr_map;	/* map of allocated DATA resources */
+	unsigned char		ts_dsr_au_count;/* Number of DSR resources requied for contest */
+	unsigned char		ts_cbr_au_count;/* Number of CBR resources requied for contest */
+	char			ts_force_unload;/* force context to be unloaded after migration */
+	char			ts_cbr_idx[GRU_CBR_AU];/* CBR numbers of each allocated CB */
+};
+
+/*
+ * Threaded programs actually allocate an array of GSEGs when a context is created. Each
+ * thread uses a separate GSEG. TSID is the index into the GSEG array.
+ */
+#define TSID(off)		((off) / GRU_GSEG_PAGESIZE)
+#define UGRUADDR(gts)		((gts)->ts_vma->vm_start + (gts)->ts_tsid * GRU_GSEG_PAGESIZE)
+
+#define NULLCTX			-1	/* if context not loaded into GRU */
+
+/*-----------------------------------------------------------------------------
+ *  GRU State Tables
+ */
+
+/*
+ * One of these exists for each GRU chiplet.
+ */
+struct gru_state {
+	struct gru_blade_state	*gs_blade;		/* GRU state for entire blade */
+	unsigned long		gs_gru_base_paddr;	/* Physical address of gru segments (64) */
+	void			*gs_gru_base_vaddr;	/* Virtual address of gru segments (64) */
+	char			gs_present;		/* 0=GRU not present */
+	unsigned char		gs_gid;			/* unique GRU number */
+	char			gs_tgh_local_shift;	/* used to pick TGH for local flush */
+	char			gs_tgh_first_remote;	/* starting TGH# for remote flush */
+	short			gs_blade_id;		/* blade of GRU */
+	spinlock_t		gs_asid_lock;		/* lock used for assigning asids */
+	spinlock_t		gs_lock;		/* lock used for assigning contexts */
+
+	/* ---- the following fields are protected by the gs_asid_lock spinlock ---- */
+	int			gs_asid;		/* Next available ASID */
+	int			gs_asid_limit;		/* Limit of available ASIDs */
+	unsigned int		gs_asid_gen;		/* asid generation. Inc on wrap */
+
+	/* ---- the following fields are protected by the gs_lock spinlock ---- */
+	short			gs_active_contexts;	/* number of contexts in use */
+	unsigned long		gs_context_map;		/* bitmap used to manage contexts in use */
+	unsigned long		gs_cbr_map;		/* bitmap used to manage CB resources */
+	unsigned long		gs_dsr_map;		/* bitmap used to manage DATA resources */
+	struct gru_thread_state	*gs_gts[GRU_NUM_CCH];	/* GTS currently using the context */
+};
+
+/*
+ * This structure contains the GRU state for all the GRUs on a blade.
+ */
+struct gru_blade_state {
+	/* ---- the following fields are protected by the blade bs_lock spinlock ---- */
+	spinlock_t		bs_lock;		/* lock used for stealing contexts */
+	int			bs_lru_ctxnum;		/* STEAL - last context stolen */
+	struct gru_state	*bs_lru_gru;		/* STEAL - last gru stolen */
+
+	struct gru_state	bs_grus[GRU_CHIPLETS_PER_BLADE];
+};
+
+/*-----------------------------------------------------------------------------
+ * Address Primitives
+ */
+#define get_tfm_for_cpu(g, c)	((struct gru_tlb_fault_map *)GRU_TFM((g)->gs_gru_base_vaddr, (c)))
+#define get_tfh_by_index(g, i)	((struct gru_tlb_fault_handle *)GRU_TFH((g)->gs_gru_base_vaddr, (i)))
+#define get_tgh_by_index(g, i)	((struct gru_tlb_global_handle *)GRU_TGH((g)->gs_gru_base_vaddr, (i)))
+#define get_cbe_by_index(g, i)	((struct gru_control_block_extended *)GRU_CBE((g)->gs_gru_base_vaddr, (i)))
+
+/*-----------------------------------------------------------------------------
+ * Useful Macros
+ */
+
+/* Number of bytes to save/restore when unloading/loading GRU contexts */
+#define DSR_BYTES(dsr)		((dsr) * GRU_DSR_AU_BYTES)
+#define CB_CBR_BYTES(cbr)	((cbr) * GRU_HANDLE_BYTES * GRU_CBR_AU_SIZE * 2)
+#define THREADDATABYTES(v) 	(sizeof(struct gru_thread_data) + 		\
+					DSR_BYTES((v)->vd_dsr_au_count)	+	\
+					CB_CBR_BYTES((v)->vd_cbr_au_count))
+
+/* Convert a user CB number to the actual CBRNUM */
+#define thread_cbr_number(gts, n) ((gts)->ts_cbr_idx[(n) / GRU_CBR_AU_SIZE] 	\
+				  * GRU_CBR_AU_SIZE + (n) % GRU_CBR_AU_SIZE)
+
+/* Test if a vaddr is a hugepage */
+#define is_hugepage(m, v)	is_hugepage_only_range(m, (v), PAGE_SIZE)
+
+/* Convert a gid to a pointer to the GRU */
+#define GID_TO_GRU(gid)		(gru_base[(gid) / GRU_CHIPLETS_PER_BLADE] ?	\
+				 (&gru_base[(gid) / GRU_CHIPLETS_PER_BLADE]->	\
+					bs_grus[(gid) % GRU_CHIPLETS_PER_BLADE]) : NULL)
+
+/* Scan all active GRUs in a GRU bitmap */
+#define for_each_gru_in_bitmap(gid, map)					\
+	for (gid = find_first_bit(map, GRU_MAX_GRUS); gid < GRU_MAX_GRUS;	\
+			 gid++, gid = find_next_bit(map, GRU_MAX_GRUS, gid))
+
+/* Scan all active GRUs on a specific blade */
+#define for_each_gru_on_blade(gru, nid, i)					\
+	for (gru = gru_base[nid]->bs_grus, i = 0; i < GRU_CHIPLETS_PER_BLADE; i++, gru++)	\
+		if (gru->gs_present)
+
+/* Scan all active GTSs on a gru. Note: must hold ss_lock to use thsi macro. */
+#define for_each_gts_on_gru(gts, gru, ctxnum)					\
+	if (gru->gs_present)							\
+		for (ctxnum = 0; ctxnum < GRU_NUM_CCH; ctxnum++)		\
+			if ((gts = gru->gs_gts[ctxnum]))
+
+/* Scan each CBR whose bit is set in a TFM (or copy of) */
+#define for_each_cbr_in_tfm(i, map)						\
+	for (i = find_first_bit(map, GRU_NUM_CBE); i < GRU_NUM_CBE;		\
+			 i++, i = find_next_bit(map, GRU_NUM_CBE, i))
+
+/* Scan each CBR in a CBR bitmap. Note: multiple CBRs in an allocation unit */
+#define for_each_cbr_in_allocation_map(i, map, k)				\
+	for (k = find_first_bit(map, GRU_CBR_AU); k < GRU_CBR_AU;		\
+			 k = find_next_bit(map, GRU_CBR_AU, k + 1)) 		\
+		for (i = k*GRU_CBR_AU_SIZE; i < (k + 1) * GRU_CBR_AU_SIZE; i++)
+
+/* Scan each DSR in a DSR bitmap. Note: multiple DSRs in an allocation unit */
+#define for_each_dsr_in_allocation_map(i, map, k)				\
+	for (k = find_first_bit((const unsigned long *)map, GRU_DSR_AU);	\
+			k < GRU_DSR_AU;						\
+			k = find_next_bit((const unsigned long *)map, GRU_DSR_AU, k + 1))\
+		for (i = k*GRU_DSR_AU_CL; i < (k + 1) * GRU_DSR_AU_CL; i++)
+
+#define gseg_physical_address(gru, ctxnum)					\
+		(gru->gs_gru_base_paddr + ctxnum * GRU_GSEG_STRIDE)
+#define gseg_virtual_address(gru, ctxnum)					\
+		(gru->gs_gru_base_vaddr + ctxnum * GRU_GSEG_STRIDE)
+
+/* ZZZ Hacks until we hook up to the rest of the UV infrastructure */
+#define NODESPERBLADE		1
+#define CPUSPERSOCKET		8
+#define SOCKETSPERBLADE		2
+#define CPUSPERBLADE		(CPUSPERSOCKET * SOCKETSPERBLADE)
+#define CPUSPERNODE		(CPUSPERBLADE / NODESPERBLADE)
+
+#define blade_processor_id() 	(smp_processor_id() % CPUSPERBLADE)
+#define numa_blade_id() 	(numa_node_id() / NODESPERBLADE)
+#define nid_to_blade(nid)	((nid) / NODESPERBLADE)
+#define nr_cpus_blade(nid)	(CPUSPERSOCKET * SOCKETSPERBLADE)
+#define cpu_to_blade(cpu)	((cpu) / CPUSPERBLADE)
+
+/*-----------------------------------------------------------------------------
+ * Lock / Unlock GRU handles
+ * 	Use the "delresp" bit in the handle as a "lock" bit.
+ */
+
+static inline void lock_handle(void *h)
+{
+	while (test_and_set_bit(1, h)) {
+		cpu_relax();
+#ifdef EMU
+		my_usleep(100);
+#endif
+	}
+}
+
+static inline void unlock_handle(void *h)
+{
+	clear_bit(1, h);
+}
+
+/*-----------------------------------------------------------------------------
+ * Function prototypes & externs
+ */
+extern struct vm_operations_struct gru_vm_ops;
+extern struct device *grudev;
+struct gru_unload_context_req;
+struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma, int tsid,
+					void *gtd);
+struct gru_thread_state *gru_find_thread_state(struct vm_area_struct *vma,
+					       int tsid);
+void gru_unload_context(struct gru_thread_state *gts, int savestate);
+void gtd_drop(struct gru_thread_data *gtd);
+void gts_drop(struct gru_thread_state *gts);
+void gru_tgh_flush_init(struct gru_state *gru);
+int gru_kservices_init(struct gru_state *gru);
+irqreturn_t gru_intr(int irq, void *dev_id);
+int gru_handle_user_call_os(unsigned long address);
+int gru_user_flush_tlb(unsigned long arg);
+int gru_user_unload_context(unsigned long arg);
+int gru_get_exception_detail(unsigned long arg);
+int gru_set_task_slice(long address);
+int gru_cpu_fault_map_id(void);
+void gru_flush_all_tlb(struct gru_state *gru);
+void gru_migrate_task(int pcpu, int cpu);
+int gru_proc_init(void);
+void gru_proc_exit(void);
+unsigned long reserve_gru_cb_resources(struct gru_state *gru, int cbr_au_count,
+				       char *cbmap);
+unsigned long reserve_gru_ds_resources(struct gru_state *gru, int dsr_au_count,
+				       char *dsmap);
+extern unsigned long gru_nopfn(struct vm_area_struct *, unsigned long);
+extern struct gru_mm_struct *gru_register_mmu_notifier(void);
+extern void gru_drop_mmu_notifier(struct gru_mm_struct *gms);
+
+void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
+                           unsigned long len);
+
+extern unsigned long options;
+
+#endif /* _ASM_IA64_SN_GRUTABLES_H */

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] mmu notifiers #v7
  2008-02-21 16:10             ` Jack Steiner
@ 2008-02-27 19:26               ` Andrea Arcangeli
  2008-02-27 20:04                 ` Peter Zijlstra
                                   ` (4 more replies)
  0 siblings, 5 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-27 19:26 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	linux-kernel, linux-mm, daniel.blueman, Christoph Lameter

Hello,

I hope this will can be considered final for .25 and be merged. Risk
is zero, the only discussion here is to make an API that will last
forever, functionality-wise all these patches provides zero risk and
zero overhead when MMU_NOTIFIER=n. This last patch covers KVM and GRU
and hopefully all other non-blocking users optimally, and the below
API will hopefully last forever (but even if it lasts just for .25 and
.26 is changed that's fine with us, it's a kernel _internal_ API
anyway, there's absolutely nothing visible to userland).

What Christoph need to do when he's back from vacations to support
sleepable mmu notifiers is to add a CONFIG_XPMEM config option that
will switch the i_mmap_lock from a semaphore to a mutex (any other
change to this patch will be minor compared to that) so XPMEM hardware
will have kernels compiled that way. I don't see other sane ways to
remove the "atomic" parameter from the API (apparently required by
Andrew for merging something not restricted to the xpmem current usage
with only anonymous memory) and I don't want to have such a
locking-change intrusive dependency for all other non-blocking users
that are fine without having to alter how the VM works (for example
KVM and GRU). Very minor changes will be required to this patch to
make it work after the VM locking will be altered (for example the
CONFIG_XPMEM should also switch the mmu_register/unregister locking
from RCU to mutex as well). XPMEM then will only compile if
CONFIG_XPMEM=y and in turn the invalidate_range_* will support
scheduling inside.

I don't think pretending to merge all in one block (I mean including
xpmem support that requires blocking methods) is good idea anymore as
long as we agree the "atomic" parameter shouldn't be merged. But we
can quite easily agree on the below to be optimal for GRU/KVM and
trivially extendible once a CONFIG_XPMEM will be added. So this first
part can go in now I think.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -10,6 +10,7 @@
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/completion.h>
+#include <linux/mmu_notifier.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -228,6 +229,8 @@ struct mm_struct {
 #ifdef CONFIG_CGROUP_MEM_CONT
 	struct mem_cgroup *mem_cgroup;
 #endif
+
+	struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
new file mode 100644
--- /dev/null
+++ b/include/linux/mmu_notifier.h
@@ -0,0 +1,159 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct mmu_notifier;
+
+struct mmu_notifier_ops {
+	/*
+	 * Called when nobody can register any more notifier in the mm
+	 * and after the "mn" notifier has been disarmed already.
+	 */
+	void (*release)(struct mmu_notifier *mn,
+			struct mm_struct *mm);
+
+	/*
+	 * invalidate_page is called in atomic context after any pte
+	 * has been updated and before dropping the PT lock required
+	 * to update any Linux pte.  Once the PT lock will be released
+	 * the pte will have its final value to export through the
+	 * secondary MMU.  Before this is invoked any secondary MMU is
+	 * still ok to read/write to the page previously pointed by
+	 * the Linux pte because the old page hasn't been freed yet.
+	 * If required set_page_dirty has to be called internally to
+	 * this method.
+	 */
+	void (*invalidate_page)(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long address);
+
+	/*
+	 * Age page is called in atomic context inside the PT lock
+	 * right after the VM is test-and-clearing the young/accessed
+	 * bitflag in the pte. This way the VM will provide proper
+	 * aging to the accesses to the page through the secondary
+	 * MMUs and not only to the ones through the Linux pte.
+	 */
+	int (*age_page)(struct mmu_notifier *mn,
+			struct mm_struct *mm,
+			unsigned long address);
+
+	/*
+	 * invalidate_range_begin() and invalidate_range_end() must be
+	 * paired. Multiple invalidate_range_begin/ends may be nested
+	 * or called concurrently.
+	 */
+	void (*invalidate_range_begin)(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start, unsigned long end);       
+	void (*invalidate_range_end)(struct mmu_notifier *mn,
+				     struct mm_struct *mm,
+				     unsigned long start, unsigned long end);
+};
+
+struct mmu_notifier {
+	struct hlist_node hlist;
+	const struct mmu_notifier_ops *ops;
+};
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier_head {
+	struct hlist_head head;
+	spinlock_t lock;
+};
+
+#include <linux/mm_types.h>
+
+/*
+ * RCU is used to traverse the list. A quiescent period needs to pass
+ * before the notifier is guaranteed to be visible to all threads.
+ */
+extern void mmu_notifier_register(struct mmu_notifier *mn,
+				  struct mm_struct *mm);
+/*
+ * RCU is used to traverse the list. A quiescent period needs to pass
+ * before the "struct mmu_notifier" can be freed. Alternatively it
+ * can be synchronously freed inside ->release when the list can't
+ * change anymore and nobody could possibly walk it.
+ */
+extern void mmu_notifier_unregister(struct mmu_notifier *mn,
+				    struct mm_struct *mm);
+extern void mmu_notifier_release(struct mm_struct *mm);
+extern int mmu_notifier_age_page(struct mm_struct *mm,
+				 unsigned long address);
+
+static inline void mmu_notifier_head_init(struct mmu_notifier_head *mnh)
+{
+	INIT_HLIST_HEAD(&mnh->head);
+	spin_lock_init(&mnh->lock);
+}
+
+#define mmu_notifier(function, mm, args...)				\
+	do {								\
+		struct mmu_notifier *__mn;				\
+		struct hlist_node *__n;					\
+									\
+		if (unlikely(!hlist_empty(&(mm)->mmu_notifier.head))) { \
+			rcu_read_lock();				\
+			hlist_for_each_entry_rcu(__mn, __n,		\
+						 &(mm)->mmu_notifier.head, \
+						 hlist)			\
+				if (__mn->ops->function)		\
+					__mn->ops->function(__mn,	\
+							    mm,		\
+							    args);	\
+			rcu_read_unlock();				\
+		}							\
+	} while (0)
+
+#define ptep_clear_flush_notify(__vma, __address, __ptep)		\
+({									\
+	pte_t __pte;							\
+	__pte = ptep_clear_flush(__vma, __address, __ptep);		\
+	mmu_notifier(invalidate_page, (__vma)->vm_mm, __address);	\
+	__pte;								\
+})
+
+#define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
+({									\
+	int __young;							\
+	__young = ptep_clear_flush_young(__vma, __address, __ptep);	\
+	__young |= mmu_notifier_age_page((__vma)->vm_mm, __address);	\
+	__young;							\
+})
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+struct mmu_notifier_head {};
+
+#define mmu_notifier_register(mn, mm) do {} while(0)
+#define mmu_notifier_unregister(mn, mm) do {} while (0)
+#define mmu_notifier_release(mm) do {} while (0)
+#define mmu_notifier_age_page(mm, address) ({ 0; })
+#define mmu_notifier_head_init(mmh) do {} while (0)
+
+/*
+ * Notifiers that use the parameters that they were passed so that the
+ * compiler does not complain about unused variables but does proper
+ * parameter checks even if !CONFIG_MMU_NOTIFIER.
+ * Macros generate no code.
+ */
+#define mmu_notifier(function, mm, args...)			       \
+	do {							       \
+		if (0) {					       \
+			struct mmu_notifier *__mn;		       \
+								       \
+			__mn = (struct mmu_notifier *)(0x00ff);	       \
+			__mn->ops->function(__mn, mm, args);	       \
+		};						       \
+	} while (0)
+
+#define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define ptep_clear_flush_notify ptep_clear_flush
+
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -362,6 +362,7 @@ static struct mm_struct * mm_init(struct
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
+		mmu_notifier_head_init(&mm->mmu_notifier);
 		return mm;
 	}
 
diff --git a/mm/Kconfig b/mm/Kconfig
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,3 +193,7 @@ config VIRT_TO_BUS
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+	def_bool y
+	bool "MMU notifier, for paging KVM/RDMA"
diff --git a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,4 +33,4 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
-
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,7 +194,7 @@ __xip_unmap (struct address_space * mapp
 		if (pte) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
-			pteval = ptep_clear_flush(vma, address, pte);
+			pteval = ptep_clear_flush_notify(vma, address, pte);
 			page_remove_rmap(page, vma);
 			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
diff --git a/mm/fremap.c b/mm/fremap.c
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -214,7 +214,9 @@ asmlinkage long sys_remap_file_pages(uns
 		spin_unlock(&mapping->i_mmap_lock);
 	}
 
+	mmu_notifier(invalidate_range_begin, mm, start, start + size);
 	err = populate_range(mm, vma, start, size, pgoff);
+	mmu_notifier(invalidate_range_end, mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
 		if (unlikely(has_write_lock)) {
 			downgrade_write(&mm->mmap_sem);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -755,6 +755,7 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	mmu_notifier(invalidate_range_begin, mm, start, end);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
@@ -775,6 +776,7 @@ void __unmap_hugepage_range(struct vm_ar
 	}
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
+	mmu_notifier(invalidate_range_end, mm, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
 		list_del(&page->lru);
 		put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -611,6 +611,9 @@ int copy_page_range(struct mm_struct *ds
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
+	if (is_cow_mapping(vma->vm_flags))
+		mmu_notifier(invalidate_range_begin, src_mm, addr, end);
+
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
 	do {
@@ -621,6 +624,11 @@ int copy_page_range(struct mm_struct *ds
 						vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+	if (is_cow_mapping(vma->vm_flags))
+		mmu_notifier(invalidate_range_end, src_mm,
+						vma->vm_start, end);
+
 	return 0;
 }
 
@@ -897,7 +905,9 @@ unsigned long zap_page_range(struct vm_a
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+	mmu_notifier(invalidate_range_begin, mm, address, end);
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	mmu_notifier(invalidate_range_end, mm, address, end);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
@@ -1463,10 +1473,11 @@ int apply_to_page_range(struct mm_struct
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long end = addr + size;
+	unsigned long start = addr, end = addr + size;
 	int err;
 
 	BUG_ON(addr >= end);
+	mmu_notifier(invalidate_range_begin, mm, start, end);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1474,6 +1485,7 @@ int apply_to_page_range(struct mm_struct
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	mmu_notifier(invalidate_range_end, mm, start, end);
 	return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1675,7 +1687,7 @@ gotten:
 		 * seen in the presence of one thread doing SMC and another
 		 * thread doing COW.
 		 */
-		ptep_clear_flush(vma, address, page_table);
+		ptep_clear_flush_notify(vma, address, page_table);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lru_cache_add_active(new_page);
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1747,11 +1747,13 @@ static void unmap_region(struct mm_struc
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+	mmu_notifier(invalidate_range_begin, mm, start, end);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
+	mmu_notifier(invalidate_range_end, mm, start, end);
 }
 
 /*
@@ -2048,6 +2050,7 @@ void exit_mmap(struct mm_struct *mm)
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
+	mmu_notifier_release(mm);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,73 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+
+/*
+ * No synchronization. This function can only be called when only a single
+ * process remains that performs teardown.
+ */
+void mmu_notifier_release(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n, *tmp;
+
+	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+		hlist_for_each_entry_safe(mn, n, tmp,
+					  &mm->mmu_notifier.head, hlist) {
+			hlist_del(&mn->hlist);
+			if (mn->ops->release)
+				mn->ops->release(mn, mm);
+		}
+	}
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->age_page can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int mmu_notifier_age_page(struct mm_struct *mm, unsigned long address)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+	int young = 0;
+
+	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(mn, n,
+					 &mm->mmu_notifier.head, hlist) {
+			if (mn->ops->age_page)
+				young |= mn->ops->age_page(mn, mm, address);
+		}
+		rcu_read_unlock();
+	}
+
+	return young;
+}
+
+void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	spin_lock(&mm->mmu_notifier.lock);
+	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier.head);
+	spin_unlock(&mm->mmu_notifier.lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	spin_lock(&mm->mmu_notifier.lock);
+	hlist_del_rcu(&mn->hlist);
+	spin_unlock(&mm->mmu_notifier.lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mprotect.c b/mm/mprotect.c
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -198,10 +198,12 @@ success:
 		dirty_accountable = 1;
 	}
 
+	mmu_notifier(invalidate_range_begin, mm, start, end);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
 		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+	mmu_notifier(invalidate_range_end, mm, start, end);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -74,6 +74,7 @@ static void move_ptes(struct vm_area_str
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
+	unsigned long old_start;
 
 	if (vma->vm_file) {
 		/*
@@ -100,6 +101,9 @@ static void move_ptes(struct vm_area_str
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 	arch_enter_lazy_mmu_mode();
 
+	old_start = old_addr;
+	mmu_notifier(invalidate_range_begin, vma->vm_mm,
+		     old_start, old_end);
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
 		if (pte_none(*old_pte))
@@ -108,6 +112,7 @@ static void move_ptes(struct vm_area_str
 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
+	mmu_notifier(invalidate_range_end, vma->vm_mm, old_start, old_end);
 
 	arch_leave_lazy_mmu_mode();
 	if (new_ptl != old_ptl)
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -287,7 +287,7 @@ static int page_referenced_one(struct pa
 	if (vma->vm_flags & VM_LOCKED) {
 		referenced++;
 		*mapcount = 1;	/* break early from loop */
-	} else if (ptep_clear_flush_young(vma, address, pte))
+	} else if (ptep_clear_flush_young_notify(vma, address, pte))
 		referenced++;
 
 	/* Pretend the page is referenced if the task has the
@@ -454,7 +454,7 @@ static int page_mkclean_one(struct page 
 		pte_t entry;
 
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		entry = ptep_clear_flush(vma, address, pte);
+		entry = ptep_clear_flush_notify(vma, address, pte);
 		entry = pte_wrprotect(entry);
 		entry = pte_mkclean(entry);
 		set_pte_at(mm, address, pte, entry);
@@ -712,14 +712,14 @@ static int try_to_unmap_one(struct page 
 	 * skipped over this mm) then we should reactivate it.
 	 */
 	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)))) {
+			(ptep_clear_flush_young_notify(vma, address, pte)))) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush(vma, address, pte);
+	pteval = ptep_clear_flush_notify(vma, address, pte);
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
@@ -844,12 +844,12 @@ static void try_to_unmap_cluster(unsigne
 		page = vm_normal_page(vma, address, *pte);
 		BUG_ON(!page || PageAnon(page));
 
-		if (ptep_clear_flush_young(vma, address, pte))
+		if (ptep_clear_flush_young_notify(vma, address, pte))
 			continue;
 
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush(vma, address, pte);
+		pteval = ptep_clear_flush_notify(vma, address, pte);
 
 		/* If nonlinear, store the file page offset in the pte. */
 		if (page->index != linear_page_index(vma, address))


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-27 19:26               ` [PATCH] mmu notifiers #v7 Andrea Arcangeli
@ 2008-02-27 20:04                 ` Peter Zijlstra
  2008-02-27 23:06                 ` Christoph Lameter
                                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 120+ messages in thread
From: Peter Zijlstra @ 2008-02-27 20:04 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter


On Wed, 2008-02-27 at 20:26 +0100, Andrea Arcangeli wrote:
> Hello,
> 
> I hope this will can be considered final for .25 and be merged. Risk
> is zero, the only discussion here is to make an API that will last
> forever, functionality-wise all these patches provides zero risk and
> zero overhead when MMU_NOTIFIER=n. This last patch covers KVM and GRU
> and hopefully all other non-blocking users optimally, and the below
> API will hopefully last forever (but even if it lasts just for .25 and
> ..26 is changed that's fine with us, it's a kernel _internal_ API
> anyway, there's absolutely nothing visible to userland).
> 
> What Christoph need to do when he's back from vacations to support
> sleepable mmu notifiers is to add a CONFIG_XPMEM config option that
> will switch the i_mmap_lock from a semaphore to a mutex (any other
> change to this patch will be minor compared to that) so XPMEM hardware
> will have kernels compiled that way. I don't see other sane ways to
> remove the "atomic" parameter from the API (apparently required by
> Andrew for merging something not restricted to the xpmem current usage
> with only anonymous memory) and I don't want to have such a
> locking-change intrusive dependency for all other non-blocking users
> that are fine without having to alter how the VM works (for example
> KVM and GRU). Very minor changes will be required to this patch to
> make it work after the VM locking will be altered (for example the
> CONFIG_XPMEM should also switch the mmu_register/unregister locking
> from RCU to mutex as well). XPMEM then will only compile if
> CONFIG_XPMEM=y and in turn the invalidate_range_* will support
> scheduling inside.
> 
> I don't think pretending to merge all in one block (I mean including
> xpmem support that requires blocking methods) is good idea anymore as
> long as we agree the "atomic" parameter shouldn't be merged. But we
> can quite easily agree on the below to be optimal for GRU/KVM and
> trivially extendible once a CONFIG_XPMEM will be added. So this first
> part can go in now I think.
> 
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
> Signed-off-by: Christoph Lameter <clameter@sgi.com>

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -10,6 +10,7 @@
>  #include <linux/rbtree.h>
>  #include <linux/rwsem.h>
>  #include <linux/completion.h>
> +#include <linux/mmu_notifier.h>
>  #include <asm/page.h>
>  #include <asm/mmu.h>
>  
> @@ -228,6 +229,8 @@ struct mm_struct {
>  #ifdef CONFIG_CGROUP_MEM_CONT
>  	struct mem_cgroup *mem_cgroup;
>  #endif
> +
> +	struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
>  };
>  
>  #endif /* _LINUX_MM_TYPES_H */
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> new file mode 100644
> --- /dev/null
> +++ b/include/linux/mmu_notifier.h
> @@ -0,0 +1,159 @@
> +#ifndef _LINUX_MMU_NOTIFIER_H
> +#define _LINUX_MMU_NOTIFIER_H
> +
> +#include <linux/list.h>
> +#include <linux/spinlock.h>
> +
> +struct mmu_notifier;
> +
> +struct mmu_notifier_ops {
> +	/*
> +	 * Called when nobody can register any more notifier in the mm
> +	 * and after the "mn" notifier has been disarmed already.
> +	 */
> +	void (*release)(struct mmu_notifier *mn,
> +			struct mm_struct *mm);
> +
> +	/*
> +	 * invalidate_page is called in atomic context after any pte
> +	 * has been updated and before dropping the PT lock required
> +	 * to update any Linux pte.  Once the PT lock will be released
> +	 * the pte will have its final value to export through the
> +	 * secondary MMU.  Before this is invoked any secondary MMU is
> +	 * still ok to read/write to the page previously pointed by
> +	 * the Linux pte because the old page hasn't been freed yet.
> +	 * If required set_page_dirty has to be called internally to
> +	 * this method.
> +	 */
> +	void (*invalidate_page)(struct mmu_notifier *mn,
> +				struct mm_struct *mm,
> +				unsigned long address);
> +
> +	/*
> +	 * Age page is called in atomic context inside the PT lock
> +	 * right after the VM is test-and-clearing the young/accessed
> +	 * bitflag in the pte. This way the VM will provide proper
> +	 * aging to the accesses to the page through the secondary
> +	 * MMUs and not only to the ones through the Linux pte.
> +	 */
> +	int (*age_page)(struct mmu_notifier *mn,
> +			struct mm_struct *mm,
> +			unsigned long address);
> +
> +	/*
> +	 * invalidate_range_begin() and invalidate_range_end() must be
> +	 * paired. Multiple invalidate_range_begin/ends may be nested
> +	 * or called concurrently.
> +	 */
> +	void (*invalidate_range_begin)(struct mmu_notifier *mn,
> +				       struct mm_struct *mm,
> +				       unsigned long start, unsigned long end);       
> +	void (*invalidate_range_end)(struct mmu_notifier *mn,
> +				     struct mm_struct *mm,
> +				     unsigned long start, unsigned long end);
> +};
> +
> +struct mmu_notifier {
> +	struct hlist_node hlist;
> +	const struct mmu_notifier_ops *ops;
> +};
> +
> +#ifdef CONFIG_MMU_NOTIFIER
> +
> +struct mmu_notifier_head {
> +	struct hlist_head head;
> +	spinlock_t lock;
> +};
> +
> +#include <linux/mm_types.h>
> +
> +/*
> + * RCU is used to traverse the list. A quiescent period needs to pass
> + * before the notifier is guaranteed to be visible to all threads.
> + */
> +extern void mmu_notifier_register(struct mmu_notifier *mn,
> +				  struct mm_struct *mm);
> +/*
> + * RCU is used to traverse the list. A quiescent period needs to pass
> + * before the "struct mmu_notifier" can be freed. Alternatively it
> + * can be synchronously freed inside ->release when the list can't
> + * change anymore and nobody could possibly walk it.
> + */
> +extern void mmu_notifier_unregister(struct mmu_notifier *mn,
> +				    struct mm_struct *mm);
> +extern void mmu_notifier_release(struct mm_struct *mm);
> +extern int mmu_notifier_age_page(struct mm_struct *mm,
> +				 unsigned long address);
> +
> +static inline void mmu_notifier_head_init(struct mmu_notifier_head *mnh)
> +{
> +	INIT_HLIST_HEAD(&mnh->head);
> +	spin_lock_init(&mnh->lock);
> +}
> +
> +#define mmu_notifier(function, mm, args...)				\
> +	do {								\
> +		struct mmu_notifier *__mn;				\
> +		struct hlist_node *__n;					\
> +									\
> +		if (unlikely(!hlist_empty(&(mm)->mmu_notifier.head))) { \
> +			rcu_read_lock();				\
> +			hlist_for_each_entry_rcu(__mn, __n,		\
> +						 &(mm)->mmu_notifier.head, \
> +						 hlist)			\
> +				if (__mn->ops->function)		\
> +					__mn->ops->function(__mn,	\
> +							    mm,		\
> +							    args);	\
> +			rcu_read_unlock();				\
> +		}							\
> +	} while (0)
> +
> +#define ptep_clear_flush_notify(__vma, __address, __ptep)		\
> +({									\
> +	pte_t __pte;							\
> +	__pte = ptep_clear_flush(__vma, __address, __ptep);		\
> +	mmu_notifier(invalidate_page, (__vma)->vm_mm, __address);	\
> +	__pte;								\
> +})
> +
> +#define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
> +({									\
> +	int __young;							\
> +	__young = ptep_clear_flush_young(__vma, __address, __ptep);	\
> +	__young |= mmu_notifier_age_page((__vma)->vm_mm, __address);	\
> +	__young;							\
> +})
> +
> +#else /* CONFIG_MMU_NOTIFIER */
> +
> +struct mmu_notifier_head {};
> +
> +#define mmu_notifier_register(mn, mm) do {} while(0)
> +#define mmu_notifier_unregister(mn, mm) do {} while (0)
> +#define mmu_notifier_release(mm) do {} while (0)
> +#define mmu_notifier_age_page(mm, address) ({ 0; })
> +#define mmu_notifier_head_init(mmh) do {} while (0)
> +
> +/*
> + * Notifiers that use the parameters that they were passed so that the
> + * compiler does not complain about unused variables but does proper
> + * parameter checks even if !CONFIG_MMU_NOTIFIER.
> + * Macros generate no code.
> + */
> +#define mmu_notifier(function, mm, args...)			       \
> +	do {							       \
> +		if (0) {					       \
> +			struct mmu_notifier *__mn;		       \
> +								       \
> +			__mn = (struct mmu_notifier *)(0x00ff);	       \
> +			__mn->ops->function(__mn, mm, args);	       \
> +		};						       \
> +	} while (0)
> +
> +#define ptep_clear_flush_young_notify ptep_clear_flush_young
> +#define ptep_clear_flush_notify ptep_clear_flush
> +
> +#endif /* CONFIG_MMU_NOTIFIER */
> +
> +#endif /* _LINUX_MMU_NOTIFIER_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -362,6 +362,7 @@ static struct mm_struct * mm_init(struct
>  
>  	if (likely(!mm_alloc_pgd(mm))) {
>  		mm->def_flags = 0;
> +		mmu_notifier_head_init(&mm->mmu_notifier);
>  		return mm;
>  	}
>  
> diff --git a/mm/Kconfig b/mm/Kconfig
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -193,3 +193,7 @@ config VIRT_TO_BUS
>  config VIRT_TO_BUS
>  	def_bool y
>  	depends on !ARCH_NO_VIRT_TO_BUS
> +
> +config MMU_NOTIFIER
> +	def_bool y
> +	bool "MMU notifier, for paging KVM/RDMA"
> diff --git a/mm/Makefile b/mm/Makefile
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -33,4 +33,4 @@ obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
>  obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
> -
> +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
> --- a/mm/filemap_xip.c
> +++ b/mm/filemap_xip.c
> @@ -194,7 +194,7 @@ __xip_unmap (struct address_space * mapp
>  		if (pte) {
>  			/* Nuke the page table entry. */
>  			flush_cache_page(vma, address, pte_pfn(*pte));
> -			pteval = ptep_clear_flush(vma, address, pte);
> +			pteval = ptep_clear_flush_notify(vma, address, pte);
>  			page_remove_rmap(page, vma);
>  			dec_mm_counter(mm, file_rss);
>  			BUG_ON(pte_dirty(pteval));
> diff --git a/mm/fremap.c b/mm/fremap.c
> --- a/mm/fremap.c
> +++ b/mm/fremap.c
> @@ -214,7 +214,9 @@ asmlinkage long sys_remap_file_pages(uns
>  		spin_unlock(&mapping->i_mmap_lock);
>  	}
>  
> +	mmu_notifier(invalidate_range_begin, mm, start, start + size);
>  	err = populate_range(mm, vma, start, size, pgoff);
> +	mmu_notifier(invalidate_range_end, mm, start, start + size);
>  	if (!err && !(flags & MAP_NONBLOCK)) {
>  		if (unlikely(has_write_lock)) {
>  			downgrade_write(&mm->mmap_sem);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -755,6 +755,7 @@ void __unmap_hugepage_range(struct vm_ar
>  	BUG_ON(start & ~HPAGE_MASK);
>  	BUG_ON(end & ~HPAGE_MASK);
>  
> +	mmu_notifier(invalidate_range_begin, mm, start, end);
>  	spin_lock(&mm->page_table_lock);
>  	for (address = start; address < end; address += HPAGE_SIZE) {
>  		ptep = huge_pte_offset(mm, address);
> @@ -775,6 +776,7 @@ void __unmap_hugepage_range(struct vm_ar
>  	}
>  	spin_unlock(&mm->page_table_lock);
>  	flush_tlb_range(vma, start, end);
> +	mmu_notifier(invalidate_range_end, mm, start, end);
>  	list_for_each_entry_safe(page, tmp, &page_list, lru) {
>  		list_del(&page->lru);
>  		put_page(page);
> diff --git a/mm/memory.c b/mm/memory.c
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -611,6 +611,9 @@ int copy_page_range(struct mm_struct *ds
>  	if (is_vm_hugetlb_page(vma))
>  		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
>  
> +	if (is_cow_mapping(vma->vm_flags))
> +		mmu_notifier(invalidate_range_begin, src_mm, addr, end);
> +
>  	dst_pgd = pgd_offset(dst_mm, addr);
>  	src_pgd = pgd_offset(src_mm, addr);
>  	do {
> @@ -621,6 +624,11 @@ int copy_page_range(struct mm_struct *ds
>  						vma, addr, next))
>  			return -ENOMEM;
>  	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
> +
> +	if (is_cow_mapping(vma->vm_flags))
> +		mmu_notifier(invalidate_range_end, src_mm,
> +						vma->vm_start, end);
> +
>  	return 0;
>  }
>  
> @@ -897,7 +905,9 @@ unsigned long zap_page_range(struct vm_a
>  	lru_add_drain();
>  	tlb = tlb_gather_mmu(mm, 0);
>  	update_hiwater_rss(mm);
> +	mmu_notifier(invalidate_range_begin, mm, address, end);
>  	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
> +	mmu_notifier(invalidate_range_end, mm, address, end);
>  	if (tlb)
>  		tlb_finish_mmu(tlb, address, end);
>  	return end;
> @@ -1463,10 +1473,11 @@ int apply_to_page_range(struct mm_struct
>  {
>  	pgd_t *pgd;
>  	unsigned long next;
> -	unsigned long end = addr + size;
> +	unsigned long start = addr, end = addr + size;
>  	int err;
>  
>  	BUG_ON(addr >= end);
> +	mmu_notifier(invalidate_range_begin, mm, start, end);
>  	pgd = pgd_offset(mm, addr);
>  	do {
>  		next = pgd_addr_end(addr, end);
> @@ -1474,6 +1485,7 @@ int apply_to_page_range(struct mm_struct
>  		if (err)
>  			break;
>  	} while (pgd++, addr = next, addr != end);
> +	mmu_notifier(invalidate_range_end, mm, start, end);
>  	return err;
>  }
>  EXPORT_SYMBOL_GPL(apply_to_page_range);
> @@ -1675,7 +1687,7 @@ gotten:
>  		 * seen in the presence of one thread doing SMC and another
>  		 * thread doing COW.
>  		 */
> -		ptep_clear_flush(vma, address, page_table);
> +		ptep_clear_flush_notify(vma, address, page_table);
>  		set_pte_at(mm, address, page_table, entry);
>  		update_mmu_cache(vma, address, entry);
>  		lru_cache_add_active(new_page);
> diff --git a/mm/mmap.c b/mm/mmap.c
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1747,11 +1747,13 @@ static void unmap_region(struct mm_struc
>  	lru_add_drain();
>  	tlb = tlb_gather_mmu(mm, 0);
>  	update_hiwater_rss(mm);
> +	mmu_notifier(invalidate_range_begin, mm, start, end);
>  	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
>  	vm_unacct_memory(nr_accounted);
>  	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
>  				 next? next->vm_start: 0);
>  	tlb_finish_mmu(tlb, start, end);
> +	mmu_notifier(invalidate_range_end, mm, start, end);
>  }
>  
>  /*
> @@ -2048,6 +2050,7 @@ void exit_mmap(struct mm_struct *mm)
>  	vm_unacct_memory(nr_accounted);
>  	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
>  	tlb_finish_mmu(tlb, 0, end);
> +	mmu_notifier_release(mm);
>  
>  	/*
>  	 * Walk the list again, actually closing and freeing it,
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> new file mode 100644
> --- /dev/null
> +++ b/mm/mmu_notifier.c
> @@ -0,0 +1,73 @@
> +/*
> + *  linux/mm/mmu_notifier.c
> + *
> + *  Copyright (C) 2008  Qumranet, Inc.
> + *  Copyright (C) 2008  SGI
> + *             Christoph Lameter <clameter@sgi.com>
> + *
> + *  This work is licensed under the terms of the GNU GPL, version 2. See
> + *  the COPYING file in the top-level directory.
> + */
> +
> +#include <linux/mmu_notifier.h>
> +#include <linux/module.h>
> +#include <linux/rcupdate.h>
> +
> +/*
> + * No synchronization. This function can only be called when only a single
> + * process remains that performs teardown.
> + */
> +void mmu_notifier_release(struct mm_struct *mm)
> +{
> +	struct mmu_notifier *mn;
> +	struct hlist_node *n, *tmp;
> +
> +	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
> +		hlist_for_each_entry_safe(mn, n, tmp,
> +					  &mm->mmu_notifier.head, hlist) {
> +			hlist_del(&mn->hlist);
> +			if (mn->ops->release)
> +				mn->ops->release(mn, mm);
> +		}
> +	}
> +}
> +
> +/*
> + * If no young bitflag is supported by the hardware, ->age_page can
> + * unmap the address and return 1 or 0 depending if the mapping previously
> + * existed or not.
> + */
> +int mmu_notifier_age_page(struct mm_struct *mm, unsigned long address)
> +{
> +	struct mmu_notifier *mn;
> +	struct hlist_node *n;
> +	int young = 0;
> +
> +	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
> +		rcu_read_lock();
> +		hlist_for_each_entry_rcu(mn, n,
> +					 &mm->mmu_notifier.head, hlist) {
> +			if (mn->ops->age_page)
> +				young |= mn->ops->age_page(mn, mm, address);
> +		}
> +		rcu_read_unlock();
> +	}
> +
> +	return young;
> +}
> +
> +void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
> +{
> +	spin_lock(&mm->mmu_notifier.lock);
> +	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier.head);
> +	spin_unlock(&mm->mmu_notifier.lock);
> +}
> +EXPORT_SYMBOL_GPL(mmu_notifier_register);
> +
> +void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
> +{
> +	spin_lock(&mm->mmu_notifier.lock);
> +	hlist_del_rcu(&mn->hlist);
> +	spin_unlock(&mm->mmu_notifier.lock);
> +}
> +EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -198,10 +198,12 @@ success:
>  		dirty_accountable = 1;
>  	}
>  
> +	mmu_notifier(invalidate_range_begin, mm, start, end);
>  	if (is_vm_hugetlb_page(vma))
>  		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
>  	else
>  		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
> +	mmu_notifier(invalidate_range_end, mm, start, end);
>  	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
>  	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
>  	return 0;
> diff --git a/mm/mremap.c b/mm/mremap.c
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -74,6 +74,7 @@ static void move_ptes(struct vm_area_str
>  	struct mm_struct *mm = vma->vm_mm;
>  	pte_t *old_pte, *new_pte, pte;
>  	spinlock_t *old_ptl, *new_ptl;
> +	unsigned long old_start;
>  
>  	if (vma->vm_file) {
>  		/*
> @@ -100,6 +101,9 @@ static void move_ptes(struct vm_area_str
>  		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>  	arch_enter_lazy_mmu_mode();
>  
> +	old_start = old_addr;
> +	mmu_notifier(invalidate_range_begin, vma->vm_mm,
> +		     old_start, old_end);
>  	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
>  				   new_pte++, new_addr += PAGE_SIZE) {
>  		if (pte_none(*old_pte))
> @@ -108,6 +112,7 @@ static void move_ptes(struct vm_area_str
>  		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
>  		set_pte_at(mm, new_addr, new_pte, pte);
>  	}
> +	mmu_notifier(invalidate_range_end, vma->vm_mm, old_start, old_end);
>  
>  	arch_leave_lazy_mmu_mode();
>  	if (new_ptl != old_ptl)
> diff --git a/mm/rmap.c b/mm/rmap.c
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -287,7 +287,7 @@ static int page_referenced_one(struct pa
>  	if (vma->vm_flags & VM_LOCKED) {
>  		referenced++;
>  		*mapcount = 1;	/* break early from loop */
> -	} else if (ptep_clear_flush_young(vma, address, pte))
> +	} else if (ptep_clear_flush_young_notify(vma, address, pte))
>  		referenced++;
>  
>  	/* Pretend the page is referenced if the task has the
> @@ -454,7 +454,7 @@ static int page_mkclean_one(struct page 
>  		pte_t entry;
>  
>  		flush_cache_page(vma, address, pte_pfn(*pte));
> -		entry = ptep_clear_flush(vma, address, pte);
> +		entry = ptep_clear_flush_notify(vma, address, pte);
>  		entry = pte_wrprotect(entry);
>  		entry = pte_mkclean(entry);
>  		set_pte_at(mm, address, pte, entry);
> @@ -712,14 +712,14 @@ static int try_to_unmap_one(struct page 
>  	 * skipped over this mm) then we should reactivate it.
>  	 */
>  	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
> -			(ptep_clear_flush_young(vma, address, pte)))) {
> +			(ptep_clear_flush_young_notify(vma, address, pte)))) {
>  		ret = SWAP_FAIL;
>  		goto out_unmap;
>  	}
>  
>  	/* Nuke the page table entry. */
>  	flush_cache_page(vma, address, page_to_pfn(page));
> -	pteval = ptep_clear_flush(vma, address, pte);
> +	pteval = ptep_clear_flush_notify(vma, address, pte);
>  
>  	/* Move the dirty bit to the physical page now the pte is gone. */
>  	if (pte_dirty(pteval))
> @@ -844,12 +844,12 @@ static void try_to_unmap_cluster(unsigne
>  		page = vm_normal_page(vma, address, *pte);
>  		BUG_ON(!page || PageAnon(page));
>  
> -		if (ptep_clear_flush_young(vma, address, pte))
> +		if (ptep_clear_flush_young_notify(vma, address, pte))
>  			continue;
>  
>  		/* Nuke the page table entry. */
>  		flush_cache_page(vma, address, pte_pfn(*pte));
> -		pteval = ptep_clear_flush(vma, address, pte);
> +		pteval = ptep_clear_flush_notify(vma, address, pte);
>  
>  		/* If nonlinear, store the file page offset in the pte. */
>  		if (page->index != linear_page_index(vma, address))
> 


^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] KVM swapping with mmu notifiers #v7
  2008-02-20 10:45         ` [PATCH] KVM swapping (+ seqlock fix) with " Andrea Arcangeli
@ 2008-02-27 22:06           ` Andrea Arcangeli
  2008-02-28  8:42             ` izik eidus
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-27 22:06 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

Same as before but one one hand ported to #v7 API and on the other
hand ported to latest kvm.git.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 41962e7..e1287ab 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM && EXPERIMENTAL
 	select PREEMPT_NOTIFIERS
+	select MMU_NOTIFIER
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4583329..4067b0f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -642,6 +642,110 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 	account_shadowed(kvm, gfn);
 }
 
+static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
+{
+	struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+	get_page(page);
+	rmap_remove(kvm, spte);
+	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+	kvm_flush_remote_tlbs(kvm);
+	__free_page(page);
+}
+
+static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte, *curr_spte;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+		curr_spte = spte;
+		spte = rmap_next(kvm, rmapp, spte);
+		kvm_unmap_spte(kvm, curr_spte);
+	}
+}
+
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int young = 0;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		int _young;
+		u64 _spte = *spte;
+		BUG_ON(!(_spte & PT_PRESENT_MASK));
+		_young = _spte & PT_ACCESSED_MASK;
+		if (_young) {
+			young = !!_young;
+			set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+	return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+	int young = 0;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+
+	if (young)
+		kvm_flush_remote_tlbs(kvm);
+
+	return young;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 17f9d16..b014b19 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -380,6 +380,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	int r;
 	struct page *page;
 	int largepage = 0;
+	unsigned mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -415,6 +416,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 			largepage = 1;
 		}
 	}
+	mmu_seq = read_seqbegin(&vcpu->kvm->arch.mmu_notifier_invalidate_lock);
 	page = gfn_to_page(vcpu->kvm, walker.gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -440,6 +442,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, "post page fault (fixed)");
 	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	if (read_seqretry(&vcpu->kvm->arch.mmu_notifier_invalidate_lock, mmu_seq)) {
+		down_read(&current->mm->mmap_sem);
+		if (page != gfn_to_page(vcpu->kvm, walker.gfn))
+			BUG();
+		up_read(&current->mm->mmap_sem);
+		kvm_release_page_clean(page);
+	}
+
 	up_read(&vcpu->kvm->slots_lock);
 
 	return write_pt;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f09840..6eafb74 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3319,6 +3319,47 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+	struct kvm_arch *kvm_arch;
+	kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
+	return container_of(kvm_arch, struct kvm, arch);
+}
+
+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+				      struct mm_struct *mm,
+				      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	write_seqlock(&kvm->arch.mmu_notifier_invalidate_lock);
+	kvm_unmap_hva(kvm, address);
+	write_sequnlock(&kvm->arch.mmu_notifier_invalidate_lock);
+}
+
+void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+					   struct mm_struct *mm,
+					   unsigned long start, unsigned long end)
+{
+	for (; start < end; start += PAGE_SIZE)
+		kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+int kvm_mmu_notifier_age_page(struct mmu_notifier *mn,
+			      struct mm_struct *mm,
+			      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	return kvm_age_hva(kvm, address);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
+	.age_page		= kvm_mmu_notifier_age_page,
+};
+
 struct  kvm *kvm_arch_create_vm(void)
 {
 	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -3328,6 +3369,10 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 
+	kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+	mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+	seqlock_init(&kvm->arch.mmu_notifier_invalidate_lock);
+
 	return kvm;
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 024b57c..305b7c3 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -303,6 +304,9 @@ struct kvm_arch{
 	struct page *apic_access_page;
 
 	gpa_t wall_clock;
+
+	struct mmu_notifier mmu_notifier;
+	seqlock_t mmu_notifier_invalidate_lock;
 };
 
 struct kvm_vm_stat {
@@ -422,6 +426,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);


As usual (for completeness) I append the change to the memslot
readonly locking through kvm->mmu_lock:

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f09840..a519fd8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3379,16 +3379,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	 */
 	if (!user_alloc) {
 		if (npages && !old.rmap) {
+			unsigned long userspace_addr;
+
 			down_write(&current->mm->mmap_sem);
-			memslot->userspace_addr = do_mmap(NULL, 0,
-						     npages * PAGE_SIZE,
-						     PROT_READ | PROT_WRITE,
-						     MAP_SHARED | MAP_ANONYMOUS,
-						     0);
+			userspace_addr = do_mmap(NULL, 0,
+						 npages * PAGE_SIZE,
+						 PROT_READ | PROT_WRITE,
+						 MAP_SHARED | MAP_ANONYMOUS,
+						 0);
 			up_write(&current->mm->mmap_sem);
 
-			if (IS_ERR((void *)memslot->userspace_addr))
-				return PTR_ERR((void *)memslot->userspace_addr);
+			if (IS_ERR((void *)userspace_addr))
+				return PTR_ERR((void *)userspace_addr);
+
+			/* set userspace_addr atomically for kvm_hva_to_rmapp */
+			spin_lock(&kvm->mmu_lock);
+			memslot->userspace_addr = userspace_addr;
+			spin_unlock(&kvm->mmu_lock);
 		} else {
 			if (!old.user_alloc && old.rmap) {
 				int ret;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 30bf832..8f3b6d6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -326,7 +326,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.rmap, 0, npages * sizeof(*new.rmap));
 
 		new.user_alloc = user_alloc;
-		new.userspace_addr = mem->userspace_addr;
+		/*
+		 * hva_to_rmmap() serialzies with the mmu_lock and to be
+		 * safe it has to ignore memslots with !user_alloc &&
+		 * !userspace_addr.
+		 */
+		if (user_alloc)
+			new.userspace_addr = mem->userspace_addr;
+		else
+			new.userspace_addr = 0;
 	}
 	if (npages && !new.lpage_info) {
 		int largepages = npages / KVM_PAGES_PER_HPAGE;
@@ -355,14 +363,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.dirty_bitmap, 0, dirty_bytes);
 	}
 
+	spin_lock(&kvm->mmu_lock);
 	if (mem->slot >= kvm->nmemslots)
 		kvm->nmemslots = mem->slot + 1;
 
 	*memslot = new;
+	spin_unlock(&kvm->mmu_lock);
 
 	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
 	if (r) {
+		spin_lock(&kvm->mmu_lock);
 		*memslot = old;
+		spin_unlock(&kvm->mmu_lock);
 		goto out_free;
 	}
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 14:27   ` Jack Steiner
  2008-02-19 23:04     ` Nick Piggin
@ 2008-02-27 22:50     ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-02-27 22:50 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Andrea Arcangeli, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Tue, 19 Feb 2008, Jack Steiner wrote:

> In general, though, I agree. Most users of mmu_notifiers would likely
> required a mutex or something equivalent.

The skeletons shows how to do most of it using a spinlock and a 
counter.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 22:59   ` Nick Piggin
  2008-02-20  0:46     ` Andrea Arcangeli
@ 2008-02-27 22:55     ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-02-27 22:55 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, steiner, linux-kernel, linux-mm, daniel.blueman

On Tue, 19 Feb 2008, Nick Piggin wrote:

> I thought that could be used by a non-sleeping user (not intending
> to try supporting sleeping users). If it is useless then it should
> go away (BTW. I didn't see your recent patch, some of my confusion
> I think stems from Christoph's novel way of merging and splitting
> patches).

What is so novel about introducing functionality step by step?

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [patch] my mmu notifiers
  2008-02-19 23:11   ` Nick Piggin
                       ` (2 preceding siblings ...)
  2008-02-20  2:49     ` Robin Holt
@ 2008-02-27 22:56     ` Christoph Lameter
  3 siblings, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-02-27 22:56 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, steiner, linux-kernel, linux-mm, daniel.blueman

On Wed, 20 Feb 2008, Nick Piggin wrote:

> But why does _anybody_ (why does Christoph's patches) need to invalidate
> when they are going to be more permissive? This should be done lazily by
> the driver, I would have thought.

Correct. If you find such places then we can avoid the invalidates there.
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-27 19:26               ` [PATCH] mmu notifiers #v7 Andrea Arcangeli
  2008-02-27 20:04                 ` Peter Zijlstra
@ 2008-02-27 23:06                 ` Christoph Lameter
  2008-02-27 23:43                   ` [kvm-devel] " Andrea Arcangeli
  2008-02-28 19:48                 ` Christoph Lameter
                                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 120+ messages in thread
From: Christoph Lameter @ 2008-02-27 23:06 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Wed, 27 Feb 2008, Andrea Arcangeli wrote:

> I hope this will can be considered final for .25 and be merged. Risk
> is zero, the only discussion here is to make an API that will last
> forever, functionality-wise all these patches provides zero risk and
> zero overhead when MMU_NOTIFIER=n. This last patch covers KVM and GRU
> and hopefully all other non-blocking users optimally, and the below

Ok so it somehow works slowly with GRU and you are happy with it. What 
about the RDMA folks etc etc?

> API will hopefully last forever (but even if it lasts just for .25 and
> .26 is changed that's fine with us, it's a kernel _internal_ API
> anyway, there's absolutely nothing visible to userland).

Would it not be better to have a solution that fits all instead of hacking 
something in now and then having to modify it later?

 > What Christoph need to do when he's back from vacations to support
> sleepable mmu notifiers is to add a CONFIG_XPMEM config option that
> will switch the i_mmap_lock from a semaphore to a mutex (any other
> change to this patch will be minor compared to that) so XPMEM hardware
> will have kernels compiled that way. I don't see other sane ways to
> remove the "atomic" parameter from the API (apparently required by
> Andrew for merging something not restricted to the xpmem current usage
> with only anonymous memory) and I don't want to have such a
> locking-change intrusive dependency for all other non-blocking users
> that are fine without having to alter how the VM works (for example
> KVM and GRU). Very minor changes will be required to this patch to
> make it work after the VM locking will be altered (for example the
> CONFIG_XPMEM should also switch the mmu_register/unregister locking
> from RCU to mutex as well). XPMEM then will only compile if
> CONFIG_XPMEM=y and in turn the invalidate_range_* will support
> scheduling inside.

Hmmm.. There were earlier discussions of changing the anon vma lock to a 
rw lock because of contention issues in large systems. Maybe we can just 
generally switch the locks taken while walking rmaps to semaphores? That 
would still require to put the invalidate outside of the pte lock.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [kvm-devel] [PATCH] mmu notifiers #v7
  2008-02-27 23:06                 ` Christoph Lameter
@ 2008-02-27 23:43                   ` Andrea Arcangeli
  2008-02-28  0:08                     ` Christoph Lameter
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-27 23:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm, Kanoj Sarcar,
	Roland Dreier, Jack Steiner, linux-kernel, Avi Kivity, kvm-devel,
	daniel.blueman, Robin Holt, general, akpm

On Wed, Feb 27, 2008 at 03:06:10PM -0800, Christoph Lameter wrote:
> Ok so it somehow works slowly with GRU and you are happy with it. What 

As far as GRU is concerned, performance is the same as with your patch
(Jack can confirm).

> about the RDMA folks etc etc?

If RDMA/IB folks needed to block in invalidate_range, I guess they
need to do so on top of tmpfs too, and that never worked with your
patch anyway.

> Would it not be better to have a solution that fits all instead of hacking 
> something in now and then having to modify it later?

The whole point is that your solution fits only GRU and KVM too.

XPMEM in your patch works in a hacked mode limited to anonymous memory
only, Robin already received incoming mail asking to allow xpmem to
work on more than anonymous memory, so your solution-that-fits-all
doesn't actually fit some of Robin's customer needs. So if it doesn't
even entirely satisfy xpmem users, imagine the other potential
blocking-users of this code.

> Hmmm.. There were earlier discussions of changing the anon vma lock to a 
> rw lock because of contention issues in large systems. Maybe we can just 
> generally switch the locks taken while walking rmaps to semaphores? That 
> would still require to put the invalidate outside of the pte lock.

anon_vma lock can remain a spinlock unless you also want to schedule
inside try_to_unmap.

If converting the i_mmap_lock to a mutex is a big trouble, another way
that might work to allow invalidate_range to block, would be to try to
boost the mm_users to prevent the mmu_notifier_release to run in
another cpu the moment after i_mmap_lock spinlock is unlocked. But
even if that works, it'll run slower and the mmu notifiers RCU locking
should be switched to a mutex, so it'd be nice to have it as a
separate option.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [kvm-devel] [PATCH] mmu notifiers #v7
  2008-02-27 23:43                   ` [kvm-devel] " Andrea Arcangeli
@ 2008-02-28  0:08                     ` Christoph Lameter
  2008-02-28  0:21                       ` Andrea Arcangeli
  0 siblings, 1 reply; 120+ messages in thread
From: Christoph Lameter @ 2008-02-28  0:08 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm, Kanoj Sarcar,
	Roland Dreier, Jack Steiner, linux-kernel, Avi Kivity, kvm-devel,
	daniel.blueman, Robin Holt, general, akpm

On Thu, 28 Feb 2008, Andrea Arcangeli wrote:

> If RDMA/IB folks needed to block in invalidate_range, I guess they
> need to do so on top of tmpfs too, and that never worked with your
> patch anyway.

How about blocking in invalidate_page()? It can be made to work...

> > Would it not be better to have a solution that fits all instead of hacking 
> > something in now and then having to modify it later?
> 
> The whole point is that your solution fits only GRU and KVM too.

Well so we do not address the issues?
 
> XPMEM in your patch works in a hacked mode limited to anonymous memory
> only, Robin already received incoming mail asking to allow xpmem to
> work on more than anonymous memory, so your solution-that-fits-all
> doesn't actually fit some of Robin's customer needs. So if it doesn't
> even entirely satisfy xpmem users, imagine the other potential
> blocking-users of this code.

The solutions have been mentioned...

> anon_vma lock can remain a spinlock unless you also want to schedule
> inside try_to_unmap.

Either that or a separate rmap as also mentioned before.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [kvm-devel] [PATCH] mmu notifiers #v7
  2008-02-28  0:08                     ` Christoph Lameter
@ 2008-02-28  0:21                       ` Andrea Arcangeli
  2008-02-28  0:24                         ` Christoph Lameter
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-28  0:21 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm, Kanoj Sarcar,
	Roland Dreier, Jack Steiner, linux-kernel, Avi Kivity, kvm-devel,
	daniel.blueman, Robin Holt, general, akpm

On Wed, Feb 27, 2008 at 04:08:07PM -0800, Christoph Lameter wrote:
> On Thu, 28 Feb 2008, Andrea Arcangeli wrote:
> 
> > If RDMA/IB folks needed to block in invalidate_range, I guess they
> > need to do so on top of tmpfs too, and that never worked with your
> > patch anyway.
> 
> How about blocking in invalidate_page()? It can be made to work...

Yes, it can be made to work with even more extended VM changes than to
only allow invalidate_range to schedule. Those core VM changes should
only be done "by default" (w/o CONFIG_XPMEM=y), if they're doing good
to the VM regardless of xpmem requirements. And I'm not really sure of
that. I think they don't do any good or they would be a mutex
already...

> Well so we do not address the issues?

I'm not suggesting not to address the issues, just that those issues
requires VM core changes, and likely those changes should be
switchable under a CONFIG_XPMEM, so I see no reason to delay the mmu
notifier until those changes are done and merged too. It's kind of a
separate problem.

> Either that or a separate rmap as also mentioned before.

DRI also wants invalidate_page by (mm,addr).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [kvm-devel] [PATCH] mmu notifiers #v7
  2008-02-28  0:21                       ` Andrea Arcangeli
@ 2008-02-28  0:24                         ` Christoph Lameter
  0 siblings, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-02-28  0:24 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm, Kanoj Sarcar,
	Roland Dreier, Jack Steiner, linux-kernel, Avi Kivity, kvm-devel,
	daniel.blueman, Robin Holt, general, akpm

On Thu, 28 Feb 2008, Andrea Arcangeli wrote:

> I'm not suggesting not to address the issues, just that those issues
> requires VM core changes, and likely those changes should be
> switchable under a CONFIG_XPMEM, so I see no reason to delay the mmu
> notifier until those changes are done and merged too. It's kind of a
> separate problem.

No its the core problem of the mmu notifier. It needs to be usable for a 
lot of scenarios.

 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] KVM swapping with mmu notifiers #v7
  2008-02-27 22:06           ` [PATCH] KVM swapping with mmu notifiers #v7 Andrea Arcangeli
@ 2008-02-28  8:42             ` izik eidus
  0 siblings, 0 replies; 120+ messages in thread
From: izik eidus @ 2008-02-28  8:42 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

ציטוט Andrea Arcangeli:
> Same as before but one one hand ported to #v7 API and on the other
> hand ported to latest kvm.git.
>
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 41962e7..e1287ab 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -21,6 +21,7 @@ config KVM
>  	tristate "Kernel-based Virtual Machine (KVM) support"
>  	depends on HAVE_KVM && EXPERIMENTAL
>  	select PREEMPT_NOTIFIERS
> +	select MMU_NOTIFIER
>  	select ANON_INODES
>  	---help---
>  	  Support hosting fully virtualized guest machines using hardware
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4583329..4067b0f 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -642,6 +642,110 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
>  	account_shadowed(kvm, gfn);
>  }
>  
> +static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
> +{
> +	struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
> +	get_page(page);
> +	rmap_remove(kvm, spte);
> +	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
> +	kvm_flush_remote_tlbs(kvm);
> +	__free_page(page);
>   

with large page support i think we need here put_page...

> +}
> +
> +static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	u64 *spte, *curr_spte;
> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		BUG_ON(!(*spte & PT_PRESENT_MASK));
> +		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
> +		curr_spte = spte;
> +		spte = rmap_next(kvm, rmapp, spte);
> +		kvm_unmap_spte(kvm, curr_spte);
> +	}
> +}
> +
> +void kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
> +{
> +	int i;
> +
> +	/*
> +	 * If mmap_sem isn't taken, we can look the memslots with only
> +	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
> +	 */
> +	spin_lock(&kvm->mmu_lock);
> +	for (i = 0; i < kvm->nmemslots; i++) {
> +		struct kvm_memory_slot *memslot = &kvm->memslots[i];
> +		unsigned long start = memslot->userspace_addr;
> +		unsigned long end;
> +
> +		/* mmu_lock protects userspace_addr */
> +		if (!start)
> +			continue;
> +
> +		end = start + (memslot->npages << PAGE_SHIFT);
> +		if (hva >= start && hva < end) {
> +			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
> +			kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]);
> +		}
> +	}
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
> +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	u64 *spte;
> +	int young = 0;
> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		int _young;
> +		u64 _spte = *spte;
> +		BUG_ON(!(_spte & PT_PRESENT_MASK));
> +		_young = _spte & PT_ACCESSED_MASK;
> +		if (_young) {
> +			young = !!_young;
> +			set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
> +		}
> +		spte = rmap_next(kvm, rmapp, spte);
> +	}
> +	return young;
> +}
> +
> +int kvm_age_hva(struct kvm *kvm, unsigned long hva)
> +{
> +	int i;
> +	int young = 0;
> +
> +	/*
> +	 * If mmap_sem isn't taken, we can look the memslots with only
> +	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
> +	 */
> +	spin_lock(&kvm->mmu_lock);
> +	for (i = 0; i < kvm->nmemslots; i++) {
> +		struct kvm_memory_slot *memslot = &kvm->memslots[i];
> +		unsigned long start = memslot->userspace_addr;
> +		unsigned long end;
> +
> +		/* mmu_lock protects userspace_addr */
> +		if (!start)
> +			continue;
> +
> +		end = start + (memslot->npages << PAGE_SHIFT);
> +		if (hva >= start && hva < end) {
> +			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
> +			young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
> +		}
> +	}
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (young)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	return young;
> +}
> +
>  #ifdef MMU_DEBUG
>  static int is_empty_shadow_page(u64 *spt)
>  {
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 17f9d16..b014b19 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -380,6 +380,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>  	int r;
>  	struct page *page;
>  	int largepage = 0;
> +	unsigned mmu_seq;
>  
>  	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
>  	kvm_mmu_audit(vcpu, "pre page fault");
> @@ -415,6 +416,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>  			largepage = 1;
>  		}
>  	}
> +	mmu_seq = read_seqbegin(&vcpu->kvm->arch.mmu_notifier_invalidate_lock);
>  	page = gfn_to_page(vcpu->kvm, walker.gfn);
>  	up_read(&current->mm->mmap_sem);
>  
> @@ -440,6 +442,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>  	++vcpu->stat.pf_fixed;
>  	kvm_mmu_audit(vcpu, "post page fault (fixed)");
>  	spin_unlock(&vcpu->kvm->mmu_lock);
> +
> +	if (read_seqretry(&vcpu->kvm->arch.mmu_notifier_invalidate_lock, mmu_seq)) {
> +		down_read(&current->mm->mmap_sem);
> +		if (page != gfn_to_page(vcpu->kvm, walker.gfn))
> +			BUG();
> +		up_read(&current->mm->mmap_sem);
> +		kvm_release_page_clean(page);
> +	}
> +
>  	up_read(&vcpu->kvm->slots_lock);
>  
>  	return write_pt;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6f09840..6eafb74 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3319,6 +3319,47 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  	free_page((unsigned long)vcpu->arch.pio_data);
>  }
>  
> +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> +{
> +	struct kvm_arch *kvm_arch;
> +	kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
> +	return container_of(kvm_arch, struct kvm, arch);
> +}
> +
> +void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
> +				      struct mm_struct *mm,
> +				      unsigned long address)
> +{
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	BUG_ON(mm != kvm->mm);
> +	write_seqlock(&kvm->arch.mmu_notifier_invalidate_lock);
> +	kvm_unmap_hva(kvm, address);
> +	write_sequnlock(&kvm->arch.mmu_notifier_invalidate_lock);
> +}
> +
> +void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> +					   struct mm_struct *mm,
> +					   unsigned long start, unsigned long end)
> +{
> +	for (; start < end; start += PAGE_SIZE)
> +		kvm_mmu_notifier_invalidate_page(mn, mm, start);
> +}
> +
> +int kvm_mmu_notifier_age_page(struct mmu_notifier *mn,
> +			      struct mm_struct *mm,
> +			      unsigned long address)
> +{
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	BUG_ON(mm != kvm->mm);
> +	return kvm_age_hva(kvm, address);
> +}
> +
> +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
> +	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
> +	.age_page		= kvm_mmu_notifier_age_page,
> +};
> +
>  struct  kvm *kvm_arch_create_vm(void)
>  {
>  	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
> @@ -3328,6 +3369,10 @@ struct  kvm *kvm_arch_create_vm(void)
>  
>  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
>  
> +	kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
> +	mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
> +	seqlock_init(&kvm->arch.mmu_notifier_invalidate_lock);
> +
>  	return kvm;
>  }
>  
> diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
> index 024b57c..305b7c3 100644
> --- a/include/asm-x86/kvm_host.h
> +++ b/include/asm-x86/kvm_host.h
> @@ -13,6 +13,7 @@
>  
>  #include <linux/types.h>
>  #include <linux/mm.h>
> +#include <linux/mmu_notifier.h>
>  
>  #include <linux/kvm.h>
>  #include <linux/kvm_para.h>
> @@ -303,6 +304,9 @@ struct kvm_arch{
>  	struct page *apic_access_page;
>  
>  	gpa_t wall_clock;
> +
> +	struct mmu_notifier mmu_notifier;
> +	seqlock_t mmu_notifier_invalidate_lock;
>  };
>  
>  struct kvm_vm_stat {
> @@ -422,6 +426,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
>  int kvm_mmu_setup(struct kvm_vcpu *vcpu);
>  void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
>  
> +void kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
> +int kvm_age_hva(struct kvm *kvm, unsigned long hva);
>  int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
>  void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
>  void kvm_mmu_zap_all(struct kvm *kvm);
>
>
> As usual (for completeness) I append the change to the memslot
> readonly locking through kvm->mmu_lock:
>
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6f09840..a519fd8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3379,16 +3379,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
>  	 */
>  	if (!user_alloc) {
>  		if (npages && !old.rmap) {
> +			unsigned long userspace_addr;
> +
>  			down_write(&current->mm->mmap_sem);
> -			memslot->userspace_addr = do_mmap(NULL, 0,
> -						     npages * PAGE_SIZE,
> -						     PROT_READ | PROT_WRITE,
> -						     MAP_SHARED | MAP_ANONYMOUS,
> -						     0);
> +			userspace_addr = do_mmap(NULL, 0,
> +						 npages * PAGE_SIZE,
> +						 PROT_READ | PROT_WRITE,
> +						 MAP_SHARED | MAP_ANONYMOUS,
> +						 0);
>  			up_write(&current->mm->mmap_sem);
>  
> -			if (IS_ERR((void *)memslot->userspace_addr))
> -				return PTR_ERR((void *)memslot->userspace_addr);
> +			if (IS_ERR((void *)userspace_addr))
> +				return PTR_ERR((void *)userspace_addr);
> +
> +			/* set userspace_addr atomically for kvm_hva_to_rmapp */
> +			spin_lock(&kvm->mmu_lock);
> +			memslot->userspace_addr = userspace_addr;
> +			spin_unlock(&kvm->mmu_lock);
>  		} else {
>  			if (!old.user_alloc && old.rmap) {
>  				int ret;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 30bf832..8f3b6d6 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -326,7 +326,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  		memset(new.rmap, 0, npages * sizeof(*new.rmap));
>  
>  		new.user_alloc = user_alloc;
> -		new.userspace_addr = mem->userspace_addr;
> +		/*
> +		 * hva_to_rmmap() serialzies with the mmu_lock and to be
> +		 * safe it has to ignore memslots with !user_alloc &&
> +		 * !userspace_addr.
> +		 */
> +		if (user_alloc)
> +			new.userspace_addr = mem->userspace_addr;
> +		else
> +			new.userspace_addr = 0;
>  	}
>  	if (npages && !new.lpage_info) {
>  		int largepages = npages / KVM_PAGES_PER_HPAGE;
> @@ -355,14 +363,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  		memset(new.dirty_bitmap, 0, dirty_bytes);
>  	}
>  
> +	spin_lock(&kvm->mmu_lock);
>  	if (mem->slot >= kvm->nmemslots)
>  		kvm->nmemslots = mem->slot + 1;
>  
>  	*memslot = new;
> +	spin_unlock(&kvm->mmu_lock);
>  
>  	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
>  	if (r) {
> +		spin_lock(&kvm->mmu_lock);
>  		*memslot = old;
> +		spin_unlock(&kvm->mmu_lock);
>  		goto out_free;
>  	}
>  
>   


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-27 19:26               ` [PATCH] mmu notifiers #v7 Andrea Arcangeli
  2008-02-27 20:04                 ` Peter Zijlstra
  2008-02-27 23:06                 ` Christoph Lameter
@ 2008-02-28 19:48                 ` Christoph Lameter
  2008-02-28 21:52                   ` Andrea Arcangeli
  2008-02-28 23:05                 ` Christoph Lameter
  2008-03-02 15:54                 ` [PATCH] mmu notifiers #v8 Andrea Arcangeli
  4 siblings, 1 reply; 120+ messages in thread
From: Christoph Lameter @ 2008-02-28 19:48 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Wed, 27 Feb 2008, Andrea Arcangeli wrote:

> What Christoph need to do when he's back from vacations to support
> sleepable mmu notifiers is to add a CONFIG_XPMEM config option that
> will switch the i_mmap_lock from a semaphore to a mutex (any other
> change to this patch will be minor compared to that) so XPMEM hardware
> will have kernels compiled that way. I don't see other sane ways to
> remove the "atomic" parameter from the API (apparently required by
> Andrew for merging something not restricted to the xpmem current usage
> with only anonymous memory) and I don't want to have such a
> locking-change intrusive dependency for all other non-blocking users
> that are fine without having to alter how the VM works (for example
> KVM and GRU). Very minor changes will be required to this patch to
> make it work after the VM locking will be altered (for example the
> CONFIG_XPMEM should also switch the mmu_register/unregister locking
> from RCU to mutex as well). XPMEM then will only compile if
> CONFIG_XPMEM=y and in turn the invalidate_range_* will support
> scheduling inside.

This is not going to work even if the mutex would work as easily as you 
think since the patch here still does an rcu_lock/unlock around a callback.

> I don't think pretending to merge all in one block (I mean including
> xpmem support that requires blocking methods) is good idea anymore as
> long as we agree the "atomic" parameter shouldn't be merged. But we
> can quite easily agree on the below to be optimal for GRU/KVM and
> trivially extendible once a CONFIG_XPMEM will be added. So this first
> part can go in now I think.

Changing the locking for the callouts for users of the mmu notivier that 
f.e. require a response via the network (RDMA, XPMEM etc) is not trivial 
at all. RCU lock cannot be used. So we are looking at totally disjunct 
methods for those users who have to sleep.

> +struct mmu_notifier_ops {
> +	/*
> +	 * Called when nobody can register any more notifier in the mm
> +	 * and after the "mn" notifier has been disarmed already.
> +	 */
> +	void (*release)(struct mmu_notifier *mn,
> +			struct mm_struct *mm);

Who disarms the notifier? Why is the method not called to disarm the 
notifier on exit?

> +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
> --- a/mm/filemap_xip.c
> +++ b/mm/filemap_xip.c
> @@ -194,7 +194,7 @@ __xip_unmap (struct address_space * mapp
>  		if (pte) {
>  			/* Nuke the page table entry. */
>  			flush_cache_page(vma, address, pte_pfn(*pte));
> -			pteval = ptep_clear_flush(vma, address, pte);
> +			pteval = ptep_clear_flush_notify(vma, address, pte);
>  			page_remove_rmap(page, vma);
>  			dec_mm_counter(mm, file_rss);
>  			BUG_ON(pte_dirty(pteval));

Well a bit better but now we have to modify both the macro and the code 
in teh VM. It would be easier to put the notify call in here.

> @@ -2048,6 +2050,7 @@ void exit_mmap(struct mm_struct *mm)
>  	vm_unacct_memory(nr_accounted);
>  	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
>  	tlb_finish_mmu(tlb, 0, end);
> +	mmu_notifier_release(mm);

The release should be called much earlier to allow the driver to release 
all resources in one go. This way each vma must be processed individually. 
For our gobs of memory this method may create a scaling problem on exit().


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-28 19:48                 ` Christoph Lameter
@ 2008-02-28 21:52                   ` Andrea Arcangeli
  2008-02-28 22:00                     ` Christoph Lameter
  2008-02-28 23:17                     ` Jack Steiner
  0 siblings, 2 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-28 21:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Thu, Feb 28, 2008 at 11:48:10AM -0800, Christoph Lameter wrote:
> > make it work after the VM locking will be altered (for example the
    	    	       	      	      	      	       ^^^^^^^^^^^^^^^
> > CONFIG_XPMEM should also switch the mmu_register/unregister locking
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > from RCU to mutex as well). XPMEM then will only compile if
    ^^^^^^^^^^^^^^^^^^^^^^^^^
> > CONFIG_XPMEM=y and in turn the invalidate_range_* will support
> > scheduling inside.
> 
> This is not going to work even if the mutex would work as easily as you 
> think since the patch here still does an rcu_lock/unlock around a callback.

See underlined.

> > +struct mmu_notifier_ops {
> > +	/*
> > +	 * Called when nobody can register any more notifier in the mm
> > +	 * and after the "mn" notifier has been disarmed already.
> > +	 */
> > +	void (*release)(struct mmu_notifier *mn,
> > +			struct mm_struct *mm);
> 
> Who disarms the notifier? Why is the method not called to disarm the 
> notifier on exit?

The notifier is auto-disarmed by mmu_notifier_release, your patch
works the same way. ->release is further called just in case anybody
wants to know the notifier was disarmed.

> > @@ -2048,6 +2050,7 @@ void exit_mmap(struct mm_struct *mm)
> >  	vm_unacct_memory(nr_accounted);
> >  	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
> >  	tlb_finish_mmu(tlb, 0, end);
> > +	mmu_notifier_release(mm);
> 
> The release should be called much earlier to allow the driver to release 
> all resources in one go. This way each vma must be processed individually. 
> For our gobs of memory this method may create a scaling problem on exit().

Good point, it has to be called earlier for GRU, but it's not a
performance issue. GRU doesn't pin the pages so it should make the
global invalidate in ->release _before_ unmap_vmas. Linux can't fault
in the ptes anymore because mm_users is zero so there's no need of a
->release_begin/end, the _begin is enough.

In #v6 I was invalidating inside unmap_vmas so it was ok. The
performance issues you're talking about refers to #v6 I guess, for #v7
there's a single call.

Thanks!

diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2039,6 +2039,7 @@ void exit_mmap(struct mm_struct *mm)
 	unsigned long end;
 
 	/* mm's last user has gone, and its about to be pulled down */
+	mmu_notifier_release(mm);
 	arch_exit_mmap(mm);
 
 	lru_add_drain();
@@ -2050,7 +2051,6 @@ void exit_mmap(struct mm_struct *mm)
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
-	mmu_notifier_release(mm);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-28 21:52                   ` Andrea Arcangeli
@ 2008-02-28 22:00                     ` Christoph Lameter
  2008-02-28 23:17                     ` Jack Steiner
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-02-28 22:00 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Thu, 28 Feb 2008, Andrea Arcangeli wrote:

> > This is not going to work even if the mutex would work as easily as you 
> > think since the patch here still does an rcu_lock/unlock around a callback.
> 
> See underlined.

Mutex is not acceptable for performance reasons. I think we can just drop 
the RCU lock if we simply unregister the mmu notifier in release and 
forbid the drivers from removing themselves from the notification 
chain. They can simply do nothing until release. At that time there is no 
concurrency and thus its safe to remove even without rcu locking.

> Good point, it has to be called earlier for GRU, but it's not a
> performance issue. GRU doesn't pin the pages so it should make the
> global invalidate in ->release _before_ unmap_vmas. Linux can't fault
> in the ptes anymore because mm_users is zero so there's no need of a
> ->release_begin/end, the _begin is enough.

I do not follow you about the _begin without end but the following fix 
seems okay.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-27 19:26               ` [PATCH] mmu notifiers #v7 Andrea Arcangeli
                                   ` (2 preceding siblings ...)
  2008-02-28 19:48                 ` Christoph Lameter
@ 2008-02-28 23:05                 ` Christoph Lameter
  2008-02-29  0:40                   ` Andrea Arcangeli
  2008-03-02 15:54                 ` [PATCH] mmu notifiers #v8 Andrea Arcangeli
  4 siblings, 1 reply; 120+ messages in thread
From: Christoph Lameter @ 2008-02-28 23:05 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Wed, 27 Feb 2008, Andrea Arcangeli wrote:

> +struct mmu_notifier_head {
> +	struct hlist_head head;
> +	spinlock_t lock;
> +};

Still think that the lock here is not of too much use and can be easily 
replaced by mmap_sem.

> +#define mmu_notifier(function, mm, args...)				\
> +	do {								\
> +		struct mmu_notifier *__mn;				\
> +		struct hlist_node *__n;					\
> +									\
> +		if (unlikely(!hlist_empty(&(mm)->mmu_notifier.head))) { \
> +			rcu_read_lock();				\
> +			hlist_for_each_entry_rcu(__mn, __n,		\
> +						 &(mm)->mmu_notifier.head, \
> +						 hlist)			\
> +				if (__mn->ops->function)		\
> +					__mn->ops->function(__mn,	\
> +							    mm,		\
> +							    args);	\
> +			rcu_read_unlock();				\
> +		}							\
> +	} while (0)

Andrew recomended local variables for parameters used multile times. This 
means the mm parameter here.

> +/*
> + * Notifiers that use the parameters that they were passed so that the
> + * compiler does not complain about unused variables but does proper
> + * parameter checks even if !CONFIG_MMU_NOTIFIER.
> + * Macros generate no code.
> + */
> +#define mmu_notifier(function, mm, args...)			       \
> +	do {							       \
> +		if (0) {					       \
> +			struct mmu_notifier *__mn;		       \
> +								       \
> +			__mn = (struct mmu_notifier *)(0x00ff);	       \
> +			__mn->ops->function(__mn, mm, args);	       \
> +		};						       \
> +	} while (0)

Note also Andrew's comments on the use of 0x00ff...

> +/*
> + * No synchronization. This function can only be called when only a single
> + * process remains that performs teardown.
> + */
> +void mmu_notifier_release(struct mm_struct *mm)
> +{
> +	struct mmu_notifier *mn;
> +	struct hlist_node *n, *tmp;
> +
> +	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
> +		hlist_for_each_entry_safe(mn, n, tmp,
> +					  &mm->mmu_notifier.head, hlist) {
> +			hlist_del(&mn->hlist);
> +			if (mn->ops->release)
> +				mn->ops->release(mn, mm);
> +		}
> +	}
> +}

One could avoid a hlist_for_each_entry_safe here by simply always deleting 
the first object. 

Also re the _notify variants: The binding to pte_clear_flush_young etc 
will become a problem for notifiers that want to sleep because 
pte_clear_flush is usually called with the pte lock held. See f.e. 
try_to_unmap_one, page_mkclean_one etc.

It would be better if the notifier calls could be moved outside of the 
pte lock.




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-28 21:52                   ` Andrea Arcangeli
  2008-02-28 22:00                     ` Christoph Lameter
@ 2008-02-28 23:17                     ` Jack Steiner
  2008-02-29  0:24                       ` Andrea Arcangeli
  1 sibling, 1 reply; 120+ messages in thread
From: Jack Steiner @ 2008-02-28 23:17 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Christoph Lameter, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

> > The release should be called much earlier to allow the driver to release 
> > all resources in one go. This way each vma must be processed individually. 
> > For our gobs of memory this method may create a scaling problem on exit().
> 
> Good point, it has to be called earlier for GRU, but it's not a
> performance issue. GRU doesn't pin the pages so it should make the
> global invalidate in ->release _before_ unmap_vmas. Linux can't fault
> in the ptes anymore because mm_users is zero so there's no need of a
> ->release_begin/end, the _begin is enough.
> 

I disagree. The location of the callout IS a performance issue. In simple
comparisons of the 2 patches (Christoph's vs. Andrea's), Andrea's has a 7X
increase in the number of TLB purges being issued to the GRU. TLB flushing
is slow and can impact the performance of of tasks using the GRU.

--- jack

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-28 23:17                     ` Jack Steiner
@ 2008-02-29  0:24                       ` Andrea Arcangeli
  2008-02-29  1:13                         ` Christoph Lameter
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-29  0:24 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Christoph Lameter, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Thu, Feb 28, 2008 at 05:17:33PM -0600, Jack Steiner wrote:
> I disagree. The location of the callout IS a performance issue. In simple
> comparisons of the 2 patches (Christoph's vs. Andrea's), Andrea's has a 7X
> increase in the number of TLB purges being issued to the GRU. TLB flushing

Are you sure that you're referring to #v7?

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-28 23:05                 ` Christoph Lameter
@ 2008-02-29  0:40                   ` Andrea Arcangeli
  2008-02-29  0:56                     ` Andrew Morton
  2008-02-29  1:03                     ` Christoph Lameter
  0 siblings, 2 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-29  0:40 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Thu, Feb 28, 2008 at 03:05:30PM -0800, Christoph Lameter wrote:
> Still think that the lock here is not of too much use and can be easily 
> replaced by mmap_sem.

I can use the mmap_sem.

> > +#define mmu_notifier(function, mm, args...)				\
> > +	do {								\
> > +		struct mmu_notifier *__mn;				\
> > +		struct hlist_node *__n;					\
> > +									\
> > +		if (unlikely(!hlist_empty(&(mm)->mmu_notifier.head))) { \
> > +			rcu_read_lock();				\
> > +			hlist_for_each_entry_rcu(__mn, __n,		\
> > +						 &(mm)->mmu_notifier.head, \
> > +						 hlist)			\
> > +				if (__mn->ops->function)		\
> > +					__mn->ops->function(__mn,	\
> > +							    mm,		\
> > +							    args);	\
> > +			rcu_read_unlock();				\
> > +		}							\
> > +	} while (0)
> 
> Andrew recomended local variables for parameters used multile times. This 
> means the mm parameter here.

I don't exactly see what "buggy macro" meant? I already use
parenthesis as needed to avoid the need of local variables to be
safe. Not really sure what's buggy, sorry!

> Note also Andrew's comments on the use of 0x00ff...

I thought I tried the (void) but it didn't work and your solution
worked, but perhaps I did something wrong, I'll try again with (void)
nevertheless.

> > +/*
> > + * No synchronization. This function can only be called when only a single
> > + * process remains that performs teardown.
> > + */
> > +void mmu_notifier_release(struct mm_struct *mm)
> > +{
> > +	struct mmu_notifier *mn;
> > +	struct hlist_node *n, *tmp;
> > +
> > +	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
> > +		hlist_for_each_entry_safe(mn, n, tmp,
> > +					  &mm->mmu_notifier.head, hlist) {
> > +			hlist_del(&mn->hlist);
> > +			if (mn->ops->release)
> > +				mn->ops->release(mn, mm);
> > +		}
> > +	}
> > +}
> 
> One could avoid a hlist_for_each_entry_safe here by simply always deleting 
> the first object. 

Agreed, the current construct come from the fact we previously didn't
assume nobody could ever call mmu_notifier_unregister by the time
mm_users is 0.

> Also re the _notify variants: The binding to pte_clear_flush_young etc 
> will become a problem for notifiers that want to sleep because 
> pte_clear_flush is usually called with the pte lock held. See f.e. 
> try_to_unmap_one, page_mkclean_one etc.

Calling __free_page out of the PT lock is much bigger
change. do_wp_page will require changes anyway when the sleepable
notifiers are merged.

> It would be better if the notifier calls could be moved outside of the 
> pte lock.

The point is that it can't make a difference right now, and my
objective was to avoid unnecessary source code duplication (later it
will be necessary, right now it isn't). By the time you rework
do_wp_page, removing _notify will be a very minor detail compared to
the rest of the changes to do_wp_page IMHO. Expanding it now won't
provide a real advantage later.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-29  0:40                   ` Andrea Arcangeli
@ 2008-02-29  0:56                     ` Andrew Morton
  2008-02-29  1:03                     ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Andrew Morton @ 2008-02-29  0:56 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Christoph Lameter, Jack Steiner, Nick Piggin, Robin Holt,
	Avi Kivity, Izik Eidus, kvm-devel, Peter Zijlstra, general,
	Steve Wise, Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Fri, 29 Feb 2008 01:40:01 +0100 Andrea Arcangeli <andrea@qumranet.com> wrote:

> > > +#define mmu_notifier(function, mm, args...)				\
> > > +	do {								\
> > > +		struct mmu_notifier *__mn;				\
> > > +		struct hlist_node *__n;					\
> > > +									\
> > > +		if (unlikely(!hlist_empty(&(mm)->mmu_notifier.head))) { \
> > > +			rcu_read_lock();				\
> > > +			hlist_for_each_entry_rcu(__mn, __n,		\
> > > +						 &(mm)->mmu_notifier.head, \
> > > +						 hlist)			\
> > > +				if (__mn->ops->function)		\
> > > +					__mn->ops->function(__mn,	\
> > > +							    mm,		\
> > > +							    args);	\
> > > +			rcu_read_unlock();				\
> > > +		}							\
> > > +	} while (0)
> > 
> > Andrew recomended local variables for parameters used multile times. This 
> > means the mm parameter here.
> 
> I don't exactly see what "buggy macro" meant?

multiple refernces to the argument, so

	mmu_notifier(foo, bar(), zot);

will call bar() either once or twice.

Unlikely in this case, but bad practice.  Easily fixable by using another
temporary.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-29  0:40                   ` Andrea Arcangeli
  2008-02-29  0:56                     ` Andrew Morton
@ 2008-02-29  1:03                     ` Christoph Lameter
  2008-02-29 13:09                       ` Andrea Arcangeli
  1 sibling, 1 reply; 120+ messages in thread
From: Christoph Lameter @ 2008-02-29  1:03 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Fri, 29 Feb 2008, Andrea Arcangeli wrote:

> > Also re the _notify variants: The binding to pte_clear_flush_young etc 
> > will become a problem for notifiers that want to sleep because 
> > pte_clear_flush is usually called with the pte lock held. See f.e. 
> > try_to_unmap_one, page_mkclean_one etc.
> 
> Calling __free_page out of the PT lock is much bigger
> change. do_wp_page will require changes anyway when the sleepable
> notifiers are merged.

I thought you wanted to get rid of the sync via pte lock?
What changes to do_wp_page do you envision?

> > It would be better if the notifier calls could be moved outside of the 
> > pte lock.
> 
> The point is that it can't make a difference right now, and my
> objective was to avoid unnecessary source code duplication (later it
> will be necessary, right now it isn't). By the time you rework
> do_wp_page, removing _notify will be a very minor detail compared to
> the rest of the changes to do_wp_page IMHO. Expanding it now won't
> provide a real advantage later.

What is the trouble with the current do_wp_page modifications? There is 
no need for invalidate_page() there so far. invalidate_range() does the 
trick there.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-29  0:24                       ` Andrea Arcangeli
@ 2008-02-29  1:13                         ` Christoph Lameter
  0 siblings, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-02-29  1:13 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Andrea Arcangeli, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Fri, 29 Feb 2008, Andrea Arcangeli wrote:

> On Thu, Feb 28, 2008 at 05:17:33PM -0600, Jack Steiner wrote:
> > I disagree. The location of the callout IS a performance issue. In simple
> > comparisons of the 2 patches (Christoph's vs. Andrea's), Andrea's has a 7X
> > increase in the number of TLB purges being issued to the GRU. TLB flushing
> 
> Are you sure that you're referring to #v7?

Jack: AFAICT Andrea moved the release callout and things will be 
fine in the next release.

 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-29  1:03                     ` Christoph Lameter
@ 2008-02-29 13:09                       ` Andrea Arcangeli
  2008-02-29 19:46                         ` Christoph Lameter
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-29 13:09 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Thu, Feb 28, 2008 at 05:03:01PM -0800, Christoph Lameter wrote:
> I thought you wanted to get rid of the sync via pte lock?

Sure. _notify is happening inside the pt lock by coincidence, to
reduce the changes to mm/* as long as the mmu notifiers aren't
sleep capable.

> What changes to do_wp_page do you envision?

Converting it to invalidate_range_begin/end.

> What is the trouble with the current do_wp_page modifications? There is 
> no need for invalidate_page() there so far. invalidate_range() does the 
> trick there.

No trouble, it's just that I didn't want to mangle over the logic of
do_wp_page unless it was strictly required, the patch has to be
obviously safe. You need to keep that bit of your patch to make the
mmu notifiers sleepable.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v7
  2008-02-29 13:09                       ` Andrea Arcangeli
@ 2008-02-29 19:46                         ` Christoph Lameter
  0 siblings, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-02-29 19:46 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Fri, 29 Feb 2008, Andrea Arcangeli wrote:

> On Thu, Feb 28, 2008 at 05:03:01PM -0800, Christoph Lameter wrote:
> > I thought you wanted to get rid of the sync via pte lock?
> 
> Sure. _notify is happening inside the pt lock by coincidence, to
> reduce the changes to mm/* as long as the mmu notifiers aren't
> sleep capable.

Ok if this is a coincidence then it would be better to separate the 
notifier callouts from the pte macro calls.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] mmu notifiers #v8
  2008-02-27 19:26               ` [PATCH] mmu notifiers #v7 Andrea Arcangeli
                                   ` (3 preceding siblings ...)
  2008-02-28 23:05                 ` Christoph Lameter
@ 2008-03-02 15:54                 ` Andrea Arcangeli
  2008-03-02 16:03                   ` [PATCH] mmu notifiers #v8 + xpmem Andrea Arcangeli
                                     ` (5 more replies)
  4 siblings, 6 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-02 15:54 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	linux-kernel, linux-mm, daniel.blueman, Christoph Lameter

Difference between #v7 and #v8:

1) s/age_page/clear_flush_young/ (Nick's suggestion)
2) macro fix (Andrew)
3) move release before final unmap_vmas (for GRU, Jack/Christoph)
4) microoptimize mmu_notifier_unregister (Christoph)
5) use mmap_sem for registration serialization (Christoph)

The (void)xxx in macros doesn't work with "args". Christoph's solution
look best in avoiding warnings, even if it forces to make the mmu
notifier operation structure visible even if MMU_NOTIFIER=n (that's
the only downside).

I didn't drop invalidate_page, because invalidate_range_begin/end
would be slower for usages like KVM/GRU (we don't need a begin/end
there because where invalidate_page is called, the VM holds a
reference on the page). do_wp_page should also use invalidate_page
since it can free the page after dropping the PT lock without losing
any performance (that's not true for the places where invalidate_range
is called).

It'd be nice if everyone involved can agree to converge on this API
for .25. KVM/GRU (and perhaps Quadrics) and similar usages will be
fully covered in .25. This is a kernel internal API so there's no
problem if all the methods will become sleep capable only starting
only in .26. The brainer part of the VM work to do to make it sleep
capable is pretty much orthogonal with this patch.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -10,6 +10,7 @@
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/completion.h>
+#include <linux/mmu_notifier.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -228,6 +229,8 @@ struct mm_struct {
 #ifdef CONFIG_CGROUP_MEM_CONT
 	struct mem_cgroup *mem_cgroup;
 #endif
+
+	struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
new file mode 100644
--- /dev/null
+++ b/include/linux/mmu_notifier.h
@@ -0,0 +1,161 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct mmu_notifier;
+
+struct mmu_notifier_ops {
+	/*
+	 * Called when nobody can register any more notifier in the mm
+	 * and after the "mn" notifier has been disarmed already.
+	 */
+	void (*release)(struct mmu_notifier *mn,
+			struct mm_struct *mm);
+
+	/*
+	 * clear_flush_young is called after the VM is
+	 * test-and-clearing the young/accessed bitflag in the
+	 * pte. This way the VM will provide proper aging to the
+	 * accesses to the page through the secondary MMUs and not
+	 * only to the ones through the Linux pte.
+	 */
+	int (*clear_flush_young)(struct mmu_notifier *mn,
+				 struct mm_struct *mm,
+				 unsigned long address);
+
+	/*
+	 * Before this is invoked any secondary MMU is still ok to
+	 * read/write to the page previously pointed by the Linux pte
+	 * because the old page hasn't been freed yet.  If required
+	 * set_page_dirty has to be called internally to this method.
+	 */
+	void (*invalidate_page)(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long address);
+
+	/*
+	 * invalidate_range_begin() and invalidate_range_end() must be
+	 * paired. Multiple invalidate_range_begin/ends may be nested
+	 * or called concurrently.
+	 */
+	void (*invalidate_range_begin)(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start, unsigned long end);       
+	void (*invalidate_range_end)(struct mmu_notifier *mn,
+				     struct mm_struct *mm,
+				     unsigned long start, unsigned long end);
+};
+
+struct mmu_notifier {
+	struct hlist_node hlist;
+	const struct mmu_notifier_ops *ops;
+};
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier_head {
+	struct hlist_head head;
+};
+
+#include <linux/mm_types.h>
+
+/*
+ * Must hold the mmap_sem for write.
+ *
+ * RCU is used to traverse the list. A quiescent period needs to pass
+ * before the notifier is guaranteed to be visible to all threads.
+ */
+extern void mmu_notifier_register(struct mmu_notifier *mn,
+				  struct mm_struct *mm);
+/*
+ * Must hold the mmap_sem for write.
+ *
+ * RCU is used to traverse the list. A quiescent period needs to pass
+ * before the "struct mmu_notifier" can be freed. Alternatively it
+ * can be synchronously freed inside ->release when the list can't
+ * change anymore and nobody could possibly walk it.
+ */
+extern void mmu_notifier_unregister(struct mmu_notifier *mn,
+				    struct mm_struct *mm);
+extern void mmu_notifier_release(struct mm_struct *mm);
+extern int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					  unsigned long address);
+
+static inline void mmu_notifier_head_init(struct mmu_notifier_head *mnh)
+{
+	INIT_HLIST_HEAD(&mnh->head);
+}
+
+#define mmu_notifier(function, mm, args...)				\
+	do {								\
+		struct mmu_notifier *__mn;				\
+		struct hlist_node *__n;					\
+		struct mm_struct * __mm = mm;				\
+									\
+		if (unlikely(!hlist_empty(&__mm->mmu_notifier.head))) { \
+			rcu_read_lock();				\
+			hlist_for_each_entry_rcu(__mn, __n,		\
+						 &__mm->mmu_notifier.head, \
+						 hlist)			\
+				if (__mn->ops->function)		\
+					__mn->ops->function(__mn,	\
+							    __mm,	\
+							    args);	\
+			rcu_read_unlock();				\
+		}							\
+	} while (0)
+
+#define ptep_clear_flush_notify(__vma, __address, __ptep)		\
+({									\
+	pte_t __pte;							\
+	struct vm_area_struct * ___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__pte = ptep_clear_flush(___vma, ___address, __ptep);		\
+	mmu_notifier(invalidate_page, ___vma->vm_mm, ___address);	\
+	__pte;								\
+})
+
+#define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
+({									\
+	int __young;							\
+	struct vm_area_struct * ___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = ptep_clear_flush_young(___vma, ___address, __ptep);	\
+	__young |= mmu_notifier_clear_flush_young(___vma->vm_mm,	\
+						  ___address);		\
+	__young;							\
+})
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+struct mmu_notifier_head {};
+
+#define mmu_notifier_register(mn, mm) do {} while(0)
+#define mmu_notifier_unregister(mn, mm) do {} while (0)
+#define mmu_notifier_release(mm) do {} while (0)
+#define mmu_notifier_head_init(mmh) do {} while (0)
+
+/*
+ * Notifiers that use the parameters that they were passed so that the
+ * compiler does not complain about unused variables but does proper
+ * parameter checks even if !CONFIG_MMU_NOTIFIER.
+ * Macros generate no code.
+ */
+#define mmu_notifier(function, mm, args...)			       \
+	do {							       \
+		if (0) {					       \
+			struct mmu_notifier *__mn;		       \
+								       \
+			__mn = (struct mmu_notifier *)(0x00ff);	       \
+			__mn->ops->function(__mn, mm, args);	       \
+		};						       \
+	} while (0)
+
+#define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define ptep_clear_flush_notify ptep_clear_flush
+
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -362,6 +362,7 @@ static struct mm_struct * mm_init(struct
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
+		mmu_notifier_head_init(&mm->mmu_notifier);
 		return mm;
 	}
 
diff --git a/mm/Kconfig b/mm/Kconfig
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,3 +193,7 @@ config VIRT_TO_BUS
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+	def_bool y
+	bool "MMU notifier, for paging KVM/RDMA"
diff --git a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,4 +33,4 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
-
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,7 +194,7 @@ __xip_unmap (struct address_space * mapp
 		if (pte) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
-			pteval = ptep_clear_flush(vma, address, pte);
+			pteval = ptep_clear_flush_notify(vma, address, pte);
 			page_remove_rmap(page, vma);
 			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
diff --git a/mm/fremap.c b/mm/fremap.c
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -214,7 +214,9 @@ asmlinkage long sys_remap_file_pages(uns
 		spin_unlock(&mapping->i_mmap_lock);
 	}
 
+	mmu_notifier(invalidate_range_begin, mm, start, start + size);
 	err = populate_range(mm, vma, start, size, pgoff);
+	mmu_notifier(invalidate_range_end, mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
 		if (unlikely(has_write_lock)) {
 			downgrade_write(&mm->mmap_sem);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -755,6 +755,7 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	mmu_notifier(invalidate_range_begin, mm, start, end);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
@@ -775,6 +776,7 @@ void __unmap_hugepage_range(struct vm_ar
 	}
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
+	mmu_notifier(invalidate_range_end, mm, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
 		list_del(&page->lru);
 		put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -611,6 +611,9 @@ int copy_page_range(struct mm_struct *ds
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
+	if (is_cow_mapping(vma->vm_flags))
+		mmu_notifier(invalidate_range_begin, src_mm, addr, end);
+
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
 	do {
@@ -621,6 +624,11 @@ int copy_page_range(struct mm_struct *ds
 						vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+	if (is_cow_mapping(vma->vm_flags))
+		mmu_notifier(invalidate_range_end, src_mm,
+						vma->vm_start, end);
+
 	return 0;
 }
 
@@ -897,7 +905,9 @@ unsigned long zap_page_range(struct vm_a
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+	mmu_notifier(invalidate_range_begin, mm, address, end);
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	mmu_notifier(invalidate_range_end, mm, address, end);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
@@ -1463,10 +1473,11 @@ int apply_to_page_range(struct mm_struct
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long end = addr + size;
+	unsigned long start = addr, end = addr + size;
 	int err;
 
 	BUG_ON(addr >= end);
+	mmu_notifier(invalidate_range_begin, mm, start, end);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1474,6 +1485,7 @@ int apply_to_page_range(struct mm_struct
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	mmu_notifier(invalidate_range_end, mm, start, end);
 	return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1675,7 +1687,7 @@ gotten:
 		 * seen in the presence of one thread doing SMC and another
 		 * thread doing COW.
 		 */
-		ptep_clear_flush(vma, address, page_table);
+		ptep_clear_flush_notify(vma, address, page_table);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lru_cache_add_active(new_page);
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1747,11 +1747,13 @@ static void unmap_region(struct mm_struc
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+	mmu_notifier(invalidate_range_begin, mm, start, end);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
+	mmu_notifier(invalidate_range_end, mm, start, end);
 }
 
 /*
@@ -2037,6 +2039,7 @@ void exit_mmap(struct mm_struct *mm)
 	unsigned long end;
 
 	/* mm's last user has gone, and its about to be pulled down */
+	mmu_notifier_release(mm);
 	arch_exit_mmap(mm);
 
 	lru_add_drain();
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,75 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+
+/*
+ * No synchronization. This function can only be called when only a single
+ * process remains that performs teardown.
+ */
+void mmu_notifier_release(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+
+	while (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+		mn = hlist_entry(mm->mmu_notifier.head.first,
+				 struct mmu_notifier,
+				 hlist);
+		hlist_del(&mn->hlist);
+		if (mn->ops->release)
+			mn->ops->release(mn, mm);
+	}
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->age_page can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+	int young = 0;
+
+	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(mn, n,
+					 &mm->mmu_notifier.head, hlist) {
+			if (mn->ops->clear_flush_young)
+				young |= mn->ops->clear_flush_young(mn, mm,
+								    address);
+		}
+		rcu_read_unlock();
+	}
+
+	return young;
+}
+
+/*
+ * Note that all notifiers use RCU. The updates are only guaranteed to
+ * be visible to other processes after a RCU quiescent period!
+ *
+ * Must hold mmap_sem writably when calling registration functions.
+ */
+void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier.head);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	hlist_del_rcu(&mn->hlist);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mprotect.c b/mm/mprotect.c
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -198,10 +198,12 @@ success:
 		dirty_accountable = 1;
 	}
 
+	mmu_notifier(invalidate_range_begin, mm, start, end);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
 		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+	mmu_notifier(invalidate_range_end, mm, start, end);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -74,6 +74,7 @@ static void move_ptes(struct vm_area_str
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
+	unsigned long old_start;
 
 	if (vma->vm_file) {
 		/*
@@ -100,6 +101,9 @@ static void move_ptes(struct vm_area_str
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 	arch_enter_lazy_mmu_mode();
 
+	old_start = old_addr;
+	mmu_notifier(invalidate_range_begin, vma->vm_mm,
+		     old_start, old_end);
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
 		if (pte_none(*old_pte))
@@ -108,6 +112,7 @@ static void move_ptes(struct vm_area_str
 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
+	mmu_notifier(invalidate_range_end, vma->vm_mm, old_start, old_end);
 
 	arch_leave_lazy_mmu_mode();
 	if (new_ptl != old_ptl)
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -287,7 +287,7 @@ static int page_referenced_one(struct pa
 	if (vma->vm_flags & VM_LOCKED) {
 		referenced++;
 		*mapcount = 1;	/* break early from loop */
-	} else if (ptep_clear_flush_young(vma, address, pte))
+	} else if (ptep_clear_flush_young_notify(vma, address, pte))
 		referenced++;
 
 	/* Pretend the page is referenced if the task has the
@@ -454,7 +454,7 @@ static int page_mkclean_one(struct page 
 		pte_t entry;
 
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		entry = ptep_clear_flush(vma, address, pte);
+		entry = ptep_clear_flush_notify(vma, address, pte);
 		entry = pte_wrprotect(entry);
 		entry = pte_mkclean(entry);
 		set_pte_at(mm, address, pte, entry);
@@ -712,14 +712,14 @@ static int try_to_unmap_one(struct page 
 	 * skipped over this mm) then we should reactivate it.
 	 */
 	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)))) {
+			(ptep_clear_flush_young_notify(vma, address, pte)))) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush(vma, address, pte);
+	pteval = ptep_clear_flush_notify(vma, address, pte);
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
@@ -844,12 +844,12 @@ static void try_to_unmap_cluster(unsigne
 		page = vm_normal_page(vma, address, *pte);
 		BUG_ON(!page || PageAnon(page));
 
-		if (ptep_clear_flush_young(vma, address, pte))
+		if (ptep_clear_flush_young_notify(vma, address, pte))
 			continue;
 
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush(vma, address, pte);
+		pteval = ptep_clear_flush_notify(vma, address, pte);
 
 		/* If nonlinear, store the file page offset in the pte. */
 		if (page->index != linear_page_index(vma, address))

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8 + xpmem
  2008-03-02 15:54                 ` [PATCH] mmu notifiers #v8 Andrea Arcangeli
@ 2008-03-02 16:03                   ` Andrea Arcangeli
  2008-03-02 16:23                     ` Peter Zijlstra
  2008-03-03  3:29                   ` [PATCH] mmu notifiers #v8 Nick Piggin
                                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-02 16:03 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	linux-kernel, linux-mm, daniel.blueman, Christoph Lameter

Here an example of the futher orthogonal work to do on top of #v8
during .26-rc to make the whole mmu notifier API sleep capable.

1) Every single ptep_clear_flush_young_notify and
ptep_clear_flush_notify must be converted like the below. The below is
the conversion of a single one. do_wp_page has been converted by
Christoph already but with invalidate_range (should be changed to
invalidate_page by releasing the refcount on the page after calling
invalidate_page). Hope it's clear why I'd rather not depend on these
changes to be merged in .25 in order to have the mmu notifier included
in .25.

2) Then after all this conversion work is finished, it's trivial to
delete ptep_clear_flush_young_notify and ptep_clear_flush_notify from
mmu_notifier.h (they will be unused macros once the conversion is
complete).

3) After that the VM has to be changed to convert anon_vma lock and
i_mmap_lock spinlocks to mutex/rwsemaphore.

4) Then finally the mmu_notifier_unregister must be dropped to make the
mmu notifier sleep capable with RCU in the mmu_notifier() fast path.

It's unclear at this point if 3/4 should be switchable and happening
under a CONFIG_XPMEM or similar or if everyone will benefit from those
spinlock becoming mutex (the only one that is certain to appreciate
such a change is preempt-rt, the rest of the userbase I don't know for
sure and I'd be more confortable with a TPC number comparison before
doing such a chance by default, but I leave the commentary on such a
change to linux-mm in a separate thread).

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -274,7 +274,7 @@ static int page_referenced_one(struct pa
 	unsigned long address;
 	pte_t *pte;
 	spinlock_t *ptl;
-	int referenced = 0;
+	int referenced = 0, clear_flush_young = 0;
 
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
@@ -287,8 +287,11 @@ static int page_referenced_one(struct pa
 	if (vma->vm_flags & VM_LOCKED) {
 		referenced++;
 		*mapcount = 1;	/* break early from loop */
-	} else if (ptep_clear_flush_young_notify(vma, address, pte))
-		referenced++;
+	} else {
+		clear_flush_young = 1;
+		if (ptep_clear_flush_young(vma, address, pte))
+			referenced++;
+	}
 
 	/* Pretend the page is referenced if the task has the
 	   swap token and is in the middle of a page fault. */
@@ -298,6 +301,11 @@ static int page_referenced_one(struct pa
 
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
+
+	if (clear_flush_young)
+		referenced += mmu_notifier_clear_flush_young(vma->vm_mm,
+							     address);
+
 out:
 	return referenced;
 }


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8 + xpmem
  2008-03-02 16:03                   ` [PATCH] mmu notifiers #v8 + xpmem Andrea Arcangeli
@ 2008-03-02 16:23                     ` Peter Zijlstra
  0 siblings, 0 replies; 120+ messages in thread
From: Peter Zijlstra @ 2008-03-02 16:23 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter


On Sun, 2008-03-02 at 17:03 +0100, Andrea Arcangeli wrote:

> 4) Then finally the mmu_notifier_unregister must be dropped to make the
> mmu notifier sleep capable with RCU in the mmu_notifier() fast path.

Or require PREEMPTIBLE_RCU, that can handle sleeps..


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-02 15:54                 ` [PATCH] mmu notifiers #v8 Andrea Arcangeli
  2008-03-02 16:03                   ` [PATCH] mmu notifiers #v8 + xpmem Andrea Arcangeli
@ 2008-03-03  3:29                   ` Nick Piggin
  2008-03-03 12:51                     ` Andrea Arcangeli
  2008-03-03 19:01                     ` Christoph Lameter
  2008-03-03  3:33                   ` Nick Piggin
                                     ` (3 subsequent siblings)
  5 siblings, 2 replies; 120+ messages in thread
From: Nick Piggin @ 2008-03-03  3:29 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Sun, Mar 02, 2008 at 04:54:57PM +0100, Andrea Arcangeli wrote:
> Difference between #v7 and #v8:
> 
> 1) s/age_page/clear_flush_young/ (Nick's suggestion)
> 2) macro fix (Andrew)
> 3) move release before final unmap_vmas (for GRU, Jack/Christoph)
> 4) microoptimize mmu_notifier_unregister (Christoph)
> 5) use mmap_sem for registration serialization (Christoph)
> 
> The (void)xxx in macros doesn't work with "args". Christoph's solution
> look best in avoiding warnings, even if it forces to make the mmu
> notifier operation structure visible even if MMU_NOTIFIER=n (that's
> the only downside).

I have a couple of "cleanup" patches that change the structure of this
to something I prefer. Others may not, but I'll post them for debate
anyway.

 
> I didn't drop invalidate_page, because invalidate_range_begin/end
> would be slower for usages like KVM/GRU (we don't need a begin/end
> there because where invalidate_page is called, the VM holds a
> reference on the page). do_wp_page should also use invalidate_page
> since it can free the page after dropping the PT lock without losing
> any performance (that's not true for the places where invalidate_range
> is called).

I'm still not completely happy with this. I had a very quick look
at the GRU driver, but I don't see why it can't be implemented
more like the regular TLB model, and have TLB insertions depend on
the linux pte, and do invalidates _after_ restricting permissions
to the pte.

Ie. I'd still like to get rid of invalidate_range_begin, and get
rid of invalidate calls from places where permissions are relaxed.


> It'd be nice if everyone involved can agree to converge on this API
> for .25. KVM/GRU (and perhaps Quadrics) and similar usages will be
> fully covered in .25.

If we can agree on the API, then I don't see any reason why it can't
go into 2.6.25, unless someome wants more time to review it (but
2.6.25 release should be quite far away still so there should be quite
a bit of time).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-02 15:54                 ` [PATCH] mmu notifiers #v8 Andrea Arcangeli
  2008-03-02 16:03                   ` [PATCH] mmu notifiers #v8 + xpmem Andrea Arcangeli
  2008-03-03  3:29                   ` [PATCH] mmu notifiers #v8 Nick Piggin
@ 2008-03-03  3:33                   ` Nick Piggin
  2008-03-03 19:03                     ` Christoph Lameter
  2008-03-03  3:34                   ` Nick Piggin
                                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 120+ messages in thread
From: Nick Piggin @ 2008-03-03  3:33 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Sun, Mar 02, 2008 at 04:54:57PM +0100, Andrea Arcangeli wrote:
> Difference between #v7 and #v8:

[patch] mmu-v8: demacro


Remove the macros from mmu_notifier.h, in favour of functions.

This requires untangling the include order circular dependencies as well,
so just remove struct mmu_notifier_head in favour of just using the hlist
in mm_struct.

Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/include/linux/mmu_notifier.h
===================================================================
--- linux-2.6.orig/include/linux/mmu_notifier.h
+++ linux-2.6/include/linux/mmu_notifier.h
@@ -55,12 +55,13 @@ struct mmu_notifier {
 
 #ifdef CONFIG_MMU_NOTIFIER
 
-struct mmu_notifier_head {
-	struct hlist_head head;
-};
-
 #include <linux/mm_types.h>
 
+static inline int mm_has_notifiers(struct mm_struct *mm)
+{
+	return unlikely(!hlist_empty(&mm->mmu_notifier_list));
+}
+
 /*
  * Must hold the mmap_sem for write.
  *
@@ -79,33 +80,59 @@ extern void mmu_notifier_register(struct
  */
 extern void mmu_notifier_unregister(struct mmu_notifier *mn,
 				    struct mm_struct *mm);
-extern void mmu_notifier_release(struct mm_struct *mm);
-extern int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+
+extern void __mmu_notifier_release(struct mm_struct *mm);
+extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long address);
+extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+					  unsigned long address);
+extern void __mmu_notifier_invalidate_range_begin(struct mm_struct *mm,
+				  unsigned long start, unsigned long end);
+extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end);
+
+
+static inline void mmu_notifier_release(struct mm_struct *mm)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_release(mm);
+}
+
+static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					  unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_clear_flush_young(mm, address);
+	return 0;
+}
+
+static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
+					  unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_invalidate_page(mm, address);
+}
+
+static inline void mmu_notifier_invalidate_range_begin(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_invalidate_range_begin(mm, start, end);
+}
+
+static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_invalidate_range_end(mm, start, end);
+}
 
-static inline void mmu_notifier_head_init(struct mmu_notifier_head *mnh)
+static inline void mmu_notifier_mm_init(struct mm_struct *mm)
 {
-	INIT_HLIST_HEAD(&mnh->head);
+	INIT_HLIST_HEAD(&mm->mmu_notifier_list);
 }
 
-#define mmu_notifier(function, mm, args...)				\
-	do {								\
-		struct mmu_notifier *__mn;				\
-		struct hlist_node *__n;					\
-		struct mm_struct * __mm = mm;				\
-									\
-		if (unlikely(!hlist_empty(&__mm->mmu_notifier.head))) { \
-			rcu_read_lock();				\
-			hlist_for_each_entry_rcu(__mn, __n,		\
-						 &__mm->mmu_notifier.head, \
-						 hlist)			\
-				if (__mn->ops->function)		\
-					__mn->ops->function(__mn,	\
-							    __mm,	\
-							    args);	\
-			rcu_read_unlock();				\
-		}							\
-	} while (0)
+
 
 #define ptep_clear_flush_notify(__vma, __address, __ptep)		\
 ({									\
@@ -113,7 +140,7 @@ static inline void mmu_notifier_head_ini
 	struct vm_area_struct * ___vma = __vma;				\
 	unsigned long ___address = __address;				\
 	__pte = ptep_clear_flush(___vma, ___address, __ptep);		\
-	mmu_notifier(invalidate_page, ___vma->vm_mm, ___address);	\
+	mmu_notifier_invalidate_page(___vma->vm_mm, ___address);	\
 	__pte;								\
 })
 
@@ -130,28 +157,34 @@ static inline void mmu_notifier_head_ini
 
 #else /* CONFIG_MMU_NOTIFIER */
 
-struct mmu_notifier_head {};
+static inline void mmu_notifier_release(struct mm_struct *mm)
+{
+}
 
-#define mmu_notifier_register(mn, mm) do {} while(0)
-#define mmu_notifier_unregister(mn, mm) do {} while (0)
-#define mmu_notifier_release(mm) do {} while (0)
-#define mmu_notifier_head_init(mmh) do {} while (0)
+static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					  unsigned long address)
+{
+	return 0;
+}
 
-/*
- * Notifiers that use the parameters that they were passed so that the
- * compiler does not complain about unused variables but does proper
- * parameter checks even if !CONFIG_MMU_NOTIFIER.
- * Macros generate no code.
- */
-#define mmu_notifier(function, mm, args...)			       \
-	do {							       \
-		if (0) {					       \
-			struct mmu_notifier *__mn;		       \
-								       \
-			__mn = (struct mmu_notifier *)(0x00ff);	       \
-			__mn->ops->function(__mn, mm, args);	       \
-		};						       \
-	} while (0)
+static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
+					  unsigned long address)
+{
+}
+
+static inline void mmu_notifier_invalidate_range_begin(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+}
+
+static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+}
+
+static inline void mmu_notifier_mm_init(struct mm_struct *mm)
+{
+}
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define ptep_clear_flush_notify ptep_clear_flush
Index: linux-2.6/mm/mmu_notifier.c
===================================================================
--- linux-2.6.orig/mm/mmu_notifier.c
+++ linux-2.6/mm/mmu_notifier.c
@@ -17,12 +17,12 @@
  * No synchronization. This function can only be called when only a single
  * process remains that performs teardown.
  */
-void mmu_notifier_release(struct mm_struct *mm)
+void __mmu_notifier_release(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
 
-	while (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
-		mn = hlist_entry(mm->mmu_notifier.head.first,
+	while (unlikely(!hlist_empty(&mm->mmu_notifier_list))) {
+		mn = hlist_entry(mm->mmu_notifier_list.first,
 				 struct mmu_notifier,
 				 hlist);
 		hlist_del(&mn->hlist);
@@ -32,30 +32,69 @@ void mmu_notifier_release(struct mm_stru
 }
 
 /*
- * If no young bitflag is supported by the hardware, ->age_page can
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
  * unmap the address and return 1 or 0 depending if the mapping previously
  * existed or not.
  */
-int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address)
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					unsigned long address)
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
 	int young = 0;
 
-	if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
-		rcu_read_lock();
-		hlist_for_each_entry_rcu(mn, n,
-					 &mm->mmu_notifier.head, hlist) {
-			if (mn->ops->clear_flush_young)
-				young |= mn->ops->clear_flush_young(mn, mm,
-								    address);
-		}
-		rcu_read_unlock();
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
+		if (mn->ops->clear_flush_young)
+			young |= mn->ops->clear_flush_young(mn, mm, address);
 	}
+	rcu_read_unlock();
 
 	return young;
 }
 
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+					  unsigned long address)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
+		if (mn->ops->invalidate_page)
+			mn->ops->invalidate_page(mn, mm, address);
+	}
+	rcu_read_unlock();
+}
+
+void __mmu_notifier_invalidate_range_begin(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
+		if (mn->ops->invalidate_range_begin)
+			mn->ops->invalidate_range_begin(mn, mm, start, end);
+	}
+	rcu_read_unlock();
+}
+
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
+		if (mn->ops->invalidate_range_end)
+			mn->ops->invalidate_range_end(mn, mm, start, end);
+	}
+	rcu_read_unlock();
+}
+
 /*
  * Note that all notifiers use RCU. The updates are only guaranteed to
  * be visible to other processes after a RCU quiescent period!
@@ -64,7 +103,7 @@ int mmu_notifier_clear_flush_young(struc
  */
 void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
 {
-	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier.head);
+	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_list);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_register);
 
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c
+++ linux-2.6/mm/fremap.c
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -214,9 +215,9 @@ asmlinkage long sys_remap_file_pages(uns
 		spin_unlock(&mapping->i_mmap_lock);
 	}
 
-	mmu_notifier(invalidate_range_begin, mm, start, start + size);
+	mmu_notifier_invalidate_range_begin(mm, start, start + size);
 	err = populate_range(mm, vma, start, size, pgoff);
-	mmu_notifier(invalidate_range_end, mm, start, start + size);
+	mmu_notifier_invalidate_range_end(mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
 		if (unlikely(has_write_lock)) {
 			downgrade_write(&mm->mmap_sem);
Index: linux-2.6/mm/hugetlb.c
===================================================================
--- linux-2.6.orig/mm/hugetlb.c
+++ linux-2.6/mm/hugetlb.c
@@ -14,6 +14,7 @@
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -755,7 +756,7 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
-	mmu_notifier(invalidate_range_begin, mm, start, end);
+	mmu_notifier_invalidate_range_begin(mm, start, end);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
@@ -776,7 +777,7 @@ void __unmap_hugepage_range(struct vm_ar
 	}
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
-	mmu_notifier(invalidate_range_end, mm, start, end);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
 		list_del(&page->lru);
 		put_page(page);
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -51,6 +51,7 @@
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -612,7 +613,7 @@ int copy_page_range(struct mm_struct *ds
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
 	if (is_cow_mapping(vma->vm_flags))
-		mmu_notifier(invalidate_range_begin, src_mm, addr, end);
+		mmu_notifier_invalidate_range_begin(src_mm, addr, end);
 
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
@@ -626,7 +627,7 @@ int copy_page_range(struct mm_struct *ds
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
 
 	if (is_cow_mapping(vma->vm_flags))
-		mmu_notifier(invalidate_range_end, src_mm,
+		mmu_notifier_invalidate_range_end(src_mm,
 						vma->vm_start, end);
 
 	return 0;
@@ -905,9 +906,9 @@ unsigned long zap_page_range(struct vm_a
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	mmu_notifier(invalidate_range_begin, mm, address, end);
+	mmu_notifier_invalidate_range_begin(mm, address, end);
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-	mmu_notifier(invalidate_range_end, mm, address, end);
+	mmu_notifier_invalidate_range_end(mm, address, end);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
@@ -1477,7 +1478,7 @@ int apply_to_page_range(struct mm_struct
 	int err;
 
 	BUG_ON(addr >= end);
-	mmu_notifier(invalidate_range_begin, mm, start, end);
+	mmu_notifier_invalidate_range_begin(mm, start, end);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1485,7 +1486,7 @@ int apply_to_page_range(struct mm_struct
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	mmu_notifier(invalidate_range_end, mm, start, end);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 	return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -26,6 +26,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1747,13 +1748,13 @@ static void unmap_region(struct mm_struc
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	mmu_notifier(invalidate_range_begin, mm, start, end);
+	mmu_notifier_invalidate_range_begin(mm, start, end);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
-	mmu_notifier(invalidate_range_end, mm, start, end);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 }
 
 /*
Index: linux-2.6/mm/mprotect.c
===================================================================
--- linux-2.6.orig/mm/mprotect.c
+++ linux-2.6/mm/mprotect.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -198,12 +199,12 @@ success:
 		dirty_accountable = 1;
 	}
 
-	mmu_notifier(invalidate_range_begin, mm, start, end);
+	mmu_notifier_invalidate_range_begin(mm, start, end);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
 		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
-	mmu_notifier(invalidate_range_end, mm, start, end);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c
+++ linux-2.6/mm/mremap.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -102,7 +103,7 @@ static void move_ptes(struct vm_area_str
 	arch_enter_lazy_mmu_mode();
 
 	old_start = old_addr;
-	mmu_notifier(invalidate_range_begin, vma->vm_mm,
+	mmu_notifier_invalidate_range_begin(vma->vm_mm,
 		     old_start, old_end);
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -112,7 +113,7 @@ static void move_ptes(struct vm_area_str
 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
-	mmu_notifier(invalidate_range_end, vma->vm_mm, old_start, old_end);
+	mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 
 	arch_leave_lazy_mmu_mode();
 	if (new_ptl != old_ptl)
Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h
+++ linux-2.6/include/linux/mm_types.h
@@ -10,7 +10,6 @@
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/completion.h>
-#include <linux/mmu_notifier.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -229,8 +228,9 @@ struct mm_struct {
 #ifdef CONFIG_CGROUP_MEM_CONT
 	struct mem_cgroup *mem_cgroup;
 #endif
-
-	struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
+#ifdef CONFIG_MMU_NOTIFIER
+	struct hlist_head mmu_notifier_list;
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlbflush.h>
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-02 15:54                 ` [PATCH] mmu notifiers #v8 Andrea Arcangeli
                                     ` (2 preceding siblings ...)
  2008-03-03  3:33                   ` Nick Piggin
@ 2008-03-03  3:34                   ` Nick Piggin
  2008-03-03 19:04                     ` Christoph Lameter
  2008-03-03  3:39                   ` Nick Piggin
  2008-03-03 21:37                   ` [PATCH] mmu notifiers #v9 Andrea Arcangeli
  5 siblings, 1 reply; 120+ messages in thread
From: Nick Piggin @ 2008-03-03  3:34 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Sun, Mar 02, 2008 at 04:54:57PM +0100, Andrea Arcangeli wrote:
> Difference between #v7 and #v8:

This one on top of the previous patch

[patch] mmu-v8: typesafe

Move definition of struct mmu_notifier and struct mmu_notifier_ops under
CONFIG_MMU_NOTIFIER to ensure they doesn't get dereferenced when they
don't make sense.

Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/include/linux/mmu_notifier.h
===================================================================
--- linux-2.6.orig/include/linux/mmu_notifier.h
+++ linux-2.6/include/linux/mmu_notifier.h
@@ -3,8 +3,12 @@
 
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/mm_types.h>
 
 struct mmu_notifier;
+struct mmu_notifier_ops;
+
+#ifdef CONFIG_MMU_NOTIFIER
 
 struct mmu_notifier_ops {
 	/*
@@ -53,10 +57,6 @@ struct mmu_notifier {
 	const struct mmu_notifier_ops *ops;
 };
 
-#ifdef CONFIG_MMU_NOTIFIER
-
-#include <linux/mm_types.h>
-
 static inline int mm_has_notifiers(struct mm_struct *mm)
 {
 	return unlikely(!hlist_empty(&mm->mmu_notifier_list));

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-02 15:54                 ` [PATCH] mmu notifiers #v8 Andrea Arcangeli
                                     ` (3 preceding siblings ...)
  2008-03-03  3:34                   ` Nick Piggin
@ 2008-03-03  3:39                   ` Nick Piggin
  2008-03-03 21:37                   ` [PATCH] mmu notifiers #v9 Andrea Arcangeli
  5 siblings, 0 replies; 120+ messages in thread
From: Nick Piggin @ 2008-03-03  3:39 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Sun, Mar 02, 2008 at 04:54:57PM +0100, Andrea Arcangeli wrote:
> Difference between #v7 and #v8:

Here is just a couple of checkpatch fixes on top of the last patches.

Index: linux-2.6/include/linux/mmu_notifier.h
===================================================================
--- linux-2.6.orig/include/linux/mmu_notifier.h
+++ linux-2.6/include/linux/mmu_notifier.h
@@ -46,7 +46,7 @@ struct mmu_notifier_ops {
 	 */
 	void (*invalidate_range_begin)(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
-				       unsigned long start, unsigned long end);       
+				       unsigned long start, unsigned long end);
 	void (*invalidate_range_end)(struct mmu_notifier *mn,
 				     struct mm_struct *mm,
 				     unsigned long start, unsigned long end);
@@ -137,7 +137,7 @@ static inline void mmu_notifier_mm_init(
 #define ptep_clear_flush_notify(__vma, __address, __ptep)		\
 ({									\
 	pte_t __pte;							\
-	struct vm_area_struct * ___vma = __vma;				\
+	struct vm_area_struct *___vma = __vma;				\
 	unsigned long ___address = __address;				\
 	__pte = ptep_clear_flush(___vma, ___address, __ptep);		\
 	mmu_notifier_invalidate_page(___vma->vm_mm, ___address);	\
@@ -147,7 +147,7 @@ static inline void mmu_notifier_mm_init(
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
 ({									\
 	int __young;							\
-	struct vm_area_struct * ___vma = __vma;				\
+	struct vm_area_struct *___vma = __vma;				\
 	unsigned long ___address = __address;				\
 	__young = ptep_clear_flush_young(___vma, ___address, __ptep);	\
 	__young |= mmu_notifier_clear_flush_young(___vma->vm_mm,	\

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03  3:29                   ` [PATCH] mmu notifiers #v8 Nick Piggin
@ 2008-03-03 12:51                     ` Andrea Arcangeli
  2008-03-03 13:10                       ` Nick Piggin
  2008-03-03 19:01                     ` Christoph Lameter
  1 sibling, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-03 12:51 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Jack Steiner, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Mon, Mar 03, 2008 at 04:29:34AM +0100, Nick Piggin wrote:
> to something I prefer. Others may not, but I'll post them for debate
> anyway.

Sure, thanks!

> > I didn't drop invalidate_page, because invalidate_range_begin/end
> > would be slower for usages like KVM/GRU (we don't need a begin/end
> > there because where invalidate_page is called, the VM holds a
> > reference on the page). do_wp_page should also use invalidate_page
> > since it can free the page after dropping the PT lock without losing
> > any performance (that's not true for the places where invalidate_range
> > is called).
> 
> I'm still not completely happy with this. I had a very quick look
> at the GRU driver, but I don't see why it can't be implemented
> more like the regular TLB model, and have TLB insertions depend on
> the linux pte, and do invalidates _after_ restricting permissions
> to the pte.
> 
> Ie. I'd still like to get rid of invalidate_range_begin, and get
> rid of invalidate calls from places where permissions are relaxed.

_begin exists because by the time _end is called, the VM already
dropped the reference on the page. This way we can do a single
invalidate no matter how large the range is. I don't see ways to
remove _begin while still invoking _end a single time for the whole
range.

> If we can agree on the API, then I don't see any reason why it can't
> go into 2.6.25, unless someome wants more time to review it (but
> 2.6.25 release should be quite far away still so there should be quite
> a bit of time).

Cool! ;)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 12:51                     ` Andrea Arcangeli
@ 2008-03-03 13:10                       ` Nick Piggin
  2008-03-03 13:24                         ` Andrea Arcangeli
  2008-03-03 15:18                         ` Jack Steiner
  0 siblings, 2 replies; 120+ messages in thread
From: Nick Piggin @ 2008-03-03 13:10 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Mon, Mar 03, 2008 at 01:51:53PM +0100, Andrea Arcangeli wrote:
> On Mon, Mar 03, 2008 at 04:29:34AM +0100, Nick Piggin wrote:
> > to something I prefer. Others may not, but I'll post them for debate
> > anyway.
> 
> Sure, thanks!
> 
> > > I didn't drop invalidate_page, because invalidate_range_begin/end
> > > would be slower for usages like KVM/GRU (we don't need a begin/end
> > > there because where invalidate_page is called, the VM holds a
> > > reference on the page). do_wp_page should also use invalidate_page
> > > since it can free the page after dropping the PT lock without losing
> > > any performance (that's not true for the places where invalidate_range
> > > is called).
> > 
> > I'm still not completely happy with this. I had a very quick look
> > at the GRU driver, but I don't see why it can't be implemented
> > more like the regular TLB model, and have TLB insertions depend on
> > the linux pte, and do invalidates _after_ restricting permissions
> > to the pte.
> > 
> > Ie. I'd still like to get rid of invalidate_range_begin, and get
> > rid of invalidate calls from places where permissions are relaxed.
> 
> _begin exists because by the time _end is called, the VM already
> dropped the reference on the page. This way we can do a single
> invalidate no matter how large the range is. I don't see ways to
> remove _begin while still invoking _end a single time for the whole
> range.

Is this just a GRU problem? Can't we just require them to take a ref
on the page (IIRC Jack said GRU could be changed to more like a TLB
model).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 13:10                       ` Nick Piggin
@ 2008-03-03 13:24                         ` Andrea Arcangeli
  2008-03-03 15:18                         ` Jack Steiner
  1 sibling, 0 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-03 13:24 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Jack Steiner, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Mon, Mar 03, 2008 at 02:10:17PM +0100, Nick Piggin wrote:
> Is this just a GRU problem? Can't we just require them to take a ref
> on the page (IIRC Jack said GRU could be changed to more like a TLB
> model).

Yes, it's just a GRU problem, it tries to optimize performance by
calling follow_page only in the fast path, and fallbacks to
get_user_pages; put_page in the slow path. xpmem could also send the
message in _begin and wait the message in _end, to reduce the wait
time. But if you forge GRU to call get_user_pages only (like KVM
does), the _begin can be removed. In theory we could also optimize KVM
to use follow_page only if the pte is already established. I'm not
sure how much that is a worthwhile optimization though.

However note that Quadrics also had a callback before and one after,
so they may be using the callback before for similar
optimizations. But functionality-wise _end is the only required bit if
everyone takes refcounts like KVM and XPMEM do.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 13:10                       ` Nick Piggin
  2008-03-03 13:24                         ` Andrea Arcangeli
@ 2008-03-03 15:18                         ` Jack Steiner
  2008-03-03 16:59                           ` Nick Piggin
  1 sibling, 1 reply; 120+ messages in thread
From: Jack Steiner @ 2008-03-03 15:18 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Mon, Mar 03, 2008 at 02:10:17PM +0100, Nick Piggin wrote:
> On Mon, Mar 03, 2008 at 01:51:53PM +0100, Andrea Arcangeli wrote:
> > On Mon, Mar 03, 2008 at 04:29:34AM +0100, Nick Piggin wrote:
> > > to something I prefer. Others may not, but I'll post them for debate
> > > anyway.
> > 
> > Sure, thanks!
> > 
> > > > I didn't drop invalidate_page, because invalidate_range_begin/end
> > > > would be slower for usages like KVM/GRU (we don't need a begin/end
> > > > there because where invalidate_page is called, the VM holds a
> > > > reference on the page). do_wp_page should also use invalidate_page
> > > > since it can free the page after dropping the PT lock without losing
> > > > any performance (that's not true for the places where invalidate_range
> > > > is called).
> > > 
> > > I'm still not completely happy with this. I had a very quick look
> > > at the GRU driver, but I don't see why it can't be implemented
> > > more like the regular TLB model, and have TLB insertions depend on
> > > the linux pte, and do invalidates _after_ restricting permissions
> > > to the pte.
> > > 
> > > Ie. I'd still like to get rid of invalidate_range_begin, and get
> > > rid of invalidate calls from places where permissions are relaxed.
> > 
> > _begin exists because by the time _end is called, the VM already
> > dropped the reference on the page. This way we can do a single
> > invalidate no matter how large the range is. I don't see ways to
> > remove _begin while still invoking _end a single time for the whole
> > range.
> 
> Is this just a GRU problem? Can't we just require them to take a ref
> on the page (IIRC Jack said GRU could be changed to more like a TLB
> model).

Maintaining a long-term reference on a page is a problem. The GRU does not
currently maintain tables to track the pages for which dropins have been done.

The GRU has a large internal TLB and is designed to reference up to 8PB of
memory. The size of the tables to track this many referenced pages would be
a problem (at best).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 15:18                         ` Jack Steiner
@ 2008-03-03 16:59                           ` Nick Piggin
  2008-03-03 18:06                             ` Jack Steiner
  2008-03-03 19:02                             ` Christoph Lameter
  0 siblings, 2 replies; 120+ messages in thread
From: Nick Piggin @ 2008-03-03 16:59 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Mon, Mar 03, 2008 at 09:18:59AM -0600, Jack Steiner wrote:
> On Mon, Mar 03, 2008 at 02:10:17PM +0100, Nick Piggin wrote:
> > On Mon, Mar 03, 2008 at 01:51:53PM +0100, Andrea Arcangeli wrote:
> > > On Mon, Mar 03, 2008 at 04:29:34AM +0100, Nick Piggin wrote:
> > > > to something I prefer. Others may not, but I'll post them for debate
> > > > anyway.
> > > 
> > > Sure, thanks!
> > > 
> > > > > I didn't drop invalidate_page, because invalidate_range_begin/end
> > > > > would be slower for usages like KVM/GRU (we don't need a begin/end
> > > > > there because where invalidate_page is called, the VM holds a
> > > > > reference on the page). do_wp_page should also use invalidate_page
> > > > > since it can free the page after dropping the PT lock without losing
> > > > > any performance (that's not true for the places where invalidate_range
> > > > > is called).
> > > > 
> > > > I'm still not completely happy with this. I had a very quick look
> > > > at the GRU driver, but I don't see why it can't be implemented
> > > > more like the regular TLB model, and have TLB insertions depend on
> > > > the linux pte, and do invalidates _after_ restricting permissions
> > > > to the pte.
> > > > 
> > > > Ie. I'd still like to get rid of invalidate_range_begin, and get
> > > > rid of invalidate calls from places where permissions are relaxed.
> > > 
> > > _begin exists because by the time _end is called, the VM already
> > > dropped the reference on the page. This way we can do a single
> > > invalidate no matter how large the range is. I don't see ways to
> > > remove _begin while still invoking _end a single time for the whole
> > > range.
> > 
> > Is this just a GRU problem? Can't we just require them to take a ref
> > on the page (IIRC Jack said GRU could be changed to more like a TLB
> > model).
> 
> Maintaining a long-term reference on a page is a problem. The GRU does not
> currently maintain tables to track the pages for which dropins have been done.
> 
> The GRU has a large internal TLB and is designed to reference up to 8PB of
> memory. The size of the tables to track this many referenced pages would be
> a problem (at best).

Is it any worse a problem than the pagetables of the processes which have
their virtual memory exported to GRU? AFAIKS, no; it is on the same
magnitude of difficulty. So you could do it without introducing any
fundamental problem (memory usage might be increased by some constant
factor, but I think we can cope with that in order to make the core patch
really nice and simple).

It is going to be really easy to add more weird and wonderful notifiers
later that deviate from our standard TLB model. It would be much harder to
remove them. So I really want to see everyone conform to this model first.
Numbers and comparisons can be brought out afterwards if people want to
attempt to make such changes.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 16:59                           ` Nick Piggin
@ 2008-03-03 18:06                             ` Jack Steiner
  2008-03-03 18:09                               ` Avi Kivity
  2008-03-03 18:45                               ` Nick Piggin
  2008-03-03 19:02                             ` Christoph Lameter
  1 sibling, 2 replies; 120+ messages in thread
From: Jack Steiner @ 2008-03-03 18:06 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Mon, Mar 03, 2008 at 05:59:10PM +0100, Nick Piggin wrote:
> On Mon, Mar 03, 2008 at 09:18:59AM -0600, Jack Steiner wrote:
> > On Mon, Mar 03, 2008 at 02:10:17PM +0100, Nick Piggin wrote:
> > > On Mon, Mar 03, 2008 at 01:51:53PM +0100, Andrea Arcangeli wrote:
> > > > On Mon, Mar 03, 2008 at 04:29:34AM +0100, Nick Piggin wrote:
> > > > > to something I prefer. Others may not, but I'll post them for debate
> > > > > anyway.
> > > > 
> > > > Sure, thanks!
> > > > 
> > > > > > I didn't drop invalidate_page, because invalidate_range_begin/end
> > > > > > would be slower for usages like KVM/GRU (we don't need a begin/end
> > > > > > there because where invalidate_page is called, the VM holds a
> > > > > > reference on the page). do_wp_page should also use invalidate_page
> > > > > > since it can free the page after dropping the PT lock without losing
> > > > > > any performance (that's not true for the places where invalidate_range
> > > > > > is called).
> > > > > 
> > > > > I'm still not completely happy with this. I had a very quick look
> > > > > at the GRU driver, but I don't see why it can't be implemented
> > > > > more like the regular TLB model, and have TLB insertions depend on
> > > > > the linux pte, and do invalidates _after_ restricting permissions
> > > > > to the pte.
> > > > > 
> > > > > Ie. I'd still like to get rid of invalidate_range_begin, and get
> > > > > rid of invalidate calls from places where permissions are relaxed.
> > > > 
> > > > _begin exists because by the time _end is called, the VM already
> > > > dropped the reference on the page. This way we can do a single
> > > > invalidate no matter how large the range is. I don't see ways to
> > > > remove _begin while still invoking _end a single time for the whole
> > > > range.

The range invalidates have a performance advantage for the GRU. TLB invalidates
on the GRU are relatively slow (usec) and interfere somewhat with the performance
of other active GRU instructions. Invalidating a large chunk of addresses with
a single GRU TLBINVAL operation is must faster than issuing a stream of single
page TLBINVALs.

I expect this performance advantage will also apply to other users of mmuops.

> > > 
> > > Is this just a GRU problem? Can't we just require them to take a ref
> > > on the page (IIRC Jack said GRU could be changed to more like a TLB
> > > model).
> > 
> > Maintaining a long-term reference on a page is a problem. The GRU does not
> > currently maintain tables to track the pages for which dropins have been done.
> > 
> > The GRU has a large internal TLB and is designed to reference up to 8PB of
> > memory. The size of the tables to track this many referenced pages would be
> > a problem (at best).
> 
> Is it any worse a problem than the pagetables of the processes which have
> their virtual memory exported to GRU? AFAIKS, no; it is on the same
> magnitude of difficulty. So you could do it without introducing any
> fundamental problem (memory usage might be increased by some constant
> factor, but I think we can cope with that in order to make the core patch
> really nice and simple).

Functionally, the GRU is very close to what I would consider to be the
"standard TLB" model. Dropins and flushs map closely to processor dropins
and flushes for cpus.  The internal structure of the GRU TLB is identical to
the TLB of existing cpus.  Requiring the GRU driver to track dropins with
long term page references seems to me a deviation from having the basic
mmuops support a "standard TLB" model. AFAIK, no other processor requires
this.

Tracking TLB dropins (and long term page references) could be done but it
adds significant complexity and scaling issues. The size of the tables to
track many TB (to PB) of memory can get large. If the memory is being
referenced by highly threaded applications, then the problem becomes even
more complex. Either tables must be replicated per-thread (and require even
more memory), or the table structure becomes even more complex to deal with
node locality, cacheline bouncing, etc.

Try to avoid a requirement to track dropins with long term page references.


> It is going to be really easy to add more weird and wonderful notifiers
> later that deviate from our standard TLB model. It would be much harder to
> remove them. So I really want to see everyone conform to this model first.

Agree.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 18:06                             ` Jack Steiner
@ 2008-03-03 18:09                               ` Avi Kivity
  2008-03-03 18:23                                 ` Jack Steiner
  2008-03-03 18:45                               ` Nick Piggin
  1 sibling, 1 reply; 120+ messages in thread
From: Avi Kivity @ 2008-03-03 18:09 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Nick Piggin, Andrea Arcangeli, akpm, Robin Holt, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

Jack Steiner wrote:
> The range invalidates have a performance advantage for the GRU. TLB invalidates
> on the GRU are relatively slow (usec) and interfere somewhat with the performance
> of other active GRU instructions. Invalidating a large chunk of addresses with
> a single GRU TLBINVAL operation is must faster than issuing a stream of single
> page TLBINVALs.
>
> I expect this performance advantage will also apply to other users of mmuops.
>   

In theory this would apply to kvm as well (coalesce tlb flush IPIs, 
lookup shadow page table once), but is it really a fast path?  What 
triggers range operations for your use cases?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 18:09                               ` Avi Kivity
@ 2008-03-03 18:23                                 ` Jack Steiner
  0 siblings, 0 replies; 120+ messages in thread
From: Jack Steiner @ 2008-03-03 18:23 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Nick Piggin, Andrea Arcangeli, akpm, Robin Holt, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Mon, Mar 03, 2008 at 08:09:49PM +0200, Avi Kivity wrote:
> Jack Steiner wrote:
> >The range invalidates have a performance advantage for the GRU. TLB 
> >invalidates
> >on the GRU are relatively slow (usec) and interfere somewhat with the 
> >performance
> >of other active GRU instructions. Invalidating a large chunk of addresses 
> >with
> >a single GRU TLBINVAL operation is must faster than issuing a stream of 
> >single
> >page TLBINVALs.
> >
> >I expect this performance advantage will also apply to other users of 
> >mmuops.
> >  
> 
> In theory this would apply to kvm as well (coalesce tlb flush IPIs, 
> lookup shadow page table once), but is it really a fast path?  What 
> triggers range operations for your use cases?
 

Although not frequent, an unmap of a multiple TB object could be quite painful
if each page was invalidated individually instead of 1 invalidate for the entire range.
This is even worse if the application is threaded and the object has been reference by
many GRUs (there are 16 GRU ports per node - each potentially has to be invalidated).

Forks (again, not frequent) would be another case.




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 18:06                             ` Jack Steiner
  2008-03-03 18:09                               ` Avi Kivity
@ 2008-03-03 18:45                               ` Nick Piggin
  2008-03-03 19:15                                 ` Jack Steiner
  1 sibling, 1 reply; 120+ messages in thread
From: Nick Piggin @ 2008-03-03 18:45 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Mon, Mar 03, 2008 at 12:06:05PM -0600, Jack Steiner wrote:
> On Mon, Mar 03, 2008 at 05:59:10PM +0100, Nick Piggin wrote:
> > > Maintaining a long-term reference on a page is a problem. The GRU does not
> > > currently maintain tables to track the pages for which dropins have been done.
> > > 
> > > The GRU has a large internal TLB and is designed to reference up to 8PB of
> > > memory. The size of the tables to track this many referenced pages would be
> > > a problem (at best).
> > 
> > Is it any worse a problem than the pagetables of the processes which have
> > their virtual memory exported to GRU? AFAIKS, no; it is on the same
> > magnitude of difficulty. So you could do it without introducing any
> > fundamental problem (memory usage might be increased by some constant
> > factor, but I think we can cope with that in order to make the core patch
> > really nice and simple).
> 
> Functionally, the GRU is very close to what I would consider to be the
> "standard TLB" model. Dropins and flushs map closely to processor dropins
> and flushes for cpus.  The internal structure of the GRU TLB is identical to
> the TLB of existing cpus.  Requiring the GRU driver to track dropins with
> long term page references seems to me a deviation from having the basic
> mmuops support a "standard TLB" model. AFAIK, no other processor requires
> this.

That is because the CPU TLBs have the mmu_gather batching APIs which
avoid the problem. It would be possible to do something similar for
GRU which would involve taking a reference for each page-to-be-invalidated
in invalidate_page, and release them when you invalidate_range. Or else
do some other scheme which makes mmu notifiers work similarly to the
mmu gather API. But not just go an invent something completely different
in the form of this invalidate_begin,clear linux pte,invalidate_end API.


> Tracking TLB dropins (and long term page references) could be done but it
> adds significant complexity and scaling issues. The size of the tables to
> track many TB (to PB) of memory can get large. If the memory is being
> referenced by highly threaded applications, then the problem becomes even
> more complex. Either tables must be replicated per-thread (and require even
> more memory), or the table structure becomes even more complex to deal with
> node locality, cacheline bouncing, etc.

I don't think it would be that significant in terms of complexity or
scaling.

For a quick solution, you could stick a radix tree in each of your mmu
notifiers registered (ie. one per mm), which is indexed on virtual address
>> PAGE_SHIFT, and returns the struct page *. Size is no different than
page tables, and locking is pretty scalable.

After that, I would really like to see whether the numbers justify
larger changes.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03  3:29                   ` [PATCH] mmu notifiers #v8 Nick Piggin
  2008-03-03 12:51                     ` Andrea Arcangeli
@ 2008-03-03 19:01                     ` Christoph Lameter
  2008-03-03 21:15                       ` Andrea Arcangeli
  2008-03-05  0:37                       ` Nick Piggin
  1 sibling, 2 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-03 19:01 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, Jack Steiner, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Mon, 3 Mar 2008, Nick Piggin wrote:

> I'm still not completely happy with this. I had a very quick look
> at the GRU driver, but I don't see why it can't be implemented
> more like the regular TLB model, and have TLB insertions depend on
> the linux pte, and do invalidates _after_ restricting permissions
> to the pte.
> 
> Ie. I'd still like to get rid of invalidate_range_begin, and get
> rid of invalidate calls from places where permissions are relaxed.

Isnt this more a job for paravirt ops if it is so tightly bound to page 
tables? Are we not adding another similar API?

> If we can agree on the API, then I don't see any reason why it can't
> go into 2.6.25, unless someome wants more time to review it (but
> 2.6.25 release should be quite far away still so there should be quite
> a bit of time).

API still has rcu issues and the example given for making things sleepable 
is only working for the aging callback. The most important callback is for 
try_to_unmao and page_mkclean. This means the API is still not generic 
enough and likely not extendable as needed in its present form.





^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 16:59                           ` Nick Piggin
  2008-03-03 18:06                             ` Jack Steiner
@ 2008-03-03 19:02                             ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-03 19:02 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Jack Steiner, Andrea Arcangeli, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Mon, 3 Mar 2008, Nick Piggin wrote:

> It is going to be really easy to add more weird and wonderful notifiers
> later that deviate from our standard TLB model. It would be much harder to
> remove them. So I really want to see everyone conform to this model first.
> Numbers and comparisons can be brought out afterwards if people want to
> attempt to make such changes.

Still do not see how that could be done. The model here is tightly bound 
to ptes. AFAICT this could be implemented in arch code like the paravirt 
ops.

 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03  3:33                   ` Nick Piggin
@ 2008-03-03 19:03                     ` Christoph Lameter
  0 siblings, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-03 19:03 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, Jack Steiner, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

I like the patch.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03  3:34                   ` Nick Piggin
@ 2008-03-03 19:04                     ` Christoph Lameter
  0 siblings, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-03 19:04 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, Jack Steiner, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Mon, 3 Mar 2008, Nick Piggin wrote:

> Move definition of struct mmu_notifier and struct mmu_notifier_ops under
> CONFIG_MMU_NOTIFIER to ensure they doesn't get dereferenced when they
> don't make sense.

The callbacks take a mmu_notifier parameter. So how does this compile for 
!MMU_NOTIFIER?


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 18:45                               ` Nick Piggin
@ 2008-03-03 19:15                                 ` Jack Steiner
  2008-03-04 10:35                                   ` Peter Zijlstra
  0 siblings, 1 reply; 120+ messages in thread
From: Jack Steiner @ 2008-03-03 19:15 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Mon, Mar 03, 2008 at 07:45:17PM +0100, Nick Piggin wrote:
> On Mon, Mar 03, 2008 at 12:06:05PM -0600, Jack Steiner wrote:
> > On Mon, Mar 03, 2008 at 05:59:10PM +0100, Nick Piggin wrote:
> > > > Maintaining a long-term reference on a page is a problem. The GRU does not
> > > > currently maintain tables to track the pages for which dropins have been done.
> > > > 
> > > > The GRU has a large internal TLB and is designed to reference up to 8PB of
> > > > memory. The size of the tables to track this many referenced pages would be
> > > > a problem (at best).
> > > 
> > > Is it any worse a problem than the pagetables of the processes which have
> > > their virtual memory exported to GRU? AFAIKS, no; it is on the same
> > > magnitude of difficulty. So you could do it without introducing any
> > > fundamental problem (memory usage might be increased by some constant
> > > factor, but I think we can cope with that in order to make the core patch
> > > really nice and simple).
> > 
> > Functionally, the GRU is very close to what I would consider to be the
> > "standard TLB" model. Dropins and flushs map closely to processor dropins
> > and flushes for cpus.  The internal structure of the GRU TLB is identical to
> > the TLB of existing cpus.  Requiring the GRU driver to track dropins with
> > long term page references seems to me a deviation from having the basic
> > mmuops support a "standard TLB" model. AFAIK, no other processor requires
> > this.
> 
> That is because the CPU TLBs have the mmu_gather batching APIs which
> avoid the problem. It would be possible to do something similar for
> GRU which would involve taking a reference for each page-to-be-invalidated
> in invalidate_page, and release them when you invalidate_range. Or else
> do some other scheme which makes mmu notifiers work similarly to the
> mmu gather API. But not just go an invent something completely different
> in the form of this invalidate_begin,clear linux pte,invalidate_end API.

Correct. If the mmu_gather were passed on the mmuops callout and the callout were
done at the same point as the tlb_finish_mmu(), the GRU could
efficiently work w/o the range invalidates. A range invalidate might still
be slightly more efficient but not measureable so. The net difference is
not worth the extra complexity of range callouts.


> 
> 
> > Tracking TLB dropins (and long term page references) could be done but it
> > adds significant complexity and scaling issues. The size of the tables to
> > track many TB (to PB) of memory can get large. If the memory is being
> > referenced by highly threaded applications, then the problem becomes even
> > more complex. Either tables must be replicated per-thread (and require even
> > more memory), or the table structure becomes even more complex to deal with
> > node locality, cacheline bouncing, etc.
> 
> I don't think it would be that significant in terms of complexity or
> scaling.
> 
> For a quick solution, you could stick a radix tree in each of your mmu
> notifiers registered (ie. one per mm), which is indexed on virtual address
> >> PAGE_SHIFT, and returns the struct page *. Size is no different than
> page tables, and locking is pretty scalable.
> 
> After that, I would really like to see whether the numbers justify
> larger changes.

I'm still concerned about performance. Each dropin would first have to access
an additional data structure that would most likely be non-node-local and
non-cache-resident. The net effect would be measurable but not a killer.

I haven't thought about locking requirements for the radix tree. Most accesses
would be read-only & updates infrequent. Any chance of an RCU-based radix
implementation?  Otherwise, don't we add the potential for hot locks/cachelines
for threaded applications ???

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 19:01                     ` Christoph Lameter
@ 2008-03-03 21:15                       ` Andrea Arcangeli
  2008-03-05  0:37                       ` Nick Piggin
  1 sibling, 0 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-03 21:15 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Jack Steiner, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Mon, Mar 03, 2008 at 11:01:22AM -0800, Christoph Lameter wrote:
> API still has rcu issues and the example given for making things sleepable 
> is only working for the aging callback. The most important callback is for 
> try_to_unmao and page_mkclean. This means the API is still not generic 
> enough and likely not extendable as needed in its present form.

I converted only one of those _notify as an example of how it should
be done, because I assumed you volunteer to convert the other ones
yourself during .26. It's useless to convert all of them right now,
because the i_mmap_lock and anon_vma locks are still going to be
spinlocks in .25.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] mmu notifiers #v9
  2008-03-02 15:54                 ` [PATCH] mmu notifiers #v8 Andrea Arcangeli
                                     ` (4 preceding siblings ...)
  2008-03-03  3:39                   ` Nick Piggin
@ 2008-03-03 21:37                   ` Andrea Arcangeli
  2008-03-03 22:05                     ` [PATCH] KVM swapping with " Andrea Arcangeli
  5 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-03 21:37 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	linux-kernel, linux-mm, daniel.blueman, Christoph Lameter

The only difference are Nick's changes (thanks Nick, nice work!) plus
a fix to make it compile.

About the removal of _begin I'm not strongly opposed to it, but I
personally think that it's unnecessary if _begin avoids to build new
data structures with a fixed ram (and cpu) cost per_page_ and at the
same time deferring _end after the whole tlb_gather page freeing is
reducing the number of invalidates.

.26 will allow all the methods to sleep by following the roadmap
described in the #v8 patch.

KVM so far is swapping fine on top of this.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -228,6 +228,9 @@ struct mm_struct {
 #ifdef CONFIG_CGROUP_MEM_CONT
 	struct mem_cgroup *mem_cgroup;
 #endif
+#ifdef CONFIG_MMU_NOTIFIER
+	struct hlist_head mmu_notifier_list;
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
new file mode 100644
--- /dev/null
+++ b/include/linux/mmu_notifier.h
@@ -0,0 +1,194 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm_types.h>
+
+struct mmu_notifier;
+struct mmu_notifier_ops;
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier_ops {
+	/*
+	 * Called when nobody can register any more notifier in the mm
+	 * and after the "mn" notifier has been disarmed already.
+	 */
+	void (*release)(struct mmu_notifier *mn,
+			struct mm_struct *mm);
+
+	/*
+	 * clear_flush_young is called after the VM is
+	 * test-and-clearing the young/accessed bitflag in the
+	 * pte. This way the VM will provide proper aging to the
+	 * accesses to the page through the secondary MMUs and not
+	 * only to the ones through the Linux pte.
+	 */
+	int (*clear_flush_young)(struct mmu_notifier *mn,
+				 struct mm_struct *mm,
+				 unsigned long address);
+
+	/*
+	 * Before this is invoked any secondary MMU is still ok to
+	 * read/write to the page previously pointed by the Linux pte
+	 * because the old page hasn't been freed yet.  If required
+	 * set_page_dirty has to be called internally to this method.
+	 */
+	void (*invalidate_page)(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long address);
+
+	/*
+	 * invalidate_range_begin() and invalidate_range_end() must be
+	 * paired. Multiple invalidate_range_begin/ends may be nested
+	 * or called concurrently.
+	 */
+	void (*invalidate_range_begin)(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start, unsigned long end);
+	void (*invalidate_range_end)(struct mmu_notifier *mn,
+				     struct mm_struct *mm,
+				     unsigned long start, unsigned long end);
+};
+
+struct mmu_notifier {
+	struct hlist_node hlist;
+	const struct mmu_notifier_ops *ops;
+};
+
+static inline int mm_has_notifiers(struct mm_struct *mm)
+{
+	return unlikely(!hlist_empty(&mm->mmu_notifier_list));
+}
+
+/*
+ * Must hold the mmap_sem for write.
+ *
+ * RCU is used to traverse the list. A quiescent period needs to pass
+ * before the notifier is guaranteed to be visible to all threads.
+ */
+extern void mmu_notifier_register(struct mmu_notifier *mn,
+				  struct mm_struct *mm);
+/*
+ * Must hold the mmap_sem for write.
+ *
+ * RCU is used to traverse the list. A quiescent period needs to pass
+ * before the "struct mmu_notifier" can be freed. Alternatively it
+ * can be synchronously freed inside ->release when the list can't
+ * change anymore and nobody could possibly walk it.
+ */
+extern void mmu_notifier_unregister(struct mmu_notifier *mn,
+				    struct mm_struct *mm);
+
+extern void __mmu_notifier_release(struct mm_struct *mm);
+extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					  unsigned long address);
+extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+					  unsigned long address);
+extern void __mmu_notifier_invalidate_range_begin(struct mm_struct *mm,
+				  unsigned long start, unsigned long end);
+extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end);
+
+
+static inline void mmu_notifier_release(struct mm_struct *mm)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_release(mm);
+}
+
+static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					  unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_clear_flush_young(mm, address);
+	return 0;
+}
+
+static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
+					  unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_invalidate_page(mm, address);
+}
+
+static inline void mmu_notifier_invalidate_range_begin(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_invalidate_range_begin(mm, start, end);
+}
+
+static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_invalidate_range_end(mm, start, end);
+}
+
+static inline void mmu_notifier_mm_init(struct mm_struct *mm)
+{
+	INIT_HLIST_HEAD(&mm->mmu_notifier_list);
+}
+
+
+
+#define ptep_clear_flush_notify(__vma, __address, __ptep)		\
+({									\
+	pte_t __pte;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__pte = ptep_clear_flush(___vma, ___address, __ptep);		\
+	mmu_notifier_invalidate_page(___vma->vm_mm, ___address);	\
+	__pte;								\
+})
+
+#define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = ptep_clear_flush_young(___vma, ___address, __ptep);	\
+	__young |= mmu_notifier_clear_flush_young(___vma->vm_mm,	\
+						  ___address);		\
+	__young;							\
+})
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+static inline void mmu_notifier_release(struct mm_struct *mm)
+{
+}
+
+static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					  unsigned long address)
+{
+	return 0;
+}
+
+static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
+					  unsigned long address)
+{
+}
+
+static inline void mmu_notifier_invalidate_range_begin(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+}
+
+static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+}
+
+static inline void mmu_notifier_mm_init(struct mm_struct *mm)
+{
+}
+
+#define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define ptep_clear_flush_notify ptep_clear_flush
+
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -53,6 +53,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -362,6 +363,7 @@ static struct mm_struct * mm_init(struct
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
+		mmu_notifier_mm_init(mm);
 		return mm;
 	}
 
diff --git a/mm/Kconfig b/mm/Kconfig
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,3 +193,7 @@ config VIRT_TO_BUS
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+	def_bool y
+	bool "MMU notifier, for paging KVM/RDMA"
diff --git a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,4 +33,4 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
-
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,7 +194,7 @@ __xip_unmap (struct address_space * mapp
 		if (pte) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
-			pteval = ptep_clear_flush(vma, address, pte);
+			pteval = ptep_clear_flush_notify(vma, address, pte);
 			page_remove_rmap(page, vma);
 			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
diff --git a/mm/fremap.c b/mm/fremap.c
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(uns
 		spin_unlock(&mapping->i_mmap_lock);
 	}
 
+	mmu_notifier_invalidate_range_begin(mm, start, start + size);
 	err = populate_range(mm, vma, start, size, pgoff);
+	mmu_notifier_invalidate_range_end(mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
 		if (unlikely(has_write_lock)) {
 			downgrade_write(&mm->mmap_sem);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,6 +14,7 @@
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -755,6 +756,7 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	mmu_notifier_invalidate_range_begin(mm, start, end);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
@@ -775,6 +777,7 @@ void __unmap_hugepage_range(struct vm_ar
 	}
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
 		list_del(&page->lru);
 		put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -611,6 +612,9 @@ int copy_page_range(struct mm_struct *ds
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
+	if (is_cow_mapping(vma->vm_flags))
+		mmu_notifier_invalidate_range_begin(src_mm, addr, end);
+
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
 	do {
@@ -621,6 +625,11 @@ int copy_page_range(struct mm_struct *ds
 						vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+	if (is_cow_mapping(vma->vm_flags))
+		mmu_notifier_invalidate_range_end(src_mm,
+						vma->vm_start, end);
+
 	return 0;
 }
 
@@ -897,7 +906,9 @@ unsigned long zap_page_range(struct vm_a
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+	mmu_notifier_invalidate_range_begin(mm, address, end);
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	mmu_notifier_invalidate_range_end(mm, address, end);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
@@ -1463,10 +1474,11 @@ int apply_to_page_range(struct mm_struct
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long end = addr + size;
+	unsigned long start = addr, end = addr + size;
 	int err;
 
 	BUG_ON(addr >= end);
+	mmu_notifier_invalidate_range_begin(mm, start, end);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1474,6 +1486,7 @@ int apply_to_page_range(struct mm_struct
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 	return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1675,7 +1688,7 @@ gotten:
 		 * seen in the presence of one thread doing SMC and another
 		 * thread doing COW.
 		 */
-		ptep_clear_flush(vma, address, page_table);
+		ptep_clear_flush_notify(vma, address, page_table);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lru_cache_add_active(new_page);
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,6 +26,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1747,11 +1748,13 @@ static void unmap_region(struct mm_struc
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+	mmu_notifier_invalidate_range_begin(mm, start, end);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 }
 
 /*
@@ -2037,6 +2040,7 @@ void exit_mmap(struct mm_struct *mm)
 	unsigned long end;
 
 	/* mm's last user has gone, and its about to be pulled down */
+	mmu_notifier_release(mm);
 	arch_exit_mmap(mm);
 
 	lru_add_drain();
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,114 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+
+/*
+ * No synchronization. This function can only be called when only a single
+ * process remains that performs teardown.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+
+	while (unlikely(!hlist_empty(&mm->mmu_notifier_list))) {
+		mn = hlist_entry(mm->mmu_notifier_list.first,
+				 struct mmu_notifier,
+				 hlist);
+		hlist_del(&mn->hlist);
+		if (mn->ops->release)
+			mn->ops->release(mn, mm);
+	}
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					unsigned long address)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+	int young = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
+		if (mn->ops->clear_flush_young)
+			young |= mn->ops->clear_flush_young(mn, mm, address);
+	}
+	rcu_read_unlock();
+
+	return young;
+}
+
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+					  unsigned long address)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
+		if (mn->ops->invalidate_page)
+			mn->ops->invalidate_page(mn, mm, address);
+	}
+	rcu_read_unlock();
+}
+
+void __mmu_notifier_invalidate_range_begin(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
+		if (mn->ops->invalidate_range_begin)
+			mn->ops->invalidate_range_begin(mn, mm, start, end);
+	}
+	rcu_read_unlock();
+}
+
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
+		if (mn->ops->invalidate_range_end)
+			mn->ops->invalidate_range_end(mn, mm, start, end);
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * Note that all notifiers use RCU. The updates are only guaranteed to
+ * be visible to other processes after a RCU quiescent period!
+ *
+ * Must hold mmap_sem writably when calling registration functions.
+ */
+void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_list);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	hlist_del_rcu(&mn->hlist);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mprotect.c b/mm/mprotect.c
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -198,10 +199,12 @@ success:
 		dirty_accountable = 1;
 	}
 
+	mmu_notifier_invalidate_range_begin(mm, start, end);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
 		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -74,6 +75,7 @@ static void move_ptes(struct vm_area_str
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
+	unsigned long old_start;
 
 	if (vma->vm_file) {
 		/*
@@ -100,6 +102,9 @@ static void move_ptes(struct vm_area_str
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 	arch_enter_lazy_mmu_mode();
 
+	old_start = old_addr;
+	mmu_notifier_invalidate_range_begin(vma->vm_mm,
+		     old_start, old_end);
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
 		if (pte_none(*old_pte))
@@ -108,6 +113,7 @@ static void move_ptes(struct vm_area_str
 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
+	mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 
 	arch_leave_lazy_mmu_mode();
 	if (new_ptl != old_ptl)
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlbflush.h>
 
@@ -287,7 +288,7 @@ static int page_referenced_one(struct pa
 	if (vma->vm_flags & VM_LOCKED) {
 		referenced++;
 		*mapcount = 1;	/* break early from loop */
-	} else if (ptep_clear_flush_young(vma, address, pte))
+	} else if (ptep_clear_flush_young_notify(vma, address, pte))
 		referenced++;
 
 	/* Pretend the page is referenced if the task has the
@@ -454,7 +455,7 @@ static int page_mkclean_one(struct page 
 		pte_t entry;
 
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		entry = ptep_clear_flush(vma, address, pte);
+		entry = ptep_clear_flush_notify(vma, address, pte);
 		entry = pte_wrprotect(entry);
 		entry = pte_mkclean(entry);
 		set_pte_at(mm, address, pte, entry);
@@ -712,14 +713,14 @@ static int try_to_unmap_one(struct page 
 	 * skipped over this mm) then we should reactivate it.
 	 */
 	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)))) {
+			(ptep_clear_flush_young_notify(vma, address, pte)))) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush(vma, address, pte);
+	pteval = ptep_clear_flush_notify(vma, address, pte);
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
@@ -844,12 +845,12 @@ static void try_to_unmap_cluster(unsigne
 		page = vm_normal_page(vma, address, *pte);
 		BUG_ON(!page || PageAnon(page));
 
-		if (ptep_clear_flush_young(vma, address, pte))
+		if (ptep_clear_flush_young_notify(vma, address, pte))
 			continue;
 
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush(vma, address, pte);
+		pteval = ptep_clear_flush_notify(vma, address, pte);
 
 		/* If nonlinear, store the file page offset in the pte. */
 		if (page->index != linear_page_index(vma, address))


^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] KVM swapping with mmu notifiers #v9
  2008-03-03 21:37                   ` [PATCH] mmu notifiers #v9 Andrea Arcangeli
@ 2008-03-03 22:05                     ` Andrea Arcangeli
  2008-03-04  0:44                       ` izik eidus
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-03 22:05 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Nick Piggin, akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	linux-kernel, linux-mm, daniel.blueman, Christoph Lameter

Notably the registration now requires the mmap_sem in write mode.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 41962e7..e1287ab 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM && EXPERIMENTAL
 	select PREEMPT_NOTIFIERS
+	select MMU_NOTIFIER
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4583329..4067b0f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -642,6 +642,110 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 	account_shadowed(kvm, gfn);
 }
 
+static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
+{
+	struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+	get_page(page);
+	rmap_remove(kvm, spte);
+	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+	kvm_flush_remote_tlbs(kvm);
+	__free_page(page);
+}
+
+static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte, *curr_spte;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+		curr_spte = spte;
+		spte = rmap_next(kvm, rmapp, spte);
+		kvm_unmap_spte(kvm, curr_spte);
+	}
+}
+
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int young = 0;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		int _young;
+		u64 _spte = *spte;
+		BUG_ON(!(_spte & PT_PRESENT_MASK));
+		_young = _spte & PT_ACCESSED_MASK;
+		if (_young) {
+			young = !!_young;
+			set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+	return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+	int young = 0;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+
+	if (young)
+		kvm_flush_remote_tlbs(kvm);
+
+	return young;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 17f9d16..b014b19 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -380,6 +380,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	int r;
 	struct page *page;
 	int largepage = 0;
+	unsigned mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -415,6 +416,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 			largepage = 1;
 		}
 	}
+	mmu_seq = read_seqbegin(&vcpu->kvm->arch.mmu_notifier_invalidate_lock);
 	page = gfn_to_page(vcpu->kvm, walker.gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -440,6 +442,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, "post page fault (fixed)");
 	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	if (read_seqretry(&vcpu->kvm->arch.mmu_notifier_invalidate_lock, mmu_seq)) {
+		down_read(&current->mm->mmap_sem);
+		if (page != gfn_to_page(vcpu->kvm, walker.gfn))
+			BUG();
+		up_read(&current->mm->mmap_sem);
+		kvm_release_page_clean(page);
+	}
+
 	up_read(&vcpu->kvm->slots_lock);
 
 	return write_pt;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f09840..1dfb1c9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -3319,6 +3320,48 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+	struct kvm_arch *kvm_arch;
+	kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
+	return container_of(kvm_arch, struct kvm, arch);
+}
+
+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+				      struct mm_struct *mm,
+				      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	write_seqlock(&kvm->arch.mmu_notifier_invalidate_lock);
+	kvm_unmap_hva(kvm, address);
+	write_sequnlock(&kvm->arch.mmu_notifier_invalidate_lock);
+}
+
+void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+					   struct mm_struct *mm,
+					   unsigned long start,
+					   unsigned long end)
+{
+	for (; start < end; start += PAGE_SIZE)
+		kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	return kvm_age_hva(kvm, address);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
+	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+};
+
 struct  kvm *kvm_arch_create_vm(void)
 {
 	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -3328,6 +3371,12 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 
+	kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+	down_write(&current->mm->mmap_sem);
+	mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+	up_write(&current->mm->mmap_sem);
+	seqlock_init(&kvm->arch.mmu_notifier_invalidate_lock);
+
 	return kvm;
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 024b57c..305b7c3 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -303,6 +304,9 @@ struct kvm_arch{
 	struct page *apic_access_page;
 
 	gpa_t wall_clock;
+
+	struct mmu_notifier mmu_notifier;
+	seqlock_t mmu_notifier_invalidate_lock;
 };
 
 struct kvm_vm_stat {
@@ -422,6 +426,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);


As usual memslot browsing with mmu_lock.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f09840..a519fd8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3379,16 +3379,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	 */
 	if (!user_alloc) {
 		if (npages && !old.rmap) {
+			unsigned long userspace_addr;
+
 			down_write(&current->mm->mmap_sem);
-			memslot->userspace_addr = do_mmap(NULL, 0,
-						     npages * PAGE_SIZE,
-						     PROT_READ | PROT_WRITE,
-						     MAP_SHARED | MAP_ANONYMOUS,
-						     0);
+			userspace_addr = do_mmap(NULL, 0,
+						 npages * PAGE_SIZE,
+						 PROT_READ | PROT_WRITE,
+						 MAP_SHARED | MAP_ANONYMOUS,
+						 0);
 			up_write(&current->mm->mmap_sem);
 
-			if (IS_ERR((void *)memslot->userspace_addr))
-				return PTR_ERR((void *)memslot->userspace_addr);
+			if (IS_ERR((void *)userspace_addr))
+				return PTR_ERR((void *)userspace_addr);
+
+			/* set userspace_addr atomically for kvm_hva_to_rmapp */
+			spin_lock(&kvm->mmu_lock);
+			memslot->userspace_addr = userspace_addr;
+			spin_unlock(&kvm->mmu_lock);
 		} else {
 			if (!old.user_alloc && old.rmap) {
 				int ret;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 30bf832..8f3b6d6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -326,7 +326,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.rmap, 0, npages * sizeof(*new.rmap));
 
 		new.user_alloc = user_alloc;
-		new.userspace_addr = mem->userspace_addr;
+		/*
+		 * hva_to_rmmap() serialzies with the mmu_lock and to be
+		 * safe it has to ignore memslots with !user_alloc &&
+		 * !userspace_addr.
+		 */
+		if (user_alloc)
+			new.userspace_addr = mem->userspace_addr;
+		else
+			new.userspace_addr = 0;
 	}
 	if (npages && !new.lpage_info) {
 		int largepages = npages / KVM_PAGES_PER_HPAGE;
@@ -355,14 +363,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.dirty_bitmap, 0, dirty_bytes);
 	}
 
+	spin_lock(&kvm->mmu_lock);
 	if (mem->slot >= kvm->nmemslots)
 		kvm->nmemslots = mem->slot + 1;
 
 	*memslot = new;
+	spin_unlock(&kvm->mmu_lock);
 
 	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
 	if (r) {
+		spin_lock(&kvm->mmu_lock);
 		*memslot = old;
+		spin_unlock(&kvm->mmu_lock);
 		goto out_free;
 	}
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] KVM swapping with mmu notifiers #v9
  2008-03-03 22:05                     ` [PATCH] KVM swapping with " Andrea Arcangeli
@ 2008-03-04  0:44                       ` izik eidus
  2008-03-04  7:31                         ` [RFC] Notifier for Externally Mapped Memory (EMM) Christoph Lameter
  2008-03-04 13:21                         ` [PATCH] KVM swapping with mmu notifiers #v9 Andrea Arcangeli
  0 siblings, 2 replies; 120+ messages in thread
From: izik eidus @ 2008-03-04  0:44 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

ציטוט Andrea Arcangeli:
> Notably the registration now requires the mmap_sem in write mode.
>
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 41962e7..e1287ab 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -21,6 +21,7 @@ config KVM
>  	tristate "Kernel-based Virtual Machine (KVM) support"
>  	depends on HAVE_KVM && EXPERIMENTAL
>  	select PREEMPT_NOTIFIERS
> +	select MMU_NOTIFIER
>  	select ANON_INODES
>  	---help---
>  	  Support hosting fully virtualized guest machines using hardware
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4583329..4067b0f 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -642,6 +642,110 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
>  	account_shadowed(kvm, gfn);
>  }
>  
> +static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
> +{
> +	struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
> +	get_page(page);
> +	rmap_remove(kvm, spte);
> +	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
> +	kvm_flush_remote_tlbs(kvm);
> +	__free_page(page);
>   

i wrote to you about this before (i didnt get answer for this so i write 
again)
with large pages support i think we need to use here put_page

> +}
> +
> +static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	u64 *spte, *curr_spte;
> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		BUG_ON(!(*spte & PT_PRESENT_MASK));
> +		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
> +		curr_spte = spte;
> +		spte = rmap_next(kvm, rmapp, spte);
> +		kvm_unmap_spte(kvm, curr_spte);
> +	}
> +}
> +
> +void kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
> +{
> +	int i;
> +
> +	/*
> +	 * If mmap_sem isn't taken, we can look the memslots with only
> +	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
> +	 */
> +	spin_lock(&kvm->mmu_lock);
> +	for (i = 0; i < kvm->nmemslots; i++) {
> +		struct kvm_memory_slot *memslot = &kvm->memslots[i];
> +		unsigned long start = memslot->userspace_addr;
> +		unsigned long end;
> +
> +		/* mmu_lock protects userspace_addr */
> +		if (!start)
> +			continue;
> +
> +		end = start + (memslot->npages << PAGE_SHIFT);
> +		if (hva >= start && hva < end) {
> +			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
> +			kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]);
> +		}
> +	}
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
> +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	u64 *spte;
> +	int young = 0;
> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		int _young;
> +		u64 _spte = *spte;
> +		BUG_ON(!(_spte & PT_PRESENT_MASK));
> +		_young = _spte & PT_ACCESSED_MASK;
> +		if (_young) {
> +			young = !!_young;
> +			set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
> +		}
> +		spte = rmap_next(kvm, rmapp, spte);
> +	}
> +	return young;
> +}
> +
> +int kvm_age_hva(struct kvm *kvm, unsigned long hva)
> +{
> +	int i;
> +	int young = 0;
> +
> +	/*
> +	 * If mmap_sem isn't taken, we can look the memslots with only
> +	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
> +	 */
> +	spin_lock(&kvm->mmu_lock);
> +	for (i = 0; i < kvm->nmemslots; i++) {
> +		struct kvm_memory_slot *memslot = &kvm->memslots[i];
> +		unsigned long start = memslot->userspace_addr;
> +		unsigned long end;
> +
> +		/* mmu_lock protects userspace_addr */
> +		if (!start)
> +			continue;
> +
> +		end = start + (memslot->npages << PAGE_SHIFT);
> +		if (hva >= start && hva < end) {
> +			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
> +			young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
> +		}
> +	}
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (young)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	return young;
> +}
> +
>  #ifdef MMU_DEBUG
>  static int is_empty_shadow_page(u64 *spt)
>  {
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 17f9d16..b014b19 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -380,6 +380,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>  	int r;
>  	struct page *page;
>  	int largepage = 0;
> +	unsigned mmu_seq;
>  
>  	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
>  	kvm_mmu_audit(vcpu, "pre page fault");
> @@ -415,6 +416,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>  			largepage = 1;
>  		}
>  	}
> +	mmu_seq = read_seqbegin(&vcpu->kvm->arch.mmu_notifier_invalidate_lock);
>  	page = gfn_to_page(vcpu->kvm, walker.gfn);
>  	up_read(&current->mm->mmap_sem);
>  
> @@ -440,6 +442,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>  	++vcpu->stat.pf_fixed;
>  	kvm_mmu_audit(vcpu, "post page fault (fixed)");
>  	spin_unlock(&vcpu->kvm->mmu_lock);
> +
> +	if (read_seqretry(&vcpu->kvm->arch.mmu_notifier_invalidate_lock, mmu_seq)) {
> +		down_read(&current->mm->mmap_sem);
> +		if (page != gfn_to_page(vcpu->kvm, walker.gfn))
> +			BUG();
> +		up_read(&current->mm->mmap_sem);
> +		kvm_release_page_clean(page);
> +	}
> +
>  	up_read(&vcpu->kvm->slots_lock);
>  
>  	return write_pt;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6f09840..1dfb1c9 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -25,6 +25,7 @@
>  #include <linux/module.h>
>  #include <linux/mman.h>
>  #include <linux/highmem.h>
> +#include <linux/mmu_notifier.h>
>  
>  #include <asm/uaccess.h>
>  #include <asm/msr.h>
> @@ -3319,6 +3320,48 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  	free_page((unsigned long)vcpu->arch.pio_data);
>  }
>  
> +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> +{
> +	struct kvm_arch *kvm_arch;
> +	kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
> +	return container_of(kvm_arch, struct kvm, arch);
> +}
> +
> +void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
> +				      struct mm_struct *mm,
> +				      unsigned long address)
> +{
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	BUG_ON(mm != kvm->mm);
> +	write_seqlock(&kvm->arch.mmu_notifier_invalidate_lock);
> +	kvm_unmap_hva(kvm, address);
> +	write_sequnlock(&kvm->arch.mmu_notifier_invalidate_lock);
> +}
> +
> +void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> +					   struct mm_struct *mm,
> +					   unsigned long start,
> +					   unsigned long end)
> +{
> +	for (; start < end; start += PAGE_SIZE)
> +		kvm_mmu_notifier_invalidate_page(mn, mm, start);
> +}
> +
> +int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
> +				       struct mm_struct *mm,
> +				       unsigned long address)
> +{
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	BUG_ON(mm != kvm->mm);
> +	return kvm_age_hva(kvm, address);
> +}
> +
> +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
> +	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
> +	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
> +};
> +
>  struct  kvm *kvm_arch_create_vm(void)
>  {
>  	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
> @@ -3328,6 +3371,12 @@ struct  kvm *kvm_arch_create_vm(void)
>  
>  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
>  
> +	kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
> +	down_write(&current->mm->mmap_sem);
> +	mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
> +	up_write(&current->mm->mmap_sem);
> +	seqlock_init(&kvm->arch.mmu_notifier_invalidate_lock);
> +
>  	return kvm;
>  }
>  
> diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
> index 024b57c..305b7c3 100644
> --- a/include/asm-x86/kvm_host.h
> +++ b/include/asm-x86/kvm_host.h
> @@ -13,6 +13,7 @@
>  
>  #include <linux/types.h>
>  #include <linux/mm.h>
> +#include <linux/mmu_notifier.h>
>  
>  #include <linux/kvm.h>
>  #include <linux/kvm_para.h>
> @@ -303,6 +304,9 @@ struct kvm_arch{
>  	struct page *apic_access_page;
>  
>  	gpa_t wall_clock;
> +
> +	struct mmu_notifier mmu_notifier;
> +	seqlock_t mmu_notifier_invalidate_lock;
>  };
>  
>  struct kvm_vm_stat {
> @@ -422,6 +426,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
>  int kvm_mmu_setup(struct kvm_vcpu *vcpu);
>  void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
>  
> +void kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
> +int kvm_age_hva(struct kvm *kvm, unsigned long hva);
>  int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
>  void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
>  void kvm_mmu_zap_all(struct kvm *kvm);
>
>
> As usual memslot browsing with mmu_lock.
>
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6f09840..a519fd8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3379,16 +3379,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
>  	 */
>  	if (!user_alloc) {
>  		if (npages && !old.rmap) {
> +			unsigned long userspace_addr;
> +
>  			down_write(&current->mm->mmap_sem);
> -			memslot->userspace_addr = do_mmap(NULL, 0,
> -						     npages * PAGE_SIZE,
> -						     PROT_READ | PROT_WRITE,
> -						     MAP_SHARED | MAP_ANONYMOUS,
> -						     0);
> +			userspace_addr = do_mmap(NULL, 0,
> +						 npages * PAGE_SIZE,
> +						 PROT_READ | PROT_WRITE,
> +						 MAP_SHARED | MAP_ANONYMOUS,
> +						 0);
>  			up_write(&current->mm->mmap_sem);
>  
> -			if (IS_ERR((void *)memslot->userspace_addr))
> -				return PTR_ERR((void *)memslot->userspace_addr);
> +			if (IS_ERR((void *)userspace_addr))
> +				return PTR_ERR((void *)userspace_addr);
> +
> +			/* set userspace_addr atomically for kvm_hva_to_rmapp */
> +			spin_lock(&kvm->mmu_lock);
> +			memslot->userspace_addr = userspace_addr;
> +			spin_unlock(&kvm->mmu_lock);
>  		} else {
>  			if (!old.user_alloc && old.rmap) {
>  				int ret;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 30bf832..8f3b6d6 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -326,7 +326,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  		memset(new.rmap, 0, npages * sizeof(*new.rmap));
>  
>  		new.user_alloc = user_alloc;
> -		new.userspace_addr = mem->userspace_addr;
> +		/*
> +		 * hva_to_rmmap() serialzies with the mmu_lock and to be
> +		 * safe it has to ignore memslots with !user_alloc &&
> +		 * !userspace_addr.
> +		 */
> +		if (user_alloc)
> +			new.userspace_addr = mem->userspace_addr;
> +		else
> +			new.userspace_addr = 0;
>  	}
>  	if (npages && !new.lpage_info) {
>  		int largepages = npages / KVM_PAGES_PER_HPAGE;
> @@ -355,14 +363,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  		memset(new.dirty_bitmap, 0, dirty_bytes);
>  	}
>  
> +	spin_lock(&kvm->mmu_lock);
>  	if (mem->slot >= kvm->nmemslots)
>  		kvm->nmemslots = mem->slot + 1;
>  
>  	*memslot = new;
> +	spin_unlock(&kvm->mmu_lock);
>  
>  	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
>  	if (r) {
> +		spin_lock(&kvm->mmu_lock);
>  		*memslot = old;
> +		spin_unlock(&kvm->mmu_lock);
>  		goto out_free;
>  	}
>  
>   


^ permalink raw reply	[flat|nested] 120+ messages in thread

* [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04  0:44                       ` izik eidus
@ 2008-03-04  7:31                         ` Christoph Lameter
  2008-03-04  7:34                           ` [Early draft] Conversion of i_mmap_lock to semaphore Christoph Lameter
  2008-03-04 13:30                           ` [RFC] Notifier for Externally Mapped Memory (EMM) Andrea Arcangeli
  2008-03-04 13:21                         ` [PATCH] KVM swapping with mmu notifiers #v9 Andrea Arcangeli
  1 sibling, 2 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-04  7:31 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

Stripped things down and did what Andrea and I talked about last Friday.
No invalidate_page callbacks. No ops anymore. Simple linked list for 
notifier. No RCU. Added the code to rmap.h and rmap.c (after all it is 
concerned with handling mappings).



This patch implements a simple callback for device drivers that establish
their own references to pages (KVM, GRU, XPmem, RDMA/Infiniband, DMA engines
etc). These references are unknown to the VM (therefore external).

With these callbacks it is possible for the device driver to release external
references when the VM requests it. This enables swapping, page migration and
allows support of remapping, permission changes etc etc for externally
mapped memory.

With this functionality it becomes possible to avoid pinning or mlocking
pages (commonly done to stop the VM from unmapping pages).

A device driver must subscribe to a process using

	emm_register_notifier

The VM will then perform callbacks for operations that unmap or change
permissions of pages in that address space. When the process terminates
the callback function is called with emm_release.

Callbacks are performed before and after the unmapping action of the VM.

	emm_invalidate_start	before
	emm_invalidate_end	after

Callbacks are mostly performed in a non atomic context. However, in
various places spinlocks are held to traverse rmaps. So this patch here
is only useful for those devices that can remove mappings in an atomic
context (f.e. KVM/GRU).

If the rmap traversal spinlocks are converted to semaphores then all 
callbacks willbe performed in a nonatomic context. Callouts can stay 
where they are.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/mm_types.h |    3 +
 include/linux/rmap.h     |   51 +++++++++++++++++++++++++++++++++
 kernel/fork.c            |    3 +
 mm/Kconfig               |    5 +++
 mm/filemap_xip.c         |    5 +++
 mm/fremap.c              |    2 +
 mm/hugetlb.c             |    4 ++
 mm/memory.c              |   32 ++++++++++++++++++--
 mm/mmap.c                |    3 +
 mm/mprotect.c            |    3 +
 mm/mremap.c              |    5 +++
 mm/rmap.c                |   72 ++++++++++++++++++++++++++++++++++++++++++++++-
 12 files changed, 183 insertions(+), 5 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h	2008-03-03 22:54:11.961264684 -0800
+++ linux-2.6/include/linux/mm_types.h	2008-03-03 22:55:13.333569600 -0800
@@ -225,6 +225,9 @@ struct mm_struct {
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;
 	struct kioctx		*ioctx_list;
+#ifdef CONFIG_EMM_NOTIFIER
+	struct emm_notifier	*emm_notifier;
+#endif
 #ifdef CONFIG_CGROUP_MEM_CONT
 	struct mem_cgroup *mem_cgroup;
 #endif
Index: linux-2.6/mm/Kconfig
===================================================================
--- linux-2.6.orig/mm/Kconfig	2008-03-03 22:54:11.993264520 -0800
+++ linux-2.6/mm/Kconfig	2008-03-03 22:55:13.337569625 -0800
@@ -193,3 +193,8 @@ config NR_QUICK
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config EMM_NOTIFIER
+	def_bool n
+	bool "External Mapped Memory Notifier for drivers directly mapping memory"
+
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c	2008-03-03 22:54:12.053265354 -0800
+++ linux-2.6/mm/mmap.c	2008-03-03 22:59:25.522848812 -0800
@@ -1747,11 +1747,13 @@ static void unmap_region(struct mm_struc
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+	emm_notify(mm, emm_invalidate_start, start, end);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
+	emm_notify(mm, emm_invalidate_end, start, end);
 }
 
 /*
@@ -2038,6 +2040,7 @@ void exit_mmap(struct mm_struct *mm)
 
 	/* mm's last user has gone, and its about to be pulled down */
 	arch_exit_mmap(mm);
+	emm_notify(mm, emm_release, 0, TASK_SIZE);
 
 	lru_add_drain();
 	flush_cache_mm(mm);
Index: linux-2.6/mm/mprotect.c
===================================================================
--- linux-2.6.orig/mm/mprotect.c	2008-03-03 22:54:12.069264942 -0800
+++ linux-2.6/mm/mprotect.c	2008-03-03 22:55:13.337569625 -0800
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/rmap.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -198,10 +199,12 @@ success:
 		dirty_accountable = 1;
 	}
 
+	emm_notify(mm, emm_invalidate_start, start, end);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
 		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+	emm_notify(mm, emm_invalidate_end, start, end);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c	2008-03-03 22:54:12.077265005 -0800
+++ linux-2.6/mm/mremap.c	2008-03-03 22:59:25.530848880 -0800
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/rmap.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -74,7 +75,9 @@ static void move_ptes(struct vm_area_str
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
+	unsigned long old_start = old_addr;
 
+	emm_notify(mm, emm_invalidate_start, old_start, old_end);
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before
@@ -98,6 +101,7 @@ static void move_ptes(struct vm_area_str
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
 	arch_enter_lazy_mmu_mode();
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
@@ -116,6 +120,7 @@ static void move_ptes(struct vm_area_str
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
+	emm_notify(mm, emm_invalidate_end, old_start, old_end);
 }
 
 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2008-03-03 22:54:12.089265604 -0800
+++ linux-2.6/mm/rmap.c	2008-03-03 22:59:25.542848702 -0800
@@ -298,6 +298,10 @@ static int page_referenced_one(struct pa
 
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
+	if (!referenced)
+		/* rmap lock held */
+		referenced = emm_notify(mm, emm_referenced,
+					address, address + PAGE_SIZE);
 out:
 	return referenced;
 }
@@ -446,6 +450,8 @@ static int page_mkclean_one(struct page 
 	if (address == -EFAULT)
 		goto out;
 
+	/* rmap lock held */
+	emm_notify(mm, emm_invalidate_start, address, address + PAGE_SIZE);
 	pte = page_check_address(page, mm, address, &ptl);
 	if (!pte)
 		goto out;
@@ -462,6 +468,7 @@ static int page_mkclean_one(struct page 
 	}
 
 	pte_unmap_unlock(pte, ptl);
+	emm_notify(mm, emm_invalidate_end, address, address + PAGE_SIZE);
 out:
 	return ret;
 }
@@ -702,9 +709,11 @@ static int try_to_unmap_one(struct page 
 	if (address == -EFAULT)
 		goto out;
 
+	/* rmap lock held */
+	emm_notify(mm, emm_invalidate_start, address, address + PAGE_SIZE);
 	pte = page_check_address(page, mm, address, &ptl);
 	if (!pte)
-		goto out;
+		goto out_notify;
 
 	/*
 	 * If the page is mlock()d, we cannot swap it out.
@@ -774,6 +783,8 @@ static int try_to_unmap_one(struct page 
 
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+out_notify:
+	emm_notify(mm, emm_invalidate_end, address, address + PAGE_SIZE);
 out:
 	return ret;
 }
@@ -812,6 +823,7 @@ static void try_to_unmap_cluster(unsigne
 	spinlock_t *ptl;
 	struct page *page;
 	unsigned long address;
+	unsigned long start;
 	unsigned long end;
 
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
@@ -833,6 +845,8 @@ static void try_to_unmap_cluster(unsigne
 	if (!pmd_present(*pmd))
 		return;
 
+	start = address;
+	emm_notify(mm, emm_invalidate_start, start, end);
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 
 	/* Update high watermark before we lower rss */
@@ -865,6 +879,7 @@ static void try_to_unmap_cluster(unsigne
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
+	emm_notify(mm, emm_invalidate_end, start, end);
 }
 
 static int try_to_unmap_anon(struct page *page, int migration)
@@ -1011,3 +1026,58 @@ int try_to_unmap(struct page *page, int 
 	return ret;
 }
 
+/*
+ * Notifier for devices establishing their own references to Linux
+ * kernel pages in addition to the regular mapping via page
+ * table and rmap. The notifier allows the device to drop the mapping
+ * when the VM removes references to pages.
+ *
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ */
+
+#ifdef CONFIG_EMM_NOTIFIER
+/*
+ * No synchronization. This function can only be called when only a single
+ * process remains that performs teardown.
+ */
+void emm_notifier_release(struct mm_struct *mm)
+{
+	struct emm_notifier *e;
+
+	while (mm->emm_notifier) {
+		e = mm->emm_notifier;
+		mm->emm_notifier = e->next;
+		e->func(e, mm, emm_release, 0, 0);
+	}
+}
+EXPORT_SYMBOL_GPL(emm_notifier_release);
+
+/* Register a notifier */
+void emm_notifier_register(struct emm_notifier *e, struct mm_struct *mm)
+{
+	e->next = mm->emm_notifier;
+	mm->emm_notifier = e;
+}
+EXPORT_SYMBOL_GPL(emm_notifier_register);
+
+/* Perform a callback */
+int __emm_notify(struct mm_struct *mm, enum emm_operations op,
+		unsigned long start, unsigned long end)
+{
+	struct emm_notifier *e = mm->emm_notifier;
+	int x;
+
+	while (e) {
+		if (e->func) {
+			x = e->func(e, mm, op, start, end);
+			if (x)
+				return x;
+		}
+		e = e->next;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__emm_notify);
+#endif
+
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2008-03-03 22:54:12.041265025 -0800
+++ linux-2.6/mm/memory.c	2008-03-03 22:59:25.502849006 -0800
@@ -611,6 +611,9 @@ int copy_page_range(struct mm_struct *ds
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
+	if (is_cow_mapping(vma->vm_flags))
+		emm_notify(src_mm, emm_invalidate_start, addr, end);
+
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
 	do {
@@ -621,6 +624,10 @@ int copy_page_range(struct mm_struct *ds
 						vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+	if (is_cow_mapping(vma->vm_flags))
+		emm_notify(src_mm, emm_invalidate_end, addr, end);
+
 	return 0;
 }
 
@@ -897,7 +904,11 @@ unsigned long zap_page_range(struct vm_a
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+
+	/* i_mmap_lock may be held */
+	emm_notify(mm, emm_invalidate_start, address, end);
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	emm_notify(mm, emm_invalidate_end, address, end);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
@@ -1340,6 +1351,7 @@ int remap_pfn_range(struct vm_area_struc
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long end = addr + PAGE_ALIGN(size);
+	unsigned long start = addr;
 	struct mm_struct *mm = vma->vm_mm;
 	int err;
 
@@ -1372,6 +1384,7 @@ int remap_pfn_range(struct vm_area_struc
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
+	emm_notify(mm, emm_invalidate_start, start, end);
 	flush_cache_range(vma, addr, end);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1380,6 +1393,7 @@ int remap_pfn_range(struct vm_area_struc
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	emm_notify(mm, emm_invalidate_end, start, end);
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1463,10 +1477,12 @@ int apply_to_page_range(struct mm_struct
 {
 	pgd_t *pgd;
 	unsigned long next;
+	unsigned long start = addr;
 	unsigned long end = addr + size;
 	int err;
 
 	BUG_ON(addr >= end);
+	emm_notify(mm, emm_invalidate_start, start, end);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1474,6 +1490,7 @@ int apply_to_page_range(struct mm_struct
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	emm_notify(mm, emm_invalidate_end, start, end);
 	return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1614,8 +1631,10 @@ static int do_wp_page(struct mm_struct *
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
 			page_cache_release(old_page);
-			if (!pte_same(*page_table, orig_pte))
-				goto unlock;
+			if (!pte_same(*page_table, orig_pte)) {
+				pte_unmap_unlock(page_table, ptl);
+				goto check_dirty;
+			}
 
 			page_mkwrite = 1;
 		}
@@ -1631,7 +1650,8 @@ static int do_wp_page(struct mm_struct *
 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
 			update_mmu_cache(vma, address, entry);
 		ret |= VM_FAULT_WRITE;
-		goto unlock;
+		pte_unmap_unlock(page_table, ptl);
+		goto check_dirty;
 	}
 
 	/*
@@ -1653,6 +1673,7 @@ gotten:
 	if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
 
+	emm_notify(mm, emm_invalidate_start, address, address + PAGE_SIZE);
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
@@ -1691,8 +1712,11 @@ gotten:
 		page_cache_release(new_page);
 	if (old_page)
 		page_cache_release(old_page);
-unlock:
+
 	pte_unmap_unlock(page_table, ptl);
+	emm_notify(mm, emm_invalidate_end, address, address + PAGE_SIZE);
+
+check_dirty:
 	if (dirty_page) {
 		if (vma->vm_file)
 			file_update_time(vma->vm_file);
Index: linux-2.6/include/linux/rmap.h
===================================================================
--- linux-2.6.orig/include/linux/rmap.h	2008-02-14 15:20:13.185930864 -0800
+++ linux-2.6/include/linux/rmap.h	2008-03-03 22:55:13.341569687 -0800
@@ -133,4 +133,55 @@ static inline int page_mkclean(struct pa
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
 
+/*
+ * Notifier for devices establishing their own references to Linux
+ * kernel pages in addition to the regular mapping via page
+ * table and rmap. The notifier allows the device to drop the mapping
+ * when the VM removes references to pages.
+ */
+enum emm_operations {
+	emm_release,		/* Process existing, */
+	emm_invalidate_start,	/* Before the VM unmaps pages */
+	emm_invalidate_end,	/* After the VM unmapped pages */
+	emm_referenced		/* Check if a range was referenced */
+};
+
+struct emm_notifier {
+	int (*func)(struct emm_notifier *e, struct mm_struct *mm,
+		enum emm_operations op,
+		unsigned long start, unsigned long end);
+	struct emm_notifier *next;
+};
+
+extern int __emm_notify(struct mm_struct *mm, enum emm_operations op,
+		unsigned long start, unsigned long end);
+
+static inline int mm_has_emm_notifier(struct mm_struct *mm)
+{
+#ifdef CONFIG_EMM_NOTIFIER
+	return unlikely(mm->emm_notifier);
+#else
+	return 0;
+#endif
+}
+
+static inline int emm_notify(struct mm_struct *mm, enum emm_operations op,
+	unsigned long start, unsigned long end)
+{
+#ifdef CONFIG_EMM_NOTIFIER
+	if (mm_has_emm_notifier(mm))
+		return __emm_notify(mm, op, start, end);
+#endif
+	return 0;
+}
+
+/*
+ * Register a notifier with an mm struct. Release occurs when the process
+ * terminates by calling the notifier function with emm_release.
+ *
+ * Must hold the mmap_sem for write.
+ */
+extern void emm_notifier_register(struct emm_notifier *e,
+					struct mm_struct *mm);
+
 #endif	/* _LINUX_RMAP_H */
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c	2008-03-03 22:54:11.985264714 -0800
+++ linux-2.6/kernel/fork.c	2008-03-03 22:59:27.230858013 -0800
@@ -362,6 +362,9 @@ static struct mm_struct * mm_init(struct
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
+#ifdef CONFIG_EMM_NOTIFIER
+		mm->emm_notifier = NULL;
+#endif
 		return mm;
 	}
 
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c	2008-03-03 22:54:12.013264644 -0800
+++ linux-2.6/mm/filemap_xip.c	2008-03-03 22:59:25.474848348 -0800
@@ -190,6 +190,9 @@ __xip_unmap (struct address_space * mapp
 		address = vma->vm_start +
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+		/* i_mmap_lock held */
+		emm_notify(mm, emm_invalidate_start,
+					address, address + PAGE_SIZE);
 		pte = page_check_address(page, mm, address, &ptl);
 		if (pte) {
 			/* Nuke the page table entry. */
@@ -201,6 +204,8 @@ __xip_unmap (struct address_space * mapp
 			pte_unmap_unlock(pte, ptl);
 			page_cache_release(page);
 		}
+		emm_notify(mm, emm_invalidate_end,
+					address, address + PAGE_SIZE);
 	}
 	spin_unlock(&mapping->i_mmap_lock);
 }
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2008-03-03 22:54:12.021264688 -0800
+++ linux-2.6/mm/fremap.c	2008-03-03 22:59:25.482848555 -0800
@@ -214,7 +214,9 @@ asmlinkage long sys_remap_file_pages(uns
 		spin_unlock(&mapping->i_mmap_lock);
 	}
 
+	emm_notify(mm, emm_invalidate_start, start, end);
 	err = populate_range(mm, vma, start, size, pgoff);
+	emm_notify(mm, emm_invalidate_end, start, end);
 	if (!err && !(flags & MAP_NONBLOCK)) {
 		if (unlikely(has_write_lock)) {
 			downgrade_write(&mm->mmap_sem);
Index: linux-2.6/mm/hugetlb.c
===================================================================
--- linux-2.6.orig/mm/hugetlb.c	2008-03-03 22:54:12.033264769 -0800
+++ linux-2.6/mm/hugetlb.c	2008-03-03 22:59:27.230858013 -0800
@@ -14,6 +14,7 @@
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/rmap.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -755,6 +756,8 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	/* i_mmap_lock held */
+	emm_notify(mm, emm_invalidate_start, start, end);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
@@ -775,6 +778,7 @@ void __unmap_hugepage_range(struct vm_ar
 	}
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
+	emm_notify(mm, emm_invalidate_end, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
 		list_del(&page->lru);
 		put_page(page);


^ permalink raw reply	[flat|nested] 120+ messages in thread

* [Early draft] Conversion of i_mmap_lock to semaphore
  2008-03-04  7:31                         ` [RFC] Notifier for Externally Mapped Memory (EMM) Christoph Lameter
@ 2008-03-04  7:34                           ` Christoph Lameter
  2008-03-04 13:30                           ` [RFC] Notifier for Externally Mapped Memory (EMM) Andrea Arcangeli
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-04  7:34 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm

Not there but the system boots and is usable. Complains about atomic 
contexts because the tlb functions use a get_cpu() and thus disable preempt.

Not sure yet what to do about the cond_resched_lock stuff etc.


Convert i_mmap_lock to i_mmap_sem

The conversion to a rwsemaphore allows callbacks during rmap traversal
for files in a non atomic context. A rw style lock allows concurrent
walking of the reverse map.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 arch/x86/mm/hugetlbpage.c |    4 ++--
 fs/hugetlbfs/inode.c      |    4 ++--
 fs/inode.c                |    2 +-
 include/linux/fs.h        |    2 +-
 include/linux/mm.h        |    2 +-
 kernel/fork.c             |    4 ++--
 mm/filemap.c              |    8 ++++----
 mm/filemap_xip.c          |    4 ++--
 mm/fremap.c               |    4 ++--
 mm/hugetlb.c              |   11 +++++------
 mm/memory.c               |   28 ++++++++--------------------
 mm/migrate.c              |    4 ++--
 mm/mmap.c                 |   16 ++++++++--------
 mm/mremap.c               |    4 ++--
 mm/rmap.c                 |   20 +++++++++-----------
 15 files changed, 51 insertions(+), 66 deletions(-)

Index: linux-2.6/arch/x86/mm/hugetlbpage.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/hugetlbpage.c	2008-03-03 22:59:25.386848427 -0800
+++ linux-2.6/arch/x86/mm/hugetlbpage.c	2008-03-03 22:59:31.174878038 -0800
@@ -69,7 +69,7 @@ static void huge_pmd_share(struct mm_str
 	if (!vma_shareable(vma, addr))
 		return;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
 		if (svma == vma)
 			continue;
@@ -94,7 +94,7 @@ static void huge_pmd_share(struct mm_str
 		put_page(virt_to_page(spte));
 	spin_unlock(&mm->page_table_lock);
 out:
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 }
 
 /*
Index: linux-2.6/fs/hugetlbfs/inode.c
===================================================================
--- linux-2.6.orig/fs/hugetlbfs/inode.c	2008-03-03 22:59:25.410848010 -0800
+++ linux-2.6/fs/hugetlbfs/inode.c	2008-03-03 22:59:31.174878038 -0800
@@ -454,10 +454,10 @@ static int hugetlb_vmtruncate(struct ino
 	pgoff = offset >> PAGE_SHIFT;
 
 	i_size_write(inode, offset);
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	if (!prio_tree_empty(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 	truncate_hugepages(inode, offset);
 	return 0;
 }
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2008-03-03 22:59:25.418848099 -0800
+++ linux-2.6/fs/inode.c	2008-03-03 22:59:31.202878206 -0800
@@ -210,7 +210,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	rwlock_init(&inode->i_data.tree_lock);
-	spin_lock_init(&inode->i_data.i_mmap_lock);
+	init_rwsem(&inode->i_data.i_mmap_sem);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
 	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2008-03-03 22:59:25.430848089 -0800
+++ linux-2.6/include/linux/fs.h	2008-03-03 22:59:31.202878206 -0800
@@ -503,7 +503,7 @@ struct address_space {
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
-	spinlock_t		i_mmap_lock;	/* protect tree, count, list */
+	struct rw_semaphore	i_mmap_sem;	/* protect tree, count, list */
 	unsigned int		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		nrpages;	/* number of total pages */
 	pgoff_t			writeback_index;/* writeback starts here */
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2008-03-03 22:59:25.442848167 -0800
+++ linux-2.6/include/linux/mm.h	2008-03-03 22:59:31.202878206 -0800
@@ -709,7 +709,7 @@ struct zap_details {
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
-	spinlock_t *i_mmap_lock;		/* For unmap_mapping_range: */
+	struct rw_semaphore *i_mmap_sem;	/* For unmap_mapping_range: */
 	unsigned long truncate_count;		/* Compare vm_truncate_count */
 };
 
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c	2008-03-03 22:59:27.230858013 -0800
+++ linux-2.6/kernel/fork.c	2008-03-03 22:59:31.202878206 -0800
@@ -273,12 +273,12 @@ static int dup_mmap(struct mm_struct *mm
 				atomic_dec(&inode->i_writecount);
 
 			/* insert tmp into the share list, just after mpnt */
-			spin_lock(&file->f_mapping->i_mmap_lock);
+			down_write(&file->f_mapping->i_mmap_sem);
 			tmp->vm_truncate_count = mpnt->vm_truncate_count;
 			flush_dcache_mmap_lock(file->f_mapping);
 			vma_prio_tree_add(tmp, mpnt);
 			flush_dcache_mmap_unlock(file->f_mapping);
-			spin_unlock(&file->f_mapping->i_mmap_lock);
+			up_write(&file->f_mapping->i_mmap_sem);
 		}
 
 		/*
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2008-03-03 22:59:25.462848256 -0800
+++ linux-2.6/mm/filemap.c	2008-03-03 22:59:31.206878010 -0800
@@ -62,16 +62,16 @@ generic_file_direct_IO(int rw, struct ki
 /*
  * Lock ordering:
  *
- *  ->i_mmap_lock		(vmtruncate)
+ *  ->i_mmap_sem		(vmtruncate)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
  *  ->i_mutex
- *    ->i_mmap_lock		(truncate->unmap_mapping_range)
+ *    ->i_mmap_sem		(truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
- *    ->i_mmap_lock
+ *    ->i_mmap_sem
  *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
  *
@@ -88,7 +88,7 @@ generic_file_direct_IO(int rw, struct ki
  *    ->sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *
- *  ->i_mmap_lock
+ *  ->i_mmap_sem
  *    ->anon_vma.lock		(vma_adjust)
  *
  *  ->anon_vma.lock
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c	2008-03-03 22:59:25.474848348 -0800
+++ linux-2.6/mm/filemap_xip.c	2008-03-03 22:59:31.206878010 -0800
@@ -184,7 +184,7 @@ __xip_unmap (struct address_space * mapp
 	if (!page)
 		return;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		mm = vma->vm_mm;
 		address = vma->vm_start +
@@ -207,7 +207,7 @@ __xip_unmap (struct address_space * mapp
 		emm_notify(mm, emm_invalidate_end,
 					address, address + PAGE_SIZE);
 	}
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 }
 
 /*
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2008-03-03 22:59:25.482848555 -0800
+++ linux-2.6/mm/fremap.c	2008-03-03 22:59:31.206878010 -0800
@@ -205,13 +205,13 @@ asmlinkage long sys_remap_file_pages(uns
 			}
 			goto out;
 		}
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		flush_dcache_mmap_lock(mapping);
 		vma->vm_flags |= VM_NONLINEAR;
 		vma_prio_tree_remove(vma, &mapping->i_mmap);
 		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		flush_dcache_mmap_unlock(mapping);
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 	}
 
 	emm_notify(mm, emm_invalidate_start, start, end);
Index: linux-2.6/mm/hugetlb.c
===================================================================
--- linux-2.6.orig/mm/hugetlb.c	2008-03-03 22:59:27.230858013 -0800
+++ linux-2.6/mm/hugetlb.c	2008-03-03 22:59:31.206878010 -0800
@@ -746,7 +746,7 @@ void __unmap_hugepage_range(struct vm_ar
 	struct page *page;
 	struct page *tmp;
 	/*
-	 * A page gathering list, protected by per file i_mmap_lock. The
+	 * A page gathering list, protected by per file i_mmap_sem. The
 	 * lock is used to avoid list corruption from multiple unmapping
 	 * of the same page since we are using page->lru.
 	 */
@@ -756,7 +756,6 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
-	/* i_mmap_lock held */
 	emm_notify(mm, emm_invalidate_start, start, end);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
@@ -797,9 +796,9 @@ void unmap_hugepage_range(struct vm_area
 	 * do nothing in this case.
 	 */
 	if (vma->vm_file) {
-		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+		down_write(&vma->vm_file->f_mapping->i_mmap_sem);
 		__unmap_hugepage_range(vma, start, end);
-		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+		up_write(&vma->vm_file->f_mapping->i_mmap_sem);
 	}
 }
 
@@ -1042,7 +1041,7 @@ void hugetlb_change_protection(struct vm
 	BUG_ON(address >= end);
 	flush_cache_range(vma, address, end);
 
-	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+	down_read(&vma->vm_file->f_mapping->i_mmap_sem);
 	spin_lock(&mm->page_table_lock);
 	for (; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
@@ -1057,7 +1056,7 @@ void hugetlb_change_protection(struct vm
 		}
 	}
 	spin_unlock(&mm->page_table_lock);
-	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+	up_read(&vma->vm_file->f_mapping->i_mmap_sem);
 
 	flush_tlb_range(vma, start, end);
 }
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2008-03-03 22:59:25.502849006 -0800
+++ linux-2.6/mm/memory.c	2008-03-03 22:59:31.206878010 -0800
@@ -830,7 +830,6 @@ unsigned long unmap_vmas(struct mmu_gath
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
-	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
 	int fullmm = (*tlbp)->fullmm;
 
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
@@ -868,21 +867,11 @@ unsigned long unmap_vmas(struct mmu_gath
 
 			tlb_finish_mmu(*tlbp, tlb_start, start);
 
-			if (need_resched() ||
-				(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-				if (i_mmap_lock) {
-					*tlbp = NULL;
-					goto out;
-				}
-				cond_resched();
-			}
-
 			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
 			tlb_start_valid = 0;
 			zap_work = ZAP_BLOCK_SIZE;
 		}
 	}
-out:
 	return start;	/* which is now the end (or restart) address */
 }
 
@@ -905,7 +894,6 @@ unsigned long zap_page_range(struct vm_a
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
 
-	/* i_mmap_lock may be held */
 	emm_notify(mm, emm_invalidate_start, address, end);
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
 	emm_notify(mm, emm_invalidate_end, address, end);
@@ -1749,7 +1737,7 @@ unwritable_page:
 /*
  * Helper functions for unmap_mapping_range().
  *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
+ * __ Notes on dropping i_mmap_sem to reduce latency while unmapping __
  *
  * We have to restart searching the prio_tree whenever we drop the lock,
  * since the iterator is only valid while the lock is held, and anyway
@@ -1768,7 +1756,7 @@ unwritable_page:
  * can't efficiently keep all vmas in step with mapping->truncate_count:
  * so instead reset them all whenever it wraps back to 0 (then go to 1).
  * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
+ * i_mmap_sem.
  *
  * In order to make forward progress despite repeatedly restarting some
  * large vma, note the restart_addr from unmap_vmas when it breaks out:
@@ -1818,7 +1806,7 @@ again:
 
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
-	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
+	need_break = need_resched();
 
 	if (restart_addr >= end_addr) {
 		/* We have now completed this vma: mark it so */
@@ -1832,9 +1820,9 @@ again:
 			goto again;
 	}
 
-	spin_unlock(details->i_mmap_lock);
+	up_write(details->i_mmap_sem);
 	cond_resched();
-	spin_lock(details->i_mmap_lock);
+	down_write(details->i_mmap_sem);
 	return -EINTR;
 }
 
@@ -1928,9 +1916,9 @@ void unmap_mapping_range(struct address_
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
-	details.i_mmap_lock = &mapping->i_mmap_lock;
+	details.i_mmap_sem = &mapping->i_mmap_sem;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_write(&mapping->i_mmap_sem);
 
 	/* Protect against endless unmapping loops */
 	mapping->truncate_count++;
@@ -1945,7 +1933,7 @@ void unmap_mapping_range(struct address_
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
-	spin_unlock(&mapping->i_mmap_lock);
+	up_write(&mapping->i_mmap_sem);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
Index: linux-2.6/mm/migrate.c
===================================================================
--- linux-2.6.orig/mm/migrate.c	2008-03-03 22:59:25.510849324 -0800
+++ linux-2.6/mm/migrate.c	2008-03-03 22:59:31.206878010 -0800
@@ -202,12 +202,12 @@ static void remove_file_migration_ptes(s
 	if (!mapping)
 		return;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
 		remove_migration_pte(vma, old, new);
 
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 }
 
 /*
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c	2008-03-03 22:59:25.522848812 -0800
+++ linux-2.6/mm/mmap.c	2008-03-03 22:59:31.210878368 -0800
@@ -186,7 +186,7 @@ error:
 }
 
 /*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_sem
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		struct file *file, struct address_space *mapping)
@@ -214,9 +214,9 @@ void unlink_file_vma(struct vm_area_stru
 
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		__remove_shared_vm_struct(vma, file, mapping);
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 	}
 }
 
@@ -439,7 +439,7 @@ static void vma_link(struct mm_struct *m
 		mapping = vma->vm_file->f_mapping;
 
 	if (mapping) {
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		vma->vm_truncate_count = mapping->truncate_count;
 	}
 	anon_vma_lock(vma);
@@ -449,7 +449,7 @@ static void vma_link(struct mm_struct *m
 
 	anon_vma_unlock(vma);
 	if (mapping)
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 
 	mm->map_count++;
 	validate_mm(mm);
@@ -536,7 +536,7 @@ again:			remove_next = 1 + (end > next->
 		mapping = file->f_mapping;
 		if (!(vma->vm_flags & VM_NONLINEAR))
 			root = &mapping->i_mmap;
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		if (importer &&
 		    vma->vm_truncate_count != next->vm_truncate_count) {
 			/*
@@ -620,7 +620,7 @@ again:			remove_next = 1 + (end > next->
 	if (anon_vma)
 		spin_unlock(&anon_vma->lock);
 	if (mapping)
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 
 	if (remove_next) {
 		if (file)
@@ -2064,7 +2064,7 @@ void exit_mmap(struct mm_struct *mm)
 
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_sem is taken here.
  */
 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c	2008-03-03 22:59:25.530848880 -0800
+++ linux-2.6/mm/mremap.c	2008-03-03 22:59:31.210878368 -0800
@@ -86,7 +86,7 @@ static void move_ptes(struct vm_area_str
 		 * and we propagate stale pages into the dst afterward.
 		 */
 		mapping = vma->vm_file->f_mapping;
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		if (new_vma->vm_truncate_count &&
 		    new_vma->vm_truncate_count != vma->vm_truncate_count)
 			new_vma->vm_truncate_count = 0;
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_str
 	pte_unmap_nested(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 	emm_notify(mm, emm_invalidate_end, old_start, old_end);
 }
 
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2008-03-03 22:59:25.542848702 -0800
+++ linux-2.6/mm/rmap.c	2008-03-03 22:59:31.210878368 -0800
@@ -24,7 +24,7 @@
  *   inode->i_alloc_sem (vmtruncate_range)
  *   mm->mmap_sem
  *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_lock
+ *       mapping->i_mmap_sem
  *         anon_vma->lock
  *           mm->page_table_lock or pte_lock
  *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -368,14 +368,14 @@ static int page_referenced_file(struct p
 	 * The page lock not only makes sure that page->mapping cannot
 	 * suddenly be NULLified by truncation, it makes sure that the
 	 * structure at mapping cannot be freed and reused yet,
-	 * so we can safely take mapping->i_mmap_lock.
+	 * so we can safely take mapping->i_mmap_sem.
 	 */
 	BUG_ON(!PageLocked(page));
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 
 	/*
-	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
+	 * i_mmap_sem does not stabilize mapcount at all, but mapcount
 	 * is more likely to be accurate if we note it after spinning.
 	 */
 	mapcount = page_mapcount(page);
@@ -398,7 +398,7 @@ static int page_referenced_file(struct p
 			break;
 	}
 
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 	return referenced;
 }
 
@@ -482,12 +482,12 @@ static int page_mkclean_file(struct addr
 
 	BUG_ON(PageAnon(page));
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		if (vma->vm_flags & VM_SHARED)
 			ret += page_mkclean_one(page, vma);
 	}
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 	return ret;
 }
 
@@ -923,7 +923,7 @@ static int try_to_unmap_file(struct page
 	unsigned long max_nl_size = 0;
 	unsigned int mapcount;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
@@ -960,7 +960,6 @@ static int try_to_unmap_file(struct page
 	mapcount = page_mapcount(page);
 	if (!mapcount)
 		goto out;
-	cond_resched_lock(&mapping->i_mmap_lock);
 
 	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 	if (max_nl_cursor == 0)
@@ -982,7 +981,6 @@ static int try_to_unmap_file(struct page
 			}
 			vma->vm_private_data = (void *) max_nl_cursor;
 		}
-		cond_resched_lock(&mapping->i_mmap_lock);
 		max_nl_cursor += CLUSTER_SIZE;
 	} while (max_nl_cursor <= max_nl_size);
 
@@ -994,7 +992,7 @@ static int try_to_unmap_file(struct page
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 		vma->vm_private_data = NULL;
 out:
-	spin_unlock(&mapping->i_mmap_lock);
+	up_write(&mapping->i_mmap_sem);
 	return ret;
 }
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 19:15                                 ` Jack Steiner
@ 2008-03-04 10:35                                   ` Peter Zijlstra
  2008-03-04 14:44                                     ` Jack Steiner
  0 siblings, 1 reply; 120+ messages in thread
From: Peter Zijlstra @ 2008-03-04 10:35 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Nick Piggin, Andrea Arcangeli, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter


On Mon, 2008-03-03 at 13:15 -0600, Jack Steiner wrote:

> I haven't thought about locking requirements for the radix tree. Most accesses
> would be read-only & updates infrequent. Any chance of an RCU-based radix
> implementation?  Otherwise, don't we add the potential for hot locks/cachelines
> for threaded applications ???

The current radix tree implementation in the kernel is RCU capable. We
just don't have many RCU users yet.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] KVM swapping with mmu notifiers #v9
  2008-03-04  0:44                       ` izik eidus
  2008-03-04  7:31                         ` [RFC] Notifier for Externally Mapped Memory (EMM) Christoph Lameter
@ 2008-03-04 13:21                         ` Andrea Arcangeli
  1 sibling, 0 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-04 13:21 UTC (permalink / raw)
  To: izik eidus
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

Hello Izik,

On Tue, Mar 04, 2008 at 02:44:07AM +0200, Izik Eidus wrote:
> i wrote to you about this before (i didnt get answer for this so i write 

Ouch I must have lost your previous comment with a too-fast pgdown in
the full quoting of the patch sorry.

> again)
> with large pages support i think we need to use here put_page

Right, thanks!!

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04  7:31                         ` [RFC] Notifier for Externally Mapped Memory (EMM) Christoph Lameter
  2008-03-04  7:34                           ` [Early draft] Conversion of i_mmap_lock to semaphore Christoph Lameter
@ 2008-03-04 13:30                           ` Andrea Arcangeli
  2008-03-04 19:00                             ` Christoph Lameter
  1 sibling, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-04 13:30 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Mon, Mar 03, 2008 at 11:31:15PM -0800, Christoph Lameter wrote:
> @@ -446,6 +450,8 @@ static int page_mkclean_one(struct page 
>  	if (address == -EFAULT)
>  		goto out;
>  
> +	/* rmap lock held */
> +	emm_notify(mm, emm_invalidate_start, address, address + PAGE_SIZE);
>  	pte = page_check_address(page, mm, address, &ptl);
>  	if (!pte)
>  		goto out;
> @@ -462,6 +468,7 @@ static int page_mkclean_one(struct page 
>  	}
>  
>  	pte_unmap_unlock(pte, ptl);
> +	emm_notify(mm, emm_invalidate_end, address, address + PAGE_SIZE);
>  out:
>  	return ret;
>  }

I could have ripped invalidate_page from my patch too, except I didn't
want to slow down those paths for the known-common-users when not even
GRU would get any benefit from two hooks when only one is needed.

When working with single pages it's more efficient and preferable to
call invalidate_page and only later release the VM reference on the
page.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-04 10:35                                   ` Peter Zijlstra
@ 2008-03-04 14:44                                     ` Jack Steiner
  0 siblings, 0 replies; 120+ messages in thread
From: Jack Steiner @ 2008-03-04 14:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Nick Piggin, Andrea Arcangeli, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman,
	Christoph Lameter

On Tue, Mar 04, 2008 at 11:35:32AM +0100, Peter Zijlstra wrote:
> 
> On Mon, 2008-03-03 at 13:15 -0600, Jack Steiner wrote:
> 
> > I haven't thought about locking requirements for the radix tree. Most accesses
> > would be read-only & updates infrequent. Any chance of an RCU-based radix
> > implementation?  Otherwise, don't we add the potential for hot locks/cachelines
> > for threaded applications ???
> 
> The current radix tree implementation in the kernel is RCU capable. We
> just don't have many RCU users yet.

Ahhh. You are right. I thought I looked but obviously missed it.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04 13:30                           ` [RFC] Notifier for Externally Mapped Memory (EMM) Andrea Arcangeli
@ 2008-03-04 19:00                             ` Christoph Lameter
  2008-03-04 22:20                               ` Andrea Arcangeli
  0 siblings, 1 reply; 120+ messages in thread
From: Christoph Lameter @ 2008-03-04 19:00 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Tue, 4 Mar 2008, Andrea Arcangeli wrote:

> When working with single pages it's more efficient and preferable to
> call invalidate_page and only later release the VM reference on the
> page.

But as you pointed out before that path is a slow path anyways. Its rarely 
taken. Having a single eviction callback simplifies design.

Plus the device driver can still check if the mapping was of PAGE_SIZE and 
then implement its own optimization.
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04 19:00                             ` Christoph Lameter
@ 2008-03-04 22:20                               ` Andrea Arcangeli
  2008-03-04 22:35                                 ` Christoph Lameter
  0 siblings, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-04 22:20 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Tue, Mar 04, 2008 at 11:00:31AM -0800, Christoph Lameter wrote:
> But as you pointed out before that path is a slow path anyways. Its rarely 

It's a slow path but I don't see why you think two hooks are better
than one, when only one is necessary.

I once ripped invalidate_page while working on #v8 but then I
reintroduced it because I thought reducing the total number of hooks
was beneficial to the core linux VM (even if only a
microoptimization, I sure agree about that, but it's trivial to add
one hook instead of two hooks there, so a microoptimization was worth
it IMHO).

Your API is also too restrictive, if we'll happen to need one more
method that doesn't take just (start,end) we'll have to cause all
drivers to have significant changes instead of one-liners to use
whatever new feature.

> taken. Having a single eviction callback simplifies design.

IMHO the design is actually same and I don't understand why you
rewrote it once more time in a less flexibile way (on a style side
you're not even using hlist), dropping RCU (not sure how you replace
it with), etc....

Your implementation has the same bug you had in your first V1, see how
you're not clearing the spte young bits if the pte young bit is
set. Once you fix that, your change in the ptep_clear_flush_young path
will look remarkably similar to the patch I posted incremental with
#v8 to make ->clear_flush_young sleep capable...

Converging in a single design is great, but it'd be nice if we could
converge into a single implementation, and my last patch doesn't have
any bug and I think it's quite nicer too (also including Nick cleanup
work) but then I may be biased ;).

But as usual I'm entirely satisfied by your brand new EMM Notifier to
be merged and all perfecting work done on my MMU notifier patch over
the weeks by multiple developers (including you) to be dropped for
good, as long as we can enable the new advanced KVM features in
2.6.25.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04 22:20                               ` Andrea Arcangeli
@ 2008-03-04 22:35                                 ` Christoph Lameter
  2008-03-04 22:42                                   ` Peter Zijlstra
  2008-03-07 15:17                                   ` [PATCH] 2/4 move all invalidate_page outside of PT lock (#v9 was 1/4) Andrea Arcangeli
  0 siblings, 2 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-04 22:35 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Tue, 4 Mar 2008, Andrea Arcangeli wrote:

> I once ripped invalidate_page while working on #v8 but then I
> reintroduced it because I thought reducing the total number of hooks
> was beneficial to the core linux VM (even if only a
> microoptimization, I sure agree about that, but it's trivial to add
> one hook instead of two hooks there, so a microoptimization was worth
> it IMHO).

Well the problem is if one does not have the begin/end hooks then 
reliable clearing of the mapping may not be possible. begin/end allow
holding off new references and that avoids the issue that would come
with an single callback that could race with something else.
 
> Your API is also too restrictive, if we'll happen to need one more
> method that doesn't take just (start,end) we'll have to cause all
> drivers to have significant changes instead of one-liners to use
> whatever new feature.

What would that be? I think the API need to stay as simple as possible. 
And this set is pretty minimal and easy to understand. Not having the 
invalidate_page() removes a troublespot from the API.
 
> IMHO the design is actually same and I don't understand why you
> rewrote it once more time in a less flexibile way (on a style side
> you're not even using hlist), dropping RCU (not sure how you replace
> it with), etc....

All of that is needed in order to allow sleeping in the future. Your 
version locks us into atomic callbacks. It also makes the API needlessly 
complex.

RCU means that the callbacks occur in an atomic context.

> Converging in a single design is great, but it'd be nice if we could
> converge into a single implementation, and my last patch doesn't have
> any bug and I think it's quite nicer too (also including Nick cleanup
> work) but then I may be biased ;).

It is the atomic dead end that we want to avoid. And your patch is exactly 
that. Both the invalidate_page and the RCU locks us into this.

> But as usual I'm entirely satisfied by your brand new EMM Notifier to
> be merged and all perfecting work done on my MMU notifier patch over
> the weeks by multiple developers (including you) to be dropped for
> good, as long as we can enable the new advanced KVM features in
> 2.6.25.

Well I really want us to have one API that is suitable for multiple 
purposes and that allows a generic use by device drivers for multiple 
purposes. The discussion in the last month have made that possible. I am 
glad that you do not see any major issues with the patch. I sure wish I 
would not have to post a competing patchset because I want things to be 
merged ASAP and get this over with. But we need to have at minimum clear 
way to support sleeping with the existing API in the future.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04 22:35                                 ` Christoph Lameter
@ 2008-03-04 22:42                                   ` Peter Zijlstra
  2008-03-04 23:14                                     ` Christoph Lameter
  2008-03-05  5:09                                     ` Avi Kivity
  2008-03-07 15:17                                   ` [PATCH] 2/4 move all invalidate_page outside of PT lock (#v9 was 1/4) Andrea Arcangeli
  1 sibling, 2 replies; 120+ messages in thread
From: Peter Zijlstra @ 2008-03-04 22:42 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrea Arcangeli, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman


On Tue, 2008-03-04 at 14:35 -0800, Christoph Lameter wrote:

> RCU means that the callbacks occur in an atomic context.

Not really, if it requires moving the VM locks to sleepable locks under
a .config option, I think its also fair to require PREEMPT_RCU.

OTOH, if you want to unconditionally move the VM locks to sleepable
locks you have a point.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04 22:42                                   ` Peter Zijlstra
@ 2008-03-04 23:14                                     ` Christoph Lameter
  2008-03-04 23:25                                       ` Peter Zijlstra
  2008-03-05  5:09                                     ` Avi Kivity
  1 sibling, 1 reply; 120+ messages in thread
From: Christoph Lameter @ 2008-03-04 23:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrea Arcangeli, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Tue, 4 Mar 2008, Peter Zijlstra wrote:

> 
> On Tue, 2008-03-04 at 14:35 -0800, Christoph Lameter wrote:
> 
> > RCU means that the callbacks occur in an atomic context.
> 
> Not really, if it requires moving the VM locks to sleepable locks under
> a .config option, I think its also fair to require PREEMPT_RCU.

Which would make the patchset pretty complex. RCU is not needed with a 
single linked list. Linked list operations can exploit atomic pointer 
updates and we only tear down the list when a single execution thread 
remains.


Having said that: Here a couple of updates to address Andrea's complaint 
that we not check the referenced bit from the external mapper when the 
rerferences bit is set on an OS pte.

Plus two barriers to ensure that a new emm notifier object becomes
visible before the base pointer is updated.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/rmap.c |   10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2008-03-04 14:36:36.321922321 -0800
+++ linux-2.6/mm/rmap.c	2008-03-04 15:10:46.159429369 -0800
@@ -298,10 +298,10 @@ static int page_referenced_one(struct pa
 
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
-	if (!referenced)
-		/* rmap lock held */
-		referenced = emm_notify(mm, emm_referenced,
-					address, address + PAGE_SIZE);
+
+	/* rmap lock held */
+	if (emm_notify(mm, emm_referenced, address, address + PAGE_SIZE))
+			referenced = 1;
 out:
 	return referenced;
 }
@@ -1057,6 +1057,7 @@ EXPORT_SYMBOL_GPL(emm_notifier_release);
 void emm_notifier_register(struct emm_notifier *e, struct mm_struct *mm)
 {
 	e->next = mm->emm_notifier;
+	smp_wmb();
 	mm->emm_notifier = e;
 }
 EXPORT_SYMBOL_GPL(emm_notifier_register);
@@ -1069,6 +1070,7 @@ int __emm_notify(struct mm_struct *mm, e
 	int x;
 
 	while (e) {
+		smp_rmb();
 		if (e->func) {
 			x = e->func(e, mm, op, start, end);
 			if (x)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04 23:14                                     ` Christoph Lameter
@ 2008-03-04 23:25                                       ` Peter Zijlstra
  2008-03-04 23:30                                         ` Peter Zijlstra
  0 siblings, 1 reply; 120+ messages in thread
From: Peter Zijlstra @ 2008-03-04 23:25 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrea Arcangeli, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman


On Tue, 2008-03-04 at 15:14 -0800, Christoph Lameter wrote:
> On Tue, 4 Mar 2008, Peter Zijlstra wrote:
> 
> > 
> > On Tue, 2008-03-04 at 14:35 -0800, Christoph Lameter wrote:
> > 
> > > RCU means that the callbacks occur in an atomic context.
> > 
> > Not really, if it requires moving the VM locks to sleepable locks under
> > a .config option, I think its also fair to require PREEMPT_RCU.
> 
> Which would make the patchset pretty complex. RCU is not needed with a 
> single linked list. Linked list operations can exploit atomic pointer 
> updates and we only tear down the list when a single execution thread 
> remains.

OK, that constraint on removal makes it work.

> Having said that: Here a couple of updates to address Andrea's complaint 
> that we not check the referenced bit from the external mapper when the 
> rerferences bit is set on an OS pte.
> 
> Plus two barriers to ensure that a new emm notifier object becomes
> visible before the base pointer is updated.
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> 
> ---
>  mm/rmap.c |   10 ++++++----
>  1 file changed, 6 insertions(+), 4 deletions(-)
> 
> Index: linux-2.6/mm/rmap.c
> ===================================================================
> --- linux-2.6.orig/mm/rmap.c	2008-03-04 14:36:36.321922321 -0800
> +++ linux-2.6/mm/rmap.c	2008-03-04 15:10:46.159429369 -0800
> @@ -298,10 +298,10 @@ static int page_referenced_one(struct pa
>  
>  	(*mapcount)--;
>  	pte_unmap_unlock(pte, ptl);
> -	if (!referenced)
> -		/* rmap lock held */
> -		referenced = emm_notify(mm, emm_referenced,
> -					address, address + PAGE_SIZE);
> +
> +	/* rmap lock held */
> +	if (emm_notify(mm, emm_referenced, address, address + PAGE_SIZE))
> +			referenced = 1;

referenced++; seems more in-style with the rest of that code..

>  out:
>  	return referenced;
>  }
> @@ -1057,6 +1057,7 @@ EXPORT_SYMBOL_GPL(emm_notifier_release);
>  void emm_notifier_register(struct emm_notifier *e, struct mm_struct *mm)
>  {
>  	e->next = mm->emm_notifier;
> +	smp_wmb();
>  	mm->emm_notifier = e;
>  }
>  EXPORT_SYMBOL_GPL(emm_notifier_register);
> @@ -1069,6 +1070,7 @@ int __emm_notify(struct mm_struct *mm, e
>  	int x;
>  
>  	while (e) {
> +		smp_rmb();
>  		if (e->func) {
>  			x = e->func(e, mm, op, start, end);
>  			if (x)

We generally require comments around barriers..


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04 23:25                                       ` Peter Zijlstra
@ 2008-03-04 23:30                                         ` Peter Zijlstra
  0 siblings, 0 replies; 120+ messages in thread
From: Peter Zijlstra @ 2008-03-04 23:30 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrea Arcangeli, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman


FWIW, I'll cut the kvm and openfabrics lists from any future posts.
I'm getting tired of the bounces.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-03 19:01                     ` Christoph Lameter
  2008-03-03 21:15                       ` Andrea Arcangeli
@ 2008-03-05  0:37                       ` Nick Piggin
  2008-03-05 18:48                         ` Christoph Lameter
  1 sibling, 1 reply; 120+ messages in thread
From: Nick Piggin @ 2008-03-05  0:37 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrea Arcangeli, Jack Steiner, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Mon, Mar 03, 2008 at 11:01:22AM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
> 
> > I'm still not completely happy with this. I had a very quick look
> > at the GRU driver, but I don't see why it can't be implemented
> > more like the regular TLB model, and have TLB insertions depend on
> > the linux pte, and do invalidates _after_ restricting permissions
> > to the pte.
> > 
> > Ie. I'd still like to get rid of invalidate_range_begin, and get
> > rid of invalidate calls from places where permissions are relaxed.
> 
> Isnt this more a job for paravirt ops if it is so tightly bound to page 
> tables? Are we not adding another similar API?

Um, it's bound to the *Linux page tables*, yes. And I have no idea why
you would use the paravirt ops for this.

 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-04 22:42                                   ` Peter Zijlstra
  2008-03-04 23:14                                     ` Christoph Lameter
@ 2008-03-05  5:09                                     ` Avi Kivity
  2008-03-05  9:47                                       ` Robin Holt
  1 sibling, 1 reply; 120+ messages in thread
From: Avi Kivity @ 2008-03-05  5:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Andrea Arcangeli, Jack Steiner, Nick Piggin,
	akpm, Robin Holt, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

Peter Zijlstra wrote:
> On Tue, 2008-03-04 at 14:35 -0800, Christoph Lameter wrote:
>
>   
>> RCU means that the callbacks occur in an atomic context.
>>     
>
> Not really, if it requires moving the VM locks to sleepable locks under
> a .config option, I think its also fair to require PREEMPT_RCU.
>
> OTOH, if you want to unconditionally move the VM locks to sleepable
> locks you have a point.
>   

Isn't that out of the question for .25?

I really wish we can get the atomic variant in now, and add on 
sleepability in .26, updating users if necessary.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-05  5:09                                     ` Avi Kivity
@ 2008-03-05  9:47                                       ` Robin Holt
  2008-03-05  9:53                                         ` Avi Kivity
  2008-03-05 10:02                                         ` [kvm-devel] " Dor Laor
  0 siblings, 2 replies; 120+ messages in thread
From: Robin Holt @ 2008-03-05  9:47 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Peter Zijlstra, Christoph Lameter, Andrea Arcangeli,
	Jack Steiner, Nick Piggin, akpm, Robin Holt, kvm-devel, general,
	Steve Wise, Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Wed, Mar 05, 2008 at 07:09:55AM +0200, Avi Kivity wrote:
> Isn't that out of the question for .25?

I keep hearing this mantra.  What is so compelling about the .25
release?  When seems to be more important than what.  While I understand
product release cycles, etc. and can certainly agree with them. I would
like to know with what I am being asked to agree.

That said, I agree we should probably finish getting the comments on
Andrea's most recent patch, if any, cleared up and put that one in.

Robin

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-05  9:47                                       ` Robin Holt
@ 2008-03-05  9:53                                         ` Avi Kivity
  2008-03-05 10:02                                         ` [kvm-devel] " Dor Laor
  1 sibling, 0 replies; 120+ messages in thread
From: Avi Kivity @ 2008-03-05  9:53 UTC (permalink / raw)
  To: Robin Holt
  Cc: Peter Zijlstra, Christoph Lameter, Andrea Arcangeli,
	Jack Steiner, Nick Piggin, akpm, kvm-devel, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

Robin Holt wrote:
> On Wed, Mar 05, 2008 at 07:09:55AM +0200, Avi Kivity wrote:
>   
>> Isn't that out of the question for .25?
>>     
>
> I keep hearing this mantra.  What is so compelling about the .25
> release?  When seems to be more important than what.  While I understand
> product release cycles, etc. and can certainly agree with them. I would
> like to know with what I am being asked to agree.
>
>   

kvm gained the ability to swap in 2.6.25.  Without mmu notifiers, 
though, the guest can still easily pin all of its memory.

> That said, I agree we should probably finish getting the comments on
> Andrea's most recent patch, if any, cleared up and put that one in.
>   

Great.  Thanks.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [kvm-devel] [RFC] Notifier for Externally Mapped Memory (EMM)
  2008-03-05  9:47                                       ` Robin Holt
  2008-03-05  9:53                                         ` Avi Kivity
@ 2008-03-05 10:02                                         ` Dor Laor
  1 sibling, 0 replies; 120+ messages in thread
From: Dor Laor @ 2008-03-05 10:02 UTC (permalink / raw)
  To: Robin Holt
  Cc: Avi Kivity, Nick Piggin, Steve Wise, Andrea Arcangeli,
	Peter Zijlstra, kvm-devel, Kanoj Sarcar, Roland Dreier,
	Jack Steiner, linux-kernel, linux-mm, daniel.blueman, general,
	akpm, Christoph Lameter


On Wed, 2008-03-05 at 03:47 -0600, Robin Holt wrote:
> On Wed, Mar 05, 2008 at 07:09:55AM +0200, Avi Kivity wrote:
> > Isn't that out of the question for .25?
> 
> I keep hearing this mantra.  What is so compelling about the .25
> release?  When seems to be more important than what.  While I understand
> product release cycles, etc. and can certainly agree with them. I would
> like to know with what I am being asked to agree.
> 

The main reason is that several kvm exciting features are dependent on
mmu notifiers:
- It enables full guest swapping (as opposed to partial today)
- It enables memory ballooning
- It enables running Izik Eidus's Kernel Shared Pages module that unify
  guest pages together.

The patchset is kernel-internal, stable and reviewed. Even if the
interface will be changed in .26 it won't have noticeable effect.

So since its stable, internal, reviewed, needed to enable important kvm
features we like to see it in for .25.

Regards,
Dor

> That said, I agree we should probably finish getting the comments on
> Andrea's most recent patch, if any, cleared up and put that one in.
> 
> Robin
> 
> -------------------------------------------------------------------------
> This SF.net email is sponsored by: Microsoft
> Defy all challenges. Microsoft(R) Visual Studio 2008.
> http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> _______________________________________________
> kvm-devel mailing list
> kvm-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/kvm-devel


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-05  0:37                       ` Nick Piggin
@ 2008-03-05 18:48                         ` Christoph Lameter
  2008-03-06  2:59                           ` Nick Piggin
  0 siblings, 1 reply; 120+ messages in thread
From: Christoph Lameter @ 2008-03-05 18:48 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrea Arcangeli, Jack Steiner, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Wed, 5 Mar 2008, Nick Piggin wrote:

> Um, it's bound to the *Linux page tables*, yes. And I have no idea why
> you would use the paravirt ops for this.

paravirt ops allows interception of page table operations?


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] mmu notifiers #v8
  2008-03-05 18:48                         ` Christoph Lameter
@ 2008-03-06  2:59                           ` Nick Piggin
  0 siblings, 0 replies; 120+ messages in thread
From: Nick Piggin @ 2008-03-06  2:59 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrea Arcangeli, Jack Steiner, akpm, Robin Holt, Avi Kivity,
	Izik Eidus, kvm-devel, Peter Zijlstra, general, Steve Wise,
	Roland Dreier, Kanoj Sarcar, linux-kernel, linux-mm,
	daniel.blueman

On Wed, Mar 05, 2008 at 10:48:24AM -0800, Christoph Lameter wrote:
> On Wed, 5 Mar 2008, Nick Piggin wrote:
> 
> > Um, it's bound to the *Linux page tables*, yes. And I have no idea why
> > you would use the paravirt ops for this.
> 
> paravirt ops allows interception of page table operations?

Maybe possible but it's totally the wrong API for it.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] 2/4 move all invalidate_page outside of PT lock (#v9 was 1/4)
  2008-03-04 22:35                                 ` Christoph Lameter
  2008-03-04 22:42                                   ` Peter Zijlstra
@ 2008-03-07 15:17                                   ` Andrea Arcangeli
  2008-03-07 15:23                                     ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Andrea Arcangeli
  2008-03-07 19:54                                     ` [PATCH] 2/4 move all invalidate_page outside of PT lock " Christoph Lameter
  1 sibling, 2 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-07 15:17 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Tue, Mar 04, 2008 at 02:35:21PM -0800, Christoph Lameter wrote:
> It is the atomic dead end that we want to avoid. And your patch is exactly 
> that. Both the invalidate_page and the RCU locks us into this.

I preferred to answer with code to avoid any possible misunderstanding
(I through already tried to explain with words and I obviously failed
miserably if you ended up writing such an erratic weird claim like
above ;).

This below simple patch invalidates the "invalidate_page" part, the
next patch will invalidate the RCU part, and btw in a way that doesn't
forbid unregistering the mmu notifiers at runtime (like your brand new
EMM does).

This is incremental with my #v9. I still ask Andrew/Linus to merge the
#v9 patch I posted a few days ago in .25 so KVM/GRU will be 100%
covered in a optimal way on all respects and with maximum flexibility
for future changes of API (to allow for future methods that may take
more than start,end, this was pointed out once by both me and Avi). My
#v9 is zero risk for .25 and it sure worth merging now.

Then in .26 we'll modify the semantics of the API to be blocking
starting with the below patchx. This is a kernel _internal_ API, and
we aren't distributions that have to respect kabi here, but even if we
were, making methods sleepable is a 100% backwards compatible
semantical change, so there's no possible reason to defer the #v9
merging. The changes in .26 will be transparent to any user (even if
they don't need to! even if we turn out to be totally wrong about .26
requiring a minor change of API everything will be perfectly
fine). Nothing of this is visible to userland so we can change it at
any time as we wish.

The reason I keep this incremental (unlike your EMM that does
everything all at the same time mixed in a single patch) is to
decrease the non obviously safe mangling over mm/* during .25. The
below patch is simple, but not as obviously safe as
s/ptep_clear_flush/ptep_clear_flush_notify/.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -134,27 +134,6 @@ static inline void mmu_notifier_mm_init(
 
 
 
-#define ptep_clear_flush_notify(__vma, __address, __ptep)		\
-({									\
-	pte_t __pte;							\
-	struct vm_area_struct *___vma = __vma;				\
-	unsigned long ___address = __address;				\
-	__pte = ptep_clear_flush(___vma, ___address, __ptep);		\
-	mmu_notifier_invalidate_page(___vma->vm_mm, ___address);	\
-	__pte;								\
-})
-
-#define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
-({									\
-	int __young;							\
-	struct vm_area_struct *___vma = __vma;				\
-	unsigned long ___address = __address;				\
-	__young = ptep_clear_flush_young(___vma, ___address, __ptep);	\
-	__young |= mmu_notifier_clear_flush_young(___vma->vm_mm,	\
-						  ___address);		\
-	__young;							\
-})
-
 #else /* CONFIG_MMU_NOTIFIER */
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
@@ -186,9 +165,6 @@ static inline void mmu_notifier_mm_init(
 {
 }
 
-#define ptep_clear_flush_young_notify ptep_clear_flush_young
-#define ptep_clear_flush_notify ptep_clear_flush
-
 #endif /* CONFIG_MMU_NOTIFIER */
 
 #endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,11 +194,13 @@ __xip_unmap (struct address_space * mapp
 		if (pte) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
-			pteval = ptep_clear_flush_notify(vma, address, pte);
+			pteval = ptep_clear_flush(vma, address, pte);
 			page_remove_rmap(page, vma);
 			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap_unlock(pte, ptl);
+			/* must invalidate_page _before_ freeing the page */
+			mmu_notifier_invalidate_page(mm, address);
 			page_cache_release(page);
 		}
 	}
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1626,9 +1626,10 @@ static int do_wp_page(struct mm_struct *
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
-			page_cache_release(old_page);
+			new_page = NULL;
 			if (!pte_same(*page_table, orig_pte))
 				goto unlock;
+			page_cache_release(old_page);
 
 			page_mkwrite = 1;
 		}
@@ -1644,6 +1645,7 @@ static int do_wp_page(struct mm_struct *
 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
 			update_mmu_cache(vma, address, entry);
 		ret |= VM_FAULT_WRITE;
+		old_page = new_page = NULL;
 		goto unlock;
 	}
 
@@ -1688,7 +1690,7 @@ gotten:
 		 * seen in the presence of one thread doing SMC and another
 		 * thread doing COW.
 		 */
-		ptep_clear_flush_notify(vma, address, page_table);
+		ptep_clear_flush(vma, address, page_table);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lru_cache_add_active(new_page);
@@ -1700,12 +1702,18 @@ gotten:
 	} else
 		mem_cgroup_uncharge_page(new_page);
 
-	if (new_page)
+unlock:
+	pte_unmap_unlock(page_table, ptl);
+
+	if (new_page) {
+		if (new_page == old_page)
+			/* cow happened, notify before releasing old_page */
+			mmu_notifier_invalidate_page(mm, address);
 		page_cache_release(new_page);
+	}
 	if (old_page)
 		page_cache_release(old_page);
-unlock:
-	pte_unmap_unlock(page_table, ptl);
+
 	if (dirty_page) {
 		if (vma->vm_file)
 			file_update_time(vma->vm_file);
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -275,7 +275,7 @@ static int page_referenced_one(struct pa
 	unsigned long address;
 	pte_t *pte;
 	spinlock_t *ptl;
-	int referenced = 0;
+	int referenced = 0, clear_flush_young = 0;
 
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
@@ -288,8 +288,11 @@ static int page_referenced_one(struct pa
 	if (vma->vm_flags & VM_LOCKED) {
 		referenced++;
 		*mapcount = 1;	/* break early from loop */
-	} else if (ptep_clear_flush_young_notify(vma, address, pte))
-		referenced++;
+	} else {
+		clear_flush_young = 1;
+		if (ptep_clear_flush_young(vma, address, pte))
+			referenced++;
+	}
 
 	/* Pretend the page is referenced if the task has the
 	   swap token and is in the middle of a page fault. */
@@ -299,6 +302,10 @@ static int page_referenced_one(struct pa
 
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
+
+	if (clear_flush_young)
+		referenced += mmu_notifier_clear_flush_young(mm, address);
+
 out:
 	return referenced;
 }
@@ -455,7 +462,7 @@ static int page_mkclean_one(struct page 
 		pte_t entry;
 
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		entry = ptep_clear_flush_notify(vma, address, pte);
+		entry = ptep_clear_flush(vma, address, pte);
 		entry = pte_wrprotect(entry);
 		entry = pte_mkclean(entry);
 		set_pte_at(mm, address, pte, entry);
@@ -463,6 +470,10 @@ static int page_mkclean_one(struct page 
 	}
 
 	pte_unmap_unlock(pte, ptl);
+
+	if (ret)
+		mmu_notifier_invalidate_page(mm, address);
+
 out:
 	return ret;
 }
@@ -712,15 +723,14 @@ static int try_to_unmap_one(struct page 
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young_notify(vma, address, pte)))) {
+	if (!migration && (vma->vm_flags & VM_LOCKED)) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush_notify(vma, address, pte);
+	pteval = ptep_clear_flush(vma, address, pte);
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
@@ -775,6 +785,8 @@ static int try_to_unmap_one(struct page 
 
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+	if (ret != SWAP_FAIL)
+		mmu_notifier_invalidate_page(mm, address);
 out:
 	return ret;
 }
@@ -813,7 +825,7 @@ static void try_to_unmap_cluster(unsigne
 	spinlock_t *ptl;
 	struct page *page;
 	unsigned long address;
-	unsigned long end;
+	unsigned long start, end;
 
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
@@ -834,6 +846,8 @@ static void try_to_unmap_cluster(unsigne
 	if (!pmd_present(*pmd))
 		return;
 
+	start = address;
+	mmu_notifier_invalidate_range_begin(mm, start, end);
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 
 	/* Update high watermark before we lower rss */
@@ -845,12 +859,12 @@ static void try_to_unmap_cluster(unsigne
 		page = vm_normal_page(vma, address, *pte);
 		BUG_ON(!page || PageAnon(page));
 
-		if (ptep_clear_flush_young_notify(vma, address, pte))
+		if (ptep_clear_flush_young(vma, address, pte))
 			continue;
 
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush_notify(vma, address, pte);
+		pteval = ptep_clear_flush(vma, address, pte);
 
 		/* If nonlinear, store the file page offset in the pte. */
 		if (page->index != linear_page_index(vma, address))
@@ -866,6 +880,7 @@ static void try_to_unmap_cluster(unsigne
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 }
 
 static int try_to_unmap_anon(struct page *page, int migration)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 15:17                                   ` [PATCH] 2/4 move all invalidate_page outside of PT lock (#v9 was 1/4) Andrea Arcangeli
@ 2008-03-07 15:23                                     ` Andrea Arcangeli
  2008-03-07 15:52                                       ` [PATCH] 4/4 i_mmap_lock spinlock2rwsem " Andrea Arcangeli
                                                         ` (2 more replies)
  2008-03-07 19:54                                     ` [PATCH] 2/4 move all invalidate_page outside of PT lock " Christoph Lameter
  1 sibling, 3 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-07 15:23 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

This combines the non-sleep-capable RCU locking of #v9 with a seqlock
so the mmu notifier fast path will require zero cacheline
writes/bouncing while still providing mmu_notifier_unregister and
allowing to schedule inside the mmu notifier methods. If we drop
mmu_notifier_unregister we can as well drop all seqlock and
rcu_read_lock()s. But this locking scheme combination is sexy enough
and 100% scalable (the mmu_notifier_list cacheline will be preloaded
anyway and that will most certainly include the sequence number value
in l1 for free even in Christoph's NUMA systems) so IMHO it worth to
keep mmu_notifier_unregister.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -10,6 +10,7 @@
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/completion.h>
+#include <linux/seqlock.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -230,6 +231,7 @@ struct mm_struct {
 #endif
 #ifdef CONFIG_MMU_NOTIFIER
 	struct hlist_head mmu_notifier_list;
+	seqlock_t mmu_notifier_lock;
 #endif
 };
 
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -130,6 +130,7 @@ static inline void mmu_notifier_mm_init(
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
 {
 	INIT_HLIST_HEAD(&mm->mmu_notifier_list);
+	seqlock_init(&mm->mmu_notifier_lock);
 }
 
 
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -20,7 +20,9 @@ void __mmu_notifier_release(struct mm_st
 void __mmu_notifier_release(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
+	unsigned seq;
 
+	seq = read_seqbegin(&mm->mmu_notifier_lock);
 	while (unlikely(!hlist_empty(&mm->mmu_notifier_list))) {
 		mn = hlist_entry(mm->mmu_notifier_list.first,
 				 struct mmu_notifier,
@@ -28,6 +30,7 @@ void __mmu_notifier_release(struct mm_st
 		hlist_del(&mn->hlist);
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
+		BUG_ON(read_seqretry(&mm->mmu_notifier_lock, seq));
 	}
 }
 
@@ -42,11 +45,19 @@ int __mmu_notifier_clear_flush_young(str
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
 	int young = 0;
+	unsigned seq;
 
 	rcu_read_lock();
+restart:
+	seq = read_seqbegin(&mm->mmu_notifier_lock);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
-		if (mn->ops->clear_flush_young)
+		if (mn->ops->clear_flush_young) {
+			rcu_read_unlock();
 			young |= mn->ops->clear_flush_young(mn, mm, address);
+			rcu_read_lock();
+		}
+		if (read_seqretry(&mm->mmu_notifier_lock, seq))
+			goto restart;
 	}
 	rcu_read_unlock();
 
@@ -58,11 +69,19 @@ void __mmu_notifier_invalidate_page(stru
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	unsigned seq;
 
 	rcu_read_lock();
+restart:
+	seq = read_seqbegin(&mm->mmu_notifier_lock);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
-		if (mn->ops->invalidate_page)
+		if (mn->ops->invalidate_page) {
+			rcu_read_unlock();
 			mn->ops->invalidate_page(mn, mm, address);
+			rcu_read_lock();
+		}
+		if (read_seqretry(&mm->mmu_notifier_lock, seq))
+			goto restart;
 	}
 	rcu_read_unlock();
 }
@@ -72,11 +91,19 @@ void __mmu_notifier_invalidate_range_beg
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	unsigned seq;
 
 	rcu_read_lock();
+restart:
+	seq = read_seqbegin(&mm->mmu_notifier_lock);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
-		if (mn->ops->invalidate_range_begin)
+		if (mn->ops->invalidate_range_begin) {
+			rcu_read_unlock();
 			mn->ops->invalidate_range_begin(mn, mm, start, end);
+			rcu_read_lock();
+		}
+		if (read_seqretry(&mm->mmu_notifier_lock, seq))
+			goto restart;
 	}
 	rcu_read_unlock();
 }
@@ -86,11 +113,19 @@ void __mmu_notifier_invalidate_range_end
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	unsigned seq;
 
 	rcu_read_lock();
+restart:
+	seq = read_seqbegin(&mm->mmu_notifier_lock);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
-		if (mn->ops->invalidate_range_end)
+		if (mn->ops->invalidate_range_end) {
+			rcu_read_unlock();
 			mn->ops->invalidate_range_end(mn, mm, start, end);
+			rcu_read_lock();
+		}
+		if (read_seqretry(&mm->mmu_notifier_lock, seq))
+			goto restart;
 	}
 	rcu_read_unlock();
 }
@@ -103,12 +138,20 @@ void __mmu_notifier_invalidate_range_end
  */
 void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
 {
+	/* no need of seqlock for hlist_add_head_rcu */
 	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_list);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_register);
 
 void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
+	/*
+	 * The seqlock tracks if a hlist_del_rcu happens while a
+	 * notifier method is scheduling and in such a case the "mn"
+	 * memory may have been freed by the time the method returns.
+	 */
+	write_seqlock(&mm->mmu_notifier_lock);
 	hlist_del_rcu(&mn->hlist);
+	write_sequnlock(&mm->mmu_notifier_lock);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] 4/4 i_mmap_lock spinlock2rwsem (#v9 was 1/4)
  2008-03-07 15:23                                     ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Andrea Arcangeli
@ 2008-03-07 15:52                                       ` Andrea Arcangeli
  2008-03-07 20:03                                         ` Christoph Lameter
  2008-03-19 21:27                                         ` Christoph Lameter
  2008-03-07 16:52                                       ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Peter Zijlstra
  2008-03-07 20:00                                       ` Christoph Lameter
  2 siblings, 2 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-07 15:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

This is a rediff of Christoph's plain i_mmap_lock2rwsem patch on top
of #v9 1/4 + 2/4 + 3/4 (hence this is called 4/4). This is mostly to
show that after 3/4, any patch that plugs on the EMM patchset will
plug nicely on top of my MMU notifer patchset too.

The patch trigger bug checks here in modprobe:

    BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);

kjournald starting.  Commit interval 5 seconds
EXT3-fs: mounted filesystem with ordered data mode.
VFS: Mounted root (ext3 filesystem) readonly.
Freeing unused kernel memory: 252k freed
------------[ cut here ]------------
kernel BUG at mm/mmap.c:2063!
invalid opcode: 0000 [1] SMP
CPU 0
Modules linked in:
Pid: 1123, comm: modprobe.sh Not tainted 2.6.25-rc3 #22
RIP: 0010:[<ffffffff80269368>]  [<ffffffff80269368>] exit_mmap+0xef/0xfa
RSP: 0000:ffff81003c79bed8  EFLAGS: 00010206
RAX: 0000000000000000 RBX: ffff810001004840 RCX: ffff81003c79bee0
RDX: 0000000000000000 RSI: ffff81003c5e8918 RDI: ffff81003d8048c0
RBP: 0000000000000000 R08: 0000000000000008 R09: ffff810002c00040
R10: 0000000000000002 R11: ffff810001009180 R12: ffff81003c57b800
R13: 0000000000000000 R14: 00000000005f0db0 R15: 00007fff3f2af234
FS:  00007f283714b6f0(0000) GS:ffffffff80694000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000458f40 CR3: 0000000000201000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process modprobe.sh (pid: 1123, threadinfo ffff81003c79a000, task ffff81003cf9ca50)
Stack:  0000000000000091 ffff810001004840 ffff81003c57b800 ffff81003c57b880
 0000000000000000 ffffffff8022f7bf 0000000000000001 0000000000000001
 ffff81003cf9ca50 ffffffff802349b6 0000000000000292 ffffffff80354c63
Call Trace:
 [<ffffffff8022f7bf>] mmput+0x30/0x9d
 [<ffffffff802349b6>] do_exit+0x223/0x66c
 [<ffffffff80354c63>] __up_read+0x13/0x8a
 [<ffffffff80234e6e>] do_group_exit+0x6f/0x8a
 [<ffffffff8020bd3b>] system_call_after_swapgs+0x7b/0x80


Code: 7b 18 e8 4a 5c 00 00 c7 43 08 00 00 00 00 eb 0b 48 89 ef e8 d1 fe ff ff 48 89 c5 48 85 ed 75 f0 49 modprobe.sh[1114]: segfault at 0 ip 7f998d2e972b sp 7fff959d8ed0 error 4 in libc-2.6.1.so[7f998d27b000+136000]


I didn't look into this but it shows how it would be risky to make
this change in .25. It's a bit strange that the bugcheck triggers
given I've preempt disabled (I mean CONFIG_PREEMPT_VOLUNTARY=y, nobody
should turn off that config option) and so even if code depended on
the implicit preempt_disable in spin_lock, no race should happen. The
down_read sections at first glance didn't seem capable of altering
nr_ptes, but I didn't look seriously into the above. I rediffed it
just to be 100% on par with EMM sleep-capabilities (but while
retaining more features and cleaner code I hope).

------------------
From: Christoph Lameter <clameter@sgi.com>
Subject: Conversion of i_mmap_lock to semaphore

Not there but the system boots and is usable. Complains about atomic
contexts because the tlb functions use a get_cpu() and thus disable preempt.

Not sure yet what to do about the cond_resched_lock stuff etc.


Convert i_mmap_lock to i_mmap_sem

The conversion to a rwsemaphore allows callbacks during rmap traversal
for files in a non atomic context. A rw style lock allows concurrent
walking of the reverse map.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 arch/x86/mm/hugetlbpage.c |    4 ++--
 fs/hugetlbfs/inode.c      |    4 ++--
 fs/inode.c                |    2 +-
 include/linux/fs.h        |    2 +-
 include/linux/mm.h        |    2 +-
 kernel/fork.c             |    4 ++--
 mm/filemap.c              |    8 ++++----
 mm/filemap_xip.c          |    4 ++--
 mm/fremap.c               |    4 ++--
 mm/hugetlb.c              |   11 +++++------
 mm/memory.c               |   28 ++++++++--------------------
 mm/migrate.c              |    4 ++--
 mm/mmap.c                 |   16 ++++++++--------
 mm/mremap.c               |    4 ++--
 mm/rmap.c                 |   20 +++++++++-----------
 15 files changed, 51 insertions(+), 66 deletions(-)

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -69,7 +69,7 @@ static void huge_pmd_share(struct mm_str
 	if (!vma_shareable(vma, addr))
 		return;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
 		if (svma == vma)
 			continue;
@@ -94,7 +94,7 @@ static void huge_pmd_share(struct mm_str
 		put_page(virt_to_page(spte));
 	spin_unlock(&mm->page_table_lock);
 out:
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 }
 
 /*
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -454,10 +454,10 @@ static int hugetlb_vmtruncate(struct ino
 	pgoff = offset >> PAGE_SHIFT;
 
 	i_size_write(inode, offset);
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	if (!prio_tree_empty(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 	truncate_hugepages(inode, offset);
 	return 0;
 }
diff --git a/fs/inode.c b/fs/inode.c
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -210,7 +210,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	rwlock_init(&inode->i_data.tree_lock);
-	spin_lock_init(&inode->i_data.i_mmap_lock);
+	init_rwsem(&inode->i_data.i_mmap_sem);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
 	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
diff --git a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -503,7 +503,7 @@ struct address_space {
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
-	spinlock_t		i_mmap_lock;	/* protect tree, count, list */
+	struct rw_semaphore	i_mmap_sem;	/* protect tree, count, list */
 	unsigned int		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		nrpages;	/* number of total pages */
 	pgoff_t			writeback_index;/* writeback starts here */
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -709,7 +709,7 @@ struct zap_details {
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
-	spinlock_t *i_mmap_lock;		/* For unmap_mapping_range: */
+	struct rw_semaphore *i_mmap_sem;	/* For unmap_mapping_range: */
 	unsigned long truncate_count;		/* Compare vm_truncate_count */
 };
 
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -274,12 +274,12 @@ static int dup_mmap(struct mm_struct *mm
 				atomic_dec(&inode->i_writecount);
 
 			/* insert tmp into the share list, just after mpnt */
-			spin_lock(&file->f_mapping->i_mmap_lock);
+			down_write(&file->f_mapping->i_mmap_sem);
 			tmp->vm_truncate_count = mpnt->vm_truncate_count;
 			flush_dcache_mmap_lock(file->f_mapping);
 			vma_prio_tree_add(tmp, mpnt);
 			flush_dcache_mmap_unlock(file->f_mapping);
-			spin_unlock(&file->f_mapping->i_mmap_lock);
+			up_write(&file->f_mapping->i_mmap_sem);
 		}
 
 		/*
diff --git a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -62,16 +62,16 @@ generic_file_direct_IO(int rw, struct ki
 /*
  * Lock ordering:
  *
- *  ->i_mmap_lock		(vmtruncate)
+ *  ->i_mmap_sem		(vmtruncate)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
  *  ->i_mutex
- *    ->i_mmap_lock		(truncate->unmap_mapping_range)
+ *    ->i_mmap_sem		(truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
- *    ->i_mmap_lock
+ *    ->i_mmap_sem
  *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
  *
@@ -88,7 +88,7 @@ generic_file_direct_IO(int rw, struct ki
  *    ->sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *
- *  ->i_mmap_lock
+ *  ->i_mmap_sem
  *    ->anon_vma.lock		(vma_adjust)
  *
  *  ->anon_vma.lock
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -184,7 +184,7 @@ __xip_unmap (struct address_space * mapp
 	if (!page)
 		return;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		mm = vma->vm_mm;
 		address = vma->vm_start +
@@ -204,7 +204,7 @@ __xip_unmap (struct address_space * mapp
 			page_cache_release(page);
 		}
 	}
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 }
 
 /*
diff --git a/mm/fremap.c b/mm/fremap.c
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -206,13 +206,13 @@ asmlinkage long sys_remap_file_pages(uns
 			}
 			goto out;
 		}
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		flush_dcache_mmap_lock(mapping);
 		vma->vm_flags |= VM_NONLINEAR;
 		vma_prio_tree_remove(vma, &mapping->i_mmap);
 		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		flush_dcache_mmap_unlock(mapping);
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 	}
 
 	mmu_notifier_invalidate_range_begin(mm, start, start + size);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -746,7 +746,7 @@ void __unmap_hugepage_range(struct vm_ar
 	struct page *page;
 	struct page *tmp;
 	/*
-	 * A page gathering list, protected by per file i_mmap_lock. The
+	 * A page gathering list, protected by per file i_mmap_sem. The
 	 * lock is used to avoid list corruption from multiple unmapping
 	 * of the same page since we are using page->lru.
 	 */
@@ -796,9 +796,9 @@ void unmap_hugepage_range(struct vm_area
 	 * do nothing in this case.
 	 */
 	if (vma->vm_file) {
-		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+		down_write(&vma->vm_file->f_mapping->i_mmap_sem);
 		__unmap_hugepage_range(vma, start, end);
-		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+		up_write(&vma->vm_file->f_mapping->i_mmap_sem);
 	}
 }
 
@@ -1041,7 +1041,7 @@ void hugetlb_change_protection(struct vm
 	BUG_ON(address >= end);
 	flush_cache_range(vma, address, end);
 
-	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+	down_read(&vma->vm_file->f_mapping->i_mmap_sem);
 	spin_lock(&mm->page_table_lock);
 	for (; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
@@ -1056,7 +1056,7 @@ void hugetlb_change_protection(struct vm
 		}
 	}
 	spin_unlock(&mm->page_table_lock);
-	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+	up_read(&vma->vm_file->f_mapping->i_mmap_sem);
 
 	flush_tlb_range(vma, start, end);
 }
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -832,7 +832,6 @@ unsigned long unmap_vmas(struct mmu_gath
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
-	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
 	int fullmm = (*tlbp)->fullmm;
 
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
@@ -870,21 +869,11 @@ unsigned long unmap_vmas(struct mmu_gath
 
 			tlb_finish_mmu(*tlbp, tlb_start, start);
 
-			if (need_resched() ||
-				(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-				if (i_mmap_lock) {
-					*tlbp = NULL;
-					goto out;
-				}
-				cond_resched();
-			}
-
 			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
 			tlb_start_valid = 0;
 			zap_work = ZAP_BLOCK_SIZE;
 		}
 	}
-out:
 	return start;	/* which is now the end (or restart) address */
 }
 
@@ -1746,7 +1735,7 @@ unwritable_page:
 /*
  * Helper functions for unmap_mapping_range().
  *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
+ * __ Notes on dropping i_mmap_sem to reduce latency while unmapping __
  *
  * We have to restart searching the prio_tree whenever we drop the lock,
  * since the iterator is only valid while the lock is held, and anyway
@@ -1765,7 +1754,7 @@ unwritable_page:
  * can't efficiently keep all vmas in step with mapping->truncate_count:
  * so instead reset them all whenever it wraps back to 0 (then go to 1).
  * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
+ * i_mmap_sem.
  *
  * In order to make forward progress despite repeatedly restarting some
  * large vma, note the restart_addr from unmap_vmas when it breaks out:
@@ -1815,7 +1804,7 @@ again:
 
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
-	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
+	need_break = need_resched();
 
 	if (restart_addr >= end_addr) {
 		/* We have now completed this vma: mark it so */
@@ -1829,9 +1818,9 @@ again:
 			goto again;
 	}
 
-	spin_unlock(details->i_mmap_lock);
+	up_write(details->i_mmap_sem);
 	cond_resched();
-	spin_lock(details->i_mmap_lock);
+	down_write(details->i_mmap_sem);
 	return -EINTR;
 }
 
@@ -1925,9 +1914,9 @@ void unmap_mapping_range(struct address_
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
-	details.i_mmap_lock = &mapping->i_mmap_lock;
+	details.i_mmap_sem = &mapping->i_mmap_sem;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_write(&mapping->i_mmap_sem);
 
 	/* Protect against endless unmapping loops */
 	mapping->truncate_count++;
@@ -1942,7 +1931,7 @@ void unmap_mapping_range(struct address_
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
-	spin_unlock(&mapping->i_mmap_lock);
+	up_write(&mapping->i_mmap_sem);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -202,12 +202,12 @@ static void remove_file_migration_ptes(s
 	if (!mapping)
 		return;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
 		remove_migration_pte(vma, old, new);
 
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 }
 
 /*
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -187,7 +187,7 @@ error:
 }
 
 /*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_sem
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		struct file *file, struct address_space *mapping)
@@ -215,9 +215,9 @@ void unlink_file_vma(struct vm_area_stru
 
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		__remove_shared_vm_struct(vma, file, mapping);
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 	}
 }
 
@@ -440,7 +440,7 @@ static void vma_link(struct mm_struct *m
 		mapping = vma->vm_file->f_mapping;
 
 	if (mapping) {
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		vma->vm_truncate_count = mapping->truncate_count;
 	}
 	anon_vma_lock(vma);
@@ -450,7 +450,7 @@ static void vma_link(struct mm_struct *m
 
 	anon_vma_unlock(vma);
 	if (mapping)
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 
 	mm->map_count++;
 	validate_mm(mm);
@@ -537,7 +537,7 @@ again:			remove_next = 1 + (end > next->
 		mapping = file->f_mapping;
 		if (!(vma->vm_flags & VM_NONLINEAR))
 			root = &mapping->i_mmap;
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		if (importer &&
 		    vma->vm_truncate_count != next->vm_truncate_count) {
 			/*
@@ -621,7 +621,7 @@ again:			remove_next = 1 + (end > next->
 	if (anon_vma)
 		spin_unlock(&anon_vma->lock);
 	if (mapping)
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 
 	if (remove_next) {
 		if (file)
@@ -2065,7 +2065,7 @@ void exit_mmap(struct mm_struct *mm)
 
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_sem is taken here.
  */
 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
diff --git a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -85,7 +85,7 @@ static void move_ptes(struct vm_area_str
 		 * and we propagate stale pages into the dst afterward.
 		 */
 		mapping = vma->vm_file->f_mapping;
-		spin_lock(&mapping->i_mmap_lock);
+		down_write(&mapping->i_mmap_sem);
 		if (new_vma->vm_truncate_count &&
 		    new_vma->vm_truncate_count != vma->vm_truncate_count)
 			new_vma->vm_truncate_count = 0;
@@ -121,7 +121,7 @@ static void move_ptes(struct vm_area_str
 	pte_unmap_nested(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
-		spin_unlock(&mapping->i_mmap_lock);
+		up_write(&mapping->i_mmap_sem);
 }
 
 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
  *   inode->i_alloc_sem (vmtruncate_range)
  *   mm->mmap_sem
  *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_lock
+ *       mapping->i_mmap_sem
  *         anon_vma->lock
  *           mm->page_table_lock or pte_lock
  *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -372,14 +372,14 @@ static int page_referenced_file(struct p
 	 * The page lock not only makes sure that page->mapping cannot
 	 * suddenly be NULLified by truncation, it makes sure that the
 	 * structure at mapping cannot be freed and reused yet,
-	 * so we can safely take mapping->i_mmap_lock.
+	 * so we can safely take mapping->i_mmap_sem.
 	 */
 	BUG_ON(!PageLocked(page));
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 
 	/*
-	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
+	 * i_mmap_sem does not stabilize mapcount at all, but mapcount
 	 * is more likely to be accurate if we note it after spinning.
 	 */
 	mapcount = page_mapcount(page);
@@ -402,7 +402,7 @@ static int page_referenced_file(struct p
 			break;
 	}
 
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 	return referenced;
 }
 
@@ -487,12 +487,12 @@ static int page_mkclean_file(struct addr
 
 	BUG_ON(PageAnon(page));
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		if (vma->vm_flags & VM_SHARED)
 			ret += page_mkclean_one(page, vma);
 	}
-	spin_unlock(&mapping->i_mmap_lock);
+	up_read(&mapping->i_mmap_sem);
 	return ret;
 }
 
@@ -924,7 +924,7 @@ static int try_to_unmap_file(struct page
 	unsigned long max_nl_size = 0;
 	unsigned int mapcount;
 
-	spin_lock(&mapping->i_mmap_lock);
+	down_read(&mapping->i_mmap_sem);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
@@ -961,7 +961,6 @@ static int try_to_unmap_file(struct page
 	mapcount = page_mapcount(page);
 	if (!mapcount)
 		goto out;
-	cond_resched_lock(&mapping->i_mmap_lock);
 
 	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 	if (max_nl_cursor == 0)
@@ -983,7 +982,6 @@ static int try_to_unmap_file(struct page
 			}
 			vma->vm_private_data = (void *) max_nl_cursor;
 		}
-		cond_resched_lock(&mapping->i_mmap_lock);
 		max_nl_cursor += CLUSTER_SIZE;
 	} while (max_nl_cursor <= max_nl_size);
 
@@ -995,7 +993,7 @@ static int try_to_unmap_file(struct page
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 		vma->vm_private_data = NULL;
 out:
-	spin_unlock(&mapping->i_mmap_lock);
+	up_write(&mapping->i_mmap_sem);
 	return ret;
 }
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 15:23                                     ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Andrea Arcangeli
  2008-03-07 15:52                                       ` [PATCH] 4/4 i_mmap_lock spinlock2rwsem " Andrea Arcangeli
@ 2008-03-07 16:52                                       ` Peter Zijlstra
  2008-03-07 17:50                                         ` Andrea Arcangeli
  2008-03-07 20:00                                       ` Christoph Lameter
  2 siblings, 1 reply; 120+ messages in thread
From: Peter Zijlstra @ 2008-03-07 16:52 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Christoph Lameter, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, 2008-03-07 at 16:23 +0100, Andrea Arcangeli wrote:


> @@ -42,11 +45,19 @@ int __mmu_notifier_clear_flush_young(str
>  	struct mmu_notifier *mn;
>  	struct hlist_node *n;
>  	int young = 0;
> +	unsigned seq;
>  
>  	rcu_read_lock();
> +restart:
> +	seq = read_seqbegin(&mm->mmu_notifier_lock);
>  	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
> -		if (mn->ops->clear_flush_young)
> +		if (mn->ops->clear_flush_young) {

hlist_del_rcu(&mn->hlist)

> +			rcu_read_unlock();

kfree(mn);

>  			young |= mn->ops->clear_flush_young(mn, mm, address);

*BANG*

> +			rcu_read_lock();
> +		}
> +		if (read_seqretry(&mm->mmu_notifier_lock, seq))
> +			goto restart;
>  	}
>  	rcu_read_unlock();




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 16:52                                       ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Peter Zijlstra
@ 2008-03-07 17:50                                         ` Andrea Arcangeli
  2008-03-07 18:01                                           ` Peter Zijlstra
  2008-03-07 20:10                                           ` Christoph Lameter
  0 siblings, 2 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-07 17:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, Mar 07, 2008 at 05:52:42PM +0100, Peter Zijlstra wrote:
> hlist_del_rcu(&mn->hlist)
> 
> > +			rcu_read_unlock();
> 
> kfree(mn);
> 
> >  			young |= mn->ops->clear_flush_young(mn, mm, address);
> 
> *BANG*

My objective was to allow mmu_notifier_register/unregister to be
called with the same mmu notifier object, I didn't mean the object
could have been freed until ->release is called. However you reminded
me that after unregistering ->release won't be called so unregister
isn't very useful and I doubt we can keep it ;).

In the meantime I've also been thinking that we could need the
write_seqlock in mmu_notifier_register, to know when to restart the
loop if somebody does a mmu_notifier_register;
synchronize_rcu(). Otherwise there's no way to be sure the mmu
notifier will start firing immediately after synchronize_rcu. I'm
unsure if it's acceptable that in-progress mmu notifier invocations,
don't need to notice the fact that somebody did mmu_notifier_register;
synchronize_rcu. If they don't need to notice, then we can just drop
unregister and all rcu_read_lock()s instead of adding write_seqlock to
the register operation.

Overall my effort is to try to avoid expand the list walk with
explicit memory barriers like in EMM while trying to be equally
efficient.

Another issue is that the _begin/_end logic doesn't provide any
guarantee that the _begin will start firing before _end, if a kernel
module is loaded while another cpu is already running inside some
munmap operation etc.. The KVM usage of mmu notifier has no problem
with that detail, but KVM doesn't use _begin at all, I wonder if
others would have problems. This is a kind of a separate problem, but
quite related to the question if the notifiers must be guaranteed to
start firing immediately after mmu_notifier_unregister;synchronize_rcu
or not, that's why I mentioned it here.

Once I get comments on the suggested direction for these details, I'll
quickly repost a replacement patch for 3/4.

Thanks Peter!

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 17:50                                         ` Andrea Arcangeli
@ 2008-03-07 18:01                                           ` Peter Zijlstra
  2008-03-07 18:45                                             ` Andrea Arcangeli
  2008-03-07 20:10                                           ` Christoph Lameter
  1 sibling, 1 reply; 120+ messages in thread
From: Peter Zijlstra @ 2008-03-07 18:01 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Christoph Lameter, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, 2008-03-07 at 18:50 +0100, Andrea Arcangeli wrote:

> Overall my effort is to try to avoid expand the list walk with
> explicit memory barriers like in EMM while trying to be equally
> efficient.

I think we can do with a smb_wmb(); like Christoph (and like
hlist_add_rcu()), but replace the smb_rmb() Christoph has with a
smp_read_barrier_depends().

That should give much the same results.

The reason Christoph can do without RCU is because he doesn't allow
unregister, and as soon as you drop that you'll end up with something
similar.

> Another issue is that the _begin/_end logic doesn't provide any
> guarantee that the _begin will start firing before _end, if a kernel
> module is loaded while another cpu is already running inside some
> munmap operation etc.. The KVM usage of mmu notifier has no problem
> with that detail, but KVM doesn't use _begin at all, I wonder if
> others would have problems. This is a kind of a separate problem, but
> quite related to the question if the notifiers must be guaranteed to
> start firing immediately after mmu_notifier_unregister;synchronize_rcu
> or not, that's why I mentioned it here.

Curious problem indeed. Would it make sense to require registering these
MMU notifiers when the process is still single threaded along with the
requirement that they can never be removed again from a running process?

For KVM this should be quite doable, but I must admit I haven't been
paying enough attention to know if its possible for these other users.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 18:01                                           ` Peter Zijlstra
@ 2008-03-07 18:45                                             ` Andrea Arcangeli
  2008-03-07 19:47                                               ` Andrea Arcangeli
  2008-03-07 20:12                                               ` Christoph Lameter
  0 siblings, 2 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-07 18:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, Mar 07, 2008 at 07:01:35PM +0100, Peter Zijlstra wrote:
> The reason Christoph can do without RCU is because he doesn't allow
> unregister, and as soon as you drop that you'll end up with something

Not sure to follow, what do you mean "he doesn't allow"? We'll also
have to rip unregister regardless after you pointed out the ->release
won't be called after calling my mmu_notifier_unregister in 3/4. If
you figured out how to retain mmu_notifier_unregister I'm not seeing
it anymore.

> Curious problem indeed. Would it make sense to require registering these
> MMU notifiers when the process is still single threaded along with the
> requirement that they can never be removed again from a running process?

I'm afraid that won't help much (even if the mmu notifiers users could
cope with that restriction like KVM can) because the VM will run
concurrently in another CPU despite the task is single threaded. See
2/4 in try_to_unmap_cluster: _start/end are not only invoked in the
context of the current task.

PS. this problem I pointed out of _end possibly called before _begin
is the same for #v9 and EMM V1 as far as I can tell.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 18:45                                             ` Andrea Arcangeli
@ 2008-03-07 19:47                                               ` Andrea Arcangeli
  2008-03-07 20:15                                                 ` Christoph Lameter
  2008-03-07 20:12                                               ` Christoph Lameter
  1 sibling, 1 reply; 120+ messages in thread
From: Andrea Arcangeli @ 2008-03-07 19:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, Mar 07, 2008 at 07:45:52PM +0100, Andrea Arcangeli wrote:
> On Fri, Mar 07, 2008 at 07:01:35PM +0100, Peter Zijlstra wrote:
> > The reason Christoph can do without RCU is because he doesn't allow
> > unregister, and as soon as you drop that you'll end up with something
> 
> Not sure to follow, what do you mean "he doesn't allow"? We'll also
> have to rip unregister regardless after you pointed out the ->release
> won't be called after calling my mmu_notifier_unregister in 3/4. If
> you figured out how to retain mmu_notifier_unregister I'm not seeing
> it anymore.

Given I don't see other (buggy ;) ways anymore to retain
mmu_notifier_unregister, I did like in EMM and I dropped the
unregister function.

To me it looks like this will be enough and equally efficient as the
expanded version in EMM that is not using the highlevel hlist_rcu
macros. If you can see any pitfall let me know! Thanks a lot for the
help.

------
This is a replacement for the previously posted 3/4, one of the pieces
to allow the mmu notifier methods to sleep.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -70,17 +70,6 @@ static inline int mm_has_notifiers(struc
  */
 extern void mmu_notifier_register(struct mmu_notifier *mn,
 				  struct mm_struct *mm);
-/*
- * Must hold the mmap_sem for write.
- *
- * RCU is used to traverse the list. A quiescent period needs to pass
- * before the "struct mmu_notifier" can be freed. Alternatively it
- * can be synchronously freed inside ->release when the list can't
- * change anymore and nobody could possibly walk it.
- */
-extern void mmu_notifier_unregister(struct mmu_notifier *mn,
-				    struct mm_struct *mm);
-
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long address);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -43,12 +43,10 @@ int __mmu_notifier_clear_flush_young(str
 	struct hlist_node *n;
 	int young = 0;
 
-	rcu_read_lock();
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
 		if (mn->ops->clear_flush_young)
 			young |= mn->ops->clear_flush_young(mn, mm, address);
 	}
-	rcu_read_unlock();
 
 	return young;
 }
@@ -59,12 +57,10 @@ void __mmu_notifier_invalidate_page(stru
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
 
-	rcu_read_lock();
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
 		if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	rcu_read_unlock();
 }
 
 void __mmu_notifier_invalidate_range_begin(struct mm_struct *mm,
@@ -73,12 +69,10 @@ void __mmu_notifier_invalidate_range_beg
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
 
-	rcu_read_lock();
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
 		if (mn->ops->invalidate_range_begin)
 			mn->ops->invalidate_range_begin(mn, mm, start, end);
 	}
-	rcu_read_unlock();
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -87,12 +81,10 @@ void __mmu_notifier_invalidate_range_end
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
 
-	rcu_read_lock();
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, mm, start, end);
 	}
-	rcu_read_unlock();
 }
 
 /*
@@ -106,9 +98,3 @@ void mmu_notifier_register(struct mmu_no
 	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_list);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_register);
-
-void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
-{
-	hlist_del_rcu(&mn->hlist);
-}
-EXPORT_SYMBOL_GPL(mmu_notifier_unregister);

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 2/4 move all invalidate_page outside of PT lock (#v9 was 1/4)
  2008-03-07 15:17                                   ` [PATCH] 2/4 move all invalidate_page outside of PT lock (#v9 was 1/4) Andrea Arcangeli
  2008-03-07 15:23                                     ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Andrea Arcangeli
@ 2008-03-07 19:54                                     ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-07 19:54 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, 7 Mar 2008, Andrea Arcangeli wrote:

> This below simple patch invalidates the "invalidate_page" part, the
> next patch will invalidate the RCU part, and btw in a way that doesn't
> forbid unregistering the mmu notifiers at runtime (like your brand new
> EMM does).

Sounds good.

> The reason I keep this incremental (unlike your EMM that does
> everything all at the same time mixed in a single patch) is to
> decrease the non obviously safe mangling over mm/* during .25. The
> below patch is simple, but not as obviously safe as
> s/ptep_clear_flush/ptep_clear_flush_notify/.

There was never a chance to merge for .25. Lets drop that and focus on 
a solution that is good for all.

>  #endif /* _LINUX_MMU_NOTIFIER_H */
> diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
> --- a/mm/filemap_xip.c
> +++ b/mm/filemap_xip.c
> @@ -194,11 +194,13 @@ __xip_unmap (struct address_space * mapp
>  		if (pte) {
>  			/* Nuke the page table entry. */
>  			flush_cache_page(vma, address, pte_pfn(*pte));
> -			pteval = ptep_clear_flush_notify(vma, address, pte);
> +			pteval = ptep_clear_flush(vma, address, pte);
>  			page_remove_rmap(page, vma);
>  			dec_mm_counter(mm, file_rss);
>  			BUG_ON(pte_dirty(pteval));
>  			pte_unmap_unlock(pte, ptl);
> +			/* must invalidate_page _before_ freeing the page */
> +			mmu_notifier_invalidate_page(mm, address);
>  			page_cache_release(page);
>  		}
>  	}

Ok but we still hold the i_mmap_lock here.


> @@ -834,6 +846,8 @@ static void try_to_unmap_cluster(unsigne
>  	if (!pmd_present(*pmd))
>  		return;
>  
> +	start = address;
> +	mmu_notifier_invalidate_range_begin(mm, start, end);

Hmmmm.. Okay you going for range invalidate here like EMM but there are 
still some invalidate_pages() left.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 15:23                                     ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Andrea Arcangeli
  2008-03-07 15:52                                       ` [PATCH] 4/4 i_mmap_lock spinlock2rwsem " Andrea Arcangeli
  2008-03-07 16:52                                       ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Peter Zijlstra
@ 2008-03-07 20:00                                       ` Christoph Lameter
  2 siblings, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-07 20:00 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, 7 Mar 2008, Andrea Arcangeli wrote:

> This combines the non-sleep-capable RCU locking of #v9 with a seqlock
> so the mmu notifier fast path will require zero cacheline
> writes/bouncing while still providing mmu_notifier_unregister and
> allowing to schedule inside the mmu notifier methods. If we drop
> mmu_notifier_unregister we can as well drop all seqlock and
> rcu_read_lock()s. But this locking scheme combination is sexy enough
> and 100% scalable (the mmu_notifier_list cacheline will be preloaded
> anyway and that will most certainly include the sequence number value
> in l1 for free even in Christoph's NUMA systems) so IMHO it worth to
> keep mmu_notifier_unregister.

Well its adds lots of processing. Not sure if its really worth it. Seems 
that this scheme cannot work since the existence of the structure passed 
to the callbacks is not guaranteed since the RCU locks are not held. You 
need some kind of a refcount to give the existence guarantee.

> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -20,7 +20,9 @@ void __mmu_notifier_release(struct mm_st
>  void __mmu_notifier_release(struct mm_struct *mm)
>  {
>  	struct mmu_notifier *mn;
> +	unsigned seq;
>  
> +	seq = read_seqbegin(&mm->mmu_notifier_lock);
>  	while (unlikely(!hlist_empty(&mm->mmu_notifier_list))) {
>  		mn = hlist_entry(mm->mmu_notifier_list.first,
>  				 struct mmu_notifier,
> @@ -28,6 +30,7 @@ void __mmu_notifier_release(struct mm_st
>  		hlist_del(&mn->hlist);
>  		if (mn->ops->release)
>  			mn->ops->release(mn, mm);
> +		BUG_ON(read_seqretry(&mm->mmu_notifier_lock, seq));
>  	}
>  }

So this is only for sanity checking? The BUG_ON detects concurrent 
operations that should not happen? Need a comment here.


> @@ -42,11 +45,19 @@ int __mmu_notifier_clear_flush_young(str
>  	struct mmu_notifier *mn;
>  	struct hlist_node *n;
>  	int young = 0;
> +	unsigned seq;
>  
>  	rcu_read_lock();
> +restart:
> +	seq = read_seqbegin(&mm->mmu_notifier_lock);
>  	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
> -		if (mn->ops->clear_flush_young)
> +		if (mn->ops->clear_flush_young) {
> +			rcu_read_unlock();
>  			young |= mn->ops->clear_flush_young(mn, mm, address);
> +			rcu_read_lock();
> +		}
> +		if (read_seqretry(&mm->mmu_notifier_lock, seq))
> +			goto restart;

Great innovative idea of the seqlock for versioning checks.

>  	}
>  	rcu_read_unlock();
>  

Well that gets pretty sophisticated here. If you drop the rcu lock then 
the entity pointed to by mn can go away right? So how can you pass that 
structure to clear_flush_young? What is guaranteeing the existence of the 
structure?


> @@ -58,11 +69,19 @@ void __mmu_notifier_invalidate_page(stru
>  {
>  	struct mmu_notifier *mn;
>  	struct hlist_node *n;
> +	unsigned seq;
>  
>  	rcu_read_lock();
> +restart:
> +	seq = read_seqbegin(&mm->mmu_notifier_lock);
>  	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_list, hlist) {
> -		if (mn->ops->invalidate_page)
> +		if (mn->ops->invalidate_page) {
> +			rcu_read_unlock();
>  			mn->ops->invalidate_page(mn, mm, address);

Ditto structure can vanish since no existence guarantee exists.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 4/4 i_mmap_lock spinlock2rwsem (#v9 was 1/4)
  2008-03-07 15:52                                       ` [PATCH] 4/4 i_mmap_lock spinlock2rwsem " Andrea Arcangeli
@ 2008-03-07 20:03                                         ` Christoph Lameter
  2008-03-19 21:27                                         ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-07 20:03 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, 7 Mar 2008, Andrea Arcangeli wrote:

> I didn't look into this but it shows how it would be risky to make
> this change in .25. It's a bit strange that the bugcheck triggers

Yes this was never intended for .25. I think we need to split this into a 
copule of patches. One needs to get rid of the spinlock dropping, then one 
that deals with the read concurrency issues and finally one that converts 
the spinlock. Thanks for looking at it.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 17:50                                         ` Andrea Arcangeli
  2008-03-07 18:01                                           ` Peter Zijlstra
@ 2008-03-07 20:10                                           ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-07 20:10 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Peter Zijlstra, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, 7 Mar 2008, Andrea Arcangeli wrote:

> In the meantime I've also been thinking that we could need the
> write_seqlock in mmu_notifier_register, to know when to restart the
> loop if somebody does a mmu_notifier_register;
> synchronize_rcu(). Otherwise there's no way to be sure the mmu
> notifier will start firing immediately after synchronize_rcu. I'm
> unsure if it's acceptable that in-progress mmu notifier invocations,
> don't need to notice the fact that somebody did mmu_notifier_register;
> synchronize_rcu. If they don't need to notice, then we can just drop
> unregister and all rcu_read_lock()s instead of adding write_seqlock to
> the register operation.

This is all getting into some very complicated issues.....

> Overall my effort is to try to avoid expand the list walk with
> explicit memory barriers like in EMM while trying to be equally
> efficient.

The smp_rmb is such a big problem? You have seqlock, rcu etc all in there 
as well. I doubt that this is more efficient.

> Another issue is that the _begin/_end logic doesn't provide any
> guarantee that the _begin will start firing before _end, if a kernel
> module is loaded while another cpu is already running inside some
> munmap operation etc.. The KVM usage of mmu notifier has no problem
> with that detail, but KVM doesn't use _begin at all, I wonder if
> others would have problems. This is a kind of a separate problem, but
> quite related to the question if the notifiers must be guaranteed to
> start firing immediately after mmu_notifier_unregister;synchronize_rcu
> or not, that's why I mentioned it here.

Ahh. Yes that is an interesting issue. If a device driver cannot handle 
this then _begin must prohibit module loading. That means not allowing 
stop_machine_run I guess which should not be that difficult.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 18:45                                             ` Andrea Arcangeli
  2008-03-07 19:47                                               ` Andrea Arcangeli
@ 2008-03-07 20:12                                               ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-07 20:12 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Peter Zijlstra, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, 7 Mar 2008, Andrea Arcangeli wrote:

> PS. this problem I pointed out of _end possibly called before _begin
> is the same for #v9 and EMM V1 as far as I can tell.

Hmmm.. We could just push that on the driver saying that is has to 
tolerate it. Otherwise how can we solve this?


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep (#v9 was 1/4)
  2008-03-07 19:47                                               ` Andrea Arcangeli
@ 2008-03-07 20:15                                                 ` Christoph Lameter
  0 siblings, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-07 20:15 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Peter Zijlstra, Jack Steiner, Nick Piggin, akpm, Robin Holt,
	Avi Kivity, kvm-devel, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

On Fri, 7 Mar 2008, Andrea Arcangeli wrote:

> This is a replacement for the previously posted 3/4, one of the pieces
> to allow the mmu notifier methods to sleep.

Looks good. That is what we talked about last week. What guarantees now 
that we see the cacheline referenced after the cacheline that 
contains the pointer that was changed? hlist_for_reach does a 
rcu_dereference with implied memory barrier? So its like EMM?

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] 4/4 i_mmap_lock spinlock2rwsem (#v9 was 1/4)
  2008-03-07 15:52                                       ` [PATCH] 4/4 i_mmap_lock spinlock2rwsem " Andrea Arcangeli
  2008-03-07 20:03                                         ` Christoph Lameter
@ 2008-03-19 21:27                                         ` Christoph Lameter
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Lameter @ 2008-03-19 21:27 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Nick Piggin, akpm, Robin Holt, Avi Kivity,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, linux-kernel, linux-mm, daniel.blueman

You need this patch to address the issues (that I already mentioned when I 
sent the patch to you). New EMM notifier patch with sleeping coming soon.

From: Christoph Lameter <clameter@sgi.com>
Subject: Move tlb flushing into free_pgtables

Move the tlb flushing into free_pgtables. The conversion of the locks
taken for reverse map scanning would require taking sleeping locks
in free_pgtables. Moving the tlb flushing into free_pgtables allows
sleeping in part of free_pgtables().

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/mm.h |    4 ++--
 mm/memory.c        |   14 ++++++++++----
 mm/mmap.c          |    6 +++---
 3 files changed, 15 insertions(+), 9 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2008-03-19 13:30:51.460856986 -0700
+++ linux-2.6/include/linux/mm.h	2008-03-19 13:31:20.809377398 -0700
@@ -751,8 +751,8 @@ int walk_page_range(const struct mm_stru
 		    void *private);
 void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
-		unsigned long floor, unsigned long ceiling);
+void free_pgtables(struct vm_area_struct *start_vma, unsigned long floor,
+						unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2008-03-19 13:29:06.007351495 -0700
+++ linux-2.6/mm/memory.c	2008-03-19 13:46:31.352774359 -0700
@@ -271,9 +271,11 @@ void free_pgd_range(struct mmu_gather **
 	} while (pgd++, addr = next, addr != end);
 }
 
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
-		unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct vm_area_struct *vma, unsigned long floor,
+							unsigned long ceiling)
 {
+	struct mmu_gather *tlb;
+
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
@@ -285,8 +287,10 @@ void free_pgtables(struct mmu_gather **t
 		unlink_file_vma(vma);
 
 		if (is_vm_hugetlb_page(vma)) {
-			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
+			tlb = tlb_gather_mmu(vma->vm_mm, 0);
+			hugetlb_free_pgd_range(&tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
+			tlb_finish_mmu(tlb, addr, vma->vm_end);
 		} else {
 			/*
 			 * Optimization: gather nearby vmas into one call down
@@ -298,8 +302,10 @@ void free_pgtables(struct mmu_gather **t
 				anon_vma_unlink(vma);
 				unlink_file_vma(vma);
 			}
-			free_pgd_range(tlb, addr, vma->vm_end,
+			tlb = tlb_gather_mmu(vma->vm_mm, 0);
+			free_pgd_range(&tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
+			tlb_finish_mmu(tlb, addr, vma->vm_end);
 		}
 		vma = next;
 	}
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c	2008-03-19 13:29:48.659889667 -0700
+++ linux-2.6/mm/mmap.c	2008-03-19 13:30:36.296604891 -0700
@@ -1750,9 +1750,9 @@ static void unmap_region(struct mm_struc
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
-				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
+	free_pgtables(vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+				 next? next->vm_start: 0);
 	emm_notify(mm, emm_invalidate_end, start, end);
 }
 
@@ -2049,8 +2049,8 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
+	free_pgtables(vma, FIRST_USER_ADDRESS, 0);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,


^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] KVM swapping with MMU Notifiers V7
  2008-02-16 11:51   ` Robin Holt
@ 2008-02-18 12:35     ` Andrea Arcangeli
  0 siblings, 0 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-18 12:35 UTC (permalink / raw)
  To: Robin Holt
  Cc: Christoph Lameter, akpm, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman

On Sat, Feb 16, 2008 at 05:51:38AM -0600, Robin Holt wrote:
> I am doing this in xpmem with a stack-based structure in the function
> calling get_user_pages.  That structure describes the start and
> end address of the range we are doing the get_user_pages on.  If an
> invalidate_range_begin comes in while we are off to the kernel doing
> the get_user_pages, the invalidate_range_begin marks that structure
> indicating an invalidate came in.  When the get_user_pages gets the
> structures relocked, it checks that flag (really a generation counter)
> and if it is set, retries the get_user_pages.  After 3 retries, it
> returns -EAGAIN and the fault is started over from the remote side.

A seqlock sounds a good optimization for the non-swapping fast path, a
per-VM-guest seqlock number can allow us to know when we need to worry
to call get_user_pages a second time, but won't be really a retry like
in 99% of seqlock usages for the reader side, but just a second
get_user_pages to trigger a minor fault. Then if the page is different
in the second run, we'll really retry (so not in function of the
seqlock but in function of the get_user_pages page array), and there's
no risk of livelocks because get_user_pages returning a different page
won't be the common case. The seqlock should be increased first before
the invalidate and a second time once the invalidate is over.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] KVM swapping with MMU Notifiers V7
  2008-02-16 11:08   ` Andrew Morton
@ 2008-02-18 12:17     ` Andrea Arcangeli
  0 siblings, 0 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-18 12:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Lameter, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman

On Sat, Feb 16, 2008 at 03:08:17AM -0800, Andrew Morton wrote:
> On Sat, 16 Feb 2008 11:48:27 +0100 Andrea Arcangeli <andrea@qumranet.com> wrote:
> 
> > +void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> > +					   struct mm_struct *mm,
> > +					   unsigned long start, unsigned long end,
> > +					   int lock)
> > +{
> > +	for (; start < end; start += PAGE_SIZE)
> > +		kvm_mmu_notifier_invalidate_page(mn, mm, start);
> > +}
> > +
> > +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> > +	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
> > +	.age_page		= kvm_mmu_notifier_age_page,
> > +	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
> > +};
> 
> So this doesn't implement ->invalidate_range_start().

Correct. range_start is needed by subsystems that don't pin the pages
(so they've to drop the secondary mmu mappings on the physical page
before the page is released by the linux VM).

> By what means does it prevent new mappings from being established in the
> range after core mm has tried to call ->invalidate_rande_start()?
> mmap_sem, I assume?

No, populate range only takes the mmap_sem in read mode and the kvm page
fault also is of course taking it only in read mode.

What makes it safe, is that invalidate_range_end is called _after_ the
linux pte is clear. The kvm page fault, if it triggers, it will call
into get_user_pages again to re-establish the linux pte _before_
establishing the spte.

It's the same reason why it's safe to flush the tlb after clearing the
linux pte. sptes are like a secondary tlb.

> > +			/* set userspace_addr atomically for kvm_hva_to_rmapp */
> > +			spin_lock(&kvm->mmu_lock);
> > +			memslot->userspace_addr = userspace_addr;
> > +			spin_unlock(&kvm->mmu_lock);
> 
> are you sure?  kvm_unmap_hva() and kvm_age_hva() read ->userspace_addr a
> single time and it doesn't immediately look like there's a need to take the
> lock here?

gcc will always write it with a movq but this is to be
C-specs-compliant and because this is by far not a performance
critical path I thought it was simpler than some other atomic move in
a single insn.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] KVM swapping with MMU Notifiers V7
  2008-02-16 10:48 ` [PATCH] KVM swapping with " Andrea Arcangeli
  2008-02-16 11:08   ` Andrew Morton
@ 2008-02-16 11:51   ` Robin Holt
  2008-02-18 12:35     ` Andrea Arcangeli
  1 sibling, 1 reply; 120+ messages in thread
From: Robin Holt @ 2008-02-16 11:51 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Christoph Lameter, akpm, Robin Holt, Avi Kivity, Izik Eidus,
	kvm-devel, Peter Zijlstra, general, Steve Wise, Roland Dreier,
	Kanoj Sarcar, steiner, linux-kernel, linux-mm, daniel.blueman

On Sat, Feb 16, 2008 at 11:48:27AM +0100, Andrea Arcangeli wrote:
> Those below two patches enable KVM to swap the guest physical memory
> through Christoph's V7.
> 
> There's one last _purely_theoretical_ race condition I figured out and
> that I'm wondering how to best fix. The race condition worst case is
> that a few guest physical pages could remain pinned by sptes. The race
> can materialize if the linux pte is zapped after get_user_pages
> returns but before the page is mapped by the spte and tracked by
> rmap. The invalidate_ calls can also likely be optimized further but
> it's not a fast path so it's not urgent.

I am doing this in xpmem with a stack-based structure in the function
calling get_user_pages.  That structure describes the start and
end address of the range we are doing the get_user_pages on.  If an
invalidate_range_begin comes in while we are off to the kernel doing
the get_user_pages, the invalidate_range_begin marks that structure
indicating an invalidate came in.  When the get_user_pages gets the
structures relocked, it checks that flag (really a generation counter)
and if it is set, retries the get_user_pages.  After 3 retries, it
returns -EAGAIN and the fault is started over from the remote side.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH] KVM swapping with MMU Notifiers V7
  2008-02-16 10:48 ` [PATCH] KVM swapping with " Andrea Arcangeli
@ 2008-02-16 11:08   ` Andrew Morton
  2008-02-18 12:17     ` Andrea Arcangeli
  2008-02-16 11:51   ` Robin Holt
  1 sibling, 1 reply; 120+ messages in thread
From: Andrew Morton @ 2008-02-16 11:08 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Christoph Lameter, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman

On Sat, 16 Feb 2008 11:48:27 +0100 Andrea Arcangeli <andrea@qumranet.com> wrote:

> +void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> +					   struct mm_struct *mm,
> +					   unsigned long start, unsigned long end,
> +					   int lock)
> +{
> +	for (; start < end; start += PAGE_SIZE)
> +		kvm_mmu_notifier_invalidate_page(mn, mm, start);
> +}
> +
> +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
> +	.age_page		= kvm_mmu_notifier_age_page,
> +	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
> +};

So this doesn't implement ->invalidate_range_start().

By what means does it prevent new mappings from being established in the
range after core mm has tried to call ->invalidate_rande_start()?
mmap_sem, I assume?


> +			/* set userspace_addr atomically for kvm_hva_to_rmapp */
> +			spin_lock(&kvm->mmu_lock);
> +			memslot->userspace_addr = userspace_addr;
> +			spin_unlock(&kvm->mmu_lock);

are you sure?  kvm_unmap_hva() and kvm_age_hva() read ->userspace_addr a
single time and it doesn't immediately look like there's a need to take the
lock here?



^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH] KVM swapping with MMU Notifiers V7
  2008-02-15  6:48 [patch 0/6] MMU Notifiers V7 Christoph Lameter
@ 2008-02-16 10:48 ` Andrea Arcangeli
  2008-02-16 11:08   ` Andrew Morton
  2008-02-16 11:51   ` Robin Holt
  0 siblings, 2 replies; 120+ messages in thread
From: Andrea Arcangeli @ 2008-02-16 10:48 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Robin Holt, Avi Kivity, Izik Eidus, kvm-devel,
	Peter Zijlstra, general, Steve Wise, Roland Dreier, Kanoj Sarcar,
	steiner, linux-kernel, linux-mm, daniel.blueman

Those below two patches enable KVM to swap the guest physical memory
through Christoph's V7.

There's one last _purely_theoretical_ race condition I figured out and
that I'm wondering how to best fix. The race condition worst case is
that a few guest physical pages could remain pinned by sptes. The race
can materialize if the linux pte is zapped after get_user_pages
returns but before the page is mapped by the spte and tracked by
rmap. The invalidate_ calls can also likely be optimized further but
it's not a fast path so it's not urgent.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 41962e7..e1287ab 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM && EXPERIMENTAL
 	select PREEMPT_NOTIFIERS
+	select MMU_NOTIFIER
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fd39cd1..b56e388 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -533,6 +533,110 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		kvm_flush_remote_tlbs(kvm);
 }
 
+static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
+{
+	struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+	get_page(page);
+	rmap_remove(kvm, spte);
+	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+	kvm_flush_remote_tlbs(kvm);
+	__free_page(page);
+}
+
+static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte, *curr_spte;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+		curr_spte = spte;
+		spte = rmap_next(kvm, rmapp, spte);
+		kvm_unmap_spte(kvm, curr_spte);
+	}
+}
+
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int young = 0;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		int _young;
+		u64 _spte = *spte;
+		BUG_ON(!(_spte & PT_PRESENT_MASK));
+		_young = _spte & PT_ACCESSED_MASK;
+		if (_young) {
+			young = !!_young;
+			set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+	return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+	int young = 0;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+
+	if (young)
+		kvm_flush_remote_tlbs(kvm);
+
+	return young;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c910c7..2b2398f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3185,6 +3185,46 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+	struct kvm_arch *kvm_arch;
+	kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
+	return container_of(kvm_arch, struct kvm, arch);
+}
+
+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+				      struct mm_struct *mm,
+				      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	kvm_unmap_hva(kvm, address);
+}
+
+int kvm_mmu_notifier_age_page(struct mmu_notifier *mn,
+			      struct mm_struct *mm,
+			      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	return kvm_age_hva(kvm, address);
+}
+
+void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+					   struct mm_struct *mm,
+					   unsigned long start, unsigned long end,
+					   int lock)
+{
+	for (; start < end; start += PAGE_SIZE)
+		kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+	.age_page		= kvm_mmu_notifier_age_page,
+	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
+};
+
 struct  kvm *kvm_arch_create_vm(void)
 {
 	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -3194,6 +3234,9 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 
+	kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+	mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+
 	return kvm;
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index da61255..11976c8 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -287,6 +288,8 @@ struct kvm_arch{
 	int round_robin_prev_vcpu;
 	unsigned int tss_addr;
 	struct page *apic_access_page;
+
+	struct mmu_notifier mmu_notifier;
 };
 
 struct kvm_vm_stat {
@@ -404,6 +407,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);


This allows to browse the memslots with only the mmu_lock hold and
it should be applied along the above patch:

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c910c7..80b719d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3245,16 +3245,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	 */
 	if (!user_alloc) {
 		if (npages && !old.rmap) {
+			unsigned long userspace_addr;
+
 			down_write(&current->mm->mmap_sem);
-			memslot->userspace_addr = do_mmap(NULL, 0,
-						     npages * PAGE_SIZE,
-						     PROT_READ | PROT_WRITE,
-						     MAP_SHARED | MAP_ANONYMOUS,
-						     0);
+			userspace_addr = do_mmap(NULL, 0,
+						 npages * PAGE_SIZE,
+						 PROT_READ | PROT_WRITE,
+						 MAP_SHARED | MAP_ANONYMOUS,
+						 0);
 			up_write(&current->mm->mmap_sem);
 
-			if (IS_ERR((void *)memslot->userspace_addr))
-				return PTR_ERR((void *)memslot->userspace_addr);
+			if (IS_ERR((void *)userspace_addr))
+				return PTR_ERR((void *)userspace_addr);
+
+			/* set userspace_addr atomically for kvm_hva_to_rmapp */
+			spin_lock(&kvm->mmu_lock);
+			memslot->userspace_addr = userspace_addr;
+			spin_unlock(&kvm->mmu_lock);
 		} else {
 			if (!old.user_alloc && old.rmap) {
 				int ret;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cf6df51..743c5c5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -299,7 +299,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.rmap, 0, npages * sizeof(*new.rmap));
 
 		new.user_alloc = user_alloc;
-		new.userspace_addr = mem->userspace_addr;
+		/*
+		 * hva_to_rmmap() serialzies with the mmu_lock and to be
+		 * safe it has to ignore memslots with !user_alloc &&
+		 * !userspace_addr.
+		 */
+		if (user_alloc)
+			new.userspace_addr = mem->userspace_addr;
+		else
+			new.userspace_addr = 0;
 	}
 
 	/* Allocate page dirty bitmap if needed */
@@ -312,14 +320,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.dirty_bitmap, 0, dirty_bytes);
 	}
 
+	spin_lock(&kvm->mmu_lock);
 	if (mem->slot >= kvm->nmemslots)
 		kvm->nmemslots = mem->slot + 1;
 
 	*memslot = new;
+	spin_unlock(&kvm->mmu_lock);
 
 	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
 	if (r) {
+		spin_lock(&kvm->mmu_lock);
 		*memslot = old;
+		spin_unlock(&kvm->mmu_lock);
 		goto out_free;
 	}
 

^ permalink raw reply	[flat|nested] 120+ messages in thread

end of thread, other threads:[~2008-03-19 23:02 UTC | newest]

Thread overview: 120+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-02-19  8:43 [patch] my mmu notifiers Nick Piggin
2008-02-19  8:44 ` [patch] my mmu notifier sample driver Nick Piggin
2008-02-19 11:59 ` [patch] my mmu notifiers Robin Holt
2008-02-19 13:58 ` Andrea Arcangeli
2008-02-19 14:27   ` Jack Steiner
2008-02-19 23:04     ` Nick Piggin
2008-02-20  0:52       ` Andrea Arcangeli
2008-02-20  2:46         ` Robin Holt
2008-02-27 22:50     ` Christoph Lameter
2008-02-19 22:59   ` Nick Piggin
2008-02-20  0:46     ` Andrea Arcangeli
2008-02-27 22:55     ` Christoph Lameter
2008-02-19 23:11   ` Nick Piggin
2008-02-19 23:40     ` Jack Steiner
2008-02-21  4:42       ` Nick Piggin
2008-02-22 16:31         ` Jack Steiner
2008-02-20  1:09     ` Andrea Arcangeli
2008-02-20 10:39       ` [PATCH] mmu notifiers #v6 Andrea Arcangeli
2008-02-20 10:45         ` [PATCH] KVM swapping (+ seqlock fix) with " Andrea Arcangeli
2008-02-27 22:06           ` [PATCH] KVM swapping with mmu notifiers #v7 Andrea Arcangeli
2008-02-28  8:42             ` izik eidus
2008-02-20 11:33         ` [PATCH] mmu notifiers #v6 Robin Holt
2008-02-20 12:03           ` Andrea Arcangeli
2008-02-20 12:24             ` Robin Holt
2008-02-20 12:32               ` Andrea Arcangeli
2008-02-20 13:15                 ` Robin Holt
2008-02-21  5:02             ` Nick Piggin
2008-02-20 14:41         ` Robin Holt
2008-02-20 15:34           ` Andrea Arcangeli
2008-02-20 21:03         ` Jack Steiner
2008-02-21  4:54         ` Nick Piggin
2008-02-21 14:40           ` Andrea Arcangeli
2008-02-21 16:10             ` Jack Steiner
2008-02-27 19:26               ` [PATCH] mmu notifiers #v7 Andrea Arcangeli
2008-02-27 20:04                 ` Peter Zijlstra
2008-02-27 23:06                 ` Christoph Lameter
2008-02-27 23:43                   ` [kvm-devel] " Andrea Arcangeli
2008-02-28  0:08                     ` Christoph Lameter
2008-02-28  0:21                       ` Andrea Arcangeli
2008-02-28  0:24                         ` Christoph Lameter
2008-02-28 19:48                 ` Christoph Lameter
2008-02-28 21:52                   ` Andrea Arcangeli
2008-02-28 22:00                     ` Christoph Lameter
2008-02-28 23:17                     ` Jack Steiner
2008-02-29  0:24                       ` Andrea Arcangeli
2008-02-29  1:13                         ` Christoph Lameter
2008-02-28 23:05                 ` Christoph Lameter
2008-02-29  0:40                   ` Andrea Arcangeli
2008-02-29  0:56                     ` Andrew Morton
2008-02-29  1:03                     ` Christoph Lameter
2008-02-29 13:09                       ` Andrea Arcangeli
2008-02-29 19:46                         ` Christoph Lameter
2008-03-02 15:54                 ` [PATCH] mmu notifiers #v8 Andrea Arcangeli
2008-03-02 16:03                   ` [PATCH] mmu notifiers #v8 + xpmem Andrea Arcangeli
2008-03-02 16:23                     ` Peter Zijlstra
2008-03-03  3:29                   ` [PATCH] mmu notifiers #v8 Nick Piggin
2008-03-03 12:51                     ` Andrea Arcangeli
2008-03-03 13:10                       ` Nick Piggin
2008-03-03 13:24                         ` Andrea Arcangeli
2008-03-03 15:18                         ` Jack Steiner
2008-03-03 16:59                           ` Nick Piggin
2008-03-03 18:06                             ` Jack Steiner
2008-03-03 18:09                               ` Avi Kivity
2008-03-03 18:23                                 ` Jack Steiner
2008-03-03 18:45                               ` Nick Piggin
2008-03-03 19:15                                 ` Jack Steiner
2008-03-04 10:35                                   ` Peter Zijlstra
2008-03-04 14:44                                     ` Jack Steiner
2008-03-03 19:02                             ` Christoph Lameter
2008-03-03 19:01                     ` Christoph Lameter
2008-03-03 21:15                       ` Andrea Arcangeli
2008-03-05  0:37                       ` Nick Piggin
2008-03-05 18:48                         ` Christoph Lameter
2008-03-06  2:59                           ` Nick Piggin
2008-03-03  3:33                   ` Nick Piggin
2008-03-03 19:03                     ` Christoph Lameter
2008-03-03  3:34                   ` Nick Piggin
2008-03-03 19:04                     ` Christoph Lameter
2008-03-03  3:39                   ` Nick Piggin
2008-03-03 21:37                   ` [PATCH] mmu notifiers #v9 Andrea Arcangeli
2008-03-03 22:05                     ` [PATCH] KVM swapping with " Andrea Arcangeli
2008-03-04  0:44                       ` izik eidus
2008-03-04  7:31                         ` [RFC] Notifier for Externally Mapped Memory (EMM) Christoph Lameter
2008-03-04  7:34                           ` [Early draft] Conversion of i_mmap_lock to semaphore Christoph Lameter
2008-03-04 13:30                           ` [RFC] Notifier for Externally Mapped Memory (EMM) Andrea Arcangeli
2008-03-04 19:00                             ` Christoph Lameter
2008-03-04 22:20                               ` Andrea Arcangeli
2008-03-04 22:35                                 ` Christoph Lameter
2008-03-04 22:42                                   ` Peter Zijlstra
2008-03-04 23:14                                     ` Christoph Lameter
2008-03-04 23:25                                       ` Peter Zijlstra
2008-03-04 23:30                                         ` Peter Zijlstra
2008-03-05  5:09                                     ` Avi Kivity
2008-03-05  9:47                                       ` Robin Holt
2008-03-05  9:53                                         ` Avi Kivity
2008-03-05 10:02                                         ` [kvm-devel] " Dor Laor
2008-03-07 15:17                                   ` [PATCH] 2/4 move all invalidate_page outside of PT lock (#v9 was 1/4) Andrea Arcangeli
2008-03-07 15:23                                     ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Andrea Arcangeli
2008-03-07 15:52                                       ` [PATCH] 4/4 i_mmap_lock spinlock2rwsem " Andrea Arcangeli
2008-03-07 20:03                                         ` Christoph Lameter
2008-03-19 21:27                                         ` Christoph Lameter
2008-03-07 16:52                                       ` [PATCH] 3/4 combine RCU with seqlock to allow mmu notifier methods to sleep " Peter Zijlstra
2008-03-07 17:50                                         ` Andrea Arcangeli
2008-03-07 18:01                                           ` Peter Zijlstra
2008-03-07 18:45                                             ` Andrea Arcangeli
2008-03-07 19:47                                               ` Andrea Arcangeli
2008-03-07 20:15                                                 ` Christoph Lameter
2008-03-07 20:12                                               ` Christoph Lameter
2008-03-07 20:10                                           ` Christoph Lameter
2008-03-07 20:00                                       ` Christoph Lameter
2008-03-07 19:54                                     ` [PATCH] 2/4 move all invalidate_page outside of PT lock " Christoph Lameter
2008-03-04 13:21                         ` [PATCH] KVM swapping with mmu notifiers #v9 Andrea Arcangeli
2008-02-21  4:47       ` [patch] my mmu notifiers Nick Piggin
2008-02-20  2:49     ` Robin Holt
2008-02-27 22:56     ` Christoph Lameter
  -- strict thread matches above, loose matches on Subject: below --
2008-02-15  6:48 [patch 0/6] MMU Notifiers V7 Christoph Lameter
2008-02-16 10:48 ` [PATCH] KVM swapping with " Andrea Arcangeli
2008-02-16 11:08   ` Andrew Morton
2008-02-18 12:17     ` Andrea Arcangeli
2008-02-16 11:51   ` Robin Holt
2008-02-18 12:35     ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).