LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 0 of 9] mmu notifier #v12
@ 2008-04-08 15:44 Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen Andrea Arcangeli
` (11 more replies)
0 siblings, 12 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
The difference with #v11 is a different implementation of mm_lock that
guarantees handling signals in O(N). It's also more lowlatency friendly.
Note that mmu_notifier_unregister may also fail with -EINTR if there are
signal pending or the system runs out of vmalloc space or physical memory,
only exit_mmap guarantees that any kernel module can be unloaded in presence
of an oom condition.
Either #v11 or the first three #v12 1,2,3 patches are suitable for inclusion
in -mm, pick what you prefer looking at the mmu_notifier_register retval and
mm_lock retval difference, I implemented and slighty tested both. GRU and KVM
only needs 1,2,3, XPMEM needs the rest of the patchset too (4, ...) but all
patches from 4 to the end can be deffered to a second merge window.
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
@ 2008-04-08 15:44 ` Andrea Arcangeli
2008-04-16 16:33 ` Robin Holt
2008-04-22 5:06 ` Rusty Russell
2008-04-08 15:44 ` [PATCH 2 of 9] Core of mmu notifiers Andrea Arcangeli
` (10 subsequent siblings)
11 siblings, 2 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
# HG changeset patch
# User Andrea Arcangeli <andrea@qumranet.com>
# Date 1207666462 -7200
# Node ID ec6d8f91b299cf26cce5c3d49bb25d35ee33c137
# Parent d4c25404de6376297ed34fada14cd6b894410eb0
Lock the entire mm to prevent any mmu related operation to happen.
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1050,6 +1050,15 @@
unsigned long addr, unsigned long len,
unsigned long flags, struct page **pages);
+struct mm_lock_data {
+ spinlock_t **i_mmap_locks;
+ spinlock_t **anon_vma_locks;
+ unsigned long nr_i_mmap_locks;
+ unsigned long nr_anon_vma_locks;
+};
+extern struct mm_lock_data *mm_lock(struct mm_struct * mm);
+extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data);
+
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,6 +26,7 @@
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -2242,3 +2243,140 @@
return 0;
}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults. The holder
+ * must not hold any mm related lock. A single task can't take more
+ * than one mm lock in a row or it would deadlock.
+ */
+struct mm_lock_data *mm_lock(struct mm_struct * mm)
+{
+ struct vm_area_struct *vma;
+ spinlock_t *i_mmap_lock_last, *anon_vma_lock_last;
+ unsigned long nr_i_mmap_locks, nr_anon_vma_locks, i;
+ struct mm_lock_data *data;
+ int err;
+
+ down_write(&mm->mmap_sem);
+
+ err = -EINTR;
+ nr_i_mmap_locks = nr_anon_vma_locks = 0;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ if (unlikely(signal_pending(current)))
+ goto out;
+
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ nr_i_mmap_locks++;
+ if (vma->anon_vma)
+ nr_anon_vma_locks++;
+ }
+
+ err = -ENOMEM;
+ data = kmalloc(sizeof(struct mm_lock_data), GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ if (nr_i_mmap_locks) {
+ data->i_mmap_locks = vmalloc(nr_i_mmap_locks *
+ sizeof(spinlock_t));
+ if (!data->i_mmap_locks)
+ goto out_kfree;
+ } else
+ data->i_mmap_locks = NULL;
+
+ if (nr_anon_vma_locks) {
+ data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
+ sizeof(spinlock_t));
+ if (!data->anon_vma_locks)
+ goto out_vfree;
+ } else
+ data->anon_vma_locks = NULL;
+
+ err = -EINTR;
+ i_mmap_lock_last = NULL;
+ nr_i_mmap_locks = 0;
+ for (;;) {
+ spinlock_t *i_mmap_lock = (spinlock_t *) -1UL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ if (unlikely(signal_pending(current)))
+ goto out_vfree_both;
+
+ if (!vma->vm_file || !vma->vm_file->f_mapping)
+ continue;
+ if ((unsigned long) i_mmap_lock >
+ (unsigned long)
+ &vma->vm_file->f_mapping->i_mmap_lock &&
+ (unsigned long)
+ &vma->vm_file->f_mapping->i_mmap_lock >
+ (unsigned long) i_mmap_lock_last)
+ i_mmap_lock =
+ &vma->vm_file->f_mapping->i_mmap_lock;
+ }
+ if (i_mmap_lock == (spinlock_t *) -1UL)
+ break;
+ i_mmap_lock_last = i_mmap_lock;
+ data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock;
+ }
+ data->nr_i_mmap_locks = nr_i_mmap_locks;
+
+ anon_vma_lock_last = NULL;
+ nr_anon_vma_locks = 0;
+ for (;;) {
+ spinlock_t *anon_vma_lock = (spinlock_t *) -1UL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ if (unlikely(signal_pending(current)))
+ goto out_vfree_both;
+
+ if (!vma->anon_vma)
+ continue;
+ if ((unsigned long) anon_vma_lock >
+ (unsigned long) &vma->anon_vma->lock &&
+ (unsigned long) &vma->anon_vma->lock >
+ (unsigned long) anon_vma_lock_last)
+ anon_vma_lock = &vma->anon_vma->lock;
+ }
+ if (anon_vma_lock == (spinlock_t *) -1UL)
+ break;
+ anon_vma_lock_last = anon_vma_lock;
+ data->anon_vma_locks[nr_anon_vma_locks++] = anon_vma_lock;
+ }
+ data->nr_anon_vma_locks = nr_anon_vma_locks;
+
+ for (i = 0; i < nr_i_mmap_locks; i++)
+ spin_lock(data->i_mmap_locks[i]);
+ for (i = 0; i < nr_anon_vma_locks; i++)
+ spin_lock(data->anon_vma_locks[i]);
+
+ return data;
+
+out_vfree_both:
+ vfree(data->anon_vma_locks);
+out_vfree:
+ vfree(data->i_mmap_locks);
+out_kfree:
+ kfree(data);
+out:
+ up_write(&mm->mmap_sem);
+ return ERR_PTR(err);
+}
+
+void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data)
+{
+ unsigned long i;
+
+ for (i = 0; i < data->nr_i_mmap_locks; i++)
+ spin_unlock(data->i_mmap_locks[i]);
+ for (i = 0; i < data->nr_anon_vma_locks; i++)
+ spin_unlock(data->anon_vma_locks[i]);
+
+ up_write(&mm->mmap_sem);
+
+ vfree(data->i_mmap_locks);
+ vfree(data->anon_vma_locks);
+ kfree(data);
+}
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 2 of 9] Core of mmu notifiers
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen Andrea Arcangeli
@ 2008-04-08 15:44 ` Andrea Arcangeli
2008-04-08 16:26 ` Robin Holt
` (2 more replies)
2008-04-08 15:44 ` [PATCH 3 of 9] Moves all mmu notifier methods outside the PT lock (first and not last Andrea Arcangeli
` (9 subsequent siblings)
11 siblings, 3 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
# HG changeset patch
# User Andrea Arcangeli <andrea@qumranet.com>
# Date 1207666462 -7200
# Node ID baceb322b45ed43280654dac6c964c9d3d8a936f
# Parent ec6d8f91b299cf26cce5c3d49bb25d35ee33c137
Core of mmu notifiers.
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,6 +225,9 @@
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
struct mem_cgroup *mem_cgroup;
#endif
+#ifdef CONFIG_MMU_NOTIFIER
+ struct hlist_head mmu_notifier_list;
+#endif
};
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
new file mode 100644
--- /dev/null
+++ b/include/linux/mmu_notifier.h
@@ -0,0 +1,177 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm_types.h>
+
+struct mmu_notifier;
+struct mmu_notifier_ops;
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier_ops {
+ /*
+ * Called when nobody can register any more notifier in the mm
+ * and after the "mn" notifier has been disarmed already.
+ */
+ void (*release)(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+
+ /*
+ * clear_flush_young is called after the VM is
+ * test-and-clearing the young/accessed bitflag in the
+ * pte. This way the VM will provide proper aging to the
+ * accesses to the page through the secondary MMUs and not
+ * only to the ones through the Linux pte.
+ */
+ int (*clear_flush_young)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+
+ /*
+ * Before this is invoked any secondary MMU is still ok to
+ * read/write to the page previously pointed by the Linux pte
+ * because the old page hasn't been freed yet. If required
+ * set_page_dirty has to be called internally to this method.
+ */
+ void (*invalidate_page)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+
+ /*
+ * invalidate_range_start() and invalidate_range_end() must be
+ * paired. Multiple invalidate_range_start/ends may be nested
+ * or called concurrently.
+ */
+ void (*invalidate_range_start)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+ void (*invalidate_range_end)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+};
+
+struct mmu_notifier {
+ struct hlist_node hlist;
+ const struct mmu_notifier_ops *ops;
+};
+
+static inline int mm_has_notifiers(struct mm_struct *mm)
+{
+ return unlikely(!hlist_empty(&mm->mmu_notifier_list));
+}
+
+extern int mmu_notifier_register(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+extern int mmu_notifier_unregister(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+extern void __mmu_notifier_release(struct mm_struct *mm);
+extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ unsigned long address);
+extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+ unsigned long address);
+extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+
+
+static inline void mmu_notifier_release(struct mm_struct *mm)
+{
+ if (mm_has_notifiers(mm))
+ __mmu_notifier_release(mm);
+}
+
+static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ if (mm_has_notifiers(mm))
+ return __mmu_notifier_clear_flush_young(mm, address);
+ return 0;
+}
+
+static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
+ unsigned long address)
+{
+ if (mm_has_notifiers(mm))
+ __mmu_notifier_invalidate_page(mm, address);
+}
+
+static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ if (mm_has_notifiers(mm))
+ __mmu_notifier_invalidate_range_start(mm, start, end);
+}
+
+static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ if (mm_has_notifiers(mm))
+ __mmu_notifier_invalidate_range_end(mm, start, end);
+}
+
+static inline void mmu_notifier_mm_init(struct mm_struct *mm)
+{
+ INIT_HLIST_HEAD(&mm->mmu_notifier_list);
+}
+
+#define ptep_clear_flush_notify(__vma, __address, __ptep) \
+({ \
+ pte_t __pte; \
+ struct vm_area_struct *___vma = __vma; \
+ unsigned long ___address = __address; \
+ __pte = ptep_clear_flush(___vma, ___address, __ptep); \
+ mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \
+ __pte; \
+})
+
+#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
+({ \
+ int __young; \
+ struct vm_area_struct *___vma = __vma; \
+ unsigned long ___address = __address; \
+ __young = ptep_clear_flush_young(___vma, ___address, __ptep); \
+ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \
+ ___address); \
+ __young; \
+})
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+static inline void mmu_notifier_release(struct mm_struct *mm)
+{
+}
+
+static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ return 0;
+}
+
+static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
+ unsigned long address)
+{
+}
+
+static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+}
+
+static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+}
+
+static inline void mmu_notifier_mm_init(struct mm_struct *mm)
+{
+}
+
+#define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define ptep_clear_flush_notify ptep_clear_flush
+
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -53,6 +53,7 @@
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
+#include <linux/mmu_notifier.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -362,6 +363,7 @@
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
+ mmu_notifier_mm_init(mm);
return mm;
}
diff --git a/mm/Kconfig b/mm/Kconfig
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,3 +193,7 @@
config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+ def_bool y
+ bool "MMU notifier, for paging KVM/RDMA"
diff --git a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,4 +33,5 @@
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,7 +194,7 @@
if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush(vma, address, pte);
+ pteval = ptep_clear_flush_notify(vma, address, pte);
page_remove_rmap(page, vma);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
diff --git a/mm/fremap.c b/mm/fremap.c
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
@@ -214,7 +215,9 @@
spin_unlock(&mapping->i_mmap_lock);
}
+ mmu_notifier_invalidate_range_start(mm, start, start + size);
err = populate_range(mm, vma, start, size, pgoff);
+ mmu_notifier_invalidate_range_end(mm, start, start + size);
if (!err && !(flags & MAP_NONBLOCK)) {
if (unlikely(has_write_lock)) {
downgrade_write(&mm->mmap_sem);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,6 +14,7 @@
#include <linux/mempolicy.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
+#include <linux/mmu_notifier.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -799,6 +800,7 @@
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);
+ mmu_notifier_invalidate_range_start(mm, start, end);
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
@@ -819,6 +821,7 @@
}
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
list_del(&page->lru);
put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
#include <linux/init.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -611,6 +612,9 @@
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier_invalidate_range_start(src_mm, addr, end);
+
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
do {
@@ -621,6 +625,11 @@
vma, addr, next))
return -ENOMEM;
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier_invalidate_range_end(src_mm,
+ vma->vm_start, end);
+
return 0;
}
@@ -897,7 +906,9 @@
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
+ mmu_notifier_invalidate_range_start(mm, address, end);
end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+ mmu_notifier_invalidate_range_end(mm, address, end);
if (tlb)
tlb_finish_mmu(tlb, address, end);
return end;
@@ -1463,10 +1474,11 @@
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + size;
+ unsigned long start = addr, end = addr + size;
int err;
BUG_ON(addr >= end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1474,6 +1486,7 @@
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1675,7 +1688,7 @@
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush(vma, address, page_table);
+ ptep_clear_flush_notify(vma, address, page_table);
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lru_cache_add_active(new_page);
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -27,6 +27,7 @@
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/vmalloc.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -1748,11 +1749,13 @@
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
+ mmu_notifier_invalidate_range_start(mm, start, end);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
}
/*
@@ -2038,6 +2041,7 @@
unsigned long end;
/* mm's last user has gone, and its about to be pulled down */
+ mmu_notifier_release(mm);
arch_exit_mmap(mm);
lru_add_drain();
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,126 @@
+/*
+ * linux/mm/mmu_notifier.c
+ *
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright (C) 2008 SGI
+ * Christoph Lameter <clameter@sgi.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+
+/*
+ * No synchronization. This function can only be called when only a single
+ * process remains that performs teardown.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+
+ while (unlikely(!hlist_empty(&mm->mmu_notifier_list))) {
+ mn = hlist_entry(mm->mmu_notifier_list.first,
+ struct mmu_notifier,
+ hlist);
+ hlist_del(&mn->hlist);
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ }
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+ int young = 0;
+
+ hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) {
+ if (mn->ops->clear_flush_young)
+ young |= mn->ops->clear_flush_young(mn, mm, address);
+ }
+
+ return young;
+}
+
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) {
+ if (mn->ops->invalidate_page)
+ mn->ops->invalidate_page(mn, mm, address);
+ }
+}
+
+void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) {
+ if (mn->ops->invalidate_range_start)
+ mn->ops->invalidate_range_start(mn, mm, start, end);
+ }
+}
+
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) {
+ if (mn->ops->invalidate_range_end)
+ mn->ops->invalidate_range_end(mn, mm, start, end);
+ }
+}
+
+/*
+ * Must not hold mmap_sem nor any other VM related lock when calling
+ * this registration function.
+ */
+int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ struct mm_lock_data *data;
+
+ data = mm_lock(mm);
+ if (unlikely(IS_ERR(data)))
+ return PTR_ERR(data);
+ hlist_add_head(&mn->hlist, &mm->mmu_notifier_list);
+ mm_unlock(mm, data);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+/*
+ * mm_users can't go down to zero while mmu_notifier_unregister()
+ * runs or it can race with ->release. So a mm_users pin must
+ * be taken by the caller (if mm can be different from current->mm).
+ */
+int mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ struct mm_lock_data *data;
+
+ BUG_ON(!atomic_read(&mm->mm_users));
+
+ data = mm_lock(mm);
+ if (unlikely(IS_ERR(data)))
+ return PTR_ERR(data);
+ hlist_del(&mn->hlist);
+ mm_unlock(mm, data);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mprotect.c b/mm/mprotect.c
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -198,10 +199,12 @@
dirty_accountable = 1;
}
+ mmu_notifier_invalidate_range_start(mm, start, end);
if (is_vm_hugetlb_page(vma))
hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
else
change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+ mmu_notifier_invalidate_range_end(mm, start, end);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -74,7 +75,11 @@
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
+ unsigned long old_start;
+ old_start = old_addr;
+ mmu_notifier_invalidate_range_start(vma->vm_mm,
+ old_start, old_end);
if (vma->vm_file) {
/*
* Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +121,7 @@
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
@@ -287,7 +288,7 @@
if (vma->vm_flags & VM_LOCKED) {
referenced++;
*mapcount = 1; /* break early from loop */
- } else if (ptep_clear_flush_young(vma, address, pte))
+ } else if (ptep_clear_flush_young_notify(vma, address, pte))
referenced++;
/* Pretend the page is referenced if the task has the
@@ -456,7 +457,7 @@
pte_t entry;
flush_cache_page(vma, address, pte_pfn(*pte));
- entry = ptep_clear_flush(vma, address, pte);
+ entry = ptep_clear_flush_notify(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(mm, address, pte, entry);
@@ -717,14 +718,14 @@
* skipped over this mm) then we should reactivate it.
*/
if (!migration && ((vma->vm_flags & VM_LOCKED) ||
- (ptep_clear_flush_young(vma, address, pte)))) {
+ (ptep_clear_flush_young_notify(vma, address, pte)))) {
ret = SWAP_FAIL;
goto out_unmap;
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
- pteval = ptep_clear_flush(vma, address, pte);
+ pteval = ptep_clear_flush_notify(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
@@ -849,12 +850,12 @@
page = vm_normal_page(vma, address, *pte);
BUG_ON(!page || PageAnon(page));
- if (ptep_clear_flush_young(vma, address, pte))
+ if (ptep_clear_flush_young_notify(vma, address, pte))
continue;
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush(vma, address, pte);
+ pteval = ptep_clear_flush_notify(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 3 of 9] Moves all mmu notifier methods outside the PT lock (first and not last
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 2 of 9] Core of mmu notifiers Andrea Arcangeli
@ 2008-04-08 15:44 ` Andrea Arcangeli
2008-04-14 19:57 ` Christoph Lameter
2008-04-08 15:44 ` [PATCH 4 of 9] Move the tlb flushing into free_pgtables. The conversion of the locks Andrea Arcangeli
` (8 subsequent siblings)
11 siblings, 1 reply; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
# HG changeset patch
# User Andrea Arcangeli <andrea@qumranet.com>
# Date 1207666463 -7200
# Node ID 33de2e17d0f5670515833bf8d3d2ea19e2a85b09
# Parent baceb322b45ed43280654dac6c964c9d3d8a936f
Moves all mmu notifier methods outside the PT lock (first and not last
step to make them sleep capable).
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -117,27 +117,6 @@
INIT_HLIST_HEAD(&mm->mmu_notifier_list);
}
-#define ptep_clear_flush_notify(__vma, __address, __ptep) \
-({ \
- pte_t __pte; \
- struct vm_area_struct *___vma = __vma; \
- unsigned long ___address = __address; \
- __pte = ptep_clear_flush(___vma, ___address, __ptep); \
- mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \
- __pte; \
-})
-
-#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
-({ \
- int __young; \
- struct vm_area_struct *___vma = __vma; \
- unsigned long ___address = __address; \
- __young = ptep_clear_flush_young(___vma, ___address, __ptep); \
- __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \
- ___address); \
- __young; \
-})
-
#else /* CONFIG_MMU_NOTIFIER */
static inline void mmu_notifier_release(struct mm_struct *mm)
@@ -169,9 +148,6 @@
{
}
-#define ptep_clear_flush_young_notify ptep_clear_flush_young
-#define ptep_clear_flush_notify ptep_clear_flush
-
#endif /* CONFIG_MMU_NOTIFIER */
#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,11 +194,13 @@
if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
page_remove_rmap(page, vma);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
+ /* must invalidate_page _before_ freeing the page */
+ mmu_notifier_invalidate_page(mm, address);
page_cache_release(page);
}
}
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1626,9 +1626,10 @@
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
- page_cache_release(old_page);
+ new_page = NULL;
if (!pte_same(*page_table, orig_pte))
goto unlock;
+ page_cache_release(old_page);
page_mkwrite = 1;
}
@@ -1644,6 +1645,7 @@
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, entry);
ret |= VM_FAULT_WRITE;
+ old_page = new_page = NULL;
goto unlock;
}
@@ -1688,7 +1690,7 @@
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, address, page_table);
+ ptep_clear_flush(vma, address, page_table);
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lru_cache_add_active(new_page);
@@ -1700,12 +1702,18 @@
} else
mem_cgroup_uncharge_page(new_page);
- if (new_page)
+unlock:
+ pte_unmap_unlock(page_table, ptl);
+
+ if (new_page) {
+ if (new_page == old_page)
+ /* cow happened, notify before releasing old_page */
+ mmu_notifier_invalidate_page(mm, address);
page_cache_release(new_page);
+ }
if (old_page)
page_cache_release(old_page);
-unlock:
- pte_unmap_unlock(page_table, ptl);
+
if (dirty_page) {
if (vma->vm_file)
file_update_time(vma->vm_file);
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -275,7 +275,7 @@
unsigned long address;
pte_t *pte;
spinlock_t *ptl;
- int referenced = 0;
+ int referenced = 0, clear_flush_young = 0;
address = vma_address(page, vma);
if (address == -EFAULT)
@@ -288,8 +288,11 @@
if (vma->vm_flags & VM_LOCKED) {
referenced++;
*mapcount = 1; /* break early from loop */
- } else if (ptep_clear_flush_young_notify(vma, address, pte))
- referenced++;
+ } else {
+ clear_flush_young = 1;
+ if (ptep_clear_flush_young(vma, address, pte))
+ referenced++;
+ }
/* Pretend the page is referenced if the task has the
swap token and is in the middle of a page fault. */
@@ -299,6 +302,10 @@
(*mapcount)--;
pte_unmap_unlock(pte, ptl);
+
+ if (clear_flush_young)
+ referenced += mmu_notifier_clear_flush_young(mm, address);
+
out:
return referenced;
}
@@ -457,7 +464,7 @@
pte_t entry;
flush_cache_page(vma, address, pte_pfn(*pte));
- entry = ptep_clear_flush_notify(vma, address, pte);
+ entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(mm, address, pte, entry);
@@ -465,6 +472,10 @@
}
pte_unmap_unlock(pte, ptl);
+
+ if (ret)
+ mmu_notifier_invalidate_page(mm, address);
+
out:
return ret;
}
@@ -717,15 +728,14 @@
* If it's recently referenced (perhaps page_referenced
* skipped over this mm) then we should reactivate it.
*/
- if (!migration && ((vma->vm_flags & VM_LOCKED) ||
- (ptep_clear_flush_young_notify(vma, address, pte)))) {
+ if (!migration && (vma->vm_flags & VM_LOCKED)) {
ret = SWAP_FAIL;
goto out_unmap;
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
@@ -780,6 +790,8 @@
out_unmap:
pte_unmap_unlock(pte, ptl);
+ if (ret != SWAP_FAIL)
+ mmu_notifier_invalidate_page(mm, address);
out:
return ret;
}
@@ -818,7 +830,7 @@
spinlock_t *ptl;
struct page *page;
unsigned long address;
- unsigned long end;
+ unsigned long start, end;
address = (vma->vm_start + cursor) & CLUSTER_MASK;
end = address + CLUSTER_SIZE;
@@ -839,6 +851,8 @@
if (!pmd_present(*pmd))
return;
+ start = address;
+ mmu_notifier_invalidate_range_start(mm, start, end);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
/* Update high watermark before we lower rss */
@@ -850,12 +864,12 @@
page = vm_normal_page(vma, address, *pte);
BUG_ON(!page || PageAnon(page));
- if (ptep_clear_flush_young_notify(vma, address, pte))
+ if (ptep_clear_flush_young(vma, address, pte))
continue;
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
@@ -871,6 +885,7 @@
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
+ mmu_notifier_invalidate_range_end(mm, start, end);
}
static int try_to_unmap_anon(struct page *page, int migration)
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 4 of 9] Move the tlb flushing into free_pgtables. The conversion of the locks
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
` (2 preceding siblings ...)
2008-04-08 15:44 ` [PATCH 3 of 9] Moves all mmu notifier methods outside the PT lock (first and not last Andrea Arcangeli
@ 2008-04-08 15:44 ` Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 5 of 9] The conversion to a rwsem allows callbacks during rmap traversal Andrea Arcangeli
` (7 subsequent siblings)
11 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
# HG changeset patch
# User Andrea Arcangeli <andrea@qumranet.com>
# Date 1207666463 -7200
# Node ID 2c2ed514f294dbbfc66157f771bc900789ac6005
# Parent 33de2e17d0f5670515833bf8d3d2ea19e2a85b09
Move the tlb flushing into free_pgtables. The conversion of the locks
taken for reverse map scanning would require taking sleeping locks
in free_pgtables(). Moving the tlb flushing into free_pgtables allows
sleeping in parts of free_pgtables().
This means that we do a tlb_finish_mmu() before freeing the page tables.
Strictly speaking there may not be the need to do another tlb flush after
freeing the tables. But its the only way to free a series of page table
pages from the tlb list. And we do not want to call into the page allocator
for performance reasons. Aim9 numbers look okay after this patch.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -751,8 +751,8 @@
void *private);
void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
- unsigned long floor, unsigned long ceiling);
+void free_pgtables(struct vm_area_struct *start_vma, unsigned long floor,
+ unsigned long ceiling);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
void unmap_mapping_range(struct address_space *mapping,
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -272,9 +272,11 @@
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
- unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct vm_area_struct *vma, unsigned long floor,
+ unsigned long ceiling)
{
+ struct mmu_gather *tlb;
+
while (vma) {
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
@@ -286,8 +288,10 @@
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
- hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
+ tlb = tlb_gather_mmu(vma->vm_mm, 0);
+ hugetlb_free_pgd_range(&tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
+ tlb_finish_mmu(tlb, addr, vma->vm_end);
} else {
/*
* Optimization: gather nearby vmas into one call down
@@ -299,8 +303,10 @@
anon_vma_unlink(vma);
unlink_file_vma(vma);
}
- free_pgd_range(tlb, addr, vma->vm_end,
+ tlb = tlb_gather_mmu(vma->vm_mm, 0);
+ free_pgd_range(&tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
+ tlb_finish_mmu(tlb, addr, vma->vm_end);
}
vma = next;
}
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1752,9 +1752,9 @@
mmu_notifier_invalidate_range_start(mm, start, end);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+ tlb_finish_mmu(tlb, start, end);
+ free_pgtables(vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
- tlb_finish_mmu(tlb, start, end);
mmu_notifier_invalidate_range_end(mm, start, end);
}
@@ -2051,8 +2051,8 @@
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
+ free_pgtables(vma, FIRST_USER_ADDRESS, 0);
/*
* Walk the list again, actually closing and freeing it,
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 5 of 9] The conversion to a rwsem allows callbacks during rmap traversal
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
` (3 preceding siblings ...)
2008-04-08 15:44 ` [PATCH 4 of 9] Move the tlb flushing into free_pgtables. The conversion of the locks Andrea Arcangeli
@ 2008-04-08 15:44 ` Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 6 of 9] We no longer abort unmapping in unmap vmas because we can reschedule while Andrea Arcangeli
` (6 subsequent siblings)
11 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
# HG changeset patch
# User Andrea Arcangeli <andrea@qumranet.com>
# Date 1207666463 -7200
# Node ID 20e829e35dfeceeb55a816ef495afda10cd50b98
# Parent 2c2ed514f294dbbfc66157f771bc900789ac6005
The conversion to a rwsem allows callbacks during rmap traversal
for files in a non atomic context. A rw style lock also allows concurrent
walking of the reverse map. This is fairly straightforward if one removes
pieces of the resched checking.
[Restarting unmapping is an issue to be discussed].
This slightly increases Aim9 performance results on an 8p.
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -69,7 +69,7 @@
if (!vma_shareable(vma, addr))
return;
- spin_lock(&mapping->i_mmap_lock);
+ down_read(&mapping->i_mmap_sem);
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -94,7 +94,7 @@
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
out:
- spin_unlock(&mapping->i_mmap_lock);
+ up_read(&mapping->i_mmap_sem);
}
/*
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -454,10 +454,10 @@
pgoff = offset >> PAGE_SHIFT;
i_size_write(inode, offset);
- spin_lock(&mapping->i_mmap_lock);
+ down_read(&mapping->i_mmap_sem);
if (!prio_tree_empty(&mapping->i_mmap))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
- spin_unlock(&mapping->i_mmap_lock);
+ up_read(&mapping->i_mmap_sem);
truncate_hugepages(inode, offset);
return 0;
}
diff --git a/fs/inode.c b/fs/inode.c
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -210,7 +210,7 @@
INIT_LIST_HEAD(&inode->i_devices);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
rwlock_init(&inode->i_data.tree_lock);
- spin_lock_init(&inode->i_data.i_mmap_lock);
+ init_rwsem(&inode->i_data.i_mmap_sem);
INIT_LIST_HEAD(&inode->i_data.private_list);
spin_lock_init(&inode->i_data.private_lock);
INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
diff --git a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -503,7 +503,7 @@
unsigned int i_mmap_writable;/* count VM_SHARED mappings */
struct prio_tree_root i_mmap; /* tree of private and shared mappings */
struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
- spinlock_t i_mmap_lock; /* protect tree, count, list */
+ struct rw_semaphore i_mmap_sem; /* protect tree, count, list */
unsigned int truncate_count; /* Cover race condition with truncate */
unsigned long nrpages; /* number of total pages */
pgoff_t writeback_index;/* writeback starts here */
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -716,7 +716,7 @@
struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
- spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
+ struct rw_semaphore *i_mmap_sem; /* For unmap_mapping_range: */
unsigned long truncate_count; /* Compare vm_truncate_count */
};
@@ -1051,9 +1051,9 @@
unsigned long flags, struct page **pages);
struct mm_lock_data {
- spinlock_t **i_mmap_locks;
+ struct rw_semaphore **i_mmap_sems;
spinlock_t **anon_vma_locks;
- unsigned long nr_i_mmap_locks;
+ unsigned long nr_i_mmap_sems;
unsigned long nr_anon_vma_locks;
};
extern struct mm_lock_data *mm_lock(struct mm_struct * mm);
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -274,12 +274,12 @@
atomic_dec(&inode->i_writecount);
/* insert tmp into the share list, just after mpnt */
- spin_lock(&file->f_mapping->i_mmap_lock);
+ down_write(&file->f_mapping->i_mmap_sem);
tmp->vm_truncate_count = mpnt->vm_truncate_count;
flush_dcache_mmap_lock(file->f_mapping);
vma_prio_tree_add(tmp, mpnt);
flush_dcache_mmap_unlock(file->f_mapping);
- spin_unlock(&file->f_mapping->i_mmap_lock);
+ up_write(&file->f_mapping->i_mmap_sem);
}
/*
diff --git a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -61,16 +61,16 @@
/*
* Lock ordering:
*
- * ->i_mmap_lock (vmtruncate)
+ * ->i_mmap_sem (vmtruncate)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
*
* ->i_mutex
- * ->i_mmap_lock (truncate->unmap_mapping_range)
+ * ->i_mmap_sem (truncate->unmap_mapping_range)
*
* ->mmap_sem
- * ->i_mmap_lock
+ * ->i_mmap_sem
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
@@ -87,7 +87,7 @@
* ->sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
- * ->i_mmap_lock
+ * ->i_mmap_sem
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -184,7 +184,7 @@
if (!page)
return;
- spin_lock(&mapping->i_mmap_lock);
+ down_read(&mapping->i_mmap_sem);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
@@ -204,7 +204,7 @@
page_cache_release(page);
}
}
- spin_unlock(&mapping->i_mmap_lock);
+ up_read(&mapping->i_mmap_sem);
}
/*
diff --git a/mm/fremap.c b/mm/fremap.c
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -206,13 +206,13 @@
}
goto out;
}
- spin_lock(&mapping->i_mmap_lock);
+ down_write(&mapping->i_mmap_sem);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
vma_prio_tree_remove(vma, &mapping->i_mmap);
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ up_write(&mapping->i_mmap_sem);
}
mmu_notifier_invalidate_range_start(mm, start, start + size);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -790,7 +790,7 @@
struct page *page;
struct page *tmp;
/*
- * A page gathering list, protected by per file i_mmap_lock. The
+ * A page gathering list, protected by per file i_mmap_sem. The
* lock is used to avoid list corruption from multiple unmapping
* of the same page since we are using page->lru.
*/
@@ -840,9 +840,9 @@
* do nothing in this case.
*/
if (vma->vm_file) {
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ down_write(&vma->vm_file->f_mapping->i_mmap_sem);
__unmap_hugepage_range(vma, start, end);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ up_write(&vma->vm_file->f_mapping->i_mmap_sem);
}
}
@@ -1085,7 +1085,7 @@
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ down_write(&vma->vm_file->f_mapping->i_mmap_sem);
spin_lock(&mm->page_table_lock);
for (; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
@@ -1100,7 +1100,7 @@
}
}
spin_unlock(&mm->page_table_lock);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ up_write(&vma->vm_file->f_mapping->i_mmap_sem);
flush_tlb_range(vma, start, end);
}
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -838,7 +838,6 @@
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
unsigned long start = start_addr;
- spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
int fullmm = (*tlbp)->fullmm;
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
@@ -875,22 +874,12 @@
}
tlb_finish_mmu(*tlbp, tlb_start, start);
-
- if (need_resched() ||
- (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
- if (i_mmap_lock) {
- *tlbp = NULL;
- goto out;
- }
- cond_resched();
- }
-
+ cond_resched();
*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
tlb_start_valid = 0;
zap_work = ZAP_BLOCK_SIZE;
}
}
-out:
return start; /* which is now the end (or restart) address */
}
@@ -1752,7 +1741,7 @@
/*
* Helper functions for unmap_mapping_range().
*
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
+ * __ Notes on dropping i_mmap_sem to reduce latency while unmapping __
*
* We have to restart searching the prio_tree whenever we drop the lock,
* since the iterator is only valid while the lock is held, and anyway
@@ -1771,7 +1760,7 @@
* can't efficiently keep all vmas in step with mapping->truncate_count:
* so instead reset them all whenever it wraps back to 0 (then go to 1).
* mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
+ * i_mmap_sem.
*
* In order to make forward progress despite repeatedly restarting some
* large vma, note the restart_addr from unmap_vmas when it breaks out:
@@ -1821,7 +1810,7 @@
restart_addr = zap_page_range(vma, start_addr,
end_addr - start_addr, details);
- need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
+ need_break = need_resched();
if (restart_addr >= end_addr) {
/* We have now completed this vma: mark it so */
@@ -1835,9 +1824,9 @@
goto again;
}
- spin_unlock(details->i_mmap_lock);
+ up_write(details->i_mmap_sem);
cond_resched();
- spin_lock(details->i_mmap_lock);
+ down_write(details->i_mmap_sem);
return -EINTR;
}
@@ -1931,9 +1920,9 @@
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
- details.i_mmap_lock = &mapping->i_mmap_lock;
+ details.i_mmap_sem = &mapping->i_mmap_sem;
- spin_lock(&mapping->i_mmap_lock);
+ down_write(&mapping->i_mmap_sem);
/* Protect against endless unmapping loops */
mapping->truncate_count++;
@@ -1948,7 +1937,7 @@
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
- spin_unlock(&mapping->i_mmap_lock);
+ up_write(&mapping->i_mmap_sem);
}
EXPORT_SYMBOL(unmap_mapping_range);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -211,12 +211,12 @@
if (!mapping)
return;
- spin_lock(&mapping->i_mmap_lock);
+ down_read(&mapping->i_mmap_sem);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
remove_migration_pte(vma, old, new);
- spin_unlock(&mapping->i_mmap_lock);
+ up_read(&mapping->i_mmap_sem);
}
/*
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -188,7 +188,7 @@
}
/*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_sem
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
@@ -216,9 +216,9 @@
if (file) {
struct address_space *mapping = file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
+ down_write(&mapping->i_mmap_sem);
__remove_shared_vm_struct(vma, file, mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ up_write(&mapping->i_mmap_sem);
}
}
@@ -441,7 +441,7 @@
mapping = vma->vm_file->f_mapping;
if (mapping) {
- spin_lock(&mapping->i_mmap_lock);
+ down_write(&mapping->i_mmap_sem);
vma->vm_truncate_count = mapping->truncate_count;
}
anon_vma_lock(vma);
@@ -451,7 +451,7 @@
anon_vma_unlock(vma);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ up_write(&mapping->i_mmap_sem);
mm->map_count++;
validate_mm(mm);
@@ -538,7 +538,7 @@
mapping = file->f_mapping;
if (!(vma->vm_flags & VM_NONLINEAR))
root = &mapping->i_mmap;
- spin_lock(&mapping->i_mmap_lock);
+ down_write(&mapping->i_mmap_sem);
if (importer &&
vma->vm_truncate_count != next->vm_truncate_count) {
/*
@@ -622,7 +622,7 @@
if (anon_vma)
spin_unlock(&anon_vma->lock);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ up_write(&mapping->i_mmap_sem);
if (remove_next) {
if (file)
@@ -2066,7 +2066,7 @@
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_sem is taken here.
*/
int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
@@ -2258,22 +2258,23 @@
struct mm_lock_data *mm_lock(struct mm_struct * mm)
{
struct vm_area_struct *vma;
- spinlock_t *i_mmap_lock_last, *anon_vma_lock_last;
- unsigned long nr_i_mmap_locks, nr_anon_vma_locks, i;
+ struct rw_semaphore *i_mmap_sem_last;
+ spinlock_t *anon_vma_lock_last;
+ unsigned long nr_i_mmap_sems, nr_anon_vma_locks, i;
struct mm_lock_data *data;
int err;
down_write(&mm->mmap_sem);
err = -EINTR;
- nr_i_mmap_locks = nr_anon_vma_locks = 0;
+ nr_i_mmap_sems = nr_anon_vma_locks = 0;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
if (unlikely(signal_pending(current)))
goto out;
if (vma->vm_file && vma->vm_file->f_mapping)
- nr_i_mmap_locks++;
+ nr_i_mmap_sems++;
if (vma->anon_vma)
nr_anon_vma_locks++;
}
@@ -2283,13 +2284,13 @@
if (!data)
goto out;
- if (nr_i_mmap_locks) {
- data->i_mmap_locks = vmalloc(nr_i_mmap_locks *
- sizeof(spinlock_t));
- if (!data->i_mmap_locks)
+ if (nr_i_mmap_sems) {
+ data->i_mmap_sems = vmalloc(nr_i_mmap_sems *
+ sizeof(struct rw_semaphore));
+ if (!data->i_mmap_sems)
goto out_kfree;
} else
- data->i_mmap_locks = NULL;
+ data->i_mmap_sems = NULL;
if (nr_anon_vma_locks) {
data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
@@ -2300,10 +2301,11 @@
data->anon_vma_locks = NULL;
err = -EINTR;
- i_mmap_lock_last = NULL;
- nr_i_mmap_locks = 0;
+ i_mmap_sem_last = NULL;
+ nr_i_mmap_sems = 0;
for (;;) {
- spinlock_t *i_mmap_lock = (spinlock_t *) -1UL;
+ struct rw_semaphore *i_mmap_sem;
+ i_mmap_sem = (struct rw_semaphore *) -1UL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
if (unlikely(signal_pending(current)))
@@ -2311,21 +2313,21 @@
if (!vma->vm_file || !vma->vm_file->f_mapping)
continue;
- if ((unsigned long) i_mmap_lock >
+ if ((unsigned long) i_mmap_sem >
(unsigned long)
- &vma->vm_file->f_mapping->i_mmap_lock &&
+ &vma->vm_file->f_mapping->i_mmap_sem &&
(unsigned long)
- &vma->vm_file->f_mapping->i_mmap_lock >
- (unsigned long) i_mmap_lock_last)
- i_mmap_lock =
- &vma->vm_file->f_mapping->i_mmap_lock;
+ &vma->vm_file->f_mapping->i_mmap_sem >
+ (unsigned long) i_mmap_sem_last)
+ i_mmap_sem =
+ &vma->vm_file->f_mapping->i_mmap_sem;
}
- if (i_mmap_lock == (spinlock_t *) -1UL)
+ if (i_mmap_sem == (struct rw_semaphore *) -1UL)
break;
- i_mmap_lock_last = i_mmap_lock;
- data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock;
+ i_mmap_sem_last = i_mmap_sem;
+ data->i_mmap_sems[nr_i_mmap_sems++] = i_mmap_sem;
}
- data->nr_i_mmap_locks = nr_i_mmap_locks;
+ data->nr_i_mmap_sems = nr_i_mmap_sems;
anon_vma_lock_last = NULL;
nr_anon_vma_locks = 0;
@@ -2351,8 +2353,8 @@
}
data->nr_anon_vma_locks = nr_anon_vma_locks;
- for (i = 0; i < nr_i_mmap_locks; i++)
- spin_lock(data->i_mmap_locks[i]);
+ for (i = 0; i < nr_i_mmap_sems; i++)
+ down_write(data->i_mmap_sems[i]);
for (i = 0; i < nr_anon_vma_locks; i++)
spin_lock(data->anon_vma_locks[i]);
@@ -2361,7 +2363,7 @@
out_vfree_both:
vfree(data->anon_vma_locks);
out_vfree:
- vfree(data->i_mmap_locks);
+ vfree(data->i_mmap_sems);
out_kfree:
kfree(data);
out:
@@ -2373,14 +2375,14 @@
{
unsigned long i;
- for (i = 0; i < data->nr_i_mmap_locks; i++)
- spin_unlock(data->i_mmap_locks[i]);
+ for (i = 0; i < data->nr_i_mmap_sems; i++)
+ up_write(data->i_mmap_sems[i]);
for (i = 0; i < data->nr_anon_vma_locks; i++)
spin_unlock(data->anon_vma_locks[i]);
up_write(&mm->mmap_sem);
- vfree(data->i_mmap_locks);
+ vfree(data->i_mmap_sems);
vfree(data->anon_vma_locks);
kfree(data);
}
diff --git a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -88,7 +88,7 @@
* and we propagate stale pages into the dst afterward.
*/
mapping = vma->vm_file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
+ down_write(&mapping->i_mmap_sem);
if (new_vma->vm_truncate_count &&
new_vma->vm_truncate_count != vma->vm_truncate_count)
new_vma->vm_truncate_count = 0;
@@ -120,7 +120,7 @@
pte_unmap_nested(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ up_write(&mapping->i_mmap_sem);
mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
}
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
* inode->i_alloc_sem (vmtruncate_range)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
- * mapping->i_mmap_lock
+ * mapping->i_mmap_sem
* anon_vma->lock
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -373,14 +373,14 @@
* The page lock not only makes sure that page->mapping cannot
* suddenly be NULLified by truncation, it makes sure that the
* structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_lock.
+ * so we can safely take mapping->i_mmap_sem.
*/
BUG_ON(!PageLocked(page));
- spin_lock(&mapping->i_mmap_lock);
+ down_read(&mapping->i_mmap_sem);
/*
- * i_mmap_lock does not stabilize mapcount at all, but mapcount
+ * i_mmap_sem does not stabilize mapcount at all, but mapcount
* is more likely to be accurate if we note it after spinning.
*/
mapcount = page_mapcount(page);
@@ -403,7 +403,7 @@
break;
}
- spin_unlock(&mapping->i_mmap_lock);
+ up_read(&mapping->i_mmap_sem);
return referenced;
}
@@ -489,12 +489,12 @@
BUG_ON(PageAnon(page));
- spin_lock(&mapping->i_mmap_lock);
+ down_read(&mapping->i_mmap_sem);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
if (vma->vm_flags & VM_SHARED)
ret += page_mkclean_one(page, vma);
}
- spin_unlock(&mapping->i_mmap_lock);
+ up_read(&mapping->i_mmap_sem);
return ret;
}
@@ -930,7 +930,7 @@
unsigned long max_nl_size = 0;
unsigned int mapcount;
- spin_lock(&mapping->i_mmap_lock);
+ down_read(&mapping->i_mmap_sem);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
ret = try_to_unmap_one(page, vma, migration);
if (ret == SWAP_FAIL || !page_mapped(page))
@@ -967,7 +967,6 @@
mapcount = page_mapcount(page);
if (!mapcount)
goto out;
- cond_resched_lock(&mapping->i_mmap_lock);
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
if (max_nl_cursor == 0)
@@ -989,7 +988,6 @@
}
vma->vm_private_data = (void *) max_nl_cursor;
}
- cond_resched_lock(&mapping->i_mmap_lock);
max_nl_cursor += CLUSTER_SIZE;
} while (max_nl_cursor <= max_nl_size);
@@ -1001,7 +999,7 @@
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
vma->vm_private_data = NULL;
out:
- spin_unlock(&mapping->i_mmap_lock);
+ up_write(&mapping->i_mmap_sem);
return ret;
}
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 6 of 9] We no longer abort unmapping in unmap vmas because we can reschedule while
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
` (4 preceding siblings ...)
2008-04-08 15:44 ` [PATCH 5 of 9] The conversion to a rwsem allows callbacks during rmap traversal Andrea Arcangeli
@ 2008-04-08 15:44 ` Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 7 of 9] Convert the anon_vma spinlock to a rw semaphore. This allows concurrent Andrea Arcangeli
` (5 subsequent siblings)
11 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
# HG changeset patch
# User Andrea Arcangeli <andrea@qumranet.com>
# Date 1207666893 -7200
# Node ID b0cb674314534b9cc4759603f123474d38427b2d
# Parent 20e829e35dfeceeb55a816ef495afda10cd50b98
We no longer abort unmapping in unmap vmas because we can reschedule while
unmapping since we are holding a semaphore. This would allow moving more
of the tlb flusing into unmap_vmas reducing code in various places.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -723,8 +723,7 @@
struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb,
- struct vm_area_struct *start_vma, unsigned long start_addr,
+unsigned long unmap_vmas(struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -805,7 +805,6 @@
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
- * @tlbp: address of the caller's struct mmu_gather
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
@@ -817,20 +816,13 @@
* Unmap all pages in the vma list.
*
* We aim to not hold locks for too long (for scheduling latency reasons).
- * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
- * return the ending mmu_gather to the caller.
+ * So zap pages in ZAP_BLOCK_SIZE bytecounts.
*
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
- *
- * unmap_vmas() assumes that the caller will flush the whole unmapped address
- * range after unmap_vmas() returns. So the only responsibility here is to
- * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
- * drops the lock and schedules.
*/
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
- struct vm_area_struct *vma, unsigned long start_addr,
+unsigned long unmap_vmas(struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
@@ -838,7 +830,15 @@
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
unsigned long start = start_addr;
- int fullmm = (*tlbp)->fullmm;
+ int fullmm;
+ struct mmu_gather *tlb;
+ struct mm_struct *mm = vma->vm_mm;
+
+ mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+ lru_add_drain();
+ tlb = tlb_gather_mmu(mm, 0);
+ update_hiwater_rss(mm);
+ fullmm = tlb->fullmm;
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long end;
@@ -865,7 +865,7 @@
(HPAGE_SIZE / PAGE_SIZE);
start = end;
} else
- start = unmap_page_range(*tlbp, vma,
+ start = unmap_page_range(tlb, vma,
start, end, &zap_work, details);
if (zap_work > 0) {
@@ -873,13 +873,15 @@
break;
}
- tlb_finish_mmu(*tlbp, tlb_start, start);
+ tlb_finish_mmu(tlb, tlb_start, start);
cond_resched();
- *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
+ tlb = tlb_gather_mmu(vma->vm_mm, fullmm);
tlb_start_valid = 0;
zap_work = ZAP_BLOCK_SIZE;
}
}
+ tlb_finish_mmu(tlb, start_addr, end_addr);
+ mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
return start; /* which is now the end (or restart) address */
}
@@ -893,20 +895,10 @@
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
- struct mm_struct *mm = vma->vm_mm;
- struct mmu_gather *tlb;
unsigned long end = address + size;
unsigned long nr_accounted = 0;
- lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
- update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, address, end);
- end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
- mmu_notifier_invalidate_range_end(mm, address, end);
- if (tlb)
- tlb_finish_mmu(tlb, address, end);
- return end;
+ return unmap_vmas(vma, address, end, &nr_accounted, details);
}
/*
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1743,19 +1743,12 @@
unsigned long start, unsigned long end)
{
struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
- struct mmu_gather *tlb;
unsigned long nr_accounted = 0;
- lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
- update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, start, end);
- unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+ unmap_vmas(vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- tlb_finish_mmu(tlb, start, end);
free_pgtables(vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
- mmu_notifier_invalidate_range_end(mm, start, end);
}
/*
@@ -2035,7 +2028,6 @@
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
- struct mmu_gather *tlb;
struct vm_area_struct *vma = mm->mmap;
unsigned long nr_accounted = 0;
unsigned long end;
@@ -2046,12 +2038,9 @@
lru_add_drain();
flush_cache_mm(mm);
- tlb = tlb_gather_mmu(mm, 1);
- /* Don't update_hiwater_rss(mm) here, do_exit already did */
- /* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+
+ end = unmap_vmas(vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- tlb_finish_mmu(tlb, 0, end);
free_pgtables(vma, FIRST_USER_ADDRESS, 0);
/*
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 7 of 9] Convert the anon_vma spinlock to a rw semaphore. This allows concurrent
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
` (5 preceding siblings ...)
2008-04-08 15:44 ` [PATCH 6 of 9] We no longer abort unmapping in unmap vmas because we can reschedule while Andrea Arcangeli
@ 2008-04-08 15:44 ` Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 8 of 9] XPMEM would have used sys_madvise() except that madvise_dontneed() Andrea Arcangeli
` (4 subsequent siblings)
11 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
# HG changeset patch
# User Andrea Arcangeli <andrea@qumranet.com>
# Date 1207666968 -7200
# Node ID a0c52e4b9b71e2627238b69c0a58905097973279
# Parent b0cb674314534b9cc4759603f123474d38427b2d
Convert the anon_vma spinlock to a rw semaphore. This allows concurrent
traversal of reverse maps for try_to_unmap and page_mkclean. It also
allows the calling of sleeping functions from reverse map traversal.
An additional complication is that rcu is used in some context to guarantee
the presence of the anon_vma while we acquire the lock. We cannot take a
semaphore within an rcu critical section. Add a refcount to the anon_vma
structure which allow us to give an existence guarantee for the anon_vma
structure independent of the spinlock or the list contents.
The refcount can then be taken within the RCU section. If it has been
taken successfully then the refcount guarantees the existence of the
anon_vma. The refcount in anon_vma also allows us to fix a nasty
issue in page migration where we fudged by using rcu for a long code
path to guarantee the existence of the anon_vma.
The refcount in general allows a shortening of RCU critical sections since
we can do a rcu_unlock after taking the refcount. This is particularly
relevant if the anon_vma chains contain hundreds of entries.
Issues:
- Atomic overhead increases in situations where a new reference
to the anon_vma has to be established or removed. Overhead also increases
when a speculative reference is used (try_to_unmap,
page_mkclean, page migration). There is also the more frequent processor
change due to up_xxx letting waiting tasks run first.
This results in f.e. the Aim9 brk performance test to got down by 10-15%.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1051,9 +1051,9 @@
struct mm_lock_data {
struct rw_semaphore **i_mmap_sems;
- spinlock_t **anon_vma_locks;
+ struct rw_semaphore **anon_vma_sems;
unsigned long nr_i_mmap_sems;
- unsigned long nr_anon_vma_locks;
+ unsigned long nr_anon_vma_sems;
};
extern struct mm_lock_data *mm_lock(struct mm_struct * mm);
extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -25,7 +25,8 @@
* pointing to this anon_vma once its vma list is empty.
*/
struct anon_vma {
- spinlock_t lock; /* Serialize access to vma list */
+ atomic_t refcount; /* vmas on the list */
+ struct rw_semaphore sem;/* Serialize access to vma list */
struct list_head head; /* List of private "related" vmas */
};
@@ -43,18 +44,31 @@
kmem_cache_free(anon_vma_cachep, anon_vma);
}
+struct anon_vma *grab_anon_vma(struct page *page);
+
+static inline void get_anon_vma(struct anon_vma *anon_vma)
+{
+ atomic_inc(&anon_vma->refcount);
+}
+
+static inline void put_anon_vma(struct anon_vma *anon_vma)
+{
+ if (atomic_dec_and_test(&anon_vma->refcount))
+ anon_vma_free(anon_vma);
+}
+
static inline void anon_vma_lock(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
- spin_lock(&anon_vma->lock);
+ down_write(&anon_vma->sem);
}
static inline void anon_vma_unlock(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
- spin_unlock(&anon_vma->lock);
+ up_write(&anon_vma->sem);
}
/*
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -235,15 +235,16 @@
return;
/*
- * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+ * We hold either the mmap_sem lock or a reference on the
+ * anon_vma. So no need to call page_lock_anon_vma.
*/
anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
- spin_lock(&anon_vma->lock);
+ down_read(&anon_vma->sem);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
remove_migration_pte(vma, old, new);
- spin_unlock(&anon_vma->lock);
+ up_read(&anon_vma->sem);
}
/*
@@ -623,7 +624,7 @@
int rc = 0;
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
- int rcu_locked = 0;
+ struct anon_vma *anon_vma = NULL;
int charge = 0;
if (!newpage)
@@ -647,16 +648,14 @@
}
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
- * we cannot notice that anon_vma is freed while we migrates a page.
+ * we cannot notice that anon_vma is freed while we migrate a page.
* This rcu_read_lock() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*/
- if (PageAnon(page)) {
- rcu_read_lock();
- rcu_locked = 1;
- }
+ if (PageAnon(page))
+ anon_vma = grab_anon_vma(page);
/*
* Corner case handling:
@@ -674,10 +673,7 @@
if (!PageAnon(page) && PagePrivate(page)) {
/*
* Go direct to try_to_free_buffers() here because
- * a) that's what try_to_release_page() would do anyway
- * b) we may be under rcu_read_lock() here, so we can't
- * use GFP_KERNEL which is what try_to_release_page()
- * needs to be effective.
+ * that's what try_to_release_page() would do anyway
*/
try_to_free_buffers(page);
}
@@ -698,8 +694,8 @@
} else if (charge)
mem_cgroup_end_migration(newpage);
rcu_unlock:
- if (rcu_locked)
- rcu_read_unlock();
+ if (anon_vma)
+ put_anon_vma(anon_vma);
unlock:
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -566,7 +566,7 @@
if (vma->anon_vma)
anon_vma = vma->anon_vma;
if (anon_vma) {
- spin_lock(&anon_vma->lock);
+ down_write(&anon_vma->sem);
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
@@ -620,7 +620,7 @@
}
if (anon_vma)
- spin_unlock(&anon_vma->lock);
+ up_write(&anon_vma->sem);
if (mapping)
up_write(&mapping->i_mmap_sem);
@@ -2247,16 +2247,15 @@
struct mm_lock_data *mm_lock(struct mm_struct * mm)
{
struct vm_area_struct *vma;
- struct rw_semaphore *i_mmap_sem_last;
- spinlock_t *anon_vma_lock_last;
- unsigned long nr_i_mmap_sems, nr_anon_vma_locks, i;
+ struct rw_semaphore *i_mmap_sem_last, *anon_vma_sem_last;
+ unsigned long nr_i_mmap_sems, nr_anon_vma_sems, i;
struct mm_lock_data *data;
int err;
down_write(&mm->mmap_sem);
err = -EINTR;
- nr_i_mmap_sems = nr_anon_vma_locks = 0;
+ nr_i_mmap_sems = nr_anon_vma_sems = 0;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
if (unlikely(signal_pending(current)))
@@ -2265,7 +2264,7 @@
if (vma->vm_file && vma->vm_file->f_mapping)
nr_i_mmap_sems++;
if (vma->anon_vma)
- nr_anon_vma_locks++;
+ nr_anon_vma_sems++;
}
err = -ENOMEM;
@@ -2281,13 +2280,13 @@
} else
data->i_mmap_sems = NULL;
- if (nr_anon_vma_locks) {
- data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
- sizeof(spinlock_t));
- if (!data->anon_vma_locks)
+ if (nr_anon_vma_sems) {
+ data->anon_vma_sems = vmalloc(nr_anon_vma_sems *
+ sizeof(struct rw_semaphore));
+ if (!data->anon_vma_sems)
goto out_vfree;
} else
- data->anon_vma_locks = NULL;
+ data->anon_vma_sems = NULL;
err = -EINTR;
i_mmap_sem_last = NULL;
@@ -2318,10 +2317,11 @@
}
data->nr_i_mmap_sems = nr_i_mmap_sems;
- anon_vma_lock_last = NULL;
- nr_anon_vma_locks = 0;
+ anon_vma_sem_last = NULL;
+ nr_anon_vma_sems = 0;
for (;;) {
- spinlock_t *anon_vma_lock = (spinlock_t *) -1UL;
+ struct rw_semaphore *anon_vma_sem;
+ anon_vma_sem = (struct rw_semaphore *) -1UL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
if (unlikely(signal_pending(current)))
@@ -2329,28 +2329,28 @@
if (!vma->anon_vma)
continue;
- if ((unsigned long) anon_vma_lock >
- (unsigned long) &vma->anon_vma->lock &&
- (unsigned long) &vma->anon_vma->lock >
- (unsigned long) anon_vma_lock_last)
- anon_vma_lock = &vma->anon_vma->lock;
+ if ((unsigned long) anon_vma_sem >
+ (unsigned long) &vma->anon_vma->sem &&
+ (unsigned long) &vma->anon_vma->sem >
+ (unsigned long) anon_vma_sem_last)
+ anon_vma_sem = &vma->anon_vma->sem;
}
- if (anon_vma_lock == (spinlock_t *) -1UL)
+ if (anon_vma_sem == (struct rw_semaphore *) -1UL)
break;
- anon_vma_lock_last = anon_vma_lock;
- data->anon_vma_locks[nr_anon_vma_locks++] = anon_vma_lock;
+ anon_vma_sem_last = anon_vma_sem;
+ data->anon_vma_sems[nr_anon_vma_sems++] = anon_vma_sem;
}
- data->nr_anon_vma_locks = nr_anon_vma_locks;
+ data->nr_anon_vma_sems = nr_anon_vma_sems;
for (i = 0; i < nr_i_mmap_sems; i++)
down_write(data->i_mmap_sems[i]);
- for (i = 0; i < nr_anon_vma_locks; i++)
- spin_lock(data->anon_vma_locks[i]);
+ for (i = 0; i < nr_anon_vma_sems; i++)
+ down_write(data->anon_vma_sems[i]);
return data;
out_vfree_both:
- vfree(data->anon_vma_locks);
+ vfree(data->anon_vma_sems);
out_vfree:
vfree(data->i_mmap_sems);
out_kfree:
@@ -2366,12 +2366,12 @@
for (i = 0; i < data->nr_i_mmap_sems; i++)
up_write(data->i_mmap_sems[i]);
- for (i = 0; i < data->nr_anon_vma_locks; i++)
- spin_unlock(data->anon_vma_locks[i]);
+ for (i = 0; i < data->nr_anon_vma_sems; i++)
+ up_write(data->anon_vma_sems[i]);
up_write(&mm->mmap_sem);
vfree(data->i_mmap_sems);
- vfree(data->anon_vma_locks);
+ vfree(data->anon_vma_sems);
kfree(data);
}
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -69,7 +69,7 @@
if (anon_vma) {
allocated = NULL;
locked = anon_vma;
- spin_lock(&locked->lock);
+ down_write(&locked->sem);
} else {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
@@ -81,6 +81,7 @@
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
+ get_anon_vma(anon_vma);
vma->anon_vma = anon_vma;
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
allocated = NULL;
@@ -88,7 +89,7 @@
spin_unlock(&mm->page_table_lock);
if (locked)
- spin_unlock(&locked->lock);
+ up_write(&locked->sem);
if (unlikely(allocated))
anon_vma_free(allocated);
}
@@ -99,14 +100,17 @@
{
BUG_ON(vma->anon_vma != next->anon_vma);
list_del(&next->anon_vma_node);
+ put_anon_vma(vma->anon_vma);
}
void __anon_vma_link(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
- if (anon_vma)
+ if (anon_vma) {
+ get_anon_vma(anon_vma);
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+ }
}
void anon_vma_link(struct vm_area_struct *vma)
@@ -114,36 +118,32 @@
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma) {
- spin_lock(&anon_vma->lock);
+ get_anon_vma(anon_vma);
+ down_write(&anon_vma->sem);
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- spin_unlock(&anon_vma->lock);
+ up_write(&anon_vma->sem);
}
}
void anon_vma_unlink(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
- int empty;
if (!anon_vma)
return;
- spin_lock(&anon_vma->lock);
+ down_write(&anon_vma->sem);
list_del(&vma->anon_vma_node);
-
- /* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
-
- if (empty)
- anon_vma_free(anon_vma);
+ up_write(&anon_vma->sem);
+ put_anon_vma(anon_vma);
}
static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
{
struct anon_vma *anon_vma = data;
- spin_lock_init(&anon_vma->lock);
+ init_rwsem(&anon_vma->sem);
+ atomic_set(&anon_vma->refcount, 0);
INIT_LIST_HEAD(&anon_vma->head);
}
@@ -157,9 +157,9 @@
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *grab_anon_vma(struct page *page)
{
- struct anon_vma *anon_vma;
+ struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
@@ -170,17 +170,26 @@
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- spin_lock(&anon_vma->lock);
- return anon_vma;
+ if (!atomic_inc_not_zero(&anon_vma->refcount))
+ anon_vma = NULL;
out:
rcu_read_unlock();
- return NULL;
+ return anon_vma;
+}
+
+static struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+ struct anon_vma *anon_vma = grab_anon_vma(page);
+
+ if (anon_vma)
+ down_read(&anon_vma->sem);
+ return anon_vma;
}
static void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
- spin_unlock(&anon_vma->lock);
- rcu_read_unlock();
+ up_read(&anon_vma->sem);
+ put_anon_vma(anon_vma);
}
/*
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 8 of 9] XPMEM would have used sys_madvise() except that madvise_dontneed()
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
` (6 preceding siblings ...)
2008-04-08 15:44 ` [PATCH 7 of 9] Convert the anon_vma spinlock to a rw semaphore. This allows concurrent Andrea Arcangeli
@ 2008-04-08 15:44 ` Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 9 of 9] This patch adds a lock ordering rule to avoid a potential deadlock when Andrea Arcangeli
` (3 subsequent siblings)
11 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
# HG changeset patch
# User Andrea Arcangeli <andrea@qumranet.com>
# Date 1207666972 -7200
# Node ID 3b14e26a4e0491f00bb989be04d8b7e0755ed2d7
# Parent a0c52e4b9b71e2627238b69c0a58905097973279
XPMEM would have used sys_madvise() except that madvise_dontneed()
returns an -EINVAL if VM_PFNMAP is set, which is always true for the pages
XPMEM imports from other partitions and is also true for uncached pages
allocated locally via the mspec allocator. XPMEM needs zap_page_range()
functionality for these types of pages as well as 'normal' pages.
Signed-off-by: Dean Nelson <dcn@sgi.com>
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -900,6 +900,7 @@
return unmap_vmas(vma, address, end, &nr_accounted, details);
}
+EXPORT_SYMBOL_GPL(zap_page_range);
/*
* Do a quick page-table lookup for a single page.
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 9 of 9] This patch adds a lock ordering rule to avoid a potential deadlock when
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
` (7 preceding siblings ...)
2008-04-08 15:44 ` [PATCH 8 of 9] XPMEM would have used sys_madvise() except that madvise_dontneed() Andrea Arcangeli
@ 2008-04-08 15:44 ` Andrea Arcangeli
2008-04-08 21:46 ` [PATCH 0 of 9] mmu notifier #v12 Avi Kivity
` (2 subsequent siblings)
11 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 15:44 UTC (permalink / raw)
To: Christoph Lameter
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
# HG changeset patch
# User Andrea Arcangeli <andrea@qumranet.com>
# Date 1207666972 -7200
# Node ID bd55023b22769ecb14b26c2347947f7d6d63bcea
# Parent 3b14e26a4e0491f00bb989be04d8b7e0755ed2d7
This patch adds a lock ordering rule to avoid a potential deadlock when
multiple mmap_sems need to be locked.
Signed-off-by: Dean Nelson <dcn@sgi.com>
diff --git a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -79,6 +79,9 @@
*
* ->i_mutex (generic_file_buffered_write)
* ->mmap_sem (fault_in_pages_readable->do_page_fault)
+ *
+ * When taking multiple mmap_sems, one should lock the lowest-addressed
+ * one first proceeding on up to the highest-addressed one.
*
* ->i_mutex
* ->i_alloc_sem (various)
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 2 of 9] Core of mmu notifiers
2008-04-08 15:44 ` [PATCH 2 of 9] Core of mmu notifiers Andrea Arcangeli
@ 2008-04-08 16:26 ` Robin Holt
2008-04-08 17:05 ` Andrea Arcangeli
2008-04-14 19:57 ` Christoph Lameter
2008-04-14 19:59 ` Christoph Lameter
2 siblings, 1 reply; 44+ messages in thread
From: Robin Holt @ 2008-04-08 16:26 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, Robin Holt, general,
Hugh Dickins
This one does not build on ia64. I get the following:
[holt@attica mmu_v12_xpmem_v003_v1]$ make compressed
CHK include/linux/version.h
CHK include/linux/utsrelease.h
CALL scripts/checksyscalls.sh
CHK include/linux/compile.h
CC mm/mmu_notifier.o
In file included from include/linux/mmu_notifier.h:6,
from mm/mmu_notifier.c:12:
include/linux/mm_types.h:200: error: expected specifier-qualifier-list before ‘cpumask_t’
In file included from mm/mmu_notifier.c:12:
include/linux/mmu_notifier.h: In function ‘mm_has_notifiers’:
include/linux/mmu_notifier.h:62: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’
include/linux/mmu_notifier.h: In function ‘mmu_notifier_mm_init’:
include/linux/mmu_notifier.h:117: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’
In file included from include/asm/pgtable.h:155,
from include/linux/mm.h:39,
from mm/mmu_notifier.c:14:
include/asm/mmu_context.h: In function ‘get_mmu_context’:
include/asm/mmu_context.h:81: error: ‘struct mm_struct’ has no member named ‘context’
include/asm/mmu_context.h:88: error: ‘struct mm_struct’ has no member named ‘context’
include/asm/mmu_context.h:90: error: ‘struct mm_struct’ has no member named ‘cpu_vm_mask’
include/asm/mmu_context.h:99: error: ‘struct mm_struct’ has no member named ‘context’
include/asm/mmu_context.h: In function ‘init_new_context’:
include/asm/mmu_context.h:120: error: ‘struct mm_struct’ has no member named ‘context’
include/asm/mmu_context.h: In function ‘activate_context’:
include/asm/mmu_context.h:173: error: ‘struct mm_struct’ has no member named ‘cpu_vm_mask’
include/asm/mmu_context.h:174: error: ‘struct mm_struct’ has no member named ‘cpu_vm_mask’
include/asm/mmu_context.h:180: error: ‘struct mm_struct’ has no member named ‘context’
mm/mmu_notifier.c: In function ‘__mmu_notifier_release’:
mm/mmu_notifier.c:25: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’
mm/mmu_notifier.c:26: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’
mm/mmu_notifier.c: In function ‘__mmu_notifier_clear_flush_young’:
mm/mmu_notifier.c:47: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’
mm/mmu_notifier.c: In function ‘__mmu_notifier_invalidate_page’:
mm/mmu_notifier.c:61: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’
mm/mmu_notifier.c: In function ‘__mmu_notifier_invalidate_range_start’:
mm/mmu_notifier.c:73: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’
mm/mmu_notifier.c: In function ‘__mmu_notifier_invalidate_range_end’:
mm/mmu_notifier.c:85: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’
mm/mmu_notifier.c: In function ‘mmu_notifier_register’:
mm/mmu_notifier.c:102: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’
make[1]: *** [mm/mmu_notifier.o] Error 1
make: *** [mm] Error 2
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 2 of 9] Core of mmu notifiers
2008-04-08 16:26 ` Robin Holt
@ 2008-04-08 17:05 ` Andrea Arcangeli
0 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 17:05 UTC (permalink / raw)
To: Robin Holt
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Tue, Apr 08, 2008 at 11:26:19AM -0500, Robin Holt wrote:
> This one does not build on ia64. I get the following:
I think it's a common code compilation bug not related to my
patch. Can you test this?
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -10,6 +10,7 @@
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
+#include <linux/cpumask.h>
#include <asm/page.h>
#include <asm/mmu.h>
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
` (8 preceding siblings ...)
2008-04-08 15:44 ` [PATCH 9 of 9] This patch adds a lock ordering rule to avoid a potential deadlock when Andrea Arcangeli
@ 2008-04-08 21:46 ` Avi Kivity
2008-04-08 22:06 ` Andrea Arcangeli
2008-04-09 13:17 ` Robin Holt
2008-04-14 23:09 ` Christoph Lameter
11 siblings, 1 reply; 44+ messages in thread
From: Avi Kivity @ 2008-04-08 21:46 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, kvm-devel, Robin Holt, general, Hugh Dickins
Andrea Arcangeli wrote:
> Note that mmu_notifier_unregister may also fail with -EINTR if there are
> signal pending or the system runs out of vmalloc space or physical memory,
> only exit_mmap guarantees that any kernel module can be unloaded in presence
> of an oom condition.
>
>
That's unusual. What happens to the notifier? Suppose I destroy a vm
without exiting the process, what happens if it fires?
--
Any sufficiently difficult bug is indistinguishable from a feature.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-08 21:46 ` [PATCH 0 of 9] mmu notifier #v12 Avi Kivity
@ 2008-04-08 22:06 ` Andrea Arcangeli
0 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-08 22:06 UTC (permalink / raw)
To: Avi Kivity
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, kvm-devel, Robin Holt, general, Hugh Dickins
On Wed, Apr 09, 2008 at 12:46:49AM +0300, Avi Kivity wrote:
> That's unusual. What happens to the notifier? Suppose I destroy a vm
Yes it's quite unusual.
> without exiting the process, what happens if it fires?
The mmu notifier ops should stop doing stuff (if there will be no
memslots they will be noops), or the ops can be replaced atomically
with null pointers. The important thing is that the module can't go
away until ->release is invoked or until mmu_notifier_unregister
returned 0.
Previously there was no mmu_notifier_unregister, so adding it can't be
a regression compared to #v11, even if it can fail and you may have to
retry later after returning to userland. Retrying from userland is
always safe in oom kill terms, only looping inside the kernel isn't
safe as do_exit has no chance to run.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
` (9 preceding siblings ...)
2008-04-08 21:46 ` [PATCH 0 of 9] mmu notifier #v12 Avi Kivity
@ 2008-04-09 13:17 ` Robin Holt
2008-04-09 14:44 ` Andrea Arcangeli
2008-04-14 23:09 ` Christoph Lameter
11 siblings, 1 reply; 44+ messages in thread
From: Robin Holt @ 2008-04-09 13:17 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, Robin Holt, general,
Hugh Dickins
I applied this patch set with the xpmem version I am working up for
submission and the basic level-1 and level-2 tests passed. The full mpi
regression test still tends to hang, but that appears to be a common
problem failure affecting either emm or mmu notifiers and therefore, I
am certain is a problem in my code.
Please note this is not an endorsement of one method over the other,
merely that under conditions where we would expect xpmem to pass the
regression tests, it does pass those tests.
Thanks,
Robin
On Tue, Apr 08, 2008 at 05:44:03PM +0200, Andrea Arcangeli wrote:
> The difference with #v11 is a different implementation of mm_lock that
> guarantees handling signals in O(N). It's also more lowlatency friendly.
>
> Note that mmu_notifier_unregister may also fail with -EINTR if there are
> signal pending or the system runs out of vmalloc space or physical memory,
> only exit_mmap guarantees that any kernel module can be unloaded in presence
> of an oom condition.
>
> Either #v11 or the first three #v12 1,2,3 patches are suitable for inclusion
> in -mm, pick what you prefer looking at the mmu_notifier_register retval and
> mm_lock retval difference, I implemented and slighty tested both. GRU and KVM
> only needs 1,2,3, XPMEM needs the rest of the patchset too (4, ...) but all
> patches from 4 to the end can be deffered to a second merge window.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-09 13:17 ` Robin Holt
@ 2008-04-09 14:44 ` Andrea Arcangeli
2008-04-09 18:55 ` Robin Holt
0 siblings, 1 reply; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-09 14:44 UTC (permalink / raw)
To: Robin Holt
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Wed, Apr 09, 2008 at 08:17:09AM -0500, Robin Holt wrote:
> I applied this patch set with the xpmem version I am working up for
> submission and the basic level-1 and level-2 tests passed. The full mpi
> regression test still tends to hang, but that appears to be a common
> problem failure affecting either emm or mmu notifiers and therefore, I
> am certain is a problem in my code.
>
> Please note this is not an endorsement of one method over the other,
> merely that under conditions where we would expect xpmem to pass the
> regression tests, it does pass those tests.
Thanks a lot for testing! #v12 works great with KVM too. (I'm now in
the process of chagning the KVM patch to drop the page pinning)
BTW, how did you implement invalidate_page? As this?
invalidate_page() {
invalidate_range_begin()
invalidate_range_end()
}
If yes, I prefer to remind you that normally invalidate_range_begin is
always called before zapping the pte. In the invalidate_page case
instead, invalidate_range_begin is called _after_ the pte has been
zapped already.
Now there's no problem if the pte is established and the spte isn't
established. But it must never happen that the spte is established and
the pte isn't established (with page-pinning that means unswappable
memlock leak, without page-pinning it would mean memory corruption).
So the range_begin must serialize against the secondary mmu page fault
so that it can't establish the spte on a pte that was zapped by the
rmap code after get_user_pages/follow_page returned. I think your
range_begin already does that so you should be ok but I wanted to
remind about this slight difference in implementing invalidate_page as
I suggested above in previous email just to be sure ;).
This is the race you must guard against in invalidate_page:
CPU0 CPU1
try_to_unmap on page
secondary mmu page fault
get_user_pages()/follow_page found a page
ptep_clear_flush
invalidate_page()
invalidate_range_begin()
invalidate_range_end()
return from invalidate_page
establish spte on page
return from secodnary mmu page fault
If your range_begin already serializes in a hard way against the
secondary mmu page fault, my previously "trivial" suggested
implementation for invalidate_page should work just fine and this this
saves 1 branch for each try_to_unmap_one if compared to the emm
implementation. The branch check is inlined and it checks against the
mmu_notifier_head that is the hot cacheline, no new cachline is
checked just one branch is saved and so it worth it IMHO even if it
doesn't provide any other advantage if you implement it the way above.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-09 14:44 ` Andrea Arcangeli
@ 2008-04-09 18:55 ` Robin Holt
2008-04-22 7:20 ` Andrea Arcangeli
0 siblings, 1 reply; 44+ messages in thread
From: Robin Holt @ 2008-04-09 18:55 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Robin Holt, Christoph Lameter, akpm, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, general,
Hugh Dickins
On Wed, Apr 09, 2008 at 04:44:01PM +0200, Andrea Arcangeli wrote:
> BTW, how did you implement invalidate_page? As this?
>
> invalidate_page() {
> invalidate_range_begin()
> invalidate_range_end()
> }
Essentially, I did the work of each step without releasing and
reacquiring locks.
> If yes, I prefer to remind you that normally invalidate_range_begin is
> always called before zapping the pte. In the invalidate_page case
> instead, invalidate_range_begin is called _after_ the pte has been
> zapped already.
>
> Now there's no problem if the pte is established and the spte isn't
> established. But it must never happen that the spte is established and
> the pte isn't established (with page-pinning that means unswappable
> memlock leak, without page-pinning it would mean memory corruption).
I am not sure I follow what you are saying. Here is a very terse
breakdown of how PFNs flow through xpmem's structures.
We have a PFN table associated with our structure describing a grant.
We use get_user_pages() to acquire information for that table and we
fill the table in under a mutex. Remote hosts (on the same numa network
so they have direct access to the users memory) have a PROXY version of
that structure. It is filled out in a similar fashion to the local
table. PTEs are created for the other processes while holding the mutex
for this table (either local or remote). During the process of
faulting, we have a simple linked list of ongoing faults that is
maintained whenever the mutex is going to be released.
Our version of a zap_page_range is called recall_PFNs. The recall
process grabs the mutex, scans the faulters list for any that cover the
range and mark them as needing a retry. It then calls zap_page_range
for any processes that have attached the granted memory to clear out
their page tables. Finally, we release the mutex and proceed.
The locking is more complex than this, but that is the essential idea.
What that means for mmu_notifiers is we have a single reference on the
page for all the remote processes using it. When the callout to
invalidate_page() is made, we will still have processes with that PTE in
their page tables and potentially TLB entries. When we return from the
invalidate_page() callout, we will have removed all those page table
entries, we will have no in-progress page table or tlb insertions that
will complete, and we will have released all our references to the page.
Does that meet your expectations?
Thanks,
Robin
>
> So the range_begin must serialize against the secondary mmu page fault
> so that it can't establish the spte on a pte that was zapped by the
> rmap code after get_user_pages/follow_page returned. I think your
> range_begin already does that so you should be ok but I wanted to
> remind about this slight difference in implementing invalidate_page as
> I suggested above in previous email just to be sure ;).
>
> This is the race you must guard against in invalidate_page:
>
>
> CPU0 CPU1
> try_to_unmap on page
> secondary mmu page fault
> get_user_pages()/follow_page found a page
> ptep_clear_flush
> invalidate_page()
> invalidate_range_begin()
> invalidate_range_end()
> return from invalidate_page
> establish spte on page
> return from secodnary mmu page fault
>
> If your range_begin already serializes in a hard way against the
> secondary mmu page fault, my previously "trivial" suggested
> implementation for invalidate_page should work just fine and this this
> saves 1 branch for each try_to_unmap_one if compared to the emm
> implementation. The branch check is inlined and it checks against the
> mmu_notifier_head that is the hot cacheline, no new cachline is
> checked just one branch is saved and so it worth it IMHO even if it
> doesn't provide any other advantage if you implement it the way above.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 2 of 9] Core of mmu notifiers
2008-04-08 15:44 ` [PATCH 2 of 9] Core of mmu notifiers Andrea Arcangeli
2008-04-08 16:26 ` Robin Holt
@ 2008-04-14 19:57 ` Christoph Lameter
2008-04-14 19:59 ` Christoph Lameter
2 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2008-04-14 19:57 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
On Tue, 8 Apr 2008, Andrea Arcangeli wrote:
> + /*
> + * Called when nobody can register any more notifier in the mm
> + * and after the "mn" notifier has been disarmed already.
> + */
> + void (*release)(struct mmu_notifier *mn,
> + struct mm_struct *mm);
Hmmm... The unregister function does not call this. Guess driver calls
unregister function and does release like stuff on its own.
> + /*
> + * invalidate_range_start() and invalidate_range_end() must be
> + * paired. Multiple invalidate_range_start/ends may be nested
> + * or called concurrently.
> + */
How could they be nested or called concurrently?
> +/*
> + * mm_users can't go down to zero while mmu_notifier_unregister()
> + * runs or it can race with ->release. So a mm_users pin must
> + * be taken by the caller (if mm can be different from current->mm).
> + */
> +int mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
> +{
> + struct mm_lock_data *data;
> +
> + BUG_ON(!atomic_read(&mm->mm_users));
> +
> + data = mm_lock(mm);
> + if (unlikely(IS_ERR(data)))
> + return PTR_ERR(data);
> + hlist_del(&mn->hlist);
> + mm_unlock(mm, data);
> + return 0;
Hmmm.. Ok, the user of the notifier does not get notified that it was
unregistered.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 3 of 9] Moves all mmu notifier methods outside the PT lock (first and not last
2008-04-08 15:44 ` [PATCH 3 of 9] Moves all mmu notifier methods outside the PT lock (first and not last Andrea Arcangeli
@ 2008-04-14 19:57 ` Christoph Lameter
0 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2008-04-14 19:57 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
Not sure why this patch is not merged into 2 of 9. Same comment as last
round.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 2 of 9] Core of mmu notifiers
2008-04-08 15:44 ` [PATCH 2 of 9] Core of mmu notifiers Andrea Arcangeli
2008-04-08 16:26 ` Robin Holt
2008-04-14 19:57 ` Christoph Lameter
@ 2008-04-14 19:59 ` Christoph Lameter
2 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2008-04-14 19:59 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
Where is the documentation on locking that you wanted to provide?
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
` (10 preceding siblings ...)
2008-04-09 13:17 ` Robin Holt
@ 2008-04-14 23:09 ` Christoph Lameter
11 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2008-04-14 23:09 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: akpm, Nick Piggin, Steve Wise, Peter Zijlstra, linux-mm,
Kanoj Sarcar, Roland Dreier, Jack Steiner, linux-kernel,
Avi Kivity, kvm-devel, Robin Holt, general, Hugh Dickins
On Tue, 8 Apr 2008, Andrea Arcangeli wrote:
> The difference with #v11 is a different implementation of mm_lock that
> guarantees handling signals in O(N). It's also more lowlatency friendly.
Ok. So the rest of the issues remains unaddressed? I am glad that we
finally settled on the locking. But now I will have to clean this up,
address the remaining issues, sequence the patches right, provide docs,
handle the merging issue etc etc? I have seen no detailed review of my
patches that you include here.
We are going down the same road as we had to go with the OOM patches where
David Rientjes and me had to deal with the issues you raised?
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-08 15:44 ` [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen Andrea Arcangeli
@ 2008-04-16 16:33 ` Robin Holt
2008-04-16 18:35 ` Christoph Lameter
2008-04-22 5:06 ` Rusty Russell
1 sibling, 1 reply; 44+ messages in thread
From: Robin Holt @ 2008-04-16 16:33 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, Robin Holt, general,
Hugh Dickins
I don't think this lock mechanism is completely working. I have
gotten a few failures trying to dereference 0x100100 which appears to
be LIST_POISON1.
Thanks,
Robin
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-16 16:33 ` Robin Holt
@ 2008-04-16 18:35 ` Christoph Lameter
2008-04-16 19:02 ` Robin Holt
2008-04-17 15:51 ` Andrea Arcangeli
0 siblings, 2 replies; 44+ messages in thread
From: Christoph Lameter @ 2008-04-16 18:35 UTC (permalink / raw)
To: Robin Holt
Cc: Andrea Arcangeli, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Wed, 16 Apr 2008, Robin Holt wrote:
> I don't think this lock mechanism is completely working. I have
> gotten a few failures trying to dereference 0x100100 which appears to
> be LIST_POISON1.
How does xpmem unregistering of notifiers work?
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-16 18:35 ` Christoph Lameter
@ 2008-04-16 19:02 ` Robin Holt
2008-04-16 19:15 ` Christoph Lameter
2008-04-17 15:51 ` Andrea Arcangeli
1 sibling, 1 reply; 44+ messages in thread
From: Robin Holt @ 2008-04-16 19:02 UTC (permalink / raw)
To: Christoph Lameter
Cc: Robin Holt, Andrea Arcangeli, akpm, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, general,
Hugh Dickins
On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> On Wed, 16 Apr 2008, Robin Holt wrote:
>
> > I don't think this lock mechanism is completely working. I have
> > gotten a few failures trying to dereference 0x100100 which appears to
> > be LIST_POISON1.
>
> How does xpmem unregistering of notifiers work?
For the tests I have been running, we are waiting for the release
callout as part of exit.
Thanks,
Robin
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-16 19:02 ` Robin Holt
@ 2008-04-16 19:15 ` Christoph Lameter
2008-04-17 11:14 ` Robin Holt
0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2008-04-16 19:15 UTC (permalink / raw)
To: Robin Holt
Cc: Andrea Arcangeli, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Wed, 16 Apr 2008, Robin Holt wrote:
> On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> > On Wed, 16 Apr 2008, Robin Holt wrote:
> >
> > > I don't think this lock mechanism is completely working. I have
> > > gotten a few failures trying to dereference 0x100100 which appears to
> > > be LIST_POISON1.
> >
> > How does xpmem unregistering of notifiers work?
>
> For the tests I have been running, we are waiting for the release
> callout as part of exit.
Some more details on the failure may be useful. AFAICT list_del[_rcu] is
the culprit here and that is only used on release or unregister.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-16 19:15 ` Christoph Lameter
@ 2008-04-17 11:14 ` Robin Holt
0 siblings, 0 replies; 44+ messages in thread
From: Robin Holt @ 2008-04-17 11:14 UTC (permalink / raw)
To: Christoph Lameter
Cc: Robin Holt, Andrea Arcangeli, akpm, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, general,
Hugh Dickins
On Wed, Apr 16, 2008 at 12:15:08PM -0700, Christoph Lameter wrote:
> On Wed, 16 Apr 2008, Robin Holt wrote:
>
> > On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> > > On Wed, 16 Apr 2008, Robin Holt wrote:
> > >
> > > > I don't think this lock mechanism is completely working. I have
> > > > gotten a few failures trying to dereference 0x100100 which appears to
> > > > be LIST_POISON1.
> > >
> > > How does xpmem unregistering of notifiers work?
> >
> > For the tests I have been running, we are waiting for the release
> > callout as part of exit.
>
> Some more details on the failure may be useful. AFAICT list_del[_rcu] is
> the culprit here and that is only used on release or unregister.
I think I have this understood now. It happens quite quickly (within
10 minutes) on a 128 rank job of small data set in a loop.
In these failing jobs, all the ranks are nearly symmetric. There is
a certain part of each ranks address space that has access granted.
All the ranks have included all the other ranks including themselves in
exactly the same layout at exactly the same virtual address.
Rank 3 has hit _release and is beginning to clean up, but has not deleted
the notifier from its list.
Rank 9 calls the xpmem_invalidate_page() callout. That page was attached
by rank 3 so we call zap_page_range on rank 3 which then calls back into
xpmem's invalidate_range_start callout.
The rank 3 _release callout begins and deletes its notifier from the list.
Rank 9's call to rank 3's zap_page_range notifier returns and dereferences
LIST_POISON1.
I often confuse myself while trying to explain these so please kick me
where the holes in the flow appear. The console output from the simple
debugging stuff I put in is a bit overwhelming.
I am trying to figure out now which locks we hold as part of the zap
callout that should have prevented the _release callout.
Thanks,
Robin
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-16 18:35 ` Christoph Lameter
2008-04-16 19:02 ` Robin Holt
@ 2008-04-17 15:51 ` Andrea Arcangeli
2008-04-17 16:36 ` Robin Holt
1 sibling, 1 reply; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-17 15:51 UTC (permalink / raw)
To: Christoph Lameter
Cc: Robin Holt, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> On Wed, 16 Apr 2008, Robin Holt wrote:
>
> > I don't think this lock mechanism is completely working. I have
> > gotten a few failures trying to dereference 0x100100 which appears to
> > be LIST_POISON1.
>
> How does xpmem unregistering of notifiers work?
Especially are you using mmu_notifier_unregister?
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-17 15:51 ` Andrea Arcangeli
@ 2008-04-17 16:36 ` Robin Holt
2008-04-17 17:14 ` Andrea Arcangeli
0 siblings, 1 reply; 44+ messages in thread
From: Robin Holt @ 2008-04-17 16:36 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Christoph Lameter, Robin Holt, akpm, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, general,
Hugh Dickins
On Thu, Apr 17, 2008 at 05:51:57PM +0200, Andrea Arcangeli wrote:
> On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> > On Wed, 16 Apr 2008, Robin Holt wrote:
> >
> > > I don't think this lock mechanism is completely working. I have
> > > gotten a few failures trying to dereference 0x100100 which appears to
> > > be LIST_POISON1.
> >
> > How does xpmem unregistering of notifiers work?
>
> Especially are you using mmu_notifier_unregister?
In this case, we are not making the call to unregister, we are waiting
for the _release callout which has already removed it from the list.
In the event that the user has removed all the grants, we use unregister.
That typically does not occur. We merely wait for exit processing to
clean up the structures.
Thanks,
Robin
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-17 16:36 ` Robin Holt
@ 2008-04-17 17:14 ` Andrea Arcangeli
2008-04-17 17:25 ` Robin Holt
2008-04-17 19:10 ` Christoph Lameter
0 siblings, 2 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-17 17:14 UTC (permalink / raw)
To: Robin Holt
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Thu, Apr 17, 2008 at 11:36:42AM -0500, Robin Holt wrote:
> In this case, we are not making the call to unregister, we are waiting
> for the _release callout which has already removed it from the list.
>
> In the event that the user has removed all the grants, we use unregister.
> That typically does not occur. We merely wait for exit processing to
> clean up the structures.
Then it's very strange. LIST_POISON1 is set in n->next. If it was a
second hlist_del triggering the bug in theory list_poison2 should
trigger first, so perhaps it's really a notifier running despite a
mm_lock is taken? Could you post a full stack trace so I can see who's
running into LIST_POISON1? If it's really a notifier running outside
of some mm_lock that will be _immediately_ visible from the stack
trace that triggered the LIST_POISON1!
Also note, EMM isn't using the clean hlist_del, it's implementing list
by hand (with zero runtime gain) so all the debugging may not be
existent in EMM, so if it's really a mm_lock race, and it only
triggers with mmu notifiers and not with EMM, it doesn't necessarily
mean EMM is bug free. If you've a full stack trace it would greatly
help to verify what is mangling over the list when the oops triggers.
Thanks!
Andrea
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-17 17:14 ` Andrea Arcangeli
@ 2008-04-17 17:25 ` Robin Holt
2008-04-17 19:10 ` Christoph Lameter
1 sibling, 0 replies; 44+ messages in thread
From: Robin Holt @ 2008-04-17 17:25 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Robin Holt, Christoph Lameter, akpm, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, general,
Hugh Dickins
On Thu, Apr 17, 2008 at 07:14:43PM +0200, Andrea Arcangeli wrote:
> On Thu, Apr 17, 2008 at 11:36:42AM -0500, Robin Holt wrote:
> > In this case, we are not making the call to unregister, we are waiting
> > for the _release callout which has already removed it from the list.
> >
> > In the event that the user has removed all the grants, we use unregister.
> > That typically does not occur. We merely wait for exit processing to
> > clean up the structures.
>
> Then it's very strange. LIST_POISON1 is set in n->next. If it was a
> second hlist_del triggering the bug in theory list_poison2 should
> trigger first, so perhaps it's really a notifier running despite a
> mm_lock is taken? Could you post a full stack trace so I can see who's
> running into LIST_POISON1? If it's really a notifier running outside
> of some mm_lock that will be _immediately_ visible from the stack
> trace that triggered the LIST_POISON1!
>
> Also note, EMM isn't using the clean hlist_del, it's implementing list
> by hand (with zero runtime gain) so all the debugging may not be
> existent in EMM, so if it's really a mm_lock race, and it only
> triggers with mmu notifiers and not with EMM, it doesn't necessarily
> mean EMM is bug free. If you've a full stack trace it would greatly
> help to verify what is mangling over the list when the oops triggers.
The stack trace is below. I did not do this level of testing on emm so
I can not compare the two in this area.
This is for a different, but equivalent failure. I just reproduce the
LIST_POISON1 failure without trying to reproduce the exact same failure
as I had documented earlier (lost that stack trace, sorry).
Thanks,
Robin
<1>Unable to handle kernel paging request at virtual address 0000000000100100
<4>mpi006.f.x[23403]: Oops 11012296146944 [1]
<4>Modules linked in: nfs lockd sunrpc binfmt_misc thermal processor fan button loop md_mod dm_mod xpmem xp mspec sg
<4>
<4>Pid: 23403, CPU 114, comm: mpi006.f.x
<4>psr : 0000121008526010 ifs : 800000000000038b ip : [<a00000010015d6a1>] Not tainted (2.6.25-rc8)
<4>ip is at __mmu_notifier_invalidate_range_start+0x81/0x120
<4>unat: 0000000000000000 pfs : 000000000000038b rsc : 0000000000000003
<4>rnat: a000000100149a00 bsps: a000000000010740 pr : 66555666a9599aa9
<4>ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c0270033f
<4>csd : 0000000000000000 ssd : 0000000000000000
<4>b0 : a00000010015d670 b6 : a0000002101ddb40 b7 : a00000010000eb50
<4>f6 : 1003e2222222222222222 f7 : 000000000000000000000
<4>f8 : 000000000000000000000 f9 : 000000000000000000000
<4>f10 : 000000000000000000000 f11 : 000000000000000000000
<4>r1 : a000000100ef1190 r2 : e0000e6080cc1940 r3 : a0000002101edd10
<4>r8 : e0000e6080cc1970 r9 : 0000000000000000 r10 : e0000e6080cc19c8
<4>r11 : 20000003a6480000 r12 : e0000c60d31efb90 r13 : e0000c60d31e0000
<4>r14 : 000000000000004d r15 : e0000e6080cc1914 r16 : e0000e6080cc1970
<4>r17 : 20000003a6480000 r18 : 20000007bf900000 r19 : 0000000000040000
<4>r20 : e0000c60d31e0000 r21 : 0000000000000010 r22 : e0000e6080cc19a8
<4>r23 : e0000c60c55f1120 r24 : e0000c60d31efda0 r25 : e0000c60d31efd98
<4>r26 : e0000e60812166d0 r27 : e0000c60d31efdc0 r28 : e0000c60d31efdb8
<4>r29 : e0000c60d31e0b60 r30 : 0000000000000000 r31 : 0000000000000081
<4>
<4>Call Trace:
<4> [<a000000100014a20>] show_stack+0x40/0xa0
<4> sp=e0000c60d31ef760 bsp=e0000c60d31e11f0
<4> [<a000000100015330>] show_regs+0x850/0x8a0
<4> sp=e0000c60d31ef930 bsp=e0000c60d31e1198
<4> [<a000000100035ed0>] die+0x1b0/0x2e0
<4> sp=e0000c60d31ef930 bsp=e0000c60d31e1150
<4> [<a000000100060e90>] ia64_do_page_fault+0x8d0/0xa40
<4> sp=e0000c60d31ef930 bsp=e0000c60d31e1100
<4> [<a00000010000ab00>] ia64_leave_kernel+0x0/0x270
<4> sp=e0000c60d31ef9c0 bsp=e0000c60d31e1100
<4> [<a00000010015d6a0>] __mmu_notifier_invalidate_range_start+0x80/0x120
<4> sp=e0000c60d31efb90 bsp=e0000c60d31e10a8
<4> [<a00000010011b1d0>] unmap_vmas+0x70/0x14c0
<4> sp=e0000c60d31efb90 bsp=e0000c60d31e0fa8
<4> [<a00000010011c660>] zap_page_range+0x40/0x60
<4> sp=e0000c60d31efda0 bsp=e0000c60d31e0f70
<4> [<a0000002101d62d0>] xpmem_clear_PTEs+0x350/0x560 [xpmem]
<4> sp=e0000c60d31efdb0 bsp=e0000c60d31e0ef0
<4> [<a0000002101d1e30>] xpmem_remove_seg+0x3f0/0x700 [xpmem]
<4> sp=e0000c60d31efde0 bsp=e0000c60d31e0ea8
<4> [<a0000002101d2500>] xpmem_remove_segs_of_tg+0x80/0x140 [xpmem]
<4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e78
<4> [<a0000002101dda40>] xpmem_mmu_notifier_release+0x40/0x80 [xpmem]
<4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e58
<4> [<a00000010015d7f0>] __mmu_notifier_release+0xb0/0x100
<4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e38
<4> [<a000000100124430>] exit_mmap+0x50/0x180
<4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e10
<4> [<a00000010008fb30>] mmput+0x70/0x180
<4> sp=e0000c60d31efe20 bsp=e0000c60d31e0dd8
<4> [<a000000100098df0>] exit_mm+0x1f0/0x220
<4> sp=e0000c60d31efe20 bsp=e0000c60d31e0da0
<4> [<a00000010009ca60>] do_exit+0x4e0/0xf40
<4> sp=e0000c60d31efe20 bsp=e0000c60d31e0d58
<4> [<a00000010009d640>] do_group_exit+0x180/0x1c0
<4> sp=e0000c60d31efe30 bsp=e0000c60d31e0d20
<4> [<a00000010009d6a0>] sys_exit_group+0x20/0x40
<4> sp=e0000c60d31efe30 bsp=e0000c60d31e0cc8
<4> [<a00000010000a960>] ia64_ret_from_syscall+0x0/0x20
<4> sp=e0000c60d31efe30 bsp=e0000c60d31e0cc8
<4> [<a000000000010720>] __kernel_syscall_via_break+0x0/0x20
<4> sp=e0000c60d31f0000 bsp=e0000c60d31e0cc8
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-17 17:14 ` Andrea Arcangeli
2008-04-17 17:25 ` Robin Holt
@ 2008-04-17 19:10 ` Christoph Lameter
2008-04-17 22:16 ` Andrea Arcangeli
1 sibling, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2008-04-17 19:10 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Robin Holt, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Thu, 17 Apr 2008, Andrea Arcangeli wrote:
> Also note, EMM isn't using the clean hlist_del, it's implementing list
> by hand (with zero runtime gain) so all the debugging may not be
> existent in EMM, so if it's really a mm_lock race, and it only
> triggers with mmu notifiers and not with EMM, it doesn't necessarily
> mean EMM is bug free. If you've a full stack trace it would greatly
> help to verify what is mangling over the list when the oops triggers.
EMM was/is using a single linked list which allows atomic updates. Looked
cleaner to me since doubly linked list must update two pointers.
I have not seen docs on the locking so not sure why you use rcu
operations here? Isnt the requirement to have either rmap locks or
mmap_sem held enough to guarantee the consistency of the doubly linked list?
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-17 19:10 ` Christoph Lameter
@ 2008-04-17 22:16 ` Andrea Arcangeli
0 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-17 22:16 UTC (permalink / raw)
To: Christoph Lameter
Cc: Robin Holt, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Thu, Apr 17, 2008 at 12:10:52PM -0700, Christoph Lameter wrote:
> EMM was/is using a single linked list which allows atomic updates. Looked
> cleaner to me since doubly linked list must update two pointers.
Cleaner would be if it would provide an abstraction in list.h. The
important is the memory taken by the head for this usage.
> I have not seen docs on the locking so not sure why you use rcu
> operations here? Isnt the requirement to have either rmap locks or
> mmap_sem held enough to guarantee the consistency of the doubly linked list?
Yes, exactly, I'm not using rcu anymore.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-08 15:44 ` [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen Andrea Arcangeli
2008-04-16 16:33 ` Robin Holt
@ 2008-04-22 5:06 ` Rusty Russell
2008-04-25 16:56 ` Andrea Arcangeli
1 sibling, 1 reply; 44+ messages in thread
From: Rusty Russell @ 2008-04-22 5:06 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, Robin Holt, general,
Hugh Dickins
On Wednesday 09 April 2008 01:44:04 Andrea Arcangeli wrote:
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1050,6 +1050,15 @@
> unsigned long addr, unsigned long len,
> unsigned long flags, struct page **pages);
>
> +struct mm_lock_data {
> + spinlock_t **i_mmap_locks;
> + spinlock_t **anon_vma_locks;
> + unsigned long nr_i_mmap_locks;
> + unsigned long nr_anon_vma_locks;
> +};
> +extern struct mm_lock_data *mm_lock(struct mm_struct * mm);
> +extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data);
As far as I can tell you don't actually need to expose this struct at all?
> + data->i_mmap_locks = vmalloc(nr_i_mmap_locks *
> + sizeof(spinlock_t));
This is why non-typesafe allocators suck. You want 'sizeof(spinlock_t *)'
here.
> + data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
> + sizeof(spinlock_t));
and here.
> + err = -EINTR;
> + i_mmap_lock_last = NULL;
> + nr_i_mmap_locks = 0;
> + for (;;) {
> + spinlock_t *i_mmap_lock = (spinlock_t *) -1UL;
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
...
> + data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock;
> + }
> + data->nr_i_mmap_locks = nr_i_mmap_locks;
How about you track your running counter in data->nr_i_mmap_locks, leave
nr_i_mmap_locks alone, and BUG_ON(data->nr_i_mmap_locks != nr_i_mmap_locks)?
Even nicer would be to wrap this in a "get_sorted_mmap_locks()" function.
Similarly for anon_vma locks.
Unfortunately, I just don't think we can fail locking like this. In your next
patch unregistering a notifier can fail because of it: that not usable.
I think it means you need to add a linked list element to the vma for the
CONFIG_MMU_NOTIFIER case. Or track the max number of vmas for any mm, and
keep a pool to handle mm_lock for this number (ie. if you can't enlarge the
pool, fail the vma allocation).
Both have their problems though...
Rusty.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-09 18:55 ` Robin Holt
@ 2008-04-22 7:20 ` Andrea Arcangeli
2008-04-22 12:00 ` Andrea Arcangeli
0 siblings, 1 reply; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-22 7:20 UTC (permalink / raw)
To: Robin Holt
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
This is a followup of the locking of the mmu-notifier methods against
the secondary-mmu page fault, each driver can implement differently
but this is to show an example of what I planned for KVM, others may
follow closely if they find this useful. I post this as pseudocode to
hide 99% of kvm internal complexities and to focus only on the
locking. The KVM locking scheme should be something on these lines:
invalidate_range_start {
spin_lock(&kvm->mmu_lock);
kvm->invalidate_range_count++;
rmap-invalidate of sptes in range
spin_unlock(&kvm->mmu_lock)
}
invalidate_range_end {
spin_lock(&kvm->mmu_lock);
kvm->invalidate_range_count--;
spin_unlock(&kvm->mmu_lock)
}
invalidate_page {
spin_lock(&kvm->mmu_lock);
write_seqlock()
rmap-invalidate of sptes of page
write_sequnlock()
spin_unlock(&kvm->mmu_lock)
}
kvm_page_fault {
seq = read_seqlock()
get_user_pages() (aka gfn_to_pfn() in kvm terms)
spin_lock(&kvm->mmu_lock)
if (seq_trylock(seq) || kvm->invalidate_range_count)
goto out; /* reply page fault */
map sptes and build rmap
out:
spin_unlock(&kvm->mmu_lock)
}
This will allow to remove the page pinning from KVM. I'd appreciate if
you Robin and Christoph can have a second look and pinpoint any
potential issue in my plan.
invalidate_page as you can notice, allows to decrease the fixed cost
overhead from all VM code that works with a single page and where
freeing the page _after_ calling invalidate_page is zero runtime/tlb
cost. We need invalidate_range_begin/end because when we work on
multiple pages, we can reduce cpu utilization and avoid many tlb
flushes by holding off the kvm page fault when we work on the range.
invalidate_page also allows to decrease the window where the kvm page
fault could possibly need to be replied (the ptep_clear_flush <->
invalidate_page window is shorter than a
invalidate_range_begin(PAGE_SIZE) <->
invalidate_range_end(PAGE_SIZE)).
So even if only as a microoptimization it worth it to decrease the
impact on the common VM code. The cost of having both a seqlock and a
range_count is irrlevant in kvm terms as they'll be in the same
cacheline and checked at the same time by the page fault and it won't
require any additional blocking (or writing) lock.
Note that the kvm page fault can't happen unless the cpu switches to
guest mode, and it can't switch to guest mode if we're in the
begin/end critical section, so in theory I could loop inside the page
fault too without risking deadlocking, but replying it by restarting
guest mode sounds nicer in sigkill/scheduling terms.
Soon I'll release a new mmu notifier patchset with patch 1 being the
mmu-notifier-core self-included and ready to go in -mm and mainline in
time for 2.6.26. Then I'll be glad to help merging any further patch
in the patchset to allow methods to sleep so XPMEM can run on mainline
2.6.27 the same way GRU/KVM/Quadrics will run fine on 2.6.26, in a
fully backwards compatible way with 2.6.26 (and of course it doesn't
really need to be backwards compatible because this is a kernel
internal API only, ask Greg etc... ;). But that will likely require a
new config option to avoid hurting AIM performance in fork because the
anon_vma critical sections are so short in the fast path.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-22 7:20 ` Andrea Arcangeli
@ 2008-04-22 12:00 ` Andrea Arcangeli
2008-04-22 13:01 ` Robin Holt
0 siblings, 1 reply; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-22 12:00 UTC (permalink / raw)
To: Robin Holt
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Tue, Apr 22, 2008 at 09:20:26AM +0200, Andrea Arcangeli wrote:
> invalidate_range_start {
> spin_lock(&kvm->mmu_lock);
>
> kvm->invalidate_range_count++;
> rmap-invalidate of sptes in range
>
write_seqlock; write_sequnlock;
> spin_unlock(&kvm->mmu_lock)
> }
>
> invalidate_range_end {
> spin_lock(&kvm->mmu_lock);
>
> kvm->invalidate_range_count--;
write_seqlock; write_sequnlock;
>
> spin_unlock(&kvm->mmu_lock)
> }
Robin correctly pointed out by PM there should be a seqlock in
range_begin/end too like corrected above.
I guess it's better to use an explicit sequence counter so we avoid an
useless spinlock of the write_seqlock (mmu_lock is enough already in
all places) and so we can increase it with a single op with +=2 in the
range_begin/end. The above is a lower-perf version of the final
locking but simpler for reading purposes.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-22 12:00 ` Andrea Arcangeli
@ 2008-04-22 13:01 ` Robin Holt
2008-04-22 13:21 ` Andrea Arcangeli
0 siblings, 1 reply; 44+ messages in thread
From: Robin Holt @ 2008-04-22 13:01 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Robin Holt, Christoph Lameter, akpm, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, general,
Hugh Dickins
On Tue, Apr 22, 2008 at 02:00:56PM +0200, Andrea Arcangeli wrote:
> On Tue, Apr 22, 2008 at 09:20:26AM +0200, Andrea Arcangeli wrote:
> > invalidate_range_start {
> > spin_lock(&kvm->mmu_lock);
> >
> > kvm->invalidate_range_count++;
> > rmap-invalidate of sptes in range
> >
>
> write_seqlock; write_sequnlock;
I don't think you need it here since invalidate_range_count is already
elevated which will accomplish the same effect.
Thanks,
Robin
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-22 13:01 ` Robin Holt
@ 2008-04-22 13:21 ` Andrea Arcangeli
2008-04-22 13:36 ` Robin Holt
0 siblings, 1 reply; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-22 13:21 UTC (permalink / raw)
To: Robin Holt
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Tue, Apr 22, 2008 at 08:01:20AM -0500, Robin Holt wrote:
> On Tue, Apr 22, 2008 at 02:00:56PM +0200, Andrea Arcangeli wrote:
> > On Tue, Apr 22, 2008 at 09:20:26AM +0200, Andrea Arcangeli wrote:
> > > invalidate_range_start {
> > > spin_lock(&kvm->mmu_lock);
> > >
> > > kvm->invalidate_range_count++;
> > > rmap-invalidate of sptes in range
> > >
> >
> > write_seqlock; write_sequnlock;
>
> I don't think you need it here since invalidate_range_count is already
> elevated which will accomplish the same effect.
Agreed, seqlock only in range_end should be enough. BTW, the fact
seqlock is needed regardless of invalidate_page existing or not,
really makes invalidate_page a no brainer not just from the core VM
point of view, but from the driver point of view too. The
kvm_page_fault logic would be the same even if I remove
invalidate_page from the mmu notifier patch but it'd run slower both
when armed and disarmed.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-22 13:21 ` Andrea Arcangeli
@ 2008-04-22 13:36 ` Robin Holt
2008-04-22 13:48 ` Andrea Arcangeli
0 siblings, 1 reply; 44+ messages in thread
From: Robin Holt @ 2008-04-22 13:36 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Robin Holt, Christoph Lameter, akpm, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, general,
Hugh Dickins
On Tue, Apr 22, 2008 at 03:21:43PM +0200, Andrea Arcangeli wrote:
> On Tue, Apr 22, 2008 at 08:01:20AM -0500, Robin Holt wrote:
> > On Tue, Apr 22, 2008 at 02:00:56PM +0200, Andrea Arcangeli wrote:
> > > On Tue, Apr 22, 2008 at 09:20:26AM +0200, Andrea Arcangeli wrote:
> > > > invalidate_range_start {
> > > > spin_lock(&kvm->mmu_lock);
> > > >
> > > > kvm->invalidate_range_count++;
> > > > rmap-invalidate of sptes in range
> > > >
> > >
> > > write_seqlock; write_sequnlock;
> >
> > I don't think you need it here since invalidate_range_count is already
> > elevated which will accomplish the same effect.
>
> Agreed, seqlock only in range_end should be enough. BTW, the fact
I am a little confused about the value of the seq_lock versus a simple
atomic, but I assumed there is a reason and left it at that.
> seqlock is needed regardless of invalidate_page existing or not,
> really makes invalidate_page a no brainer not just from the core VM
> point of view, but from the driver point of view too. The
> kvm_page_fault logic would be the same even if I remove
> invalidate_page from the mmu notifier patch but it'd run slower both
> when armed and disarmed.
I don't know what you mean by "it'd" run slower and what you mean by
"armed and disarmed".
For the sake of this discussion, I will assume "it'd" means the kernel in
general and not KVM. With the two call sites for range_begin/range_end,
I would agree we have more call sites, but the second is extremely likely
to be cache hot.
By disarmed, I will assume you mean no notifiers registered for a
particular mm. In that case, the cache will make the second call
effectively free. So, for the disarmed case, I see no measurable
difference.
For the case where there is a notifier registered, I certainly can see
a difference. I am not certain how to quantify the difference as it
depends on the callee. In the case of xpmem, our callout is always very
expensive for the _start case. Our _end case is very light, but it is
essentially the exact same steps we would perform for the _page callout.
When I was discussing this difference with Jack, he reminded me that
the GRU, due to its hardware, does not have any race issues with the
invalidate_page callout simply doing the tlb shootdown and not modifying
any of its internal structures. He then put a caveat on the discussion
that _either_ method was acceptable as far as he was concerned. The real
issue is getting a patch in that satisfies all needs and not whether
there is a seperate invalidate_page callout.
Thanks,
Robin
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-22 13:36 ` Robin Holt
@ 2008-04-22 13:48 ` Andrea Arcangeli
2008-04-22 15:26 ` Robin Holt
0 siblings, 1 reply; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-22 13:48 UTC (permalink / raw)
To: Robin Holt
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, general, Hugh Dickins
On Tue, Apr 22, 2008 at 08:36:04AM -0500, Robin Holt wrote:
> I am a little confused about the value of the seq_lock versus a simple
> atomic, but I assumed there is a reason and left it at that.
There's no value for anything but get_user_pages (get_user_pages takes
its own lock internally though). I preferred to explain it as a
seqlock because it was simpler for reading, but I totally agree in the
final implementation it shouldn't be a seqlock. My code was meant to
be pseudo-code only. It doesn't even need to be atomic ;).
> I don't know what you mean by "it'd" run slower and what you mean by
> "armed and disarmed".
1) when armed the time-window where the kvm-page-fault would be
blocked would be a bit larger without invalidate_page for no good
reason
2) if you were to remove invalidate_page when disarmed the VM could
would need two branches instead of one in various places
I don't want to waste cycles if not wasting them improves performance
both when armed and disarmed.
> For the sake of this discussion, I will assume "it'd" means the kernel in
> general and not KVM. With the two call sites for range_begin/range_end,
I actually meant for both.
> By disarmed, I will assume you mean no notifiers registered for a
> particular mm. In that case, the cache will make the second call
> effectively free. So, for the disarmed case, I see no measurable
> difference.
For rmap is sure effective free, for do_wp_page it costs one branch
for no good reason.
> For the case where there is a notifier registered, I certainly can see
> a difference. I am not certain how to quantify the difference as it
Agreed.
> When I was discussing this difference with Jack, he reminded me that
> the GRU, due to its hardware, does not have any race issues with the
> invalidate_page callout simply doing the tlb shootdown and not modifying
> any of its internal structures. He then put a caveat on the discussion
> that _either_ method was acceptable as far as he was concerned. The real
> issue is getting a patch in that satisfies all needs and not whether
> there is a seperate invalidate_page callout.
Sure, we have that patch now, I'll send it out in a minute, I was just
trying to explain why it makes sense to have an invalidate_page too
(which remains the only difference by now), removing it would be a
regression on all sides, even if a minor one.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 0 of 9] mmu notifier #v12
2008-04-22 13:48 ` Andrea Arcangeli
@ 2008-04-22 15:26 ` Robin Holt
0 siblings, 0 replies; 44+ messages in thread
From: Robin Holt @ 2008-04-22 15:26 UTC (permalink / raw)
To: Andrea Arcangeli, akpm
Cc: Robin Holt, Christoph Lameter, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, general,
Hugh Dickins
Andrew, Could we get direction/guidance from you as regards
the invalidate_page() callout of Andrea's patch set versus the
invalidate_range_start/invalidate_range_end callout pairs of Christoph's
patchset? This is only in the context of the __xip_unmap, do_wp_page,
page_mkclean_one, and try_to_unmap_one call sites.
On Tue, Apr 22, 2008 at 03:48:47PM +0200, Andrea Arcangeli wrote:
> On Tue, Apr 22, 2008 at 08:36:04AM -0500, Robin Holt wrote:
> > I am a little confused about the value of the seq_lock versus a simple
> > atomic, but I assumed there is a reason and left it at that.
>
> There's no value for anything but get_user_pages (get_user_pages takes
> its own lock internally though). I preferred to explain it as a
> seqlock because it was simpler for reading, but I totally agree in the
> final implementation it shouldn't be a seqlock. My code was meant to
> be pseudo-code only. It doesn't even need to be atomic ;).
Unless there is additional locking in your fault path, I think it does
need to be atomic.
> > I don't know what you mean by "it'd" run slower and what you mean by
> > "armed and disarmed".
>
> 1) when armed the time-window where the kvm-page-fault would be
> blocked would be a bit larger without invalidate_page for no good
> reason
But that is a distinction without a difference. In the _start/_end
case, kvm's fault handler will not have any _DIRECT_ blocking, but
get_user_pages() had certainly better block waiting for some other lock
to prevent the process's pages being refaulted.
I am no VM expert, but that seems like it is critical to having a
consistent virtual address space. Effectively, you have a delay on the
kvm fault handler beginning when either invalidate_page() is entered
or invalidate_range_start() is entered until when the _CALLER_ of the
invalidate* method has unlocked. That time will remain essentailly
identical for either case. I would argue you would be hard pressed to
even measure the difference.
> 2) if you were to remove invalidate_page when disarmed the VM could
> would need two branches instead of one in various places
Those branches are conditional upon there being list entries. That check
should be extremely cheap. The vast majority of cases will have no
registered notifiers. The second check for the _end callout will be
from cpu cache.
> I don't want to waste cycles if not wasting them improves performance
> both when armed and disarmed.
In summary, I think we have narrowed down the case of no registered
notifiers to being infinitesimal. The case of registered notifiers
being a distinction without a difference.
> > When I was discussing this difference with Jack, he reminded me that
> > the GRU, due to its hardware, does not have any race issues with the
> > invalidate_page callout simply doing the tlb shootdown and not modifying
> > any of its internal structures. He then put a caveat on the discussion
> > that _either_ method was acceptable as far as he was concerned. The real
> > issue is getting a patch in that satisfies all needs and not whether
> > there is a seperate invalidate_page callout.
>
> Sure, we have that patch now, I'll send it out in a minute, I was just
> trying to explain why it makes sense to have an invalidate_page too
> (which remains the only difference by now), removing it would be a
> regression on all sides, even if a minor one.
I think GRU is the only compelling case I have heard for having the
invalidate_page seperate. In the case of the GRU, the hardware enforces a
lifetime of the invalidate which covers all in-progress faults including
ones where the hardware is informed after the flush of a PTE. in all
cases, once the GRU invalidate instruction is issued, all active requests
are invalidated. Future faults will be blocked in get_user_pages().
Without that special feature of the hardware, I don't think any code
simplification exists. I, of course, reserve the right to be wrong.
I believe the argument against a seperate invalidate_page() callout was
Christoph's interpretation of Andrew's comments. I am not certain Andrew
was aware of this special aspects of the GRU hardware and whether that
had been factored into the discussion at that point in time.
Thanks,
Robin
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-22 5:06 ` Rusty Russell
@ 2008-04-25 16:56 ` Andrea Arcangeli
2008-04-25 17:04 ` Andrea Arcangeli
2008-04-25 19:25 ` Robin Holt
0 siblings, 2 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-25 16:56 UTC (permalink / raw)
To: Rusty Russell
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, Robin Holt, general,
Hugh Dickins
I somehow lost missed this email in my inbox, found it now because it
was strangely still unread... Sorry for the late reply!
On Tue, Apr 22, 2008 at 03:06:24PM +1000, Rusty Russell wrote:
> On Wednesday 09 April 2008 01:44:04 Andrea Arcangeli wrote:
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1050,6 +1050,15 @@
> > unsigned long addr, unsigned long len,
> > unsigned long flags, struct page **pages);
> >
> > +struct mm_lock_data {
> > + spinlock_t **i_mmap_locks;
> > + spinlock_t **anon_vma_locks;
> > + unsigned long nr_i_mmap_locks;
> > + unsigned long nr_anon_vma_locks;
> > +};
> > +extern struct mm_lock_data *mm_lock(struct mm_struct * mm);
> > +extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data);
>
> As far as I can tell you don't actually need to expose this struct at all?
Yes, it should be possible to only expose 'struct mm_lock_data;'.
> > + data->i_mmap_locks = vmalloc(nr_i_mmap_locks *
> > + sizeof(spinlock_t));
>
> This is why non-typesafe allocators suck. You want 'sizeof(spinlock_t *)'
> here.
>
> > + data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
> > + sizeof(spinlock_t));
>
> and here.
Great catch! (it was temporarily wasting some ram which isn't nice at all)
> > + err = -EINTR;
> > + i_mmap_lock_last = NULL;
> > + nr_i_mmap_locks = 0;
> > + for (;;) {
> > + spinlock_t *i_mmap_lock = (spinlock_t *) -1UL;
> > + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> ...
> > + data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock;
> > + }
> > + data->nr_i_mmap_locks = nr_i_mmap_locks;
>
> How about you track your running counter in data->nr_i_mmap_locks, leave
> nr_i_mmap_locks alone, and BUG_ON(data->nr_i_mmap_locks != nr_i_mmap_locks)?
>
> Even nicer would be to wrap this in a "get_sorted_mmap_locks()" function.
I'll try to clean this up further and I'll make a further update for review.
> Unfortunately, I just don't think we can fail locking like this. In your next
> patch unregistering a notifier can fail because of it: that not usable.
Fortunately I figured out we don't really need mm_lock in unregister
because it's ok to unregister in the middle of the range_begin/end
critical section (that's definitely not ok for register that's why
register needs mm_lock). And it's perfectly ok to fail in register().
Also it wasn't ok to unpin the module count in ->release as ->release
needs to 'ret' to get back to the mmu notifier code. And without any
unregister at all, the module can't be unloaded at all which
is quite unacceptable...
The logic is to prevent mmu_notifier_register to race with
mmu_notifier_release because it takes the mm_users pin (implicit or
explicit, and then mmput just after mmu_notifier_register
returns). Then _register serializes against all the mmu notifier
methods (except ->release) with srcu (->release can't run thanks to
the mm_users pin). The mmu_notifier_mm->lock then serializes the
modification on the list (register vs unregister) and it ensures one
and only one between _unregister and _releases calls ->release before
_unregister returns. All other methods runs freely with srcu. Having
the guarante that ->release is called just before all pages are freed
or inside _unregister, allows the module to zap and freeze its
secondary mmu inside ->release with the race condition of exit()
against mmu_notifier_unregister internally by the mmu notifier code
and without dependency on exit_files/exit_mm ordering depending if the
fd of the driver is open the filetables or in the vma only. The
mmu_notifier_mm can be reset to 0 only after the last mmdrop.
About the mm_count refcounting for _release and _unregiste: no mmu
notifier and not even mmu_notifier_unregister and _release can cope
with mmu_notfier_mm list and srcu structures going away out of
order. exit_mmap is safe as it holds an mm_count implicitly because
mmdrop is run after exit_mmap returns. mmu_notifier_unregister is safe
too as _register takes the mm_count pin. We can't prevent
mmu_notifer_mm to go away with mm_users as that will screwup the vma
filedescriptor closure that only happens inside exit_mmap (mm_users
pinned prevents exit_mmap to run, and it can only be taken temporarily
until _register returns).
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-25 16:56 ` Andrea Arcangeli
@ 2008-04-25 17:04 ` Andrea Arcangeli
2008-04-25 19:25 ` Robin Holt
1 sibling, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-25 17:04 UTC (permalink / raw)
To: Rusty Russell
Cc: Christoph Lameter, akpm, Nick Piggin, Steve Wise, Peter Zijlstra,
linux-mm, Kanoj Sarcar, Roland Dreier, Jack Steiner,
linux-kernel, Avi Kivity, kvm-devel, Robin Holt, general,
Hugh Dickins
On Fri, Apr 25, 2008 at 06:56:39PM +0200, Andrea Arcangeli wrote:
> > > + data->i_mmap_locks = vmalloc(nr_i_mmap_locks *
> > > + sizeof(spinlock_t));
> >
> > This is why non-typesafe allocators suck. You want 'sizeof(spinlock_t *)'
> > here.
> >
> > > + data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
> > > + sizeof(spinlock_t));
> >
> > and here.
>
> Great catch! (it was temporarily wasting some ram which isn't nice at all)
As I went into the editor I just found the above already fixed in
#v14-pre3. And I can't move the structure into the file anymore
without kmallocing it. Exposing that structure avoids the
ERR_PTR/PTR_ERR on the retvals and one kmalloc so I think it makes the
code simpler in the end to keep it as it is now. I'd rather avoid
further changes to the 1/N patch, as long as they don't make any
difference at runtime and as long as they involve more than
cut-and-pasting a structure from .h to .c file.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-25 16:56 ` Andrea Arcangeli
2008-04-25 17:04 ` Andrea Arcangeli
@ 2008-04-25 19:25 ` Robin Holt
2008-04-26 0:57 ` Andrea Arcangeli
1 sibling, 1 reply; 44+ messages in thread
From: Robin Holt @ 2008-04-25 19:25 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Rusty Russell, Christoph Lameter, akpm, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, Robin Holt,
general, Hugh Dickins
On Fri, Apr 25, 2008 at 06:56:40PM +0200, Andrea Arcangeli wrote:
> Fortunately I figured out we don't really need mm_lock in unregister
> because it's ok to unregister in the middle of the range_begin/end
> critical section (that's definitely not ok for register that's why
> register needs mm_lock). And it's perfectly ok to fail in register().
I think you still need mm_lock (unless I miss something). What happens
when one callout is scanning mmu_notifier_invalidate_range_start() and
you unlink. That list next pointer with LIST_POISON1 which is a really
bad address for the processor to track.
Maybe I misunderstood your description.
Thanks,
Robin
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen
2008-04-25 19:25 ` Robin Holt
@ 2008-04-26 0:57 ` Andrea Arcangeli
0 siblings, 0 replies; 44+ messages in thread
From: Andrea Arcangeli @ 2008-04-26 0:57 UTC (permalink / raw)
To: Robin Holt
Cc: Rusty Russell, Christoph Lameter, akpm, Nick Piggin, Steve Wise,
Peter Zijlstra, linux-mm, Kanoj Sarcar, Roland Dreier,
Jack Steiner, linux-kernel, Avi Kivity, kvm-devel, general,
Hugh Dickins
On Fri, Apr 25, 2008 at 02:25:32PM -0500, Robin Holt wrote:
> I think you still need mm_lock (unless I miss something). What happens
> when one callout is scanning mmu_notifier_invalidate_range_start() and
> you unlink. That list next pointer with LIST_POISON1 which is a really
> bad address for the processor to track.
Ok, _release list_del_init qcan't race with that because it happens in
exit_mmap when no other mmu notifier can trigger anymore.
_unregister can run concurrently but it does list_del_rcu, that only
overwrites the pprev pointer with LIST_POISON2. The
mmu_notifier_invalidate_range_start won't crash on LIST_POISON1 thanks
to srcu.
Actually I did more changes than necessary, for example I noticed the
mmu_notifier_register can return a list_add_head instead of
list_add_head_rcu. _register can't race against _release thanks to the
mm_users temporary or implicit pin. _register can't race against
_unregister thanks to the mmu_notifier_mm->lock. And register can't
race against all other mmu notifiers thanks to the mm_lock.
At this time I've no other pending patches on top of v14-pre3 other
than the below micro-optimizing cleanup. It'd be great to have
confirmation that v14-pre3 passes GRU/XPMEM regressions tests as well
as my KVM testing already passed successfully on it. I'll forward
v14-pre3 mmu-notifier-core plus the below to Andrew tomorrow, I'm
trying to be optimistic here! ;)
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -187,7 +187,7 @@ int mmu_notifier_register(struct mmu_not
* current->mm or explicitly with get_task_mm() or similar).
*/
spin_lock(&mm->mmu_notifier_mm->lock);
- hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
+ hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
spin_unlock(&mm->mmu_notifier_mm->lock);
out_unlock:
mm_unlock(mm, &data);
^ permalink raw reply [flat|nested] 44+ messages in thread
end of thread, other threads:[~2008-04-26 0:57 UTC | newest]
Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-04-08 15:44 [PATCH 0 of 9] mmu notifier #v12 Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen Andrea Arcangeli
2008-04-16 16:33 ` Robin Holt
2008-04-16 18:35 ` Christoph Lameter
2008-04-16 19:02 ` Robin Holt
2008-04-16 19:15 ` Christoph Lameter
2008-04-17 11:14 ` Robin Holt
2008-04-17 15:51 ` Andrea Arcangeli
2008-04-17 16:36 ` Robin Holt
2008-04-17 17:14 ` Andrea Arcangeli
2008-04-17 17:25 ` Robin Holt
2008-04-17 19:10 ` Christoph Lameter
2008-04-17 22:16 ` Andrea Arcangeli
2008-04-22 5:06 ` Rusty Russell
2008-04-25 16:56 ` Andrea Arcangeli
2008-04-25 17:04 ` Andrea Arcangeli
2008-04-25 19:25 ` Robin Holt
2008-04-26 0:57 ` Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 2 of 9] Core of mmu notifiers Andrea Arcangeli
2008-04-08 16:26 ` Robin Holt
2008-04-08 17:05 ` Andrea Arcangeli
2008-04-14 19:57 ` Christoph Lameter
2008-04-14 19:59 ` Christoph Lameter
2008-04-08 15:44 ` [PATCH 3 of 9] Moves all mmu notifier methods outside the PT lock (first and not last Andrea Arcangeli
2008-04-14 19:57 ` Christoph Lameter
2008-04-08 15:44 ` [PATCH 4 of 9] Move the tlb flushing into free_pgtables. The conversion of the locks Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 5 of 9] The conversion to a rwsem allows callbacks during rmap traversal Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 6 of 9] We no longer abort unmapping in unmap vmas because we can reschedule while Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 7 of 9] Convert the anon_vma spinlock to a rw semaphore. This allows concurrent Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 8 of 9] XPMEM would have used sys_madvise() except that madvise_dontneed() Andrea Arcangeli
2008-04-08 15:44 ` [PATCH 9 of 9] This patch adds a lock ordering rule to avoid a potential deadlock when Andrea Arcangeli
2008-04-08 21:46 ` [PATCH 0 of 9] mmu notifier #v12 Avi Kivity
2008-04-08 22:06 ` Andrea Arcangeli
2008-04-09 13:17 ` Robin Holt
2008-04-09 14:44 ` Andrea Arcangeli
2008-04-09 18:55 ` Robin Holt
2008-04-22 7:20 ` Andrea Arcangeli
2008-04-22 12:00 ` Andrea Arcangeli
2008-04-22 13:01 ` Robin Holt
2008-04-22 13:21 ` Andrea Arcangeli
2008-04-22 13:36 ` Robin Holt
2008-04-22 13:48 ` Andrea Arcangeli
2008-04-22 15:26 ` Robin Holt
2008-04-14 23:09 ` Christoph Lameter
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).