LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] mm: remove global locks from mm/highmem.c
@ 2007-01-28 14:11 Peter Zijlstra
  2007-01-28 14:49 ` Christoph Hellwig
  2007-01-28 22:29 ` Andrew Morton
  0 siblings, 2 replies; 26+ messages in thread
From: Peter Zijlstra @ 2007-01-28 14:11 UTC (permalink / raw)
  To: linux-kernel, linux-mm; +Cc: Andrew Morton, Ingo Molnar

Eradicate global locks.

 - kmap_lock is removed by extensive use of atomic_t, a new flush
   scheme and modifying set_page_address to only allow NULL<->virt
   transitions.

A count of 0 is an exclusive state acting as an entry lock. This is done
using inc_not_zero and cmpxchg. The restriction on changing the virtual
address closes the gap with concurrent additions of the same entry.

 - pool_lock is removed by using the pkmap index for the
   page_address_maps.

By using the pkmap index for the hash entries it is no longer needed to
keep a free list.

This patch has been in -rt for a while but should also help regular
highmem machines with multiple cores/cpus.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm.h |   32 ++-
 mm/highmem.c       |  433 ++++++++++++++++++++++++++++++-----------------------
 2 files changed, 276 insertions(+), 189 deletions(-)

Index: linux/include/linux/mm.h
===================================================================
--- linux.orig/include/linux/mm.h
+++ linux/include/linux/mm.h
@@ -543,23 +543,39 @@ static __always_inline void *lowmem_page
 #endif
 
 #if defined(WANT_PAGE_VIRTUAL)
-#define page_address(page) ((page)->virtual)
-#define set_page_address(page, address)			\
-	do {						\
-		(page)->virtual = (address);		\
-	} while(0)
-#define page_address_init()  do { } while(0)
+/*
+ * wrap page->virtual so it is safe to set/read locklessly
+ */
+#define page_address(page) \
+	({ typeof((page)->virtual) v = (page)->virtual; \
+	 smp_read_barrier_depends(); \
+	 v; })
+
+static inline int set_page_address(struct page *page, void *address)
+{
+	if (address)
+		return cmpxchg(&page->virtual, NULL, address) == NULL;
+	else {
+		/*
+		 * cmpxchg is a bit abused because it is not guaranteed
+		 * safe wrt direct assignment on all platforms.
+		 */
+		void *virt = page->virtual;
+		return cmpxchg(&page->vitrual, virt, NULL) == virt;
+	}
+}
+void page_address_init(void);
 #endif
 
 #if defined(HASHED_PAGE_VIRTUAL)
 void *page_address(struct page *page);
-void set_page_address(struct page *page, void *virtual);
+int set_page_address(struct page *page, void *virtual);
 void page_address_init(void);
 #endif
 
 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
 #define page_address(page) lowmem_page_address(page)
-#define set_page_address(page, address)  do { } while(0)
+#define set_page_address(page, address)  (0)
 #define page_address_init()  do { } while(0)
 #endif
 
Index: linux/mm/highmem.c
===================================================================
--- linux.orig/mm/highmem.c
+++ linux/mm/highmem.c
@@ -14,6 +14,11 @@
  * based on Linus' idea.
  *
  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ *
+ * Largely rewritten to get rid of all global locks
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
  */
 
 #include <linux/mm.h>
@@ -27,18 +32,14 @@
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
+
 #include <asm/tlbflush.h>
+#include <asm/pgtable.h>
 
-/*
- * Virtual_count is not a pure "count".
- *  0 means that it is not mapped, and has not been mapped
- *    since a TLB flush - it is usable.
- *  1 means that there are no users, but it has been mapped
- *    since the last TLB flush - so we can't use it.
- *  n means that there are (n-1) current users of it.
- */
 #ifdef CONFIG_HIGHMEM
 
+static int __set_page_address(struct page *page, void *virtual, int pos);
+
 unsigned long totalhigh_pages __read_mostly;
 
 unsigned int nr_free_highpages (void)
@@ -52,164 +53,208 @@ unsigned int nr_free_highpages (void)
 	return pages;
 }
 
-static int pkmap_count[LAST_PKMAP];
-static unsigned int last_pkmap_nr;
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
+/*
+ * count is not a pure "count".
+ *  0 means its owned exclusively by someone
+ *  1 means its free for use - either mapped or not.
+ *  n means that there are (n-1) current users of it.
+ */
+static atomic_t pkmap_count[LAST_PKMAP];
+static atomic_t pkmap_hand;
 
 pte_t * pkmap_page_table;
 
 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
 
-static void flush_all_zero_pkmaps(void)
+/*
+ * Try to free a given kmap slot.
+ *
+ * Returns:
+ *  -1 - in use
+ *   0 - free, no TLB flush needed
+ *   1 - free, needs TLB flush
+ */
+static int pkmap_try_free(int pos)
 {
-	int i;
-
-	flush_cache_kmaps();
+	if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1)
+		return -1;
 
-	for (i = 0; i < LAST_PKMAP; i++) {
-		struct page *page;
+	/*
+	 * TODO: add a young bit to make it CLOCK
+	 */
+	if (!pte_none(pkmap_page_table[pos])) {
+		struct page *page = pte_page(pkmap_page_table[pos]);
+		unsigned long addr = PKMAP_ADDR(pos);
+		pte_t *ptep = &pkmap_page_table[pos];
+
+		VM_BUG_ON(addr != (unsigned long)page_address(page));
+
+		if (!__set_page_address(page, NULL, pos))
+			BUG();
+		flush_kernel_dcache_page(page);
+		pte_clear(&init_mm, addr, ptep);
 
-		/*
-		 * zero means we don't have anything to do,
-		 * >1 means that it is still in use. Only
-		 * a count of 1 means that it is free but
-		 * needs to be unmapped
-		 */
-		if (pkmap_count[i] != 1)
-			continue;
-		pkmap_count[i] = 0;
+		return 1;
+	}
 
-		/* sanity check */
-		BUG_ON(pte_none(pkmap_page_table[i]));
+	return 0;
+}
 
-		/*
-		 * Don't need an atomic fetch-and-clear op here;
-		 * no-one has the page mapped, and cannot get at
-		 * its virtual address (and hence PTE) without first
-		 * getting the kmap_lock (which is held here).
-		 * So no dangers, even with speculative execution.
-		 */
-		page = pte_page(pkmap_page_table[i]);
-		pte_clear(&init_mm, (unsigned long)page_address(page),
-			  &pkmap_page_table[i]);
+static inline void pkmap_put(atomic_t *counter)
+{
+	switch (atomic_dec_return(counter)) {
+	case 0:
+		BUG();
 
-		set_page_address(page, NULL);
+	case 1:
+		wake_up(&pkmap_map_wait);
 	}
-	flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
 }
 
-static inline unsigned long map_new_virtual(struct page *page)
+#define TLB_BATCH	32
+
+static int pkmap_get_free(void)
 {
-	unsigned long vaddr;
-	int count;
+	int i, pos, flush;
+	DECLARE_WAITQUEUE(wait, current);
 
-start:
-	count = LAST_PKMAP;
-	/* Find an empty entry */
-	for (;;) {
-		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
-		if (!last_pkmap_nr) {
-			flush_all_zero_pkmaps();
-			count = LAST_PKMAP;
-		}
-		if (!pkmap_count[last_pkmap_nr])
-			break;	/* Found a usable entry */
-		if (--count)
-			continue;
+restart:
+	for (i = 0; i < LAST_PKMAP; i++) {
+		pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP;
+		flush = pkmap_try_free(pos);
+		if (flush >= 0)
+			goto got_one;
+	}
+
+	/*
+	 * wait for somebody else to unmap their entries
+	 */
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(&pkmap_map_wait, &wait);
+	schedule();
+	remove_wait_queue(&pkmap_map_wait, &wait);
+
+	goto restart;
+
+got_one:
+	if (flush) {
+#if 0
+		flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1));
+#else
+		int pos2 = (pos + 1) % LAST_PKMAP;
+		int nr;
+		int entries[TLB_BATCH];
 
 		/*
-		 * Sleep for somebody else to unmap their entries
+		 * For those architectures that cannot help but flush the
+		 * whole TLB, flush some more entries to make it worthwhile.
+		 * Scan ahead of the hand to minimise search distances.
 		 */
-		{
-			DECLARE_WAITQUEUE(wait, current);
+		for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH;
+				i++, pos2 = (pos2 + 1) % LAST_PKMAP) {
 
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			add_wait_queue(&pkmap_map_wait, &wait);
-			spin_unlock(&kmap_lock);
-			schedule();
-			remove_wait_queue(&pkmap_map_wait, &wait);
-			spin_lock(&kmap_lock);
-
-			/* Somebody else might have mapped it while we slept */
-			if (page_address(page))
-				return (unsigned long)page_address(page);
+			flush = pkmap_try_free(pos2);
+			if (flush < 0)
+				continue;
+
+			if (!flush) {
+				atomic_t *counter = &pkmap_count[pos2];
+				VM_BUG_ON(atomic_read(counter) != 0);
+				atomic_set(counter, 2);
+				pkmap_put(counter);
+			} else
+				entries[nr++] = pos2;
+		}
+		flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
 
-			/* Re-start */
-			goto start;
+		for (i = 0; i < nr; i++) {
+			atomic_t *counter = &pkmap_count[entries[i]];
+			VM_BUG_ON(atomic_read(counter) != 0);
+			atomic_set(counter, 2);
+			pkmap_put(counter);
 		}
+#endif
 	}
-	vaddr = PKMAP_ADDR(last_pkmap_nr);
-	set_pte_at(&init_mm, vaddr,
-		   &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+	return pos;
+}
+
+static unsigned long pkmap_insert(struct page *page)
+{
+	int pos = pkmap_get_free();
+	unsigned long vaddr = PKMAP_ADDR(pos);
+	pte_t *ptep = &pkmap_page_table[pos];
+	pte_t entry = mk_pte(page, kmap_prot);
+	atomic_t *counter = &pkmap_count[pos];
+
+	VM_BUG_ON(atomic_read(counter) != 0);
 
-	pkmap_count[last_pkmap_nr] = 1;
-	set_page_address(page, (void *)vaddr);
+	set_pte_at(&init_mm, vaddr, ptep, entry);
+	if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) {
+		/*
+		 * concurrent pkmap_inserts for this page -
+		 * the other won the race, release this entry.
+		 *
+		 * we can still clear the pte without a tlb flush since
+		 * it couldn't have been used yet.
+		 */
+		pte_clear(&init_mm, vaddr, ptep);
+		VM_BUG_ON(atomic_read(counter) != 0);
+		atomic_set(counter, 2);
+		pkmap_put(counter);
+		vaddr = 0;
+	} else
+		atomic_set(counter, 2);
 
 	return vaddr;
 }
 
-void fastcall *kmap_high(struct page *page)
+fastcall void *kmap_high(struct page *page)
 {
 	unsigned long vaddr;
-
-	/*
-	 * For highmem pages, we can't trust "virtual" until
-	 * after we have the lock.
-	 *
-	 * We cannot call this from interrupts, as it may block
-	 */
-	spin_lock(&kmap_lock);
+again:
 	vaddr = (unsigned long)page_address(page);
+	if (vaddr) {
+		atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)];
+		if (atomic_inc_not_zero(counter)) {
+			/*
+			 * atomic_inc_not_zero implies a (memory) barrier on success
+			 * so page address will be reloaded.
+			 */
+			unsigned long vaddr2 = (unsigned long)page_address(page);
+			if (likely(vaddr == vaddr2))
+				return (void *)vaddr;
+
+			/*
+			 * Oops, we got someone else.
+			 *
+			 * This can happen if we get preempted after
+			 * page_address() and before atomic_inc_not_zero()
+			 * and during that preemption this slot is freed and
+			 * reused.
+			 */
+			pkmap_put(counter);
+			goto again;
+		}
+	}
+
+	vaddr = pkmap_insert(page);
 	if (!vaddr)
-		vaddr = map_new_virtual(page);
-	pkmap_count[PKMAP_NR(vaddr)]++;
-	BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
-	spin_unlock(&kmap_lock);
-	return (void*) vaddr;
+		goto again;
+
+	return (void *)vaddr;
 }
 
 EXPORT_SYMBOL(kmap_high);
 
-void fastcall kunmap_high(struct page *page)
+fastcall void kunmap_high(struct page *page)
 {
-	unsigned long vaddr;
-	unsigned long nr;
-	int need_wakeup;
-
-	spin_lock(&kmap_lock);
-	vaddr = (unsigned long)page_address(page);
+	unsigned long vaddr = (unsigned long)page_address(page);
 	BUG_ON(!vaddr);
-	nr = PKMAP_NR(vaddr);
-
-	/*
-	 * A count must never go down to zero
-	 * without a TLB flush!
-	 */
-	need_wakeup = 0;
-	switch (--pkmap_count[nr]) {
-	case 0:
-		BUG();
-	case 1:
-		/*
-		 * Avoid an unnecessary wake_up() function call.
-		 * The common case is pkmap_count[] == 1, but
-		 * no waiters.
-		 * The tasks queued in the wait-queue are guarded
-		 * by both the lock in the wait-queue-head and by
-		 * the kmap_lock.  As the kmap_lock is held here,
-		 * no need for the wait-queue-head's lock.  Simply
-		 * test if the queue is empty.
-		 */
-		need_wakeup = waitqueue_active(&pkmap_map_wait);
-	}
-	spin_unlock(&kmap_lock);
-
-	/* do wake-up, if needed, race-free outside of the spin lock */
-	if (need_wakeup)
-		wake_up(&pkmap_map_wait);
+	pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]);
 }
 
 EXPORT_SYMBOL(kunmap_high);
+
 #endif
 
 #if defined(HASHED_PAGE_VIRTUAL)
@@ -217,19 +262,13 @@ EXPORT_SYMBOL(kunmap_high);
 #define PA_HASH_ORDER	7
 
 /*
- * Describes one page->virtual association
+ * Describes one page->virtual address association.
  */
-struct page_address_map {
+static struct page_address_map {
 	struct page *page;
 	void *virtual;
 	struct list_head list;
-};
-
-/*
- * page_address_map freelist, allocated from page_address_maps.
- */
-static struct list_head page_address_pool;	/* freelist */
-static spinlock_t pool_lock;			/* protects page_address_pool */
+} page_address_maps[LAST_PKMAP];
 
 /*
  * Hash table bucket
@@ -244,91 +283,123 @@ static struct page_address_slot *page_sl
 	return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
 }
 
-void *page_address(struct page *page)
+static void *__page_address(struct page_address_slot *pas, struct page *page)
 {
-	unsigned long flags;
-	void *ret;
-	struct page_address_slot *pas;
-
-	if (!PageHighMem(page))
-		return lowmem_page_address(page);
+	void *ret = NULL;
 
-	pas = page_slot(page);
-	ret = NULL;
-	spin_lock_irqsave(&pas->lock, flags);
 	if (!list_empty(&pas->lh)) {
 		struct page_address_map *pam;
 
 		list_for_each_entry(pam, &pas->lh, list) {
 			if (pam->page == page) {
 				ret = pam->virtual;
-				goto done;
+				break;
 			}
 		}
 	}
-done:
+
+	return ret;
+}
+
+void *page_address(struct page *page)
+{
+	unsigned long flags;
+	void *ret;
+	struct page_address_slot *pas;
+
+	if (!PageHighMem(page))
+		return lowmem_page_address(page);
+
+	pas = page_slot(page);
+	spin_lock_irqsave(&pas->lock, flags);
+	ret = __page_address(pas, page);
 	spin_unlock_irqrestore(&pas->lock, flags);
 	return ret;
 }
 
 EXPORT_SYMBOL(page_address);
 
-void set_page_address(struct page *page, void *virtual)
+static int __set_page_address(struct page *page, void *virtual, int pos)
 {
+	int ret = 0;
 	unsigned long flags;
 	struct page_address_slot *pas;
 	struct page_address_map *pam;
 
-	BUG_ON(!PageHighMem(page));
+	VM_BUG_ON(!PageHighMem(page));
+	VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0);
+	VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP);
 
 	pas = page_slot(page);
-	if (virtual) {		/* Add */
-		BUG_ON(list_empty(&page_address_pool));
+	pam = &page_address_maps[pos];
 
-		spin_lock_irqsave(&pool_lock, flags);
-		pam = list_entry(page_address_pool.next,
-				struct page_address_map, list);
-		list_del(&pam->list);
-		spin_unlock_irqrestore(&pool_lock, flags);
-
-		pam->page = page;
-		pam->virtual = virtual;
-
-		spin_lock_irqsave(&pas->lock, flags);
-		list_add_tail(&pam->list, &pas->lh);
-		spin_unlock_irqrestore(&pas->lock, flags);
-	} else {		/* Remove */
-		spin_lock_irqsave(&pas->lock, flags);
-		list_for_each_entry(pam, &pas->lh, list) {
-			if (pam->page == page) {
-				list_del(&pam->list);
-				spin_unlock_irqrestore(&pas->lock, flags);
-				spin_lock_irqsave(&pool_lock, flags);
-				list_add_tail(&pam->list, &page_address_pool);
-				spin_unlock_irqrestore(&pool_lock, flags);
-				goto done;
-			}
+	spin_lock_irqsave(&pas->lock, flags);
+	if (virtual) { /* add */
+		VM_BUG_ON(!list_empty(&pam->list));
+
+		if (!__page_address(pas, page)) {
+			pam->page = page;
+			pam->virtual = virtual;
+			list_add_tail(&pam->list, &pas->lh);
+			ret = 1;
+		}
+	} else { /* remove */
+		if (!list_empty(&pam->list)) {
+			list_del_init(&pam->list);
+			ret = 1;
 		}
-		spin_unlock_irqrestore(&pas->lock, flags);
 	}
-done:
-	return;
+	spin_unlock_irqrestore(&pas->lock, flags);
+
+	return ret;
 }
 
-static struct page_address_map page_address_maps[LAST_PKMAP];
+int set_page_address(struct page *page, void *virtual)
+{
+	/*
+	 * set_page_address is not supposed to be called when using
+	 * hashed virtual addresses.
+	 */
+	BUG();
+	return 0;
+}
 
-void __init page_address_init(void)
+void __init __page_address_init(void)
 {
 	int i;
 
-	INIT_LIST_HEAD(&page_address_pool);
 	for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
-		list_add(&page_address_maps[i].list, &page_address_pool);
+		INIT_LIST_HEAD(&page_address_maps[i].list);
+
 	for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
 		INIT_LIST_HEAD(&page_address_htable[i].lh);
 		spin_lock_init(&page_address_htable[i].lock);
 	}
-	spin_lock_init(&pool_lock);
+}
+
+#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */
+
+static int __set_page_address(struct page *page, void *virtual, int pos)
+{
+	return set_page_address(page, virtual);
 }
 
 #endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+
+#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL)
+
+void __init page_address_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pkmap_count); i++)
+		atomic_set(&pkmap_count[i], 1);
+#endif
+
+#ifdef HASHED_PAGE_VIRTUAL
+	__page_address_init();
+#endif
+}
+
+#endif



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 14:11 [PATCH] mm: remove global locks from mm/highmem.c Peter Zijlstra
@ 2007-01-28 14:49 ` Christoph Hellwig
  2007-01-28 15:17   ` Ingo Molnar
  2007-01-28 22:29 ` Andrew Morton
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2007-01-28 14:49 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, linux-mm, Andrew Morton, Ingo Molnar

On Sun, Jan 28, 2007 at 03:11:34PM +0100, Peter Zijlstra wrote:
> Eradicate global locks.
> 
>  - kmap_lock is removed by extensive use of atomic_t, a new flush
>    scheme and modifying set_page_address to only allow NULL<->virt
>    transitions.

What's the point for this?  Extensive atomic_t use is usually much worse
than spinlocks.  A spinlock region is just a single atomic instruction,
as soon as you do more than one atomic_t you tend to make scalability
worse.  Not to mention that atomic_t are much worse when you try to
profile scalability issues.

What benchmark shows a problem with the current locking, and from what
caller?  In doubt we just need to convert that caller to kmap_atomic.


> 
> A count of 0 is an exclusive state acting as an entry lock. This is done
> using inc_not_zero and cmpxchg. The restriction on changing the virtual
> address closes the gap with concurrent additions of the same entry.
> 
>  - pool_lock is removed by using the pkmap index for the
>    page_address_maps.
> 
> By using the pkmap index for the hash entries it is no longer needed to
> keep a free list.
> 
> This patch has been in -rt for a while but should also help regular
> highmem machines with multiple cores/cpus.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/mm.h |   32 ++-
>  mm/highmem.c       |  433 ++++++++++++++++++++++++++++++-----------------------
>  2 files changed, 276 insertions(+), 189 deletions(-)
> 
> Index: linux/include/linux/mm.h
> ===================================================================
> --- linux.orig/include/linux/mm.h
> +++ linux/include/linux/mm.h
> @@ -543,23 +543,39 @@ static __always_inline void *lowmem_page
>  #endif
>  
>  #if defined(WANT_PAGE_VIRTUAL)
> -#define page_address(page) ((page)->virtual)
> -#define set_page_address(page, address)			\
> -	do {						\
> -		(page)->virtual = (address);		\
> -	} while(0)
> -#define page_address_init()  do { } while(0)
> +/*
> + * wrap page->virtual so it is safe to set/read locklessly
> + */
> +#define page_address(page) \
> +	({ typeof((page)->virtual) v = (page)->virtual; \
> +	 smp_read_barrier_depends(); \
> +	 v; })
> +
> +static inline int set_page_address(struct page *page, void *address)
> +{
> +	if (address)
> +		return cmpxchg(&page->virtual, NULL, address) == NULL;
> +	else {
> +		/*
> +		 * cmpxchg is a bit abused because it is not guaranteed
> +		 * safe wrt direct assignment on all platforms.
> +		 */
> +		void *virt = page->virtual;
> +		return cmpxchg(&page->vitrual, virt, NULL) == virt;
> +	}
> +}
> +void page_address_init(void);
>  #endif
>  
>  #if defined(HASHED_PAGE_VIRTUAL)
>  void *page_address(struct page *page);
> -void set_page_address(struct page *page, void *virtual);
> +int set_page_address(struct page *page, void *virtual);
>  void page_address_init(void);
>  #endif
>  
>  #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
>  #define page_address(page) lowmem_page_address(page)
> -#define set_page_address(page, address)  do { } while(0)
> +#define set_page_address(page, address)  (0)
>  #define page_address_init()  do { } while(0)
>  #endif
>  
> Index: linux/mm/highmem.c
> ===================================================================
> --- linux.orig/mm/highmem.c
> +++ linux/mm/highmem.c
> @@ -14,6 +14,11 @@
>   * based on Linus' idea.
>   *
>   * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
> + *
> + * Largely rewritten to get rid of all global locks
> + *
> + * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
> + *
>   */
>  
>  #include <linux/mm.h>
> @@ -27,18 +32,14 @@
>  #include <linux/hash.h>
>  #include <linux/highmem.h>
>  #include <linux/blktrace_api.h>
> +
>  #include <asm/tlbflush.h>
> +#include <asm/pgtable.h>
>  
> -/*
> - * Virtual_count is not a pure "count".
> - *  0 means that it is not mapped, and has not been mapped
> - *    since a TLB flush - it is usable.
> - *  1 means that there are no users, but it has been mapped
> - *    since the last TLB flush - so we can't use it.
> - *  n means that there are (n-1) current users of it.
> - */
>  #ifdef CONFIG_HIGHMEM
>  
> +static int __set_page_address(struct page *page, void *virtual, int pos);
> +
>  unsigned long totalhigh_pages __read_mostly;
>  
>  unsigned int nr_free_highpages (void)
> @@ -52,164 +53,208 @@ unsigned int nr_free_highpages (void)
>  	return pages;
>  }
>  
> -static int pkmap_count[LAST_PKMAP];
> -static unsigned int last_pkmap_nr;
> -static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
> +/*
> + * count is not a pure "count".
> + *  0 means its owned exclusively by someone
> + *  1 means its free for use - either mapped or not.
> + *  n means that there are (n-1) current users of it.
> + */
> +static atomic_t pkmap_count[LAST_PKMAP];
> +static atomic_t pkmap_hand;
>  
>  pte_t * pkmap_page_table;
>  
>  static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
>  
> -static void flush_all_zero_pkmaps(void)
> +/*
> + * Try to free a given kmap slot.
> + *
> + * Returns:
> + *  -1 - in use
> + *   0 - free, no TLB flush needed
> + *   1 - free, needs TLB flush
> + */
> +static int pkmap_try_free(int pos)
>  {
> -	int i;
> -
> -	flush_cache_kmaps();
> +	if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1)
> +		return -1;
>  
> -	for (i = 0; i < LAST_PKMAP; i++) {
> -		struct page *page;
> +	/*
> +	 * TODO: add a young bit to make it CLOCK
> +	 */
> +	if (!pte_none(pkmap_page_table[pos])) {
> +		struct page *page = pte_page(pkmap_page_table[pos]);
> +		unsigned long addr = PKMAP_ADDR(pos);
> +		pte_t *ptep = &pkmap_page_table[pos];
> +
> +		VM_BUG_ON(addr != (unsigned long)page_address(page));
> +
> +		if (!__set_page_address(page, NULL, pos))
> +			BUG();
> +		flush_kernel_dcache_page(page);
> +		pte_clear(&init_mm, addr, ptep);
>  
> -		/*
> -		 * zero means we don't have anything to do,
> -		 * >1 means that it is still in use. Only
> -		 * a count of 1 means that it is free but
> -		 * needs to be unmapped
> -		 */
> -		if (pkmap_count[i] != 1)
> -			continue;
> -		pkmap_count[i] = 0;
> +		return 1;
> +	}
>  
> -		/* sanity check */
> -		BUG_ON(pte_none(pkmap_page_table[i]));
> +	return 0;
> +}
>  
> -		/*
> -		 * Don't need an atomic fetch-and-clear op here;
> -		 * no-one has the page mapped, and cannot get at
> -		 * its virtual address (and hence PTE) without first
> -		 * getting the kmap_lock (which is held here).
> -		 * So no dangers, even with speculative execution.
> -		 */
> -		page = pte_page(pkmap_page_table[i]);
> -		pte_clear(&init_mm, (unsigned long)page_address(page),
> -			  &pkmap_page_table[i]);
> +static inline void pkmap_put(atomic_t *counter)
> +{
> +	switch (atomic_dec_return(counter)) {
> +	case 0:
> +		BUG();
>  
> -		set_page_address(page, NULL);
> +	case 1:
> +		wake_up(&pkmap_map_wait);
>  	}
> -	flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
>  }
>  
> -static inline unsigned long map_new_virtual(struct page *page)
> +#define TLB_BATCH	32
> +
> +static int pkmap_get_free(void)
>  {
> -	unsigned long vaddr;
> -	int count;
> +	int i, pos, flush;
> +	DECLARE_WAITQUEUE(wait, current);
>  
> -start:
> -	count = LAST_PKMAP;
> -	/* Find an empty entry */
> -	for (;;) {
> -		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
> -		if (!last_pkmap_nr) {
> -			flush_all_zero_pkmaps();
> -			count = LAST_PKMAP;
> -		}
> -		if (!pkmap_count[last_pkmap_nr])
> -			break;	/* Found a usable entry */
> -		if (--count)
> -			continue;
> +restart:
> +	for (i = 0; i < LAST_PKMAP; i++) {
> +		pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP;
> +		flush = pkmap_try_free(pos);
> +		if (flush >= 0)
> +			goto got_one;
> +	}
> +
> +	/*
> +	 * wait for somebody else to unmap their entries
> +	 */
> +	__set_current_state(TASK_UNINTERRUPTIBLE);
> +	add_wait_queue(&pkmap_map_wait, &wait);
> +	schedule();
> +	remove_wait_queue(&pkmap_map_wait, &wait);
> +
> +	goto restart;
> +
> +got_one:
> +	if (flush) {
> +#if 0
> +		flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1));
> +#else
> +		int pos2 = (pos + 1) % LAST_PKMAP;
> +		int nr;
> +		int entries[TLB_BATCH];
>  
>  		/*
> -		 * Sleep for somebody else to unmap their entries
> +		 * For those architectures that cannot help but flush the
> +		 * whole TLB, flush some more entries to make it worthwhile.
> +		 * Scan ahead of the hand to minimise search distances.
>  		 */
> -		{
> -			DECLARE_WAITQUEUE(wait, current);
> +		for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH;
> +				i++, pos2 = (pos2 + 1) % LAST_PKMAP) {
>  
> -			__set_current_state(TASK_UNINTERRUPTIBLE);
> -			add_wait_queue(&pkmap_map_wait, &wait);
> -			spin_unlock(&kmap_lock);
> -			schedule();
> -			remove_wait_queue(&pkmap_map_wait, &wait);
> -			spin_lock(&kmap_lock);
> -
> -			/* Somebody else might have mapped it while we slept */
> -			if (page_address(page))
> -				return (unsigned long)page_address(page);
> +			flush = pkmap_try_free(pos2);
> +			if (flush < 0)
> +				continue;
> +
> +			if (!flush) {
> +				atomic_t *counter = &pkmap_count[pos2];
> +				VM_BUG_ON(atomic_read(counter) != 0);
> +				atomic_set(counter, 2);
> +				pkmap_put(counter);
> +			} else
> +				entries[nr++] = pos2;
> +		}
> +		flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
>  
> -			/* Re-start */
> -			goto start;
> +		for (i = 0; i < nr; i++) {
> +			atomic_t *counter = &pkmap_count[entries[i]];
> +			VM_BUG_ON(atomic_read(counter) != 0);
> +			atomic_set(counter, 2);
> +			pkmap_put(counter);
>  		}
> +#endif
>  	}
> -	vaddr = PKMAP_ADDR(last_pkmap_nr);
> -	set_pte_at(&init_mm, vaddr,
> -		   &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
> +	return pos;
> +}
> +
> +static unsigned long pkmap_insert(struct page *page)
> +{
> +	int pos = pkmap_get_free();
> +	unsigned long vaddr = PKMAP_ADDR(pos);
> +	pte_t *ptep = &pkmap_page_table[pos];
> +	pte_t entry = mk_pte(page, kmap_prot);
> +	atomic_t *counter = &pkmap_count[pos];
> +
> +	VM_BUG_ON(atomic_read(counter) != 0);
>  
> -	pkmap_count[last_pkmap_nr] = 1;
> -	set_page_address(page, (void *)vaddr);
> +	set_pte_at(&init_mm, vaddr, ptep, entry);
> +	if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) {
> +		/*
> +		 * concurrent pkmap_inserts for this page -
> +		 * the other won the race, release this entry.
> +		 *
> +		 * we can still clear the pte without a tlb flush since
> +		 * it couldn't have been used yet.
> +		 */
> +		pte_clear(&init_mm, vaddr, ptep);
> +		VM_BUG_ON(atomic_read(counter) != 0);
> +		atomic_set(counter, 2);
> +		pkmap_put(counter);
> +		vaddr = 0;
> +	} else
> +		atomic_set(counter, 2);
>  
>  	return vaddr;
>  }
>  
> -void fastcall *kmap_high(struct page *page)
> +fastcall void *kmap_high(struct page *page)
>  {
>  	unsigned long vaddr;
> -
> -	/*
> -	 * For highmem pages, we can't trust "virtual" until
> -	 * after we have the lock.
> -	 *
> -	 * We cannot call this from interrupts, as it may block
> -	 */
> -	spin_lock(&kmap_lock);
> +again:
>  	vaddr = (unsigned long)page_address(page);
> +	if (vaddr) {
> +		atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)];
> +		if (atomic_inc_not_zero(counter)) {
> +			/*
> +			 * atomic_inc_not_zero implies a (memory) barrier on success
> +			 * so page address will be reloaded.
> +			 */
> +			unsigned long vaddr2 = (unsigned long)page_address(page);
> +			if (likely(vaddr == vaddr2))
> +				return (void *)vaddr;
> +
> +			/*
> +			 * Oops, we got someone else.
> +			 *
> +			 * This can happen if we get preempted after
> +			 * page_address() and before atomic_inc_not_zero()
> +			 * and during that preemption this slot is freed and
> +			 * reused.
> +			 */
> +			pkmap_put(counter);
> +			goto again;
> +		}
> +	}
> +
> +	vaddr = pkmap_insert(page);
>  	if (!vaddr)
> -		vaddr = map_new_virtual(page);
> -	pkmap_count[PKMAP_NR(vaddr)]++;
> -	BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
> -	spin_unlock(&kmap_lock);
> -	return (void*) vaddr;
> +		goto again;
> +
> +	return (void *)vaddr;
>  }
>  
>  EXPORT_SYMBOL(kmap_high);
>  
> -void fastcall kunmap_high(struct page *page)
> +fastcall void kunmap_high(struct page *page)
>  {
> -	unsigned long vaddr;
> -	unsigned long nr;
> -	int need_wakeup;
> -
> -	spin_lock(&kmap_lock);
> -	vaddr = (unsigned long)page_address(page);
> +	unsigned long vaddr = (unsigned long)page_address(page);
>  	BUG_ON(!vaddr);
> -	nr = PKMAP_NR(vaddr);
> -
> -	/*
> -	 * A count must never go down to zero
> -	 * without a TLB flush!
> -	 */
> -	need_wakeup = 0;
> -	switch (--pkmap_count[nr]) {
> -	case 0:
> -		BUG();
> -	case 1:
> -		/*
> -		 * Avoid an unnecessary wake_up() function call.
> -		 * The common case is pkmap_count[] == 1, but
> -		 * no waiters.
> -		 * The tasks queued in the wait-queue are guarded
> -		 * by both the lock in the wait-queue-head and by
> -		 * the kmap_lock.  As the kmap_lock is held here,
> -		 * no need for the wait-queue-head's lock.  Simply
> -		 * test if the queue is empty.
> -		 */
> -		need_wakeup = waitqueue_active(&pkmap_map_wait);
> -	}
> -	spin_unlock(&kmap_lock);
> -
> -	/* do wake-up, if needed, race-free outside of the spin lock */
> -	if (need_wakeup)
> -		wake_up(&pkmap_map_wait);
> +	pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]);
>  }
>  
>  EXPORT_SYMBOL(kunmap_high);
> +
>  #endif
>  
>  #if defined(HASHED_PAGE_VIRTUAL)
> @@ -217,19 +262,13 @@ EXPORT_SYMBOL(kunmap_high);
>  #define PA_HASH_ORDER	7
>  
>  /*
> - * Describes one page->virtual association
> + * Describes one page->virtual address association.
>   */
> -struct page_address_map {
> +static struct page_address_map {
>  	struct page *page;
>  	void *virtual;
>  	struct list_head list;
> -};
> -
> -/*
> - * page_address_map freelist, allocated from page_address_maps.
> - */
> -static struct list_head page_address_pool;	/* freelist */
> -static spinlock_t pool_lock;			/* protects page_address_pool */
> +} page_address_maps[LAST_PKMAP];
>  
>  /*
>   * Hash table bucket
> @@ -244,91 +283,123 @@ static struct page_address_slot *page_sl
>  	return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
>  }
>  
> -void *page_address(struct page *page)
> +static void *__page_address(struct page_address_slot *pas, struct page *page)
>  {
> -	unsigned long flags;
> -	void *ret;
> -	struct page_address_slot *pas;
> -
> -	if (!PageHighMem(page))
> -		return lowmem_page_address(page);
> +	void *ret = NULL;
>  
> -	pas = page_slot(page);
> -	ret = NULL;
> -	spin_lock_irqsave(&pas->lock, flags);
>  	if (!list_empty(&pas->lh)) {
>  		struct page_address_map *pam;
>  
>  		list_for_each_entry(pam, &pas->lh, list) {
>  			if (pam->page == page) {
>  				ret = pam->virtual;
> -				goto done;
> +				break;
>  			}
>  		}
>  	}
> -done:
> +
> +	return ret;
> +}
> +
> +void *page_address(struct page *page)
> +{
> +	unsigned long flags;
> +	void *ret;
> +	struct page_address_slot *pas;
> +
> +	if (!PageHighMem(page))
> +		return lowmem_page_address(page);
> +
> +	pas = page_slot(page);
> +	spin_lock_irqsave(&pas->lock, flags);
> +	ret = __page_address(pas, page);
>  	spin_unlock_irqrestore(&pas->lock, flags);
>  	return ret;
>  }
>  
>  EXPORT_SYMBOL(page_address);
>  
> -void set_page_address(struct page *page, void *virtual)
> +static int __set_page_address(struct page *page, void *virtual, int pos)
>  {
> +	int ret = 0;
>  	unsigned long flags;
>  	struct page_address_slot *pas;
>  	struct page_address_map *pam;
>  
> -	BUG_ON(!PageHighMem(page));
> +	VM_BUG_ON(!PageHighMem(page));
> +	VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0);
> +	VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP);
>  
>  	pas = page_slot(page);
> -	if (virtual) {		/* Add */
> -		BUG_ON(list_empty(&page_address_pool));
> +	pam = &page_address_maps[pos];
>  
> -		spin_lock_irqsave(&pool_lock, flags);
> -		pam = list_entry(page_address_pool.next,
> -				struct page_address_map, list);
> -		list_del(&pam->list);
> -		spin_unlock_irqrestore(&pool_lock, flags);
> -
> -		pam->page = page;
> -		pam->virtual = virtual;
> -
> -		spin_lock_irqsave(&pas->lock, flags);
> -		list_add_tail(&pam->list, &pas->lh);
> -		spin_unlock_irqrestore(&pas->lock, flags);
> -	} else {		/* Remove */
> -		spin_lock_irqsave(&pas->lock, flags);
> -		list_for_each_entry(pam, &pas->lh, list) {
> -			if (pam->page == page) {
> -				list_del(&pam->list);
> -				spin_unlock_irqrestore(&pas->lock, flags);
> -				spin_lock_irqsave(&pool_lock, flags);
> -				list_add_tail(&pam->list, &page_address_pool);
> -				spin_unlock_irqrestore(&pool_lock, flags);
> -				goto done;
> -			}
> +	spin_lock_irqsave(&pas->lock, flags);
> +	if (virtual) { /* add */
> +		VM_BUG_ON(!list_empty(&pam->list));
> +
> +		if (!__page_address(pas, page)) {
> +			pam->page = page;
> +			pam->virtual = virtual;
> +			list_add_tail(&pam->list, &pas->lh);
> +			ret = 1;
> +		}
> +	} else { /* remove */
> +		if (!list_empty(&pam->list)) {
> +			list_del_init(&pam->list);
> +			ret = 1;
>  		}
> -		spin_unlock_irqrestore(&pas->lock, flags);
>  	}
> -done:
> -	return;
> +	spin_unlock_irqrestore(&pas->lock, flags);
> +
> +	return ret;
>  }
>  
> -static struct page_address_map page_address_maps[LAST_PKMAP];
> +int set_page_address(struct page *page, void *virtual)
> +{
> +	/*
> +	 * set_page_address is not supposed to be called when using
> +	 * hashed virtual addresses.
> +	 */
> +	BUG();
> +	return 0;
> +}
>  
> -void __init page_address_init(void)
> +void __init __page_address_init(void)
>  {
>  	int i;
>  
> -	INIT_LIST_HEAD(&page_address_pool);
>  	for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
> -		list_add(&page_address_maps[i].list, &page_address_pool);
> +		INIT_LIST_HEAD(&page_address_maps[i].list);
> +
>  	for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
>  		INIT_LIST_HEAD(&page_address_htable[i].lh);
>  		spin_lock_init(&page_address_htable[i].lock);
>  	}
> -	spin_lock_init(&pool_lock);
> +}
> +
> +#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */
> +
> +static int __set_page_address(struct page *page, void *virtual, int pos)
> +{
> +	return set_page_address(page, virtual);
>  }
>  
>  #endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
> +
> +#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL)
> +
> +void __init page_address_init(void)
> +{
> +#ifdef CONFIG_HIGHMEM
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(pkmap_count); i++)
> +		atomic_set(&pkmap_count[i], 1);
> +#endif
> +
> +#ifdef HASHED_PAGE_VIRTUAL
> +	__page_address_init();
> +#endif
> +}
> +
> +#endif
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
---end quoted text---

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 14:49 ` Christoph Hellwig
@ 2007-01-28 15:17   ` Ingo Molnar
  2007-01-28 15:28     ` Christoph Hellwig
  0 siblings, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2007-01-28 15:17 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, linux-kernel, linux-mm, Andrew Morton


* Christoph Hellwig <hch@infradead.org> wrote:

> On Sun, Jan 28, 2007 at 03:11:34PM +0100, Peter Zijlstra wrote:
> > Eradicate global locks.
> > 
> >  - kmap_lock is removed by extensive use of atomic_t, a new flush
> >    scheme and modifying set_page_address to only allow NULL<->virt
> >    transitions.
> 
> What's the point for this? [...]

scalability. I did lock profiling on the -rt kernel, which exposes such 
things nicely. Half of the lock contention events during kernel compile 
were due to kmap(). (The system had 2 GB of RAM, so 40% lowmem, 60% 
highmem.)

> [...] In doubt we just need to convert that caller to kmap_atomic.

the pagecache ones cannot be converted to kmap_atomic, because we can 
block while holding them. Plus kmap_atomic is quite a bit slower than 
this scalable version of kmap().

	Ingo

ps. please fix your mailer to not emit Mail-Followup-To headers. In Mutt
    you can do this via "set followup_to=no" in your .muttrc.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 15:17   ` Ingo Molnar
@ 2007-01-28 15:28     ` Christoph Hellwig
  2007-01-28 15:48       ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2007-01-28 15:28 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Peter Zijlstra, linux-kernel, linux-mm, Andrew Morton

On Sun, Jan 28, 2007 at 04:17:00PM +0100, Ingo Molnar wrote:
> scalability. I did lock profiling on the -rt kernel, which exposes such 
> things nicely. Half of the lock contention events during kernel compile 
> were due to kmap(). (The system had 2 GB of RAM, so 40% lowmem, 60% 
> highmem.)

Numbers please, and not on -rt but on mainline.  Please show the profiles.

> ps. please fix your mailer to not emit Mail-Followup-To headers. In Mutt
>     you can do this via "set followup_to=no" in your .muttrc.

I have told you last time that this is absolutely intentional and I won't
change it.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 15:28     ` Christoph Hellwig
@ 2007-01-28 15:48       ` Ingo Molnar
  2007-01-28 15:54         ` Christoph Hellwig
  0 siblings, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2007-01-28 15:48 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton


* Christoph Hellwig <hch@infradead.org> wrote:

> On Sun, Jan 28, 2007 at 04:17:00PM +0100, Ingo Molnar wrote:
> > scalability. I did lock profiling on the -rt kernel, which exposes 
> > such things nicely. Half of the lock contention events during kernel 
> > compile were due to kmap(). (The system had 2 GB of RAM, so 40% 
> > lowmem, 60% highmem.)
> 
> Numbers please, and not on -rt but on mainline.  Please show the 
> profiles.

i'm sorry, but do you realize that files_lock is a global lock, 
triggered by /every single/ file close?

   " files_lock is a global lock and we touch it for every single
     sys_close() system call that the system does. "

You really dont need to be a rocket scientist to see that it's a 
globally bouncing cacheline that has a major effect on certain 
VFS-intense workloads. Peter has worked hard to eliminate its effects 
without having to couple this to an intrusive rewrite of the TTY layer.

( really, i personally find your dismissive style apalling and i think 
  such a reception of a nice patchset must be humiliating to Peter. I
  certainly try to avoid to be involved with any VFS internals, due to
  this unwelcoming tone of discussion. Had you been around when i
  started contributing to the Linux kernel i'd probably not be hacking
  the kernel today. You are a good hacker but the simultaneous
  collateral damage you are causing is significantly reducing the net
  benefit. )

> > ps. please fix your mailer to not emit Mail-Followup-To headers. In 
> > Mutt you can do this via "set followup_to=no" in your .muttrc.
> 
> I have told you last time that this is absolutely intentional and I 
> won't change it.

( You are messing up the reply headers, everyone is listed in the 'To:'
  field for any reply to your mail, instead of being added to the Cc:
  list. )

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 15:48       ` Ingo Molnar
@ 2007-01-28 15:54         ` Christoph Hellwig
  2007-01-28 18:19           ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2007-01-28 15:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Peter Zijlstra, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton

On Sun, Jan 28, 2007 at 04:48:06PM +0100, Ingo Molnar wrote:
> i'm sorry, but do you realize that files_lock is a global lock, 
> triggered by /every single/ file close?

Please check which thread you're in before you start such lengthy rants.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 15:54         ` Christoph Hellwig
@ 2007-01-28 18:19           ` Ingo Molnar
  0 siblings, 0 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-01-28 18:19 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton


* Christoph Hellwig <hch@infradead.org> wrote:

> On Sun, Jan 28, 2007 at 04:48:06PM +0100, Ingo Molnar wrote:
> > i'm sorry, but do you realize that files_lock is a global lock, 
> > triggered by /every single/ file close?
> 
> Please check which thread you're in before you start such lengthy 
> rants.

my reply applies to the other thread too, you made a similar comment 
there too:

* Christoph Hellwig <hch@infradead.org> wrote:

> On Sun, Jan 28, 2007 at 12:51:18PM +0100, Peter Zijlstra wrote:
> > This patch-set breaks up the global file_list_lock which was found 
> > to be a severe contention point under basically any filesystem 
> > intensive workload.
>
> Benchmarks, please.  Where exactly do you see contention for this?

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 14:11 [PATCH] mm: remove global locks from mm/highmem.c Peter Zijlstra
  2007-01-28 14:49 ` Christoph Hellwig
@ 2007-01-28 22:29 ` Andrew Morton
  2007-01-29  2:52   ` Nick Piggin
                     ` (2 more replies)
  1 sibling, 3 replies; 26+ messages in thread
From: Andrew Morton @ 2007-01-28 22:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, linux-mm, Ingo Molnar

On Sun, 28 Jan 2007 15:11:34 +0100
Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Eradicate global locks.
> 
>  - kmap_lock is removed by extensive use of atomic_t, a new flush
>    scheme and modifying set_page_address to only allow NULL<->virt
>    transitions.
> 
> A count of 0 is an exclusive state acting as an entry lock. This is done
> using inc_not_zero and cmpxchg. The restriction on changing the virtual
> address closes the gap with concurrent additions of the same entry.
> 
>  - pool_lock is removed by using the pkmap index for the
>    page_address_maps.
> 
> By using the pkmap index for the hash entries it is no longer needed to
> keep a free list.
> 
> This patch has been in -rt for a while but should also help regular
> highmem machines with multiple cores/cpus.

I really don't recall any performance problems being reported out of that
code in recent years.

As Christoph says, it's very much preferred that code be migrated over to
kmap_atomic().  Partly because kmap() is deadlockable in situations where a
large number of threads are trying to take two kmaps at the same time and
we run out.  This happened in the past, but incidences have gone away,
probably because of kmap->kmap_atomic conversions.

>From which callsite have you measured problems?

> Index: linux/include/linux/mm.h
> ===================================================================
> --- linux.orig/include/linux/mm.h
> +++ linux/include/linux/mm.h
> @@ -543,23 +543,39 @@ static __always_inline void *lowmem_page
>  #endif
>  
>  #if defined(WANT_PAGE_VIRTUAL)
> -#define page_address(page) ((page)->virtual)
> -#define set_page_address(page, address)			\
> -	do {						\
> -		(page)->virtual = (address);		\
> -	} while(0)
> -#define page_address_init()  do { } while(0)
> +/*
> + * wrap page->virtual so it is safe to set/read locklessly
> + */
> +#define page_address(page) \
> +	({ typeof((page)->virtual) v = (page)->virtual; \
> +	 smp_read_barrier_depends(); \
> +	 v; })
> +
> +static inline int set_page_address(struct page *page, void *address)
> +{
> +	if (address)
> +		return cmpxchg(&page->virtual, NULL, address) == NULL;
> +	else {
> +		/*
> +		 * cmpxchg is a bit abused because it is not guaranteed
> +		 * safe wrt direct assignment on all platforms.
> +		 */
> +		void *virt = page->virtual;
> +		return cmpxchg(&page->vitrual, virt, NULL) == virt;
> +	}
> +}

Have you verified that all architectures which can implement
WANT_PAGE_VIRTUAL also implement cmpxchg?

Have you verified that sufficient headers are included for this to compile
correctly on all WANT_PAGE_VIRTUAL-enabling architectures on all configs? 
I don't see asm/system.h being included in mm.h and if I get yet another
damned it-wont-compile patch I might do something irreversible.

> +static int pkmap_get_free(void)
>  {
> -	unsigned long vaddr;
> -	int count;
> +	int i, pos, flush;
> +	DECLARE_WAITQUEUE(wait, current);
>  
> -start:
> -	count = LAST_PKMAP;
> -	/* Find an empty entry */
> -	for (;;) {
> -		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;

The old code used masking.

> -		if (!last_pkmap_nr) {
> -			flush_all_zero_pkmaps();
> -			count = LAST_PKMAP;
> -		}
> -		if (!pkmap_count[last_pkmap_nr])
> -			break;	/* Found a usable entry */
> -		if (--count)
> -			continue;
> +restart:
> +	for (i = 0; i < LAST_PKMAP; i++) {
> +		pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP;

The new code does more-expensive modulus.  Necessary?

> +		flush = pkmap_try_free(pos);
> +		if (flush >= 0)
> +			goto got_one;
> +	}
> +
> +	/*
> +	 * wait for somebody else to unmap their entries
> +	 */
> +	__set_current_state(TASK_UNINTERRUPTIBLE);
> +	add_wait_queue(&pkmap_map_wait, &wait);
> +	schedule();
> +	remove_wait_queue(&pkmap_map_wait, &wait);

This looks wrong.  What happens if everyone else does their unmap between
the __set_current_state() and the add_wait_queue()?



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 22:29 ` Andrew Morton
@ 2007-01-29  2:52   ` Nick Piggin
  2007-01-29  9:44   ` Peter Zijlstra
  2007-01-29 19:08   ` Ingo Molnar
  2 siblings, 0 replies; 26+ messages in thread
From: Nick Piggin @ 2007-01-29  2:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Peter Zijlstra, linux-kernel, linux-mm, Ingo Molnar

Andrew Morton wrote:
> On Sun, 28 Jan 2007 15:11:34 +0100
> Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

>>+static inline int set_page_address(struct page *page, void *address)
>>+{
>>+	if (address)
>>+		return cmpxchg(&page->virtual, NULL, address) == NULL;
>>+	else {
>>+		/*
>>+		 * cmpxchg is a bit abused because it is not guaranteed
>>+		 * safe wrt direct assignment on all platforms.
>>+		 */
>>+		void *virt = page->virtual;
>>+		return cmpxchg(&page->vitrual, virt, NULL) == virt;
>>+	}
>>+}
> 
> 
> Have you verified that all architectures which can implement
> WANT_PAGE_VIRTUAL also implement cmpxchg?

Simple: we should not implement cmpxchg in generic code. We should
be able to use atomic_long_cmpxchg for this -- it will work perfectly
regardless of what anybody else tells you.

cmpxchg is only required for when that memory location may get modified
by some other means than under your control (eg. userspace, in the case
of drm, or hardware MMU in the case of Christoph's old page fault
scalability patches).

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 22:29 ` Andrew Morton
  2007-01-29  2:52   ` Nick Piggin
@ 2007-01-29  9:44   ` Peter Zijlstra
  2007-01-30  1:31     ` Martin J. Bligh
  2007-01-29 19:08   ` Ingo Molnar
  2 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2007-01-29  9:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, linux-mm, Ingo Molnar

On Sun, 2007-01-28 at 14:29 -0800, Andrew Morton wrote:

> As Christoph says, it's very much preferred that code be migrated over to
> kmap_atomic().  Partly because kmap() is deadlockable in situations where a
> large number of threads are trying to take two kmaps at the same time and
> we run out.  This happened in the past, but incidences have gone away,
> probably because of kmap->kmap_atomic conversions.

> From which callsite have you measured problems?

CONFIG_HIGHPTE code in -rt was horrid. I'll do some measurements on
mainline.

> > Index: linux/include/linux/mm.h
> > ===================================================================
> > --- linux.orig/include/linux/mm.h
> > +++ linux/include/linux/mm.h
> > @@ -543,23 +543,39 @@ static __always_inline void *lowmem_page
> >  #endif
> >  
> >  #if defined(WANT_PAGE_VIRTUAL)
> > -#define page_address(page) ((page)->virtual)
> > -#define set_page_address(page, address)			\
> > -	do {						\
> > -		(page)->virtual = (address);		\
> > -	} while(0)
> > -#define page_address_init()  do { } while(0)
> > +/*
> > + * wrap page->virtual so it is safe to set/read locklessly
> > + */
> > +#define page_address(page) \
> > +	({ typeof((page)->virtual) v = (page)->virtual; \
> > +	 smp_read_barrier_depends(); \
> > +	 v; })
> > +
> > +static inline int set_page_address(struct page *page, void *address)
> > +{
> > +	if (address)
> > +		return cmpxchg(&page->virtual, NULL, address) == NULL;
> > +	else {
> > +		/*
> > +		 * cmpxchg is a bit abused because it is not guaranteed
> > +		 * safe wrt direct assignment on all platforms.
> > +		 */
> > +		void *virt = page->virtual;
> > +		return cmpxchg(&page->vitrual, virt, NULL) == virt;
> > +	}
> > +}
> 
> Have you verified that all architectures which can implement
> WANT_PAGE_VIRTUAL also implement cmpxchg?

It might have been my mistaken in understanding the latest cmpxchg
thread. My understanding was that since LL/SC is not exposable as a low
level primitive all platforms should implement a cmpxchg where some
would not be save against direct assignment.

Anyway, I'll do as Nick says and replace it with atomic_long_cmpxchg.

> Have you verified that sufficient headers are included for this to compile
> correctly on all WANT_PAGE_VIRTUAL-enabling architectures on all configs? 
> I don't see asm/system.h being included in mm.h and if I get yet another
> damned it-wont-compile patch I might do something irreversible.

Point taken.

> > +static int pkmap_get_free(void)
> >  {
> > -	unsigned long vaddr;
> > -	int count;
> > +	int i, pos, flush;
> > +	DECLARE_WAITQUEUE(wait, current);
> >  
> > -start:
> > -	count = LAST_PKMAP;
> > -	/* Find an empty entry */
> > -	for (;;) {
> > -		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
> 
> The old code used masking.
> 
> > -		if (!last_pkmap_nr) {
> > -			flush_all_zero_pkmaps();
> > -			count = LAST_PKMAP;
> > -		}
> > -		if (!pkmap_count[last_pkmap_nr])
> > -			break;	/* Found a usable entry */
> > -		if (--count)
> > -			continue;
> > +restart:
> > +	for (i = 0; i < LAST_PKMAP; i++) {
> > +		pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP;
> 
> The new code does more-expensive modulus.  Necessary?

I thought GCC would automagically use masking when presented with a
power-of-two constant. Can make it more explicit though.

> > +		flush = pkmap_try_free(pos);
> > +		if (flush >= 0)
> > +			goto got_one;
> > +	}
> > +
> > +	/*
> > +	 * wait for somebody else to unmap their entries
> > +	 */
> > +	__set_current_state(TASK_UNINTERRUPTIBLE);
> > +	add_wait_queue(&pkmap_map_wait, &wait);
> > +	schedule();
> > +	remove_wait_queue(&pkmap_map_wait, &wait);
> 
> This looks wrong.  What happens if everyone else does their unmap between
> the __set_current_state() and the add_wait_queue()?

Eek, you are quite right.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-28 22:29 ` Andrew Morton
  2007-01-29  2:52   ` Nick Piggin
  2007-01-29  9:44   ` Peter Zijlstra
@ 2007-01-29 19:08   ` Ingo Molnar
  2007-01-29 19:19     ` Hugh Dickins
                       ` (2 more replies)
  2 siblings, 3 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-01-29 19:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Peter Zijlstra, linux-kernel, linux-mm


* Andrew Morton <akpm@osdl.org> wrote:

> > Eradicate global locks.
> > 
> >  - kmap_lock is removed by extensive use of atomic_t, a new flush
> >    scheme and modifying set_page_address to only allow NULL<->virt
> >    transitions.

> I really don't recall any performance problems being reported out of 
> that code in recent years.

well, almost nobody profiles 32-bit boxes. I personally always knew that 
kmap() sucks on certain 32-bit SMP workloads (and -rt's scheduling model 
makes such bottlenecks even more apparent) - but many people acted in 
the belief that 64-bit is all that matters and 32-bit scalability is 
obsolete. Here are the numbers that i think changes the picture:

 http://www.fedoraproject.org/awstats/stats/updates-released-fc6-i386.total
 http://www.fedoraproject.org/awstats/stats/updates-released-fc6-x86_64.total

For every 64-bit Fedora box there's more than seven 32-bit boxes. I 
think 32-bit is going to live with us far longer than many thought, so 
we might as well make it work better. Both HIGHMEM and HIGHPTE is the 
default on many distro kernels, which pushes the kmap infrastructure 
quite a bit.

> As Christoph says, it's very much preferred that code be migrated over 
> to kmap_atomic().  Partly because kmap() is deadlockable in situations 
> where a large number of threads are trying to take two kmaps at the 
> same time and we run out.  This happened in the past, but incidences 
> have gone away, probably because of kmap->kmap_atomic conversions.

the problem is that everything that was easy to migrate was migrated off 
kmap() already - and it's exactly those hard cases that cannot be 
converted (like the pagecache use) which is the most frequent kmap() 
users.

While "it would be nice" to eliminate kmap(), but reality is that it's 
here and the patches from Peter to make it (quite a bit) more scalable 
are here as well.

plus, with these fixes kmap() is actually faster than kmap_atomic(). 
(because kunmap_atomic() necessiates an INVLPG instruction which is 
quite slow.)

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-29 19:08   ` Ingo Molnar
@ 2007-01-29 19:19     ` Hugh Dickins
  2007-01-29 19:53       ` Ingo Molnar
  2007-01-29 20:06     ` Ingo Molnar
  2007-01-30  2:02     ` Nick Piggin
  2 siblings, 1 reply; 26+ messages in thread
From: Hugh Dickins @ 2007-01-29 19:19 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Peter Zijlstra, linux-kernel, linux-mm

On Mon, 29 Jan 2007, Ingo Molnar wrote:
> 
> For every 64-bit Fedora box there's more than seven 32-bit boxes. I 
> think 32-bit is going to live with us far longer than many thought, so 
> we might as well make it work better. Both HIGHMEM and HIGHPTE is the 
> default on many distro kernels, which pushes the kmap infrastructure 
> quite a bit.

But HIGHPTE uses kmap_atomic (in mainline: does -rt use kmap there?)

Hugh

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-29 19:19     ` Hugh Dickins
@ 2007-01-29 19:53       ` Ingo Molnar
  0 siblings, 0 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-01-29 19:53 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Andrew Morton, Peter Zijlstra, linux-kernel, linux-mm


* Hugh Dickins <hugh@veritas.com> wrote:

> > For every 64-bit Fedora box there's more than seven 32-bit boxes. I 
> > think 32-bit is going to live with us far longer than many thought, 
> > so we might as well make it work better. Both HIGHMEM and HIGHPTE is 
> > the default on many distro kernels, which pushes the kmap 
> > infrastructure quite a bit.
> 
> But HIGHPTE uses kmap_atomic (in mainline: does -rt use kmap there?)

The contention i saw was on mainline and in the pagecache uses of 
kmap(). With HIGHPTE i only meant that typically every available highmem 
option is enabled on 32-bit distro kernel rpms, to make it work on as 
wide selection of hardware as possible. Sometimes PAE is split into a 
separate rpm, but mostly there's just one 32-bit kernel.

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-29 19:08   ` Ingo Molnar
  2007-01-29 19:19     ` Hugh Dickins
@ 2007-01-29 20:06     ` Ingo Molnar
  2007-01-30  2:02     ` Nick Piggin
  2 siblings, 0 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-01-29 20:06 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Peter Zijlstra, linux-kernel, linux-mm


* Ingo Molnar <mingo@elte.hu> wrote:

> Here are the numbers that i think changes the picture:

i forgot to explain them:

current (estimated) total installed base of 32-bit (i686) Fedora:

>  http://www.fedoraproject.org/awstats/stats/updates-released-fc6-i386.total

current (estimated) total installed base of 64-bit (x86_64) Fedora:

>  http://www.fedoraproject.org/awstats/stats/updates-released-fc6-x86_64.total

current (estimated) total installed base of PPC(64) Fedora:

>  http://www.fedoraproject.org/awstats/stats/updates-released-fc6-ppc.total

these are updated daily i think. The counters started late October 2006, 
when FC6 was released.

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-29  9:44   ` Peter Zijlstra
@ 2007-01-30  1:31     ` Martin J. Bligh
  2007-01-30  1:41       ` Andrew Morton
  0 siblings, 1 reply; 26+ messages in thread
From: Martin J. Bligh @ 2007-01-30  1:31 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andrew Morton, linux-kernel, linux-mm, Ingo Molnar

Peter Zijlstra wrote:
> On Sun, 2007-01-28 at 14:29 -0800, Andrew Morton wrote:
> 
>> As Christoph says, it's very much preferred that code be migrated over to
>> kmap_atomic().  Partly because kmap() is deadlockable in situations where a
>> large number of threads are trying to take two kmaps at the same time and
>> we run out.  This happened in the past, but incidences have gone away,
>> probably because of kmap->kmap_atomic conversions.
> 
>> From which callsite have you measured problems?
> 
> CONFIG_HIGHPTE code in -rt was horrid. I'll do some measurements on
> mainline.
> 

CONFIG_HIGHPTE is always horrid -we've known that for years.
Don't use it.

If that's all we're fixing here, I'd be highly suspect ...

M.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-30  1:31     ` Martin J. Bligh
@ 2007-01-30  1:41       ` Andrew Morton
  2007-01-30  1:49         ` Martin J. Bligh
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2007-01-30  1:41 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Peter Zijlstra, linux-kernel, linux-mm, Ingo Molnar

On Mon, 29 Jan 2007 17:31:20 -0800
"Martin J. Bligh" <mbligh@mbligh.org> wrote:

> Peter Zijlstra wrote:
> > On Sun, 2007-01-28 at 14:29 -0800, Andrew Morton wrote:
> > 
> >> As Christoph says, it's very much preferred that code be migrated over to
> >> kmap_atomic().  Partly because kmap() is deadlockable in situations where a
> >> large number of threads are trying to take two kmaps at the same time and
> >> we run out.  This happened in the past, but incidences have gone away,
> >> probably because of kmap->kmap_atomic conversions.
> > 
> >> From which callsite have you measured problems?
> > 
> > CONFIG_HIGHPTE code in -rt was horrid. I'll do some measurements on
> > mainline.
> > 
> 
> CONFIG_HIGHPTE is always horrid -we've known that for years.

We have?  What's wrong with it?  <looks around for bug reports>

> Don't use it.
> 
> If that's all we're fixing here, I'd be highly suspect ...

highpte uses atomic kmaps - it is unrelated to this work.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-30  1:41       ` Andrew Morton
@ 2007-01-30  1:49         ` Martin J. Bligh
  2007-01-30  2:15           ` Andrew Morton
  0 siblings, 1 reply; 26+ messages in thread
From: Martin J. Bligh @ 2007-01-30  1:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Peter Zijlstra, linux-kernel, linux-mm, Ingo Molnar

Andrew Morton wrote:
> On Mon, 29 Jan 2007 17:31:20 -0800
> "Martin J. Bligh" <mbligh@mbligh.org> wrote:
> 
>> Peter Zijlstra wrote:
>>> On Sun, 2007-01-28 at 14:29 -0800, Andrew Morton wrote:
>>>
>>>> As Christoph says, it's very much preferred that code be migrated over to
>>>> kmap_atomic().  Partly because kmap() is deadlockable in situations where a
>>>> large number of threads are trying to take two kmaps at the same time and
>>>> we run out.  This happened in the past, but incidences have gone away,
>>>> probably because of kmap->kmap_atomic conversions.
>>>> From which callsite have you measured problems?
>>> CONFIG_HIGHPTE code in -rt was horrid. I'll do some measurements on
>>> mainline.
>>>
>> CONFIG_HIGHPTE is always horrid -we've known that for years.
> 
> We have?  What's wrong with it?  <looks around for bug reports>

http://www.ussg.iu.edu/hypermail/linux/kernel/0307.0/0463.html

July 2003.


M.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-29 19:08   ` Ingo Molnar
  2007-01-29 19:19     ` Hugh Dickins
  2007-01-29 20:06     ` Ingo Molnar
@ 2007-01-30  2:02     ` Nick Piggin
  2 siblings, 0 replies; 26+ messages in thread
From: Nick Piggin @ 2007-01-30  2:02 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Peter Zijlstra, linux-kernel, linux-mm

Ingo Molnar wrote:

> For every 64-bit Fedora box there's more than seven 32-bit boxes. I 
> think 32-bit is going to live with us far longer than many thought, so 
> we might as well make it work better. Both HIGHMEM and HIGHPTE is the 
> default on many distro kernels, which pushes the kmap infrastructure 
> quite a bit.

I don't think anybody would argue against numbers, but just that there
are not many big 32-bit SMPs anymore. And if Bill Irwin didn't fix the
kmap problem back then, it would be interesting to see a system and
workload where it actually is a bottleneck.

Not that I'm against any patch to improve scalability, if it doesn't
hurt single-threaded performance ;)

> the problem is that everything that was easy to migrate was migrated off 
> kmap() already - and it's exactly those hard cases that cannot be 
> converted (like the pagecache use) which is the most frequent kmap() 
> users.

Which pagecache use? file_read_actor()?

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-30  1:49         ` Martin J. Bligh
@ 2007-01-30  2:15           ` Andrew Morton
  2007-01-31  0:44             ` David Chinner
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2007-01-30  2:15 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Peter Zijlstra, linux-kernel, linux-mm, Ingo Molnar, David Chinner

On Mon, 29 Jan 2007 17:49:14 -0800
"Martin J. Bligh" <mbligh@mbligh.org> wrote:

> Andrew Morton wrote:
> > On Mon, 29 Jan 2007 17:31:20 -0800
> > "Martin J. Bligh" <mbligh@mbligh.org> wrote:
> > 
> >> Peter Zijlstra wrote:
> >>> On Sun, 2007-01-28 at 14:29 -0800, Andrew Morton wrote:
> >>>
> >>>> As Christoph says, it's very much preferred that code be migrated over to
> >>>> kmap_atomic().  Partly because kmap() is deadlockable in situations where a
> >>>> large number of threads are trying to take two kmaps at the same time and
> >>>> we run out.  This happened in the past, but incidences have gone away,
> >>>> probably because of kmap->kmap_atomic conversions.
> >>>> From which callsite have you measured problems?
> >>> CONFIG_HIGHPTE code in -rt was horrid. I'll do some measurements on
> >>> mainline.
> >>>
> >> CONFIG_HIGHPTE is always horrid -we've known that for years.
> > 
> > We have?  What's wrong with it?  <looks around for bug reports>
> 
> http://www.ussg.iu.edu/hypermail/linux/kernel/0307.0/0463.html

2% overhead for a pte-intensive workload for unknown reasons four years
ago.  Sort of a mini-horrid, no?

We still don't know what is the source of kmap() activity which
necessitated this patch btw.  AFAIK the busiest source is ext2 directories,
but perhaps NFS under certain conditions?

<looks at xfs_iozero>

->prepare_write no longer requires that the caller kmap the page.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-30  2:15           ` Andrew Morton
@ 2007-01-31  0:44             ` David Chinner
  2007-01-31  1:11               ` Andrew Morton
  0 siblings, 1 reply; 26+ messages in thread
From: David Chinner @ 2007-01-31  0:44 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Martin J. Bligh, Peter Zijlstra, linux-kernel, linux-mm,
	Ingo Molnar, David Chinner

On Mon, Jan 29, 2007 at 06:15:57PM -0800, Andrew Morton wrote:
> We still don't know what is the source of kmap() activity which
> necessitated this patch btw.  AFAIK the busiest source is ext2 directories,
> but perhaps NFS under certain conditions?
> 
> <looks at xfs_iozero>
> 
> ->prepare_write no longer requires that the caller kmap the page.

Agreed, but don't we (xfs_iozero) have to map it first to zero it?

I think what you are saying here, Andrew, is that we can
do something like:

	page = grab_cache_page
	->prepare_write(page)
	kaddr = kmap_atomic(page, KM_USER0)
	memset(kaddr+offset, 0, bytes)
	flush_dcache_page(page)
	kunmap_atomic(kaddr, KM_USER0)
	->commit_write(page)

to avoid using kmap() altogether?

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-31  0:44             ` David Chinner
@ 2007-01-31  1:11               ` Andrew Morton
  2007-01-31  3:22                 ` David Chinner
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2007-01-31  1:11 UTC (permalink / raw)
  To: David Chinner
  Cc: Martin J. Bligh, Peter Zijlstra, linux-kernel, linux-mm, Ingo Molnar

On Wed, 31 Jan 2007 11:44:36 +1100
David Chinner <dgc@sgi.com> wrote:

> On Mon, Jan 29, 2007 at 06:15:57PM -0800, Andrew Morton wrote:
> > We still don't know what is the source of kmap() activity which
> > necessitated this patch btw.  AFAIK the busiest source is ext2 directories,
> > but perhaps NFS under certain conditions?
> > 
> > <looks at xfs_iozero>
> > 
> > ->prepare_write no longer requires that the caller kmap the page.
> 
> Agreed, but don't we (xfs_iozero) have to map it first to zero it?
> 
> I think what you are saying here, Andrew, is that we can
> do something like:
> 
> 	page = grab_cache_page
> 	->prepare_write(page)
> 	kaddr = kmap_atomic(page, KM_USER0)
> 	memset(kaddr+offset, 0, bytes)
> 	flush_dcache_page(page)
> 	kunmap_atomic(kaddr, KM_USER0)
> 	->commit_write(page)
> 
> to avoid using kmap() altogether?
> 

Yup.  Even better, use clear_highpage().

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-31  1:11               ` Andrew Morton
@ 2007-01-31  3:22                 ` David Chinner
  2007-02-02 12:05                   ` Christoph Hellwig
  0 siblings, 1 reply; 26+ messages in thread
From: David Chinner @ 2007-01-31  3:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Chinner, Martin J. Bligh, Peter Zijlstra, linux-kernel,
	linux-mm, Ingo Molnar

On Tue, Jan 30, 2007 at 05:11:32PM -0800, Andrew Morton wrote:
> On Wed, 31 Jan 2007 11:44:36 +1100
> David Chinner <dgc@sgi.com> wrote:
> 
> > On Mon, Jan 29, 2007 at 06:15:57PM -0800, Andrew Morton wrote:
> > > We still don't know what is the source of kmap() activity which
> > > necessitated this patch btw.  AFAIK the busiest source is ext2 directories,
> > > but perhaps NFS under certain conditions?
> > > 
> > > <looks at xfs_iozero>
> > > 
> > > ->prepare_write no longer requires that the caller kmap the page.
> > 
> > Agreed, but don't we (xfs_iozero) have to map it first to zero it?
> > 
> > I think what you are saying here, Andrew, is that we can
> > do something like:
> > 
> > 	page = grab_cache_page
> > 	->prepare_write(page)
> > 	kaddr = kmap_atomic(page, KM_USER0)
> > 	memset(kaddr+offset, 0, bytes)
> > 	flush_dcache_page(page)
> > 	kunmap_atomic(kaddr, KM_USER0)
> > 	->commit_write(page)
> > 
> > to avoid using kmap() altogether?
> 
> Yup.  Even better, use clear_highpage().

For even more goodness, clearmem_highpage_flush() does exactly
the right thing for partial page zeroing ;)

Thanks, Andrew, I've added a patch to my QA tree with this mod.

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-01-31  3:22                 ` David Chinner
@ 2007-02-02 12:05                   ` Christoph Hellwig
  2007-02-02 19:24                     ` Matt Mackall
  2007-02-02 23:14                     ` David Chinner
  0 siblings, 2 replies; 26+ messages in thread
From: Christoph Hellwig @ 2007-02-02 12:05 UTC (permalink / raw)
  To: David Chinner
  Cc: Andrew Morton, Martin J. Bligh, Peter Zijlstra, linux-kernel,
	linux-mm, Ingo Molnar

On Wed, Jan 31, 2007 at 02:22:24PM +1100, David Chinner wrote:
> > Yup.  Even better, use clear_highpage().
> 
> For even more goodness, clearmem_highpage_flush() does exactly
> the right thing for partial page zeroing ;)

Note that there are tons of places in buffer.c that could use
clearmem_highpage_flush().  See the so far untested patch below:


Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c	2007-02-02 12:53:51.000000000 +0100
+++ linux-2.6/fs/buffer.c	2007-02-02 12:59:42.000000000 +0100
@@ -1858,13 +1858,8 @@
 		if (block_start >= to)
 			break;
 		if (buffer_new(bh)) {
-			void *kaddr;
-
 			clear_buffer_new(bh);
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr+block_start, 0, bh->b_size);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
+			memclear_highpage_flush(page, block_start, bh->b_size);
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
 		}
@@ -1952,10 +1947,8 @@
 					SetPageError(page);
 			}
 			if (!buffer_mapped(bh)) {
-				void *kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + i * blocksize, 0, blocksize);
-				flush_dcache_page(page);
-				kunmap_atomic(kaddr, KM_USER0);
+				memclear_highpage_flush(page, i * blocksize,
+							blocksize);
 				if (!err)
 					set_buffer_uptodate(bh);
 				continue;
@@ -2098,7 +2091,6 @@
 	long status;
 	unsigned zerofrom;
 	unsigned blocksize = 1 << inode->i_blkbits;
-	void *kaddr;
 
 	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
 		status = -ENOMEM;
@@ -2120,10 +2112,8 @@
 						PAGE_CACHE_SIZE, get_block);
 		if (status)
 			goto out_unmap;
-		kaddr = kmap_atomic(new_page, KM_USER0);
-		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
-		flush_dcache_page(new_page);
-		kunmap_atomic(kaddr, KM_USER0);
+		memclear_highpage_flush(page, zerofrom,
+					PAGE_CACHE_SIZE - zerofrom);
 		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
 		unlock_page(new_page);
 		page_cache_release(new_page);
@@ -2150,10 +2140,7 @@
 	if (status)
 		goto out1;
 	if (zerofrom < offset) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr+zerofrom, 0, offset-zerofrom);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		memclear_highpage_flush(page, zerofrom, offset - zerofrom);
 		__block_commit_write(inode, page, zerofrom, offset);
 	}
 	return 0;
@@ -2368,10 +2355,7 @@
 	 * Error recovery is pretty slack.  Clear the page and mark it dirty
 	 * so we'll later zero out any blocks which _were_ allocated.
 	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr, 0, PAGE_CACHE_SIZE);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	memclear_highpage_flush(page, 0, PAGE_CACHE_SIZE);
 	SetPageUptodate(page);
 	set_page_dirty(page);
 	return ret;
@@ -2405,7 +2389,6 @@
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
-	void *kaddr;
 	int ret;
 
 	/* Is the page fully inside i_size? */
@@ -2436,10 +2419,7 @@
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	memclear_highpage_flush(page, offset, PAGE_CACHE_SIZE - offset);
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
@@ -2460,7 +2440,6 @@
 	unsigned to;
 	struct page *page;
 	const struct address_space_operations *a_ops = mapping->a_ops;
-	char *kaddr;
 	int ret = 0;
 
 	if ((offset & (blocksize - 1)) == 0)
@@ -2474,10 +2453,7 @@
 	to = (offset + blocksize) & ~(blocksize - 1);
 	ret = a_ops->prepare_write(NULL, page, offset, to);
 	if (ret == 0) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		memclear_highpage_flush(page, offset, PAGE_CACHE_SIZE - offset);
 		set_page_dirty(page);
 	}
 	unlock_page(page);
@@ -2498,7 +2474,6 @@
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head *bh;
-	void *kaddr;
 	int err;
 
 	blocksize = 1 << inode->i_blkbits;
@@ -2552,11 +2527,7 @@
 			goto unlock;
 	}
 
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, length);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-
+	memclear_highpage_flush(page, offset, length);
 	mark_buffer_dirty(bh);
 	err = 0;
 
@@ -2577,7 +2548,6 @@
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
-	void *kaddr;
 
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
@@ -2603,10 +2573,7 @@
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	memclear_highpage_flush(page, offset, PAGE_CACHE_SIZE - offset);
 	return __block_write_full_page(inode, page, get_block, wbc);
 }
 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-02-02 12:05                   ` Christoph Hellwig
@ 2007-02-02 19:24                     ` Matt Mackall
  2007-02-02 23:16                       ` David Chinner
  2007-02-02 23:14                     ` David Chinner
  1 sibling, 1 reply; 26+ messages in thread
From: Matt Mackall @ 2007-02-02 19:24 UTC (permalink / raw)
  To: Christoph Hellwig, David Chinner, Andrew Morton, Martin J. Bligh,
	Peter Zijlstra, linux-kernel, linux-mm, Ingo Molnar

On Fri, Feb 02, 2007 at 12:05:11PM +0000, Christoph Hellwig wrote:
> On Wed, Jan 31, 2007 at 02:22:24PM +1100, David Chinner wrote:
> > > Yup.  Even better, use clear_highpage().
> > 
> > For even more goodness, clearmem_highpage_flush() does exactly
> > the right thing for partial page zeroing ;)
> 
> Note that there are tons of places in buffer.c that could use
> clearmem_highpage_flush().  See the so far untested patch below:
> 

You probably need s/memclear/clearmem/g..
 
> Index: linux-2.6/fs/buffer.c
> ===================================================================
> --- linux-2.6.orig/fs/buffer.c	2007-02-02 12:53:51.000000000 +0100
> +++ linux-2.6/fs/buffer.c	2007-02-02 12:59:42.000000000 +0100
> @@ -1858,13 +1858,8 @@
>  		if (block_start >= to)
>  			break;
>  		if (buffer_new(bh)) {
> -			void *kaddr;
> -
>  			clear_buffer_new(bh);
> -			kaddr = kmap_atomic(page, KM_USER0);
> -			memset(kaddr+block_start, 0, bh->b_size);
> -			flush_dcache_page(page);
> -			kunmap_atomic(kaddr, KM_USER0);
> +			memclear_highpage_flush(page, block_start, bh->b_size);
>  			set_buffer_uptodate(bh);
>  			mark_buffer_dirty(bh);
>  		}
> @@ -1952,10 +1947,8 @@
>  					SetPageError(page);
>  			}
>  			if (!buffer_mapped(bh)) {
> -				void *kaddr = kmap_atomic(page, KM_USER0);
> -				memset(kaddr + i * blocksize, 0, blocksize);
> -				flush_dcache_page(page);
> -				kunmap_atomic(kaddr, KM_USER0);
> +				memclear_highpage_flush(page, i * blocksize,
> +							blocksize);
>  				if (!err)
>  					set_buffer_uptodate(bh);
>  				continue;
> @@ -2098,7 +2091,6 @@
>  	long status;
>  	unsigned zerofrom;
>  	unsigned blocksize = 1 << inode->i_blkbits;
> -	void *kaddr;
>  
>  	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
>  		status = -ENOMEM;
> @@ -2120,10 +2112,8 @@
>  						PAGE_CACHE_SIZE, get_block);
>  		if (status)
>  			goto out_unmap;
> -		kaddr = kmap_atomic(new_page, KM_USER0);
> -		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
> -		flush_dcache_page(new_page);
> -		kunmap_atomic(kaddr, KM_USER0);
> +		memclear_highpage_flush(page, zerofrom,
> +					PAGE_CACHE_SIZE - zerofrom);
>  		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
>  		unlock_page(new_page);
>  		page_cache_release(new_page);
> @@ -2150,10 +2140,7 @@
>  	if (status)
>  		goto out1;
>  	if (zerofrom < offset) {
> -		kaddr = kmap_atomic(page, KM_USER0);
> -		memset(kaddr+zerofrom, 0, offset-zerofrom);
> -		flush_dcache_page(page);
> -		kunmap_atomic(kaddr, KM_USER0);
> +		memclear_highpage_flush(page, zerofrom, offset - zerofrom);
>  		__block_commit_write(inode, page, zerofrom, offset);
>  	}
>  	return 0;
> @@ -2368,10 +2355,7 @@
>  	 * Error recovery is pretty slack.  Clear the page and mark it dirty
>  	 * so we'll later zero out any blocks which _were_ allocated.
>  	 */
> -	kaddr = kmap_atomic(page, KM_USER0);
> -	memset(kaddr, 0, PAGE_CACHE_SIZE);
> -	flush_dcache_page(page);
> -	kunmap_atomic(kaddr, KM_USER0);
> +	memclear_highpage_flush(page, 0, PAGE_CACHE_SIZE);
>  	SetPageUptodate(page);
>  	set_page_dirty(page);
>  	return ret;
> @@ -2405,7 +2389,6 @@
>  	loff_t i_size = i_size_read(inode);
>  	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
>  	unsigned offset;
> -	void *kaddr;
>  	int ret;
>  
>  	/* Is the page fully inside i_size? */
> @@ -2436,10 +2419,7 @@
>  	 * the  page size, the remaining memory is zeroed when mapped, and
>  	 * writes to that region are not written out to the file."
>  	 */
> -	kaddr = kmap_atomic(page, KM_USER0);
> -	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
> -	flush_dcache_page(page);
> -	kunmap_atomic(kaddr, KM_USER0);
> +	memclear_highpage_flush(page, offset, PAGE_CACHE_SIZE - offset);
>  out:
>  	ret = mpage_writepage(page, get_block, wbc);
>  	if (ret == -EAGAIN)
> @@ -2460,7 +2440,6 @@
>  	unsigned to;
>  	struct page *page;
>  	const struct address_space_operations *a_ops = mapping->a_ops;
> -	char *kaddr;
>  	int ret = 0;
>  
>  	if ((offset & (blocksize - 1)) == 0)
> @@ -2474,10 +2453,7 @@
>  	to = (offset + blocksize) & ~(blocksize - 1);
>  	ret = a_ops->prepare_write(NULL, page, offset, to);
>  	if (ret == 0) {
> -		kaddr = kmap_atomic(page, KM_USER0);
> -		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
> -		flush_dcache_page(page);
> -		kunmap_atomic(kaddr, KM_USER0);
> +		memclear_highpage_flush(page, offset, PAGE_CACHE_SIZE - offset);
>  		set_page_dirty(page);
>  	}
>  	unlock_page(page);
> @@ -2498,7 +2474,6 @@
>  	struct inode *inode = mapping->host;
>  	struct page *page;
>  	struct buffer_head *bh;
> -	void *kaddr;
>  	int err;
>  
>  	blocksize = 1 << inode->i_blkbits;
> @@ -2552,11 +2527,7 @@
>  			goto unlock;
>  	}
>  
> -	kaddr = kmap_atomic(page, KM_USER0);
> -	memset(kaddr + offset, 0, length);
> -	flush_dcache_page(page);
> -	kunmap_atomic(kaddr, KM_USER0);
> -
> +	memclear_highpage_flush(page, offset, length);
>  	mark_buffer_dirty(bh);
>  	err = 0;
>  
> @@ -2577,7 +2548,6 @@
>  	loff_t i_size = i_size_read(inode);
>  	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
>  	unsigned offset;
> -	void *kaddr;
>  
>  	/* Is the page fully inside i_size? */
>  	if (page->index < end_index)
> @@ -2603,10 +2573,7 @@
>  	 * the  page size, the remaining memory is zeroed when mapped, and
>  	 * writes to that region are not written out to the file."
>  	 */
> -	kaddr = kmap_atomic(page, KM_USER0);
> -	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
> -	flush_dcache_page(page);
> -	kunmap_atomic(kaddr, KM_USER0);
> +	memclear_highpage_flush(page, offset, PAGE_CACHE_SIZE - offset);
>  	return __block_write_full_page(inode, page, get_block, wbc);
>  }
>  
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-02-02 12:05                   ` Christoph Hellwig
  2007-02-02 19:24                     ` Matt Mackall
@ 2007-02-02 23:14                     ` David Chinner
  1 sibling, 0 replies; 26+ messages in thread
From: David Chinner @ 2007-02-02 23:14 UTC (permalink / raw)
  To: Christoph Hellwig, David Chinner, Andrew Morton, Martin J. Bligh,
	Peter Zijlstra, linux-kernel, linux-mm, Ingo Molnar

On Fri, Feb 02, 2007 at 12:05:11PM +0000, Christoph Hellwig wrote:
> On Wed, Jan 31, 2007 at 02:22:24PM +1100, David Chinner wrote:
> > > Yup.  Even better, use clear_highpage().
> > 
> > For even more goodness, clearmem_highpage_flush() does exactly
> > the right thing for partial page zeroing ;)
> 
> Note that there are tons of places in buffer.c that could use
> clearmem_highpage_flush().  See the so far untested patch below:

Runs through XFSQA just fine. Looks good to me, Christoph.

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm: remove global locks from mm/highmem.c
  2007-02-02 19:24                     ` Matt Mackall
@ 2007-02-02 23:16                       ` David Chinner
  0 siblings, 0 replies; 26+ messages in thread
From: David Chinner @ 2007-02-02 23:16 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Christoph Hellwig, David Chinner, Andrew Morton, Martin J. Bligh,
	Peter Zijlstra, linux-kernel, linux-mm, Ingo Molnar

On Fri, Feb 02, 2007 at 01:24:40PM -0600, Matt Mackall wrote:
> On Fri, Feb 02, 2007 at 12:05:11PM +0000, Christoph Hellwig wrote:
> > On Wed, Jan 31, 2007 at 02:22:24PM +1100, David Chinner wrote:
> > > > Yup.  Even better, use clear_highpage().
> > > 
> > > For even more goodness, clearmem_highpage_flush() does exactly
> > > the right thing for partial page zeroing ;)
> > 
> > Note that there are tons of places in buffer.c that could use
> > clearmem_highpage_flush().  See the so far untested patch below:
> > 
> 
> You probably need s/memclear/clearmem/g..

Not needed - as usual, the code is right and the comments
are wrong. ;)

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-02-02 23:17 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-01-28 14:11 [PATCH] mm: remove global locks from mm/highmem.c Peter Zijlstra
2007-01-28 14:49 ` Christoph Hellwig
2007-01-28 15:17   ` Ingo Molnar
2007-01-28 15:28     ` Christoph Hellwig
2007-01-28 15:48       ` Ingo Molnar
2007-01-28 15:54         ` Christoph Hellwig
2007-01-28 18:19           ` Ingo Molnar
2007-01-28 22:29 ` Andrew Morton
2007-01-29  2:52   ` Nick Piggin
2007-01-29  9:44   ` Peter Zijlstra
2007-01-30  1:31     ` Martin J. Bligh
2007-01-30  1:41       ` Andrew Morton
2007-01-30  1:49         ` Martin J. Bligh
2007-01-30  2:15           ` Andrew Morton
2007-01-31  0:44             ` David Chinner
2007-01-31  1:11               ` Andrew Morton
2007-01-31  3:22                 ` David Chinner
2007-02-02 12:05                   ` Christoph Hellwig
2007-02-02 19:24                     ` Matt Mackall
2007-02-02 23:16                       ` David Chinner
2007-02-02 23:14                     ` David Chinner
2007-01-29 19:08   ` Ingo Molnar
2007-01-29 19:19     ` Hugh Dickins
2007-01-29 19:53       ` Ingo Molnar
2007-01-29 20:06     ` Ingo Molnar
2007-01-30  2:02     ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).