LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [patch 0/5] some mm improvements + s390 tlb flush.
@ 2007-07-03 11:18 Martin Schwidefsky
  2007-07-03 11:18 ` [patch 1/5] avoid tlb gather restarts Martin Schwidefsky
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Martin Schwidefsky @ 2007-07-03 11:18 UTC (permalink / raw)
  To: akpm, hugh, peterz; +Cc: linux-kernel, linux-mm

I have updated my mm patch set. The diff:

 - The tlb gather restart patch has been changed as discussed with Hugh.
 - The ptep_establish patch now really removes all traces of the call.
 - The mm_struct / vm_area_struct move to mm_types.h has been test
   compiled on a number of architectures.
 - The page_mkclean_one patch has been dropped because it doesn't seem
   to have any effect.
 - There is a new patch to fix a theoretical architectural problem on
   s390. The patch is required for the 1K/2K page tables for KVM as well.

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [patch 1/5] avoid tlb gather restarts.
  2007-07-03 11:18 [patch 0/5] some mm improvements + s390 tlb flush Martin Schwidefsky
@ 2007-07-03 11:18 ` Martin Schwidefsky
  2007-07-03 17:42   ` Hugh Dickins
  2007-07-16  6:20   ` Andrew Morton
  2007-07-03 11:18 ` [patch 2/5] remove ptep_establish Martin Schwidefsky
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 15+ messages in thread
From: Martin Schwidefsky @ 2007-07-03 11:18 UTC (permalink / raw)
  To: akpm, hugh, peterz; +Cc: linux-kernel, linux-mm, Martin Schwidefsky

[-- Attachment #1: 001-flush-restarts.diff --]
[-- Type: text/plain, Size: 4138 bytes --]

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

If need_resched() is false in the inner loop of unmap_vmas it is
unnecessary to do a full blown tlb_finish_mmu / tlb_gather_mmu for
each ZAP_BLOCK_SIZE ptes. Do a tlb_flush_mmu() instead. That gives
architectures with a non-generic tlb flush implementation room for
optimization. The tlb_flush_mmu primitive is a available with the
generic tlb flush code, the ia64_tlb_flush_mm needs to be renamed
and a dummy function is added to arm and arm26.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---

 include/asm-arm/tlb.h   |    5 +++++
 include/asm-arm26/tlb.h |    5 +++++
 include/asm-ia64/tlb.h  |    6 +++---
 mm/memory.c             |   16 ++++++----------
 4 files changed, 19 insertions(+), 13 deletions(-)

diff -urpN linux-2.6/include/asm-arm/tlb.h linux-2.6-patched/include/asm-arm/tlb.h
--- linux-2.6/include/asm-arm/tlb.h	2006-11-08 10:45:43.000000000 +0100
+++ linux-2.6-patched/include/asm-arm/tlb.h	2007-07-03 12:56:46.000000000 +0200
@@ -52,6 +52,11 @@ tlb_gather_mmu(struct mm_struct *mm, uns
 }
 
 static inline void
+tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+}
+
+static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
 	if (tlb->fullmm)
diff -urpN linux-2.6/include/asm-arm26/tlb.h linux-2.6-patched/include/asm-arm26/tlb.h
--- linux-2.6/include/asm-arm26/tlb.h	2006-11-08 10:45:43.000000000 +0100
+++ linux-2.6-patched/include/asm-arm26/tlb.h	2007-07-03 12:56:46.000000000 +0200
@@ -29,6 +29,11 @@ tlb_gather_mmu(struct mm_struct *mm, uns
 }
 
 static inline void
+tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+}
+
+static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
         if (tlb->need_flush)
diff -urpN linux-2.6/include/asm-ia64/tlb.h linux-2.6-patched/include/asm-ia64/tlb.h
--- linux-2.6/include/asm-ia64/tlb.h	2006-11-08 10:45:45.000000000 +0100
+++ linux-2.6-patched/include/asm-ia64/tlb.h	2007-07-03 12:56:46.000000000 +0200
@@ -72,7 +72,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_g
  * freed pages that where gathered up to this point.
  */
 static inline void
-ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
+tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
 	unsigned int nr;
 
@@ -160,7 +160,7 @@ tlb_finish_mmu (struct mmu_gather *tlb, 
 	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
 	 * tlb->end_addr.
 	 */
-	ia64_tlb_flush_mmu(tlb, start, end);
+	tlb_flush_mmu(tlb, start, end);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
@@ -184,7 +184,7 @@ tlb_remove_page (struct mmu_gather *tlb,
 	}
 	tlb->pages[tlb->nr++] = page;
 	if (tlb->nr >= FREE_PTE_NR)
-		ia64_tlb_flush_mmu(tlb, tlb->start_addr, tlb->end_addr);
+		tlb_flush_mmu(tlb, tlb->start_addr, tlb->end_addr);
 }
 
 /*
diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c
--- linux-2.6/mm/memory.c	2007-06-18 09:43:22.000000000 +0200
+++ linux-2.6-patched/mm/memory.c	2007-07-03 12:56:46.000000000 +0200
@@ -853,18 +853,15 @@ unsigned long unmap_vmas(struct mmu_gath
 				break;
 			}
 
-			tlb_finish_mmu(*tlbp, tlb_start, start);
-
 			if (need_resched() ||
 				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
-				if (i_mmap_lock) {
-					*tlbp = NULL;
+				if (i_mmap_lock)
 					goto out;
-				}
+				tlb_finish_mmu(*tlbp, tlb_start, start);
 				cond_resched();
-			}
-
-			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
+				*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
+			} else
+				tlb_flush_mmu(*tlbp, tlb_start, start);
 			tlb_start_valid = 0;
 			zap_work = ZAP_BLOCK_SIZE;
 		}
@@ -892,8 +889,7 @@ unsigned long zap_page_range(struct vm_a
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-	if (tlb)
-		tlb_finish_mmu(tlb, address, end);
+	tlb_finish_mmu(tlb, address, end);
 	return end;
 }
 

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [patch 2/5] remove ptep_establish.
  2007-07-03 11:18 [patch 0/5] some mm improvements + s390 tlb flush Martin Schwidefsky
  2007-07-03 11:18 ` [patch 1/5] avoid tlb gather restarts Martin Schwidefsky
@ 2007-07-03 11:18 ` Martin Schwidefsky
  2007-07-03 11:18 ` [patch 3/5] remove ptep_test_and_clear_dirty and ptep_clear_flush_dirty Martin Schwidefsky
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 15+ messages in thread
From: Martin Schwidefsky @ 2007-07-03 11:18 UTC (permalink / raw)
  To: akpm, hugh, peterz; +Cc: linux-kernel, linux-mm, Martin Schwidefsky

[-- Attachment #1: 002-ptep-establish.diff --]
[-- Type: text/plain, Size: 6546 bytes --]

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

The last user of ptep_establish in mm/ is long gone. Remove the
architecture primitive as well.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---

 include/asm-arm/pgtable.h     |    6 ++---
 include/asm-generic/pgtable.h |   19 ------------------
 include/asm-i386/pgtable.h    |   11 ----------
 include/asm-ia64/pgtable.h    |    6 +++--
 include/asm-s390/pgtable.h    |   43 ++++++++++++++++++------------------------
 5 files changed, 26 insertions(+), 59 deletions(-)

diff -urpN linux-2.6/include/asm-arm/pgtable.h linux-2.6-patched/include/asm-arm/pgtable.h
--- linux-2.6/include/asm-arm/pgtable.h	2007-05-09 09:58:15.000000000 +0200
+++ linux-2.6-patched/include/asm-arm/pgtable.h	2007-07-03 12:56:47.000000000 +0200
@@ -83,14 +83,14 @@
  * means that a write to a clean page will cause a permission fault, and
  * the Linux MM layer will mark the page dirty via handle_pte_fault().
  * For the hardware to notice the permission change, the TLB entry must
- * be flushed, and ptep_establish() does that for us.
+ * be flushed, and ptep_set_access_flags() does that for us.
  *
  * The "accessed" or "young" bit is emulated by a similar method; we only
  * allow accesses to the page if the "young" bit is set.  Accesses to the
  * page will cause a fault, and handle_pte_fault() will set the young bit
  * for us as long as the page is marked present in the corresponding Linux
- * PTE entry.  Again, ptep_establish() will ensure that the TLB is up to
- * date.
+ * PTE entry.  Again, ptep_set_access_flags() will ensure that the TLB is
+ * up to date.
  *
  * However, when the "young" bit is cleared, we deny access to the page
  * by clearing the hardware PTE.  Currently Linux does not flush the TLB
diff -urpN linux-2.6/include/asm-generic/pgtable.h linux-2.6-patched/include/asm-generic/pgtable.h
--- linux-2.6/include/asm-generic/pgtable.h	2007-06-18 09:43:22.000000000 +0200
+++ linux-2.6-patched/include/asm-generic/pgtable.h	2007-07-03 12:56:47.000000000 +0200
@@ -3,25 +3,6 @@
 
 #ifndef __ASSEMBLY__
 
-#ifndef __HAVE_ARCH_PTEP_ESTABLISH
-/*
- * Establish a new mapping:
- *  - flush the old one
- *  - update the page tables
- *  - inform the TLB about the new one
- *
- * We hold the mm semaphore for reading, and the pte lock.
- *
- * Note: the old pte is known to not be writable, so we don't need to
- * worry about dirty bits etc getting lost.
- */
-#define ptep_establish(__vma, __address, __ptep, __entry)		\
-do {				  					\
-	set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry);	\
-	flush_tlb_page(__vma, __address);				\
-} while (0)
-#endif
-
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 /*
  * Largely same as above, but only sets the access flags (dirty,
diff -urpN linux-2.6/include/asm-i386/pgtable.h linux-2.6-patched/include/asm-i386/pgtable.h
--- linux-2.6/include/asm-i386/pgtable.h	2007-06-18 09:43:22.000000000 +0200
+++ linux-2.6-patched/include/asm-i386/pgtable.h	2007-07-03 12:56:47.000000000 +0200
@@ -317,17 +317,6 @@ static inline pte_t native_local_ptep_ge
 	__ret;								\
 })
 
-/*
- * Rules for using ptep_establish: the pte MUST be a user pte, and
- * must be a present->present transition.
- */
-#define __HAVE_ARCH_PTEP_ESTABLISH
-#define ptep_establish(vma, address, ptep, pteval)			\
-do {									\
-	set_pte_present((vma)->vm_mm, address, ptep, pteval);		\
-	flush_tlb_page(vma, address);					\
-} while (0)
-
 #define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
 #define ptep_clear_flush_dirty(vma, address, ptep)			\
 ({									\
diff -urpN linux-2.6/include/asm-ia64/pgtable.h linux-2.6-patched/include/asm-ia64/pgtable.h
--- linux-2.6/include/asm-ia64/pgtable.h	2007-06-18 09:43:22.000000000 +0200
+++ linux-2.6-patched/include/asm-ia64/pgtable.h	2007-07-03 12:56:47.000000000 +0200
@@ -546,8 +546,10 @@ extern void lazy_mmu_prot_update (pte_t 
 # define ptep_set_access_flags(__vma, __addr, __ptep, __entry, __safely_writable) \
 ({									\
 	int __changed = !pte_same(*(__ptep), __entry);			\
-	if (__changed)							\
-		ptep_establish(__vma, __addr, __ptep, __entry);		\
+	if (__changed) {						\
+		set_pte_at((__vma)->vm_mm, (__addr), __ptep, __entry);	\
+		flush_tlb_page(__vma, __addr);				\
+	}								\
 	__changed;							\
 })
 #endif
diff -urpN linux-2.6/include/asm-s390/pgtable.h linux-2.6-patched/include/asm-s390/pgtable.h
--- linux-2.6/include/asm-s390/pgtable.h	2007-06-18 09:43:22.000000000 +0200
+++ linux-2.6-patched/include/asm-s390/pgtable.h	2007-07-03 12:56:47.000000000 +0200
@@ -715,16 +715,19 @@ static inline void __ptep_ipte(unsigned 
 	pte_val(*ptep) = _PAGE_TYPE_EMPTY;
 }
 
-static inline pte_t
-ptep_clear_flush(struct vm_area_struct *vma,
-		 unsigned long address, pte_t *ptep)
+static inline void ptep_invalidate(unsigned long address, pte_t *ptep)
 {
-	pte_t pte = *ptep;
-	pte_t *shadow_pte = get_shadow_pte(ptep);
-
 	__ptep_ipte(address, ptep);
-	if (shadow_pte)
-		__ptep_ipte(address, shadow_pte);
+	ptep = get_shadow_pte(ptep);
+	if (ptep)
+		__ptep_ipte(address, ptep);
+}
+
+static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
+				     unsigned long address, pte_t *ptep)
+{
+	pte_t pte = *ptep;
+	ptep_invalidate(address, ptep);
 	return pte;
 }
 
@@ -734,21 +737,14 @@ static inline void ptep_set_wrprotect(st
 	set_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
 }
 
-static inline void
-ptep_establish(struct vm_area_struct *vma, 
-	       unsigned long address, pte_t *ptep,
-	       pte_t entry)
-{
-	ptep_clear_flush(vma, address, ptep);
-	set_pte(ptep, entry);
-}
-
-#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
-({									  \
-	int __changed = !pte_same(*(__ptep), __entry);			  \
-	if (__changed)							  \
-		ptep_establish(__vma, __address, __ptep, __entry);	  \
-	__changed;							  \
+#define ptep_set_access_flags(__vma, __addr, __ptep, __entry, __dirty)	\
+({									\
+	int __changed = !pte_same(*(__ptep), __entry);			\
+	if (__changed) {						\
+		ptep_invalidate(__addr, __ptep);			\
+		set_pte_at((__vma)->vm_mm, __addr, __ptep, __entry);	\
+	}								\
+	__changed;							\
 })
 
 /*
@@ -948,7 +944,6 @@ extern int remove_shared_memory(unsigned
 #define __HAVE_ARCH_MEMMAP_INIT
 extern void memmap_init(unsigned long, int, unsigned long, unsigned long);
 
-#define __HAVE_ARCH_PTEP_ESTABLISH
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [patch 3/5] remove ptep_test_and_clear_dirty and ptep_clear_flush_dirty.
  2007-07-03 11:18 [patch 0/5] some mm improvements + s390 tlb flush Martin Schwidefsky
  2007-07-03 11:18 ` [patch 1/5] avoid tlb gather restarts Martin Schwidefsky
  2007-07-03 11:18 ` [patch 2/5] remove ptep_establish Martin Schwidefsky
@ 2007-07-03 11:18 ` Martin Schwidefsky
  2007-07-03 11:18 ` [patch 4/5] move mm_struct and vm_area_struct Martin Schwidefsky
  2007-07-03 11:18 ` [patch 5/5] s390 tlb flush fix Martin Schwidefsky
  4 siblings, 0 replies; 15+ messages in thread
From: Martin Schwidefsky @ 2007-07-03 11:18 UTC (permalink / raw)
  To: akpm, hugh, peterz; +Cc: linux-kernel, linux-mm, Martin Schwidefsky

[-- Attachment #1: 003-ptep-clear-dirty.diff --]
[-- Type: text/plain, Size: 14486 bytes --]

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

Nobody is using ptep_test_and_clear_dirty and ptep_clear_flush_dirty.
Remove the functions from all architectures.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---

 include/asm-frv/pgtable.h           |    8 --------
 include/asm-generic/pgtable.h       |   25 -------------------------
 include/asm-i386/pgtable.h          |   21 ---------------------
 include/asm-ia64/pgtable.h          |   17 -----------------
 include/asm-m32r/pgtable.h          |    6 ------
 include/asm-parisc/pgtable.h        |   16 ----------------
 include/asm-powerpc/pgtable-ppc32.h |    7 -------
 include/asm-powerpc/pgtable-ppc64.h |   31 -------------------------------
 include/asm-ppc/pgtable.h           |    7 -------
 include/asm-s390/pgtable.h          |   15 ---------------
 include/asm-x86_64/pgtable.h        |    8 --------
 include/asm-xtensa/pgtable.h        |   12 ------------
 12 files changed, 173 deletions(-)

diff -urpN linux-2.6/include/asm-frv/pgtable.h linux-2.6-patched/include/asm-frv/pgtable.h
--- linux-2.6/include/asm-frv/pgtable.h	2007-07-02 08:45:46.000000000 +0200
+++ linux-2.6-patched/include/asm-frv/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -394,13 +394,6 @@ static inline pte_t pte_mkdirty(pte_t pt
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte &= ~_PAGE_WP; return pte; }
 
-static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-	int i = test_and_clear_bit(_PAGE_BIT_DIRTY, ptep);
-	asm volatile("dcf %M0" :: "U"(*ptep));
-	return i;
-}
-
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
 	int i = test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep);
@@ -510,7 +503,6 @@ static inline int pte_file(pte_t pte)
 		remap_pfn_range(vma, vaddr, pfn, size, prot)
 
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTE_SAME
diff -urpN linux-2.6/include/asm-generic/pgtable.h linux-2.6-patched/include/asm-generic/pgtable.h
--- linux-2.6/include/asm-generic/pgtable.h	2007-07-03 12:56:49.000000000 +0200
+++ linux-2.6-patched/include/asm-generic/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -49,31 +49,6 @@
 })
 #endif
 
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
-#define ptep_test_and_clear_dirty(__vma, __address, __ptep)		\
-({									\
-	pte_t __pte = *__ptep;						\
-	int r = 1;							\
-	if (!pte_dirty(__pte))						\
-		r = 0;							\
-	else								\
-		set_pte_at((__vma)->vm_mm, (__address), (__ptep),	\
-			   pte_mkclean(__pte));				\
-	r;								\
-})
-#endif
-
-#ifndef __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
-#define ptep_clear_flush_dirty(__vma, __address, __ptep)		\
-({									\
-	int __dirty;							\
-	__dirty = ptep_test_and_clear_dirty(__vma, __address, __ptep);	\
-	if (__dirty)							\
-		flush_tlb_page(__vma, __address);			\
-	__dirty;							\
-})
-#endif
-
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define ptep_get_and_clear(__mm, __address, __ptep)			\
 ({									\
diff -urpN linux-2.6/include/asm-i386/pgtable.h linux-2.6-patched/include/asm-i386/pgtable.h
--- linux-2.6/include/asm-i386/pgtable.h	2007-07-03 12:56:49.000000000 +0200
+++ linux-2.6-patched/include/asm-i386/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -295,17 +295,6 @@ static inline pte_t native_local_ptep_ge
 	__changed;							\
 })
 
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
-#define ptep_test_and_clear_dirty(vma, addr, ptep) ({			\
-	int __ret = 0;							\
-	if (pte_dirty(*(ptep)))						\
-		__ret = test_and_clear_bit(_PAGE_BIT_DIRTY,		\
-						&(ptep)->pte_low);	\
-	if (__ret)							\
-		pte_update((vma)->vm_mm, addr, ptep);			\
-	__ret;								\
-})
-
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define ptep_test_and_clear_young(vma, addr, ptep) ({			\
 	int __ret = 0;							\
@@ -317,16 +306,6 @@ static inline pte_t native_local_ptep_ge
 	__ret;								\
 })
 
-#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
-#define ptep_clear_flush_dirty(vma, address, ptep)			\
-({									\
-	int __dirty;							\
-	__dirty = ptep_test_and_clear_dirty((vma), (address), (ptep));	\
-	if (__dirty)							\
-		flush_tlb_page(vma, address);				\
-	__dirty;							\
-})
-
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
 #define ptep_clear_flush_young(vma, address, ptep)			\
 ({									\
diff -urpN linux-2.6/include/asm-ia64/pgtable.h linux-2.6-patched/include/asm-ia64/pgtable.h
--- linux-2.6/include/asm-ia64/pgtable.h	2007-07-03 12:56:49.000000000 +0200
+++ linux-2.6-patched/include/asm-ia64/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -398,22 +398,6 @@ ptep_test_and_clear_young (struct vm_are
 #endif
 }
 
-static inline int
-ptep_test_and_clear_dirty (struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-#ifdef CONFIG_SMP
-	if (!pte_dirty(*ptep))
-		return 0;
-	return test_and_clear_bit(_PAGE_D_BIT, ptep);
-#else
-	pte_t pte = *ptep;
-	if (!pte_dirty(pte))
-		return 0;
-	set_pte_at(vma->vm_mm, addr, ptep, pte_mkclean(pte));
-	return 1;
-#endif
-}
-
 static inline pte_t
 ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
@@ -593,7 +577,6 @@ extern void lazy_mmu_prot_update (pte_t 
 #endif
 
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTE_SAME
diff -urpN linux-2.6/include/asm-m32r/pgtable.h linux-2.6-patched/include/asm-m32r/pgtable.h
--- linux-2.6/include/asm-m32r/pgtable.h	2007-05-12 20:16:10.000000000 +0200
+++ linux-2.6-patched/include/asm-m32r/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -284,11 +284,6 @@ static inline pte_t pte_mkwrite(pte_t pt
 	return pte;
 }
 
-static inline  int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-	return test_and_clear_bit(_PAGE_BIT_DIRTY, ptep);
-}
-
 static inline  int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
 	return test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep);
@@ -382,7 +377,6 @@ static inline void pmd_set(pmd_t * pmdp,
 		remap_pfn_range(vma, vaddr, pfn, size, prot)
 
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTE_SAME
diff -urpN linux-2.6/include/asm-parisc/pgtable.h linux-2.6-patched/include/asm-parisc/pgtable.h
--- linux-2.6/include/asm-parisc/pgtable.h	2007-05-09 09:58:15.000000000 +0200
+++ linux-2.6-patched/include/asm-parisc/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -451,21 +451,6 @@ static inline int ptep_test_and_clear_yo
 #endif
 }
 
-static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-#ifdef CONFIG_SMP
-	if (!pte_dirty(*ptep))
-		return 0;
-	return test_and_clear_bit(xlate_pabit(_PAGE_DIRTY_BIT), &pte_val(*ptep));
-#else
-	pte_t pte = *ptep;
-	if (!pte_dirty(pte))
-		return 0;
-	set_pte_at(vma->vm_mm, addr, ptep, pte_mkclean(pte));
-	return 1;
-#endif
-}
-
 extern spinlock_t pa_dbit_lock;
 
 struct mm_struct;
@@ -533,7 +518,6 @@ static inline void ptep_set_wrprotect(st
 #define HAVE_ARCH_UNMAPPED_AREA
 
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTE_SAME
diff -urpN linux-2.6/include/asm-powerpc/pgtable-ppc32.h linux-2.6-patched/include/asm-powerpc/pgtable-ppc32.h
--- linux-2.6/include/asm-powerpc/pgtable-ppc32.h	2007-06-18 09:43:22.000000000 +0200
+++ linux-2.6-patched/include/asm-powerpc/pgtable-ppc32.h	2007-07-03 12:56:49.000000000 +0200
@@ -643,13 +643,6 @@ static inline int __ptep_test_and_clear_
 #define ptep_test_and_clear_young(__vma, __addr, __ptep) \
 	__ptep_test_and_clear_young((__vma)->vm_mm->context.id, __addr, __ptep)
 
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
-static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma,
-					    unsigned long addr, pte_t *ptep)
-{
-	return (pte_update(ptep, (_PAGE_DIRTY | _PAGE_HWWRITE), 0) & _PAGE_DIRTY) != 0;
-}
-
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 				       pte_t *ptep)
diff -urpN linux-2.6/include/asm-powerpc/pgtable-ppc64.h linux-2.6-patched/include/asm-powerpc/pgtable-ppc64.h
--- linux-2.6/include/asm-powerpc/pgtable-ppc64.h	2007-06-18 09:43:22.000000000 +0200
+++ linux-2.6-patched/include/asm-powerpc/pgtable-ppc64.h	2007-07-03 12:56:49.000000000 +0200
@@ -307,29 +307,6 @@ static inline int __ptep_test_and_clear_
 	__r;								   \
 })
 
-/*
- * On RW/DIRTY bit transitions we can avoid flushing the hpte. For the
- * moment we always flush but we need to fix hpte_update and test if the
- * optimisation is worth it.
- */
-static inline int __ptep_test_and_clear_dirty(struct mm_struct *mm,
-					      unsigned long addr, pte_t *ptep)
-{
-	unsigned long old;
-
-       	if ((pte_val(*ptep) & _PAGE_DIRTY) == 0)
-		return 0;
-	old = pte_update(mm, addr, ptep, _PAGE_DIRTY, 0);
-	return (old & _PAGE_DIRTY) != 0;
-}
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
-#define ptep_test_and_clear_dirty(__vma, __addr, __ptep)		   \
-({									   \
-	int __r;							   \
-	__r = __ptep_test_and_clear_dirty((__vma)->vm_mm, __addr, __ptep); \
-	__r;								   \
-})
-
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 				      pte_t *ptep)
@@ -357,14 +334,6 @@ static inline void ptep_set_wrprotect(st
 	__young;							\
 })
 
-#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
-#define ptep_clear_flush_dirty(__vma, __address, __ptep)		\
-({									\
-	int __dirty = __ptep_test_and_clear_dirty((__vma)->vm_mm, __address, \
-						  __ptep); 		\
-	__dirty;							\
-})
-
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long addr, pte_t *ptep)
diff -urpN linux-2.6/include/asm-ppc/pgtable.h linux-2.6-patched/include/asm-ppc/pgtable.h
--- linux-2.6/include/asm-ppc/pgtable.h	2007-06-18 09:43:22.000000000 +0200
+++ linux-2.6-patched/include/asm-ppc/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -664,13 +664,6 @@ static inline int __ptep_test_and_clear_
 #define ptep_test_and_clear_young(__vma, __addr, __ptep) \
 	__ptep_test_and_clear_young((__vma)->vm_mm->context.id, __addr, __ptep)
 
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
-static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma,
-					    unsigned long addr, pte_t *ptep)
-{
-	return (pte_update(ptep, (_PAGE_DIRTY | _PAGE_HWWRITE), 0) & _PAGE_DIRTY) != 0;
-}
-
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 				       pte_t *ptep)
diff -urpN linux-2.6/include/asm-s390/pgtable.h linux-2.6-patched/include/asm-s390/pgtable.h
--- linux-2.6/include/asm-s390/pgtable.h	2007-07-03 12:56:49.000000000 +0200
+++ linux-2.6-patched/include/asm-s390/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -677,19 +677,6 @@ ptep_clear_flush_young(struct vm_area_st
 	return ptep_test_and_clear_young(vma, address, ptep);
 }
 
-static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-	return 0;
-}
-
-static inline int
-ptep_clear_flush_dirty(struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep)
-{
-	/* No need to flush TLB; bits are in storage key */
-	return ptep_test_and_clear_dirty(vma, address, ptep);
-}
-
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	pte_t pte = *ptep;
@@ -947,8 +934,6 @@ extern void memmap_init(unsigned long, i
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
-#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
diff -urpN linux-2.6/include/asm-x86_64/pgtable.h linux-2.6-patched/include/asm-x86_64/pgtable.h
--- linux-2.6/include/asm-x86_64/pgtable.h	2007-06-18 09:43:22.000000000 +0200
+++ linux-2.6-patched/include/asm-x86_64/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -290,13 +290,6 @@ static inline pte_t pte_clrhuge(pte_t pt
 
 struct vm_area_struct;
 
-static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-	if (!pte_dirty(*ptep))
-		return 0;
-	return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte);
-}
-
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
 	if (!pte_young(*ptep))
@@ -433,7 +426,6 @@ extern int kern_addr_valid(unsigned long
    (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
 
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
diff -urpN linux-2.6/include/asm-xtensa/pgtable.h linux-2.6-patched/include/asm-xtensa/pgtable.h
--- linux-2.6/include/asm-xtensa/pgtable.h	2006-12-11 10:25:32.000000000 +0100
+++ linux-2.6-patched/include/asm-xtensa/pgtable.h	2007-07-03 12:56:49.000000000 +0200
@@ -270,17 +270,6 @@ ptep_test_and_clear_young(struct vm_area
 	return 1;
 }
 
-static inline int
-ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr,
-   			  pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	if (!pte_dirty(pte))
-		return 0;
-	update_pte(ptep, pte_mkclean(pte));
-	return 1;
-}
-
 static inline pte_t
 ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
@@ -421,7 +410,6 @@ typedef pte_t *pte_addr_t;
 #endif /* !defined (__ASSEMBLY__) */
 
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTEP_MKDIRTY

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [patch 4/5] move mm_struct and vm_area_struct.
  2007-07-03 11:18 [patch 0/5] some mm improvements + s390 tlb flush Martin Schwidefsky
                   ` (2 preceding siblings ...)
  2007-07-03 11:18 ` [patch 3/5] remove ptep_test_and_clear_dirty and ptep_clear_flush_dirty Martin Schwidefsky
@ 2007-07-03 11:18 ` Martin Schwidefsky
  2007-07-03 11:18 ` [patch 5/5] s390 tlb flush fix Martin Schwidefsky
  4 siblings, 0 replies; 15+ messages in thread
From: Martin Schwidefsky @ 2007-07-03 11:18 UTC (permalink / raw)
  To: akpm, hugh, peterz; +Cc: linux-kernel, linux-mm, Martin Schwidefsky

[-- Attachment #1: 004-mm-types.diff --]
[-- Type: text/plain, Size: 13254 bytes --]

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

Move the definitions of struct mm_struct and struct vma_area_struct
to include/mm_types.h. This allows to define more function in
asm/pgtable.h and friends with inline assemblies instead of macros.
Compile tested on i386, ia64, powerpc, powerpc64, s390-32, s390-64
and x86_64.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---

 include/linux/mm.h       |   63 --------------------
 include/linux/mm_types.h |  143 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h    |   74 ------------------------
 3 files changed, 144 insertions(+), 136 deletions(-)

diff -urpN linux-2.6/include/linux/mm.h linux-2.6-patched/include/linux/mm.h
--- linux-2.6/include/linux/mm.h	2007-06-22 14:11:55.000000000 +0200
+++ linux-2.6-patched/include/linux/mm.h	2007-07-03 12:56:50.000000000 +0200
@@ -51,69 +51,6 @@ extern int sysctl_legacy_va_layout;
  * mmap() functions).
  */
 
-/*
- * This struct defines a memory VMM memory area. There is one of these
- * per VM-area/task.  A VM area is any part of the process virtual memory
- * space that has a special rule for the page-fault handlers (ie a shared
- * library, the executable area etc).
- */
-struct vm_area_struct {
-	struct mm_struct * vm_mm;	/* The address space we belong to. */
-	unsigned long vm_start;		/* Our start address within vm_mm. */
-	unsigned long vm_end;		/* The first byte after our end address
-					   within vm_mm. */
-
-	/* linked list of VM areas per task, sorted by address */
-	struct vm_area_struct *vm_next;
-
-	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
-	unsigned long vm_flags;		/* Flags, listed below. */
-
-	struct rb_node vm_rb;
-
-	/*
-	 * For areas with an address space and backing store,
-	 * linkage into the address_space->i_mmap prio tree, or
-	 * linkage to the list of like vmas hanging off its node, or
-	 * linkage of vma in the address_space->i_mmap_nonlinear list.
-	 */
-	union {
-		struct {
-			struct list_head list;
-			void *parent;	/* aligns with prio_tree_node parent */
-			struct vm_area_struct *head;
-		} vm_set;
-
-		struct raw_prio_tree_node prio_tree_node;
-	} shared;
-
-	/*
-	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
-	 * list, after a COW of one of the file pages.  A MAP_SHARED vma
-	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
-	 * or brk vma (with NULL file) can only be in an anon_vma list.
-	 */
-	struct list_head anon_vma_node;	/* Serialized by anon_vma->lock */
-	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
-
-	/* Function pointers to deal with this struct. */
-	struct vm_operations_struct * vm_ops;
-
-	/* Information about our backing store: */
-	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
-					   units, *not* PAGE_CACHE_SIZE */
-	struct file * vm_file;		/* File we map to (can be NULL). */
-	void * vm_private_data;		/* was vm_pte (shared mem) */
-	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
-
-#ifndef CONFIG_MMU
-	atomic_t vm_usage;		/* refcount (VMAs shared if !MMU) */
-#endif
-#ifdef CONFIG_NUMA
-	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
-#endif
-};
-
 extern struct kmem_cache *vm_area_cachep;
 
 /*
diff -urpN linux-2.6/include/linux/mm_types.h linux-2.6-patched/include/linux/mm_types.h
--- linux-2.6/include/linux/mm_types.h	2007-05-11 09:19:04.000000000 +0200
+++ linux-2.6-patched/include/linux/mm_types.h	2007-07-03 12:56:50.000000000 +0200
@@ -1,13 +1,25 @@
 #ifndef _LINUX_MM_TYPES_H
 #define _LINUX_MM_TYPES_H
 
+#include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 #include <linux/types.h>
 #include <linux/threads.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/completion.h>
+#include <asm/mmu.h>
 
 struct address_space;
 
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+typedef atomic_long_t mm_counter_t;
+#else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+typedef unsigned long mm_counter_t;
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -80,4 +92,135 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
+/*
+ * This struct defines a memory VMM memory area. There is one of these
+ * per VM-area/task.  A VM area is any part of the process virtual memory
+ * space that has a special rule for the page-fault handlers (ie a shared
+ * library, the executable area etc).
+ */
+struct vm_area_struct {
+	struct mm_struct * vm_mm;	/* The address space we belong to. */
+	unsigned long vm_start;		/* Our start address within vm_mm. */
+	unsigned long vm_end;		/* The first byte after our end address
+					   within vm_mm. */
+
+	/* linked list of VM areas per task, sorted by address */
+	struct vm_area_struct *vm_next;
+
+	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
+	unsigned long vm_flags;		/* Flags, listed below. */
+
+	struct rb_node vm_rb;
+
+	/*
+	 * For areas with an address space and backing store,
+	 * linkage into the address_space->i_mmap prio tree, or
+	 * linkage to the list of like vmas hanging off its node, or
+	 * linkage of vma in the address_space->i_mmap_nonlinear list.
+	 */
+	union {
+		struct {
+			struct list_head list;
+			void *parent;	/* aligns with prio_tree_node parent */
+			struct vm_area_struct *head;
+		} vm_set;
+
+		struct raw_prio_tree_node prio_tree_node;
+	} shared;
+
+	/*
+	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
+	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
+	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
+	 * or brk vma (with NULL file) can only be in an anon_vma list.
+	 */
+	struct list_head anon_vma_node;	/* Serialized by anon_vma->lock */
+	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
+
+	/* Function pointers to deal with this struct. */
+	struct vm_operations_struct * vm_ops;
+
+	/* Information about our backing store: */
+	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
+					   units, *not* PAGE_CACHE_SIZE */
+	struct file * vm_file;		/* File we map to (can be NULL). */
+	void * vm_private_data;		/* was vm_pte (shared mem) */
+	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
+
+#ifndef CONFIG_MMU
+	atomic_t vm_usage;		/* refcount (VMAs shared if !MMU) */
+#endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
+#endif
+};
+
+struct mm_struct {
+	struct vm_area_struct * mmap;		/* list of VMAs */
+	struct rb_root mm_rb;
+	struct vm_area_struct * mmap_cache;	/* last find_vma result */
+	unsigned long (*get_unmapped_area) (struct file *filp,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags);
+	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
+	unsigned long mmap_base;		/* base of mmap area */
+	unsigned long task_size;		/* size of task vm space */
+	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
+	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
+	pgd_t * pgd;
+	atomic_t mm_users;			/* How many users with user space? */
+	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
+	int map_count;				/* number of VMAs */
+	struct rw_semaphore mmap_sem;
+	spinlock_t page_table_lock;		/* Protects page tables and some counters */
+
+	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
+						 * together off init_mm.mmlist, and are protected
+						 * by mmlist_lock
+						 */
+
+	/* Special counters, in some configurations protected by the
+	 * page_table_lock, in other configurations by being atomic.
+	 */
+	mm_counter_t _file_rss;
+	mm_counter_t _anon_rss;
+
+	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
+	unsigned long hiwater_vm;	/* High-water virtual memory usage */
+
+	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
+	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+	unsigned long start_code, end_code, start_data, end_data;
+	unsigned long start_brk, brk, start_stack;
+	unsigned long arg_start, arg_end, env_start, env_end;
+
+	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
+
+	cpumask_t cpu_vm_mask;
+
+	/* Architecture-specific MM context */
+	mm_context_t context;
+
+	/* Swap token stuff */
+	/*
+	 * Last value of global fault stamp as seen by this process.
+	 * In other words, this value gives an indication of how long
+	 * it has been since this task got the token.
+	 * Look at mm/thrash.c
+	 */
+	unsigned int faultstamp;
+	unsigned int token_priority;
+	unsigned int last_interval;
+
+	unsigned char dumpable:2;
+
+	/* coredumping support */
+	int core_waiters;
+	struct completion *core_startup_done, core_done;
+
+	/* aio bits */
+	rwlock_t		ioctx_list_lock;
+	struct kioctx		*ioctx_list;
+};
+
 #endif /* _LINUX_MM_TYPES_H */
diff -urpN linux-2.6/include/linux/sched.h linux-2.6-patched/include/linux/sched.h
--- linux-2.6/include/linux/sched.h	2007-06-09 12:24:04.000000000 +0200
+++ linux-2.6-patched/include/linux/sched.h	2007-07-03 12:56:50.000000000 +0200
@@ -1,8 +1,6 @@
 #ifndef _LINUX_SCHED_H
 #define _LINUX_SCHED_H
 
-#include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
-
 /*
  * cloning flags:
  */
@@ -54,12 +52,12 @@ struct sched_param {
 #include <linux/cpumask.h>
 #include <linux/errno.h>
 #include <linux/nodemask.h>
+#include <linux/mm_types.h>
 
 #include <asm/system.h>
 #include <asm/semaphore.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
-#include <asm/mmu.h>
 #include <asm/cputime.h>
 
 #include <linux/smp.h>
@@ -292,7 +290,6 @@ extern void arch_unmap_area_topdown(stru
 #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
 #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
 #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-typedef atomic_long_t mm_counter_t;
 
 #else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
 /*
@@ -304,7 +301,6 @@ typedef atomic_long_t mm_counter_t;
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
 #define inc_mm_counter(mm, member) (mm)->_##member++
 #define dec_mm_counter(mm, member) (mm)->_##member--
-typedef unsigned long mm_counter_t;
 
 #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
 
@@ -320,74 +316,6 @@ typedef unsigned long mm_counter_t;
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 
-struct mm_struct {
-	struct vm_area_struct * mmap;		/* list of VMAs */
-	struct rb_root mm_rb;
-	struct vm_area_struct * mmap_cache;	/* last find_vma result */
-	unsigned long (*get_unmapped_area) (struct file *filp,
-				unsigned long addr, unsigned long len,
-				unsigned long pgoff, unsigned long flags);
-	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
-	unsigned long mmap_base;		/* base of mmap area */
-	unsigned long task_size;		/* size of task vm space */
-	unsigned long cached_hole_size;         /* if non-zero, the largest hole below free_area_cache */
-	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
-	pgd_t * pgd;
-	atomic_t mm_users;			/* How many users with user space? */
-	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
-	int map_count;				/* number of VMAs */
-	struct rw_semaphore mmap_sem;
-	spinlock_t page_table_lock;		/* Protects page tables and some counters */
-
-	struct list_head mmlist;		/* List of maybe swapped mm's.  These are globally strung
-						 * together off init_mm.mmlist, and are protected
-						 * by mmlist_lock
-						 */
-
-	/* Special counters, in some configurations protected by the
-	 * page_table_lock, in other configurations by being atomic.
-	 */
-	mm_counter_t _file_rss;
-	mm_counter_t _anon_rss;
-
-	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
-	unsigned long hiwater_vm;	/* High-water virtual memory usage */
-
-	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
-	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
-	unsigned long start_code, end_code, start_data, end_data;
-	unsigned long start_brk, brk, start_stack;
-	unsigned long arg_start, arg_end, env_start, env_end;
-
-	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
-
-	cpumask_t cpu_vm_mask;
-
-	/* Architecture-specific MM context */
-	mm_context_t context;
-
-	/* Swap token stuff */
-	/*
-	 * Last value of global fault stamp as seen by this process.
-	 * In other words, this value gives an indication of how long
-	 * it has been since this task got the token.
-	 * Look at mm/thrash.c
-	 */
-	unsigned int faultstamp;
-	unsigned int token_priority;
-	unsigned int last_interval;
-
-	unsigned char dumpable:2;
-
-	/* coredumping support */
-	int core_waiters;
-	struct completion *core_startup_done, core_done;
-
-	/* aio bits */
-	rwlock_t		ioctx_list_lock;
-	struct kioctx		*ioctx_list;
-};
-
 struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [patch 5/5] s390 tlb flush fix.
  2007-07-03 11:18 [patch 0/5] some mm improvements + s390 tlb flush Martin Schwidefsky
                   ` (3 preceding siblings ...)
  2007-07-03 11:18 ` [patch 4/5] move mm_struct and vm_area_struct Martin Schwidefsky
@ 2007-07-03 11:18 ` Martin Schwidefsky
  2007-07-03 18:58   ` Hugh Dickins
  4 siblings, 1 reply; 15+ messages in thread
From: Martin Schwidefsky @ 2007-07-03 11:18 UTC (permalink / raw)
  To: akpm, hugh, peterz; +Cc: linux-kernel, linux-mm, Martin Schwidefsky

[-- Attachment #1: 005-s390-tlbflush.diff --]
[-- Type: text/plain, Size: 21942 bytes --]

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

The current tlb flushing code for page table entries violates the
s390 architecture in a small detail. The relevant section from the
principles of operation (SA22-7832-02 page 3-47):

   "A valid table entry must not be changed while it is attached
   to any CPU and may be used for translation by that CPU except to
   (1) invalidate the entry by using INVALIDATE PAGE TABLE ENTRY or
   INVALIDATE DAT TABLE ENTRY, (2) alter bits 56-63 of a page-table
   entry, or (3) make a change by means of a COMPARE AND SWAP AND
   PURGE instruction that purges the TLB."

That means if one thread of a multithreaded applciation uses a vma
while another thread does an unmap on it, the page table entries of
that vma needs to get removed with IPTE, IDTE or CSP. In some strange
and rare situations a cpu could check-stop (die) because a entry has
been pushed out of the TLB that is still needed to complete a
(milli-coded) instruction. I've never seen it happen with the current
code on any of the supported machines, so right now this is a
theoretical problem. But I want to fix it nevertheless, to avoid
headaches in the futures.

To get this implemented correctly without changing common code the
primitives ptep_get_and_clear, ptep_get_and_clear_full and
ptep_set_wrprotect need to use the IPTE instruction to invalidate the
pte before the new pte value gets stored. If IPTE is always used for
the three primitives three important operations will have a performace
hit: fork, mprotect and exit_mmap. Time for some workarounds:

* 1: ptep_get_and_clear_full is used in unmap_vmas to remove page
tables entries in a batched tlb gather operation. If the mmu_gather
context passed to unmap_vmas has been started with full_mm_flush==1
or if only one cpu is online or if the mm_struct doesn't has more
than one user the fullmm indication in the mmu_gather context is
set to one. All TLBs for mm_struct are flushed by the tlb_gather_mmu
call. No new TLBs can be created while the unmap is in progress. In
this case ptep_get_and_clear_full clears the ptes with a simple store.

* 2: ptep_get_and_clear is used in change_protection to clear the
ptes from the page tables before they are reentered with the new
access flags. At the end of the update flush_tlb_range clears the
remaining TLBs. In general the ptep_get_and_clear has to issue IPTE
for each pte and flush_tlb_range is a nop. But if there is only one
user of the mm_struct then ptep_get_and_clear uses simple stores
to do the update and flush_tlb_range will flush the TLBs.

* 3: Similar to 2, ptep_set_wrprotect is used in copy_page_range
for a fork to make all ptes of a cow mapping read-only. At the end of
of copy_page_range dup_mmap will flush the TLBs with a call to
flush_tlb_mm.  Check for mm->mm_users and if there is only one user
avoid using IPTE in ptep_set_wrprotect and let flush_tlb_mm clear the
TLBs.

Overall for single threaded programs the tlb flush code now performs
better, for multi threaded programs it is slightly worse. In particular
exit_mmap() now does a single IDTE for the mm and then just frees every
page cache reference and every page table page directly without a delay
over the mmu_gather structure.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---

 arch/s390/kernel/smp.c      |    2 
 include/asm-s390/pgalloc.h  |   17 ----
 include/asm-s390/pgtable.h  |   90 +++++++++++++++++++-----
 include/asm-s390/tlb.h      |  127 ++++++++++++++++++++++++++++++++---
 include/asm-s390/tlbflush.h |  159 +++++++++++++++-----------------------------
 5 files changed, 245 insertions(+), 150 deletions(-)

diff -urpN linux-2.6/arch/s390/kernel/smp.c linux-2.6-patched/arch/s390/kernel/smp.c
--- linux-2.6/arch/s390/kernel/smp.c	2007-06-01 10:06:01.000000000 +0200
+++ linux-2.6-patched/arch/s390/kernel/smp.c	2007-07-03 12:56:51.000000000 +0200
@@ -328,7 +328,7 @@ static void smp_ext_bitcall(int cpu, ec_
  */
 void smp_ptlb_callback(void *info)
 {
-	local_flush_tlb();
+	__tlb_flush_local();
 }
 
 void smp_ptlb_all(void)
diff -urpN linux-2.6/include/asm-s390/pgalloc.h linux-2.6-patched/include/asm-s390/pgalloc.h
--- linux-2.6/include/asm-s390/pgalloc.h	2007-02-07 15:42:46.000000000 +0100
+++ linux-2.6-patched/include/asm-s390/pgalloc.h	2007-07-03 12:56:51.000000000 +0200
@@ -84,7 +84,6 @@ static inline void pgd_free(pgd_t *pgd)
  */
 #define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); })
 #define pmd_free(x)                     do { } while (0)
-#define __pmd_free_tlb(tlb,x)		do { } while (0)
 #define pgd_populate(mm, pmd, pte)      BUG()
 #define pgd_populate_kernel(mm, pmd, pte)	BUG()
 #else /* __s390x__ */
@@ -120,12 +119,6 @@ static inline void pmd_free (pmd_t *pmd)
 	free_pages((unsigned long) pmd, PMD_ALLOC_ORDER);
 }
 
-#define __pmd_free_tlb(tlb,pmd)			\
-	do {					\
-		tlb_flush_mmu(tlb, 0, 0);	\
-		pmd_free(pmd);			\
-	 } while (0)
-
 static inline void
 pgd_populate_kernel(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd)
 {
@@ -226,14 +219,4 @@ static inline void pte_free(struct page 
 	__free_page(pte);
 }
 
-#define __pte_free_tlb(tlb, pte)					\
-({									\
-	struct mmu_gather *__tlb = (tlb);				\
-	struct page *__pte = (pte);					\
-	struct page *shadow_page = get_shadow_page(__pte);		\
-	if (shadow_page)						\
-		tlb_remove_page(__tlb, shadow_page);			\
-	tlb_remove_page(__tlb, __pte);					\
-})
-
 #endif /* _S390_PGALLOC_H */
diff -urpN linux-2.6/include/asm-s390/pgtable.h linux-2.6-patched/include/asm-s390/pgtable.h
--- linux-2.6/include/asm-s390/pgtable.h	2007-07-03 12:56:50.000000000 +0200
+++ linux-2.6-patched/include/asm-s390/pgtable.h	2007-07-03 12:56:51.000000000 +0200
@@ -417,7 +417,8 @@ static inline pgd_t *get_shadow_pgd(pgd_
  * within a page table are directly modified.  Thus, the following
  * hook is made available.
  */
-static inline void set_pte(pte_t *pteptr, pte_t pteval)
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+			      pte_t *pteptr, pte_t pteval)
 {
 	pte_t *shadow_pte = get_shadow_pte(pteptr);
 
@@ -430,7 +431,6 @@ static inline void set_pte(pte_t *pteptr
 			pte_val(*shadow_pte) = _PAGE_TYPE_EMPTY;
 	}
 }
-#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
 
 /*
  * pgd/pmd/pte query functions
@@ -501,7 +501,8 @@ static inline int pte_file(pte_t pte)
 	return (pte_val(pte) & mask) == _PAGE_TYPE_FILE;
 }
 
-#define pte_same(a,b)	(pte_val(a) == pte_val(b))
+#define __HAVE_ARCH_PTE_SAME
+#define pte_same(a,b)  (pte_val(a) == pte_val(b))
 
 /*
  * query functions pte_write/pte_dirty/pte_young only work if
@@ -664,17 +665,19 @@ static inline pte_t pte_mkyoung(pte_t pt
 	return pte;
 }
 
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long addr, pte_t *ptep)
 {
 	return 0;
 }
 
-static inline int
-ptep_clear_flush_young(struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep)
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+					 unsigned long address, pte_t *ptep)
 {
 	/* No need to flush TLB; bits are in storage key */
-	return ptep_test_and_clear_young(vma, address, ptep);
+	return 0;
 }
 
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -710,6 +713,31 @@ static inline void ptep_invalidate(unsig
 		__ptep_ipte(address, ptep);
 }
 
+/*
+ * This is hard to understand. ptep_get_and_clear and ptep_clear_flush
+ * both clear the TLB for the unmapped pte. The reason is that
+ * ptep_get_and_clear is used in common code (e.g. change_pte_range)
+ * to modify an active pte. The sequence is
+ *   1) ptep_get_and_clear
+ *   2) set_pte_at
+ *   3) flush_tlb_range
+ * On s390 the tlb needs to get flushed with the modification of the pte
+ * if the pte is active. The only way how this can be implemented is to
+ * have ptep_get_and_clear do the tlb flush. In exchange flush_tlb_range
+ * is a nop.
+ */
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define ptep_get_and_clear(__mm, __address, __ptep)			\
+({									\
+	pte_t __pte = *(__ptep);					\
+	if (atomic_read(&(__mm)->mm_users) > 1)				\
+		ptep_invalidate(__address, __ptep);			\
+	else								\
+		pte_clear((__mm), (__address), (__ptep));		\
+	__pte;								\
+})
+
+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
 static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
 				     unsigned long address, pte_t *ptep)
 {
@@ -718,12 +746,39 @@ static inline pte_t ptep_clear_flush(str
 	return pte;
 }
 
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+/*
+ * The batched pte unmap code uses ptep_get_and_clear_full to clear the
+ * ptes. Here an optimization is possible. tlb_gather_mmu flushes all
+ * tlbs of an mm if it can guarantee that the ptes of the mm_struct
+ * cannot be accessed while the batched unmap is running. In this case
+ * full==1 and a simple pte_clear is enough. See tlb.h.
+ */
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+					    unsigned long addr,
+					    pte_t *ptep, int full)
 {
-	pte_t old_pte = *ptep;
-	set_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
+	pte_t pte = *ptep;
+
+	if (full)
+		pte_clear(mm, addr, ptep);
+	else
+		ptep_invalidate(addr, ptep);
+	return pte;
 }
 
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect(__mm, __addr, __ptep)			\
+({									\
+	pte_t __pte = *(__ptep);					\
+	if (pte_write(__pte)) {						\
+		if (atomic_read(&(__mm)->mm_users) > 1)			\
+			ptep_invalidate(__addr, __ptep);		\
+		set_pte_at(__mm, __addr, __ptep, pte_wrprotect(__pte));	\
+	}								\
+})
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define ptep_set_access_flags(__vma, __addr, __ptep, __entry, __dirty)	\
 ({									\
 	int __changed = !pte_same(*(__ptep), __entry);			\
@@ -741,11 +796,13 @@ static inline void ptep_set_wrprotect(st
  * should therefore only be called if it is not mapped in any
  * address space.
  */
+#define __HAVE_ARCH_PAGE_TEST_DIRTY
 static inline int page_test_dirty(struct page *page)
 {
 	return (page_get_storage_key(page_to_phys(page)) & _PAGE_CHANGED) != 0;
 }
 
+#define __HAVE_ARCH_PAGE_CLEAR_DIRTY
 static inline void page_clear_dirty(struct page *page)
 {
 	page_set_storage_key(page_to_phys(page), PAGE_DEFAULT_KEY);
@@ -754,6 +811,7 @@ static inline void page_clear_dirty(stru
 /*
  * Test and clear referenced bit in storage key.
  */
+#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG
 static inline int page_test_and_clear_young(struct page *page)
 {
 	unsigned long physpage = page_to_phys(page);
@@ -931,16 +989,6 @@ extern int remove_shared_memory(unsigned
 #define __HAVE_ARCH_MEMMAP_INIT
 extern void memmap_init(unsigned long, int, unsigned long, unsigned long);
 
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-#define __HAVE_ARCH_PTE_SAME
-#define __HAVE_ARCH_PAGE_TEST_DIRTY
-#define __HAVE_ARCH_PAGE_CLEAR_DIRTY
-#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG
 #include <asm-generic/pgtable.h>
 
 #endif /* _S390_PAGE_H */
diff -urpN linux-2.6/include/asm-s390/tlbflush.h linux-2.6-patched/include/asm-s390/tlbflush.h
--- linux-2.6/include/asm-s390/tlbflush.h	2007-02-07 15:42:46.000000000 +0100
+++ linux-2.6-patched/include/asm-s390/tlbflush.h	2007-07-03 12:56:51.000000000 +0200
@@ -6,69 +6,19 @@
 #include <asm/pgalloc.h>
 
 /*
- * TLB flushing:
- *
- *  - flush_tlb() flushes the current mm struct TLBs
- *  - flush_tlb_all() flushes all processes TLBs 
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
- */
-
-/*
- * S/390 has three ways of flushing TLBs
- * 'ptlb' does a flush of the local processor
- * 'csp' flushes the TLBs on all PUs of a SMP
- * 'ipte' invalidates a pte in a page table and flushes that out of
- * the TLBs of all PUs of a SMP
+ * Flush all tlb entries on the local cpu.
  */
-
-#define local_flush_tlb() \
-do {  asm volatile("ptlb": : :"memory"); } while (0)
-
-#ifndef CONFIG_SMP
-
-/*
- * We always need to flush, since s390 does not flush tlb
- * on each context switch
- */
-
-static inline void flush_tlb(void)
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_all(void)
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_mm(struct mm_struct *mm) 
+static inline void __tlb_flush_local(void)
 {
-	local_flush_tlb();
+	asm volatile("ptlb" : : : "memory");
 }
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long addr)
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end)
-{
-	local_flush_tlb();
-}
-
-#define flush_tlb_kernel_range(start, end) \
-	local_flush_tlb();
-
-#else
-
-#include <asm/smp.h>
-
-extern void smp_ptlb_all(void);
 
-static inline void global_flush_tlb(void)
+/*
+ * Flush all tlb entries on all cpus.
+ */
+static inline void __tlb_flush_global(void)
 {
+	extern void smp_ptlb_all(void);
 	register unsigned long reg2 asm("2");
 	register unsigned long reg3 asm("3");
 	register unsigned long reg4 asm("4");
@@ -90,72 +40,77 @@ static inline void global_flush_tlb(void
 }
 
 /*
- * We only have to do global flush of tlb if process run since last
- * flush on any other pu than current. 
- * If we have threads (mm->count > 1) we always do a global flush, 
- * since the process runs on more than one processor at the same time.
+ * Flush all tlb entries of a page table on all cpus.
  */
+static inline void __tlb_flush_idte(pgd_t *pgd)
+{
+	asm volatile(
+		"	.insn	rrf,0xb98e0000,0,%0,%1,0"
+		: : "a" (2048), "a" (__pa(pgd) & PAGE_MASK) : "cc" );
+}
 
-static inline void __flush_tlb_mm(struct mm_struct * mm)
+static inline void __tlb_flush_mm(struct mm_struct * mm)
 {
 	cpumask_t local_cpumask;
 
 	if (unlikely(cpus_empty(mm->cpu_vm_mask)))
 		return;
+	/*
+	 * If the machine has IDTE we prefer to do a per mm flush
+	 * on all cpus instead of doing a local flush if the mm
+	 * only ran on the local cpu.
+	 */
 	if (MACHINE_HAS_IDTE) {
 		pgd_t *shadow_pgd = get_shadow_pgd(mm->pgd);
 
-		if (shadow_pgd) {
-			asm volatile(
-				"	.insn	rrf,0xb98e0000,0,%0,%1,0"
-				: : "a" (2048),
-				"a" (__pa(shadow_pgd) & PAGE_MASK) : "cc" );
-		}
-		asm volatile(
-			"	.insn	rrf,0xb98e0000,0,%0,%1,0"
-			: : "a" (2048), "a" (__pa(mm->pgd)&PAGE_MASK) : "cc");
+		if (shadow_pgd)
+			__tlb_flush_idte(shadow_pgd);
+		__tlb_flush_idte(mm->pgd);
 		return;
 	}
 	preempt_disable();
+	/*
+	 * If the process only ran on the local cpu, do a local flush.
+	 */
 	local_cpumask = cpumask_of_cpu(smp_processor_id());
 	if (cpus_equal(mm->cpu_vm_mask, local_cpumask))
-		local_flush_tlb();
+		__tlb_flush_local();
 	else
-		global_flush_tlb();
+		__tlb_flush_global();
 	preempt_enable();
 }
 
-static inline void flush_tlb(void)
-{
-	__flush_tlb_mm(current->mm);
-}
-static inline void flush_tlb_all(void)
-{
-	global_flush_tlb();
-}
-static inline void flush_tlb_mm(struct mm_struct *mm) 
-{
-	__flush_tlb_mm(mm); 
-}
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long addr)
+static inline void __tlb_flush_mm_cond(struct mm_struct * mm)
 {
-	__flush_tlb_mm(vma->vm_mm);
+	if (atomic_read(&mm->mm_users) <= 1)
+		__tlb_flush_mm(mm);
 }
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end)
-{
-	__flush_tlb_mm(vma->vm_mm); 
-}
-
-#define flush_tlb_kernel_range(start, end) global_flush_tlb()
 
-#endif
+/*
+ * TLB flushing:
+ *  flush_tlb() - flushes the current mm struct TLBs
+ *  flush_tlb_all() - flushes all processes TLBs
+ *  flush_tlb_mm(mm) - flushes the specified mm context TLB's
+ *  flush_tlb_page(vma, vmaddr) - flushes one page
+ *  flush_tlb_range(vma, start, end) - flushes a range of pages
+ *  flush_tlb_kernel_range(start, end) - flushes a range of kernel pages
+ *  flush_tlb_pgtables(mm, start, end) - flushes a range of page tables
+ */
 
-static inline void flush_tlb_pgtables(struct mm_struct *mm,
-                                      unsigned long start, unsigned long end)
-{
-        /* S/390 does not keep any page table caches in TLB */
-}
+/*
+ * flush_tlb_mm goes together with ptep_set_wrprotect for the
+ * copy_page_range operation and flush_tlb_range is related to
+ * ptep_get_and_clear for change_protection. ptep_set_wrprotect and
+ * ptep_get_and_clear do not flush the TLBs directly if the mm has
+ * only one user. At the end of the update the flush_tlb_mm and
+ * flush_tlb_range functions need to do the flush.
+ */
+#define flush_tlb()				do { } while (0)
+#define flush_tlb_all()				do { } while (0)
+#define flush_tlb_mm(mm)			__tlb_flush_mm_cond(mm)
+#define flush_tlb_page(vma, addr)		do { } while (0)
+#define flush_tlb_range(vma, start, end)	__tlb_flush_mm_cond(mm)
+#define flush_tlb_kernel_range(start, end)	__tlb_flush_mm(&init_mm)
+#define flush_tlb_pgtables(mm, start, end)	do { } while (0)
 
 #endif /* _S390_TLBFLUSH_H */
diff -urpN linux-2.6/include/asm-s390/tlb.h linux-2.6-patched/include/asm-s390/tlb.h
--- linux-2.6/include/asm-s390/tlb.h	2006-11-08 10:45:47.000000000 +0100
+++ linux-2.6-patched/include/asm-s390/tlb.h	2007-07-03 12:56:51.000000000 +0200
@@ -2,19 +2,128 @@
 #define _S390_TLB_H
 
 /*
- * s390 doesn't need any special per-pte or
- * per-vma handling..
+ * TLB flushing on s390 is complicated. The following requirement
+ * from the principles of operation is the most arduous:
+ *
+ * "A valid table entry must not be changed while it is attached
+ * to any CPU and may be used for translation by that CPU except to
+ * (1) invalidate the entry by using INVALIDATE PAGE TABLE ENTRY,
+ * or INVALIDATE DAT TABLE ENTRY, (2) alter bits 56-63 of a page
+ * table entry, or (3) make a change by means of a COMPARE AND SWAP
+ * AND PURGE instruction that purges the TLB."
+ *
+ * The modification of a pte of an active mm struct therefore is
+ * a two step process: i) invalidate the pte, ii) store the new pte.
+ * This is true for the page protection bit as well.
+ * The only possible optimization is to flush at the beginning of
+ * a tlb_gather_mmu cycle if the mm_struct is currently not in use.
+ *
+ * Pages used for the page tables is a different story. FIXME: more
  */
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <asm/processor.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+
+#ifndef CONFIG_SMP
+#define TLB_NR_PTRS	1
+#else
+#define TLB_NR_PTRS	508
+#endif
+
+struct mmu_gather {
+	struct mm_struct *mm;
+	unsigned int fullmm;
+	unsigned int nr_ptes;
+	unsigned int nr_pmds;
+	void *array[TLB_NR_PTRS];
+};
+
+DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm,
+						unsigned int full_mm_flush)
+{
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+
+	tlb->mm = mm;
+	tlb->fullmm = full_mm_flush || (num_online_cpus() == 1) ||
+		(atomic_read(&mm->mm_users) <= 1);
+	tlb->nr_ptes = 0;
+	tlb->nr_pmds = TLB_NR_PTRS;
+	if (tlb->fullmm)
+		__tlb_flush_mm(mm);
+	return tlb;
+}
+
+static inline void tlb_flush_mmu(struct mmu_gather *tlb,
+				 unsigned long start, unsigned long end)
+{
+	if (!tlb->fullmm && (tlb->nr_ptes > 0 || tlb->nr_pmds < TLB_NR_PTRS))
+		__tlb_flush_mm(tlb->mm);
+	while (tlb->nr_ptes > 0)
+		pte_free(tlb->array[--tlb->nr_ptes]);
+	while (tlb->nr_pmds < TLB_NR_PTRS)
+		pmd_free((pmd_t *) tlb->array[tlb->nr_pmds++]);
+}
+
+static inline void tlb_finish_mmu(struct mmu_gather *tlb,
+				  unsigned long start, unsigned long end)
+{
+	tlb_flush_mmu(tlb, start, end);
+
+	/* keep the page table cache within bounds */
+	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
+}
 
 /*
- * .. because we flush the whole mm when it
- * fills up.
+ * Release the page cache reference for a pte removed by
+ * tlb_ptep_clear_flush. In both flush modes the tlb fo a page cache page
+ * has already been freed, so just do free_page_and_swap_cache.
  */
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+	free_page_and_swap_cache(page);
+}
 
-#include <asm-generic/tlb.h>
+/*
+ * pte_free_tlb frees a pte table and clears the CRSTE for the
+ * page table from the tlb.
+ */
+static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page)
+{
+	if (!tlb->fullmm) {
+		tlb->array[tlb->nr_ptes++] = page;
+		if (tlb->nr_ptes >= tlb->nr_pmds)
+			tlb_flush_mmu(tlb, 0, 0);
+	} else
+		pte_free(page);
+}
 
+/*
+ * pmd_free_tlb frees a pmd table and clears the CRSTE for the
+ * segment table entry from the tlb.
+ */
+static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+#ifdef __s390x__
+	if (!tlb->fullmm) {
+		tlb->array[--tlb->nr_pmds] = (struct page *) pmd;
+		if (tlb->nr_ptes >= tlb->nr_pmds)
+			tlb_flush_mmu(tlb, 0, 0);
+	} else
+		pmd_free(pmd);
 #endif
+}
+
+#define tlb_start_vma(tlb, vma)			do { } while (0)
+#define tlb_end_vma(tlb, vma)			do { } while (0)
+#define tlb_remove_tlb_entry(tlb, ptep, addr)	do { } while (0)
+#define tlb_migrate_finish(mm)			do { } while (0)
+
+#endif /* _S390_TLB_H */

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/5] avoid tlb gather restarts.
  2007-07-03 11:18 ` [patch 1/5] avoid tlb gather restarts Martin Schwidefsky
@ 2007-07-03 17:42   ` Hugh Dickins
  2007-07-04  7:37     ` Martin Schwidefsky
  2007-07-16  6:20   ` Andrew Morton
  1 sibling, 1 reply; 15+ messages in thread
From: Hugh Dickins @ 2007-07-03 17:42 UTC (permalink / raw)
  To: Martin Schwidefsky; +Cc: akpm, peterz, linux-kernel, linux-mm

On Tue, 3 Jul 2007, Martin Schwidefsky wrote:
> From: Martin Schwidefsky <schwidefsky@de.ibm.com>
> 
> If need_resched() is false in the inner loop of unmap_vmas it is
> unnecessary to do a full blown tlb_finish_mmu / tlb_gather_mmu for
> each ZAP_BLOCK_SIZE ptes. Do a tlb_flush_mmu() instead. That gives
> architectures with a non-generic tlb flush implementation room for
> optimization. The tlb_flush_mmu primitive is a available with the
> generic tlb flush code, the ia64_tlb_flush_mm needs to be renamed
> and a dummy function is added to arm and arm26.
> 
> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

Acked-by: Hugh Dickins <hugh@veritas.com>

(Looking at it, I see that we could argue that there ought to be a
need_resched() etc. check after your tlb_flush_mmu() in unmap_vmas,
in case it's spent a long while in there on some arches; but I don't
think we have the ZAP_BLOCK_SIZE tuned with any great precision, and
you'd at worst be doubling the latency there, so let's not worry
about it.  I write this merely in order to reserve myself an
"I told you so" if anyone ever notices increased latency ;)

> ---
> 
>  include/asm-arm/tlb.h   |    5 +++++
>  include/asm-arm26/tlb.h |    5 +++++
>  include/asm-ia64/tlb.h  |    6 +++---
>  mm/memory.c             |   16 ++++++----------
>  4 files changed, 19 insertions(+), 13 deletions(-)
> 
> diff -urpN linux-2.6/include/asm-arm/tlb.h linux-2.6-patched/include/asm-arm/tlb.h
> --- linux-2.6/include/asm-arm/tlb.h	2006-11-08 10:45:43.000000000 +0100
> +++ linux-2.6-patched/include/asm-arm/tlb.h	2007-07-03 12:56:46.000000000 +0200
> @@ -52,6 +52,11 @@ tlb_gather_mmu(struct mm_struct *mm, uns
>  }
>  
>  static inline void
> +tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
> +{
> +}
> +
> +static inline void
>  tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
>  {
>  	if (tlb->fullmm)
> diff -urpN linux-2.6/include/asm-arm26/tlb.h linux-2.6-patched/include/asm-arm26/tlb.h
> --- linux-2.6/include/asm-arm26/tlb.h	2006-11-08 10:45:43.000000000 +0100
> +++ linux-2.6-patched/include/asm-arm26/tlb.h	2007-07-03 12:56:46.000000000 +0200
> @@ -29,6 +29,11 @@ tlb_gather_mmu(struct mm_struct *mm, uns
>  }
>  
>  static inline void
> +tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
> +{
> +}
> +
> +static inline void
>  tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
>  {
>          if (tlb->need_flush)
> diff -urpN linux-2.6/include/asm-ia64/tlb.h linux-2.6-patched/include/asm-ia64/tlb.h
> --- linux-2.6/include/asm-ia64/tlb.h	2006-11-08 10:45:45.000000000 +0100
> +++ linux-2.6-patched/include/asm-ia64/tlb.h	2007-07-03 12:56:46.000000000 +0200
> @@ -72,7 +72,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_g
>   * freed pages that where gathered up to this point.
>   */
>  static inline void
> -ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
> +tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
>  {
>  	unsigned int nr;
>  
> @@ -160,7 +160,7 @@ tlb_finish_mmu (struct mmu_gather *tlb, 
>  	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
>  	 * tlb->end_addr.
>  	 */
> -	ia64_tlb_flush_mmu(tlb, start, end);
> +	tlb_flush_mmu(tlb, start, end);
>  
>  	/* keep the page table cache within bounds */
>  	check_pgt_cache();
> @@ -184,7 +184,7 @@ tlb_remove_page (struct mmu_gather *tlb,
>  	}
>  	tlb->pages[tlb->nr++] = page;
>  	if (tlb->nr >= FREE_PTE_NR)
> -		ia64_tlb_flush_mmu(tlb, tlb->start_addr, tlb->end_addr);
> +		tlb_flush_mmu(tlb, tlb->start_addr, tlb->end_addr);
>  }
>  
>  /*
> diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c
> --- linux-2.6/mm/memory.c	2007-06-18 09:43:22.000000000 +0200
> +++ linux-2.6-patched/mm/memory.c	2007-07-03 12:56:46.000000000 +0200
> @@ -853,18 +853,15 @@ unsigned long unmap_vmas(struct mmu_gath
>  				break;
>  			}
>  
> -			tlb_finish_mmu(*tlbp, tlb_start, start);
> -
>  			if (need_resched() ||
>  				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
> -				if (i_mmap_lock) {
> -					*tlbp = NULL;
> +				if (i_mmap_lock)
>  					goto out;
> -				}
> +				tlb_finish_mmu(*tlbp, tlb_start, start);
>  				cond_resched();
> -			}
> -
> -			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
> +				*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
> +			} else
> +				tlb_flush_mmu(*tlbp, tlb_start, start);
>  			tlb_start_valid = 0;
>  			zap_work = ZAP_BLOCK_SIZE;
>  		}
> @@ -892,8 +889,7 @@ unsigned long zap_page_range(struct vm_a
>  	tlb = tlb_gather_mmu(mm, 0);
>  	update_hiwater_rss(mm);
>  	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
> -	if (tlb)
> -		tlb_finish_mmu(tlb, address, end);
> +	tlb_finish_mmu(tlb, address, end);
>  	return end;
>  }
>  

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 5/5] s390 tlb flush fix.
  2007-07-03 11:18 ` [patch 5/5] s390 tlb flush fix Martin Schwidefsky
@ 2007-07-03 18:58   ` Hugh Dickins
  2007-07-04  7:34     ` Martin Schwidefsky
  0 siblings, 1 reply; 15+ messages in thread
From: Hugh Dickins @ 2007-07-03 18:58 UTC (permalink / raw)
  To: Martin Schwidefsky; +Cc: akpm, peterz, linux-kernel, linux-mm

On Tue, 3 Jul 2007, Martin Schwidefsky wrote:
> +
> +static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm,
> +						unsigned int full_mm_flush)
> +{
> +	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
> +
> +	tlb->mm = mm;
> +	tlb->fullmm = full_mm_flush || (num_online_cpus() == 1) ||
> +		(atomic_read(&mm->mm_users) <= 1);
> +	tlb->nr_ptes = 0;
> +	tlb->nr_pmds = TLB_NR_PTRS;
> +	if (tlb->fullmm)
> +		__tlb_flush_mm(mm);
> +	return tlb;
> +}

I'm afraid that mm_users test (and probably some of your other
mm_users tests) is not good: because this also gets called when
a file is truncated while it is mapped - the active mm at that
time is likely not to be one of the mm_users.  (Do any other
arches use mm_users in that way?  No: that should be a warning.)

You might do better to make more use of cpu_vm_mask (though I
didn't see where any bits get cleared from it on s390 at present).

Though it seems sensible to aim for one TLB flush at the beginning
as you're doing, that's not what other arches do (some have to
worry about speculative execution, but you don't?), and it
worries me that you're taking s390 further away into its own
implementation: which you're surely entitled to do, but then
we're more likely to screw you over by mistake in future.

Is there perhaps another architecture whose procedures you
can copy?  Changing a pte while another cpu is accessing it
is not a problem unique to s390.

Patches 1-4 looked fine to me, but I believe this 5/5
is the rationale behind all of them.

Hugh

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 5/5] s390 tlb flush fix.
  2007-07-03 18:58   ` Hugh Dickins
@ 2007-07-04  7:34     ` Martin Schwidefsky
  0 siblings, 0 replies; 15+ messages in thread
From: Martin Schwidefsky @ 2007-07-04  7:34 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: akpm, peterz, linux-kernel, linux-mm

On Tue, 2007-07-03 at 19:58 +0100, Hugh Dickins wrote:
> On Tue, 3 Jul 2007, Martin Schwidefsky wrote:
> > +
> > +static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm,
> > +						unsigned int full_mm_flush)
> > +{
> > +	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
> > +
> > +	tlb->mm = mm;
> > +	tlb->fullmm = full_mm_flush || (num_online_cpus() == 1) ||
> > +		(atomic_read(&mm->mm_users) <= 1);
> > +	tlb->nr_ptes = 0;
> > +	tlb->nr_pmds = TLB_NR_PTRS;
> > +	if (tlb->fullmm)
> > +		__tlb_flush_mm(mm);
> > +	return tlb;
> > +}
> 
> I'm afraid that mm_users test (and probably some of your other
> mm_users tests) is not good: because this also gets called when
> a file is truncated while it is mapped - the active mm at that
> time is likely not to be one of the mm_users.  (Do any other
> arches use mm_users in that way?  No: that should be a warning.)

Good catch, that would have caused me some headache. So I need to add a
current->active_mm==mm check if mm_users==1.

> You might do better to make more use of cpu_vm_mask (though I
> didn't see where any bits get cleared from it on s390 at present).

We don't clear any of the bits in cpu_vm_mask. I though about it for a
while and got tangled in race conditions. The cpu_vm_mask optimization
works for short-lived processes which always executed on the same cpu.

> Though it seems sensible to aim for one TLB flush at the beginning
> as you're doing, that's not what other arches do (some have to
> worry about speculative execution, but you don't?), and it
> worries me that you're taking s390 further away into its own
> implementation: which you're surely entitled to do, but then
> we're more likely to screw you over by mistake in future.

We do not have to worry about speculative execution because s390 uses
special instruction to do user access (mvcs, mvcp and mvcos) and the
kernel has its own address space. The compiler doesn't know about these
instruction and cannot "accidentally" access a user space address over
the user page table when it shouldn't.

> Is there perhaps another architecture whose procedures you
> can copy?  Changing a pte while another cpu is accessing it
> is not a problem unique to s390.

No, I don't think so. s390 is quite unique with the restriction that you
may not do a set_pte_at .. flush_tlb_xxx while a pte might get accessed
by a different cpu.

> Patches 1-4 looked fine to me, but I believe this 5/5
> is the rationale behind all of them.

Yes, indeed the tlb flush fix and the 1K/2K page tables are my reasons
for all these patches.

-- 
blue skies,
  Martin.

"Reality continues to ruin my life." - Calvin.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/5] avoid tlb gather restarts.
  2007-07-03 17:42   ` Hugh Dickins
@ 2007-07-04  7:37     ` Martin Schwidefsky
  0 siblings, 0 replies; 15+ messages in thread
From: Martin Schwidefsky @ 2007-07-04  7:37 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: akpm, peterz, linux-kernel, linux-mm

On Tue, 2007-07-03 at 18:42 +0100, Hugh Dickins wrote:
> > If need_resched() is false in the inner loop of unmap_vmas it is
> > unnecessary to do a full blown tlb_finish_mmu / tlb_gather_mmu for
> > each ZAP_BLOCK_SIZE ptes. Do a tlb_flush_mmu() instead. That gives
> > architectures with a non-generic tlb flush implementation room for
> > optimization. The tlb_flush_mmu primitive is a available with the
> > generic tlb flush code, the ia64_tlb_flush_mm needs to be renamed
> > and a dummy function is added to arm and arm26.
> > 
> > Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
> 
> Acked-by: Hugh Dickins <hugh@veritas.com>
> 
> (Looking at it, I see that we could argue that there ought to be a
> need_resched() etc. check after your tlb_flush_mmu() in unmap_vmas,
> in case it's spent a long while in there on some arches; but I don't
> think we have the ZAP_BLOCK_SIZE tuned with any great precision, and
> you'd at worst be doubling the latency there, so let's not worry
> about it.  I write this merely in order to reserve myself an
> "I told you so" if anyone ever notices increased latency ;)

Hmm, we'd have to repeat the longish if statement to make sure we don't
miss a cond_resched after tlb_flush_mmu. I'd rather not do that.

-- 
blue skies,
  Martin.

"Reality continues to ruin my life." - Calvin.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/5] avoid tlb gather restarts.
  2007-07-03 11:18 ` [patch 1/5] avoid tlb gather restarts Martin Schwidefsky
  2007-07-03 17:42   ` Hugh Dickins
@ 2007-07-16  6:20   ` Andrew Morton
  1 sibling, 0 replies; 15+ messages in thread
From: Andrew Morton @ 2007-07-16  6:20 UTC (permalink / raw)
  To: Martin Schwidefsky; +Cc: hugh, peterz, linux-kernel, linux-mm

On Tue, 03 Jul 2007 13:18:23 +0200 Martin Schwidefsky <schwidefsky@de.ibm.com> wrote:

> From: Martin Schwidefsky <schwidefsky@de.ibm.com>
> 
> If need_resched() is false in the inner loop of unmap_vmas it is
> unnecessary to do a full blown tlb_finish_mmu / tlb_gather_mmu for
> each ZAP_BLOCK_SIZE ptes. Do a tlb_flush_mmu() instead. That gives
> architectures with a non-generic tlb flush implementation room for
> optimization. The tlb_flush_mmu primitive is a available with the
> generic tlb flush code, the ia64_tlb_flush_mm needs to be renamed
> and a dummy function is added to arm and arm26.
> 
> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
> ---
> 
>  include/asm-arm/tlb.h   |    5 +++++
>  include/asm-arm26/tlb.h |    5 +++++
>  include/asm-ia64/tlb.h  |    6 +++---
>  mm/memory.c             |   16 ++++++----------
>  4 files changed, 19 insertions(+), 13 deletions(-)

sparc64 broke:

mm/memory.c: In function `unmap_vmas':
mm/memory.c:862: error: too many arguments to function `tlb_flush_mmu'

grep, please.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/5] avoid tlb gather restarts.
  2007-06-29 21:19     ` Martin Schwidefsky
@ 2007-06-30 13:16       ` Hugh Dickins
  0 siblings, 0 replies; 15+ messages in thread
From: Hugh Dickins @ 2007-06-30 13:16 UTC (permalink / raw)
  To: Martin Schwidefsky; +Cc: linux-kernel, linux-mm

On Fri, 29 Jun 2007, Martin Schwidefsky wrote:
> On Fri, 2007-06-29 at 19:56 +0100, Hugh Dickins wrote:
> > I don't dare comment on your page_mkclean_one patch (5/5),
> > that dirty page business has grown too subtle for me.
> 
> Oh yes, the dirty handling is tricky....

I'll move that discussion over to 5/5 and Cc Peter
(sorry I was too lazy to do so in the first place).

> > On Fri, 29 Jun 2007, Martin Schwidefsky wrote:
> > You think you're just moving the finish/gather to where they're
> > actually necessary; but the thing is, that per-cpu struct mmu_gather
> > is liable to accumulate a lot of unpreemptible work for the future
> > tlb_finish_mmu, particularly when anon pages are associated with swap.
> 
> Hmm, ok, so you are saying that we should do a flush at the end of each
> vma.

I think of it as doing a flush every ZAP_BLOCK_SIZE, with the imperfect
structure of the loop forcing perhaps an early flush at the end of each
vma: I seem to assume large vmas, and you to assume small ones.

IIRC, the common case for doing multiple vmas here is exit, when it
ends up that the TLB flush can often be skipped because already done
by the switch from exiting task; so the premature flush per vma doesn't
matter much.  But treat that claim with maximum scepticism: I've not
rechecked it, several aspects may be wrong.  What I do remember is
that (at least on i386) there's a lot less actual TLB flushing done
here than it appears from the outside.

> > So although there may be no need to resched right now, if we keep on
> > gathering more and more without flushing, we'll be very unresponsive
> > when a resched is needed later on.  Hence Ingo's ZAP_BLOCK_SIZE to
> > split it up, small when CONFIG_PREEMPT, more reasonable but still
> > limited when not.
> 
> Would it be acceptable to call tlb_flush_mmu instead of the
> tlb_finish_mmu / tlb_gather_mmu pair if the condition around
> cond_resched evaluates to false?

That sounds a good idea, yes, that should be fine.  But beware,
tlb_flush_mmu is an internal detail of the asm-generic/tlb.h method
and perhaps some others, it currently doesn't exist on some arches.

I think you just need to add a simple one to arm & arm26, and take
the "ia64_" off the ia64 one.  powerpc and sparc64 go about it all 
a bit differently, but it should be easy to give them one too.
There may be some others missing.

> The background for this change is that I'm working on another patch that
> will change the tlb flushing for s390 quite a bit. We won't have
> anything to flush with tlb_finish_mmu because we will either flush all
> tlbs with tlb_gather_mmu or each pte seperatly. The pages will always be
> freed immediatly. If we are forced to restart the tlb gather then we'll
> do multiple flush_tlb_mm because the information that we already flushed
> everything is lost with tlb_finish_mmu.

Thanks for the info.  Sounds like we may have trouble ahead when
rearranging this stuff, easy to forget s390 from our assumptions:
keep watch!

Hugh

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/5] avoid tlb gather restarts.
  2007-06-29 18:56   ` Hugh Dickins
@ 2007-06-29 21:19     ` Martin Schwidefsky
  2007-06-30 13:16       ` Hugh Dickins
  0 siblings, 1 reply; 15+ messages in thread
From: Martin Schwidefsky @ 2007-06-29 21:19 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: linux-kernel, linux-mm

On Fri, 2007-06-29 at 19:56 +0100, Hugh Dickins wrote:
> I don't dare comment on your page_mkclean_one patch (5/5),
> that dirty page business has grown too subtle for me.

Oh yes, the dirty handling is tricky. I had to fix a really nasty bug
with it lately. As for page_mkclean_one the difference is that it
doesn't claim a page is dirty if only the write protect bit has not been
set. If we manage to lose dirty bits from ptes and have to rely on the
write protect bit to take over the job, then we have a different problem
altogether, no ?

> Your cleanups 2-4 look good, especially the mm_types.h one (how
> confident are you that everything builds?), and I'm glad we can
> now lay ptep_establish to rest.  Though I think you may have 
> missed removing a __HAVE_ARCH_PTEP... from frv at least?

Ok, thanks for the review. I take a look at frv to see if I missed
something.

> But this one...
> 
> On Fri, 29 Jun 2007, Martin Schwidefsky wrote:
> 
> > If need_resched() is false it is unnecessary to call tlb_finish_mmu()
> > and tlb_gather_mmu() for each vma in unmap_vmas(). Moving the tlb gather
> > restart under the if that contains the cond_resched() will avoid
> > unnecessary tlb flush operations that are triggered by tlb_finish_mmu() 
> > and tlb_gather_mmu().
> > 
> > Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
> 
> Sorry, no.  It looks reasonable, but unmap_vmas is treading a delicate
> and uncomfortable line between hi-performance and lo-latency: you've
> chosen to improve performance at the expense of latency.

That it true, my only concern had been performance. You likely have a
point here.

> You think you're just moving the finish/gather to where they're
> actually necessary; but the thing is, that per-cpu struct mmu_gather
> is liable to accumulate a lot of unpreemptible work for the future
> tlb_finish_mmu, particularly when anon pages are associated with swap.

Hmm, ok, so you are saying that we should do a flush at the end of each
vma.

> So although there may be no need to resched right now, if we keep on
> gathering more and more without flushing, we'll be very unresponsive
> when a resched is needed later on.  Hence Ingo's ZAP_BLOCK_SIZE to
> split it up, small when CONFIG_PREEMPT, more reasonable but still
> limited when not.

Would it be acceptable to call tlb_flush_mmu instead of the
tlb_finish_mmu / tlb_gather_mmu pair if the condition around
cond_resched evaluates to false?
The background for this change is that I'm working on another patch that
will change the tlb flushing for s390 quite a bit. We won't have
anything to flush with tlb_finish_mmu because we will either flush all
tlbs with tlb_gather_mmu or each pte seperatly. The pages will always be
freed immediatly. If we are forced to restart the tlb gather then we'll
do multiple flush_tlb_mm because the information that we already flushed
everything is lost with tlb_finish_mmu.

-- 
blue skies,
  Martin.

"Reality continues to ruin my life." - Calvin.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/5] avoid tlb gather restarts.
  2007-06-29 13:55 ` [patch 1/5] avoid tlb gather restarts Martin Schwidefsky
@ 2007-06-29 18:56   ` Hugh Dickins
  2007-06-29 21:19     ` Martin Schwidefsky
  0 siblings, 1 reply; 15+ messages in thread
From: Hugh Dickins @ 2007-06-29 18:56 UTC (permalink / raw)
  To: Martin Schwidefsky; +Cc: linux-kernel, linux-mm

I don't dare comment on your page_mkclean_one patch (5/5),
that dirty page business has grown too subtle for me.

Your cleanups 2-4 look good, especially the mm_types.h one (how
confident are you that everything builds?), and I'm glad we can
now lay ptep_establish to rest.  Though I think you may have 
missed removing a __HAVE_ARCH_PTEP... from frv at least?

But this one...

On Fri, 29 Jun 2007, Martin Schwidefsky wrote:

> If need_resched() is false it is unnecessary to call tlb_finish_mmu()
> and tlb_gather_mmu() for each vma in unmap_vmas(). Moving the tlb gather
> restart under the if that contains the cond_resched() will avoid
> unnecessary tlb flush operations that are triggered by tlb_finish_mmu() 
> and tlb_gather_mmu().
> 
> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

Sorry, no.  It looks reasonable, but unmap_vmas is treading a delicate
and uncomfortable line between hi-performance and lo-latency: you've
chosen to improve performance at the expense of latency.

You think you're just moving the finish/gather to where they're
actually necessary; but the thing is, that per-cpu struct mmu_gather
is liable to accumulate a lot of unpreemptible work for the future
tlb_finish_mmu, particularly when anon pages are associated with swap.

So although there may be no need to resched right now, if we keep on
gathering more and more without flushing, we'll be very unresponsive
when a resched is needed later on.  Hence Ingo's ZAP_BLOCK_SIZE to
split it up, small when CONFIG_PREEMPT, more reasonable but still
limited when not.

I expect there is some tinkering which could be done to improve it a
little; but my ambition has always been to eliminate ZAP_BLOCK_SIZE,
get away from the per-cpu'ness of the mmu_gather, and make unmap_vmas
preemptible.  But the i_mmap_lock case, and the per-arch variations
in TLB flushing, have forever stalled me.

Hugh

> ---
> 
>  mm/memory.c |    7 +++----
>  1 files changed, 3 insertions(+), 4 deletions(-)
> 
> diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c
> --- linux-2.6/mm/memory.c	2007-06-29 15:44:08.000000000 +0200
> +++ linux-2.6-patched/mm/memory.c	2007-06-29 15:44:08.000000000 +0200
> @@ -851,19 +851,18 @@ unsigned long unmap_vmas(struct mmu_gath
>  				break;
>  			}
>  
> -			tlb_finish_mmu(*tlbp, tlb_start, start);
> -
>  			if (need_resched() ||
>  				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
> +				tlb_finish_mmu(*tlbp, tlb_start, start);
>  				if (i_mmap_lock) {
>  					*tlbp = NULL;
>  					goto out;
>  				}
>  				cond_resched();
> +				*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
> +				tlb_start_valid = 0;
>  			}
>  
> -			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
> -			tlb_start_valid = 0;
>  			zap_work = ZAP_BLOCK_SIZE;
>  		}
>  	}

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [patch 1/5] avoid tlb gather restarts.
  2007-06-29 13:55 [patch 0/5] Various mm improvements Martin Schwidefsky
@ 2007-06-29 13:55 ` Martin Schwidefsky
  2007-06-29 18:56   ` Hugh Dickins
  0 siblings, 1 reply; 15+ messages in thread
From: Martin Schwidefsky @ 2007-06-29 13:55 UTC (permalink / raw)
  To: linux-kernel, linux-mm; +Cc: Martin Schwidefsky

[-- Attachment #1: 002-flush-restarts.diff --]
[-- Type: text/plain, Size: 1309 bytes --]

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

If need_resched() is false it is unnecessary to call tlb_finish_mmu()
and tlb_gather_mmu() for each vma in unmap_vmas(). Moving the tlb gather
restart under the if that contains the cond_resched() will avoid
unnecessary tlb flush operations that are triggered by tlb_finish_mmu() 
and tlb_gather_mmu().

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---

 mm/memory.c |    7 +++----
 1 files changed, 3 insertions(+), 4 deletions(-)

diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c
--- linux-2.6/mm/memory.c	2007-06-29 15:44:08.000000000 +0200
+++ linux-2.6-patched/mm/memory.c	2007-06-29 15:44:08.000000000 +0200
@@ -851,19 +851,18 @@ unsigned long unmap_vmas(struct mmu_gath
 				break;
 			}
 
-			tlb_finish_mmu(*tlbp, tlb_start, start);
-
 			if (need_resched() ||
 				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
+				tlb_finish_mmu(*tlbp, tlb_start, start);
 				if (i_mmap_lock) {
 					*tlbp = NULL;
 					goto out;
 				}
 				cond_resched();
+				*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
+				tlb_start_valid = 0;
 			}
 
-			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
-			tlb_start_valid = 0;
 			zap_work = ZAP_BLOCK_SIZE;
 		}
 	}

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2007-07-16  6:21 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-07-03 11:18 [patch 0/5] some mm improvements + s390 tlb flush Martin Schwidefsky
2007-07-03 11:18 ` [patch 1/5] avoid tlb gather restarts Martin Schwidefsky
2007-07-03 17:42   ` Hugh Dickins
2007-07-04  7:37     ` Martin Schwidefsky
2007-07-16  6:20   ` Andrew Morton
2007-07-03 11:18 ` [patch 2/5] remove ptep_establish Martin Schwidefsky
2007-07-03 11:18 ` [patch 3/5] remove ptep_test_and_clear_dirty and ptep_clear_flush_dirty Martin Schwidefsky
2007-07-03 11:18 ` [patch 4/5] move mm_struct and vm_area_struct Martin Schwidefsky
2007-07-03 11:18 ` [patch 5/5] s390 tlb flush fix Martin Schwidefsky
2007-07-03 18:58   ` Hugh Dickins
2007-07-04  7:34     ` Martin Schwidefsky
  -- strict thread matches above, loose matches on Subject: below --
2007-06-29 13:55 [patch 0/5] Various mm improvements Martin Schwidefsky
2007-06-29 13:55 ` [patch 1/5] avoid tlb gather restarts Martin Schwidefsky
2007-06-29 18:56   ` Hugh Dickins
2007-06-29 21:19     ` Martin Schwidefsky
2007-06-30 13:16       ` Hugh Dickins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).