LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Andi Kleen <ak@suse.de>
To: patches@x86-64.org, linux-kernel@vger.kernel.org
Subject: [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache
Date: Sat, 10 Feb 2007 12:50:13 +0100 (CET)	[thread overview]
Message-ID: <200702101250.142420000@suse.de> (raw)


This does user copies in fs write() into the page cache with write combining.
This pushes the destination out of the CPU's cache, but allows higher bandwidth
in some case.

The theory is that the page cache data is usually not touched by the 
CPU again and it's better to not pollute the cache with it. Also it is a little
faster.

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/x8664_ksyms.c    |    1 
 arch/x86_64/lib/Makefile            |    2 
 arch/x86_64/lib/copy_user_nocache.S |  217 ++++++++++++++++++++++++++++++++++++
 include/asm-x86_64/uaccess.h        |   14 ++
 4 files changed, 233 insertions(+), 1 deletion(-)

Index: linux/arch/x86_64/lib/Makefile
===================================================================
--- linux.orig/arch/x86_64/lib/Makefile
+++ linux/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
 lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
 	usercopy.o getuser.o putuser.o  \
 	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
Index: linux/arch/x86_64/lib/copy_user_nocache.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag	when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+	CFI_STARTPROC
+	pushq %rbx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rbx, 0
+	pushq %rcx		/* save zero flag */
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rcx, 0
+
+	xorl %eax,%eax		/* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+	movq %rdx,%rcx
+
+	movl $64,%ebx
+	shrq $6,%rdx
+	decq %rdx
+	js   .Lhandle_tail
+
+	.p2align 4
+.Lloop:
+.Ls1:	movq (%rsi),%r11
+.Ls2:	movq 1*8(%rsi),%r8
+.Ls3:	movq 2*8(%rsi),%r9
+.Ls4:	movq 3*8(%rsi),%r10
+.Ld1:	movnti %r11,(%rdi)
+.Ld2:	movnti %r8,1*8(%rdi)
+.Ld3:	movnti %r9,2*8(%rdi)
+.Ld4:	movnti %r10,3*8(%rdi)
+
+.Ls5:	movq 4*8(%rsi),%r11
+.Ls6:	movq 5*8(%rsi),%r8
+.Ls7:	movq 6*8(%rsi),%r9
+.Ls8:	movq 7*8(%rsi),%r10
+.Ld5:	movnti %r11,4*8(%rdi)
+.Ld6:	movnti %r8,5*8(%rdi)
+.Ld7:	movnti %r9,6*8(%rdi)
+.Ld8:	movnti %r10,7*8(%rdi)
+
+	dec  %rdx
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+
+	jns  .Lloop
+
+	.p2align 4
+.Lhandle_tail:
+	movl %ecx,%edx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	movl $8,%ebx
+	.p2align 4
+.Lloop_8:
+.Ls9:	movq (%rsi),%r8
+.Ld9:	movnti %r8,(%rdi)
+	decl %ecx
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz .Lloop_8
+
+.Lhandle_7:
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz   .Lende
+	.p2align 4
+.Lloop_1:
+.Ls10:	movb (%rsi),%bl
+.Ld10:	movb %bl,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+
+	CFI_REMEMBER_STATE
+.Lende:
+	popq %rcx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE %rcx
+	popq %rbx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rbx
+	ret
+	CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+	/* align destination */
+	.p2align 4
+.Lbad_alignment:
+	movl $8,%r9d
+	subl %ecx,%r9d
+	movl %r9d,%ecx
+	cmpq %r9,%rdx
+	jz   .Lhandle_7
+	js   .Lhandle_7
+.Lalign_1:
+.Ls11:	movb (%rsi),%bl
+.Ld11:	movb %bl,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .Lalign_1
+	subq %r9,%rdx
+	jmp .Lafter_bad_alignment
+#endif
+
+	/* table sorted by exception address */
+	.section __ex_table,"a"
+	.align 8
+	.quad .Ls1,.Ls1e
+	.quad .Ls2,.Ls2e
+	.quad .Ls3,.Ls3e
+	.quad .Ls4,.Ls4e
+	.quad .Ld1,.Ls1e
+	.quad .Ld2,.Ls2e
+	.quad .Ld3,.Ls3e
+	.quad .Ld4,.Ls4e
+	.quad .Ls5,.Ls5e
+	.quad .Ls6,.Ls6e
+	.quad .Ls7,.Ls7e
+	.quad .Ls8,.Ls8e
+	.quad .Ld5,.Ls5e
+	.quad .Ld6,.Ls6e
+	.quad .Ld7,.Ls7e
+	.quad .Ld8,.Ls8e
+	.quad .Ls9,.Le_quad
+	.quad .Ld9,.Le_quad
+	.quad .Ls10,.Le_byte
+	.quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+	.quad .Ls11,.Lzero_rest
+	.quad .Ld11,.Lzero_rest
+#endif
+	.quad .Le5,.Le_zero
+	.previous
+
+	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
+	   pessimistic side. this is gross. it would be better to fix the
+	   interface. */
+	/* eax: zero, ebx: 64 */
+.Ls1e: 	addl $8,%eax
+.Ls2e: 	addl $8,%eax
+.Ls3e: 	addl $8,%eax
+.Ls4e: 	addl $8,%eax
+.Ls5e: 	addl $8,%eax
+.Ls6e: 	addl $8,%eax
+.Ls7e: 	addl $8,%eax
+.Ls8e: 	addl $8,%eax
+	addq %rbx,%rdi	/* +64 */
+	subq %rax,%rdi  /* correct destination with computed offset */
+
+	shlq $6,%rdx	/* loop counter * 64 (stride length) */
+	addq %rax,%rdx	/* add offset to loopcnt */
+	andl $63,%ecx	/* remaining bytes */
+	addq %rcx,%rdx	/* add them */
+	jmp .Lzero_rest
+
+	/* exception on quad word loop in tail handling */
+	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+	shll $3,%ecx
+	andl $7,%edx
+	addl %ecx,%edx
+	/* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+	cmpl $0,(%rsp)	/* zero flag set? */
+	jz   .Le_zero
+	movq %rdx,%rcx
+.Le_byte:
+	xorl %eax,%eax
+.Le5:	rep
+	stosb
+	/* when there is another exception while zeroing the rest just return */
+.Le_zero:
+	movq %rdx,%rax
+	jmp .Lende
+	CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
Index: linux/include/asm-x86_64/uaccess.h
===================================================================
--- linux.orig/include/asm-x86_64/uaccess.h
+++ linux/include/asm-x86_64/uaccess.h
@@ -367,4 +367,18 @@ __copy_to_user_inatomic(void __user *dst
 	return copy_user_generic((__force void *)dst, src, size);
 }
 
+#define ARCH_HAS_NOCACHE_UACCESS 1
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest);
+
+static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
+{
+	might_sleep();
+	return __copy_user_nocache(dst, (__force void *)src, size, 1);
+}
+
+static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size)
+{
+	return __copy_user_nocache(dst, (__force void *)src, size, 0);
+}
+
 #endif /* __X86_64_UACCESS_H */
Index: linux/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
 EXPORT_SYMBOL(__put_user_8);
 
 EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(copy_from_user);
 EXPORT_SYMBOL(copy_to_user);
 EXPORT_SYMBOL(__copy_from_user_inatomic);

             reply	other threads:[~2007-02-10 11:50 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-02-10 11:50 Andi Kleen [this message]
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [3/25] i386: Convert i386 PDA code to use %fs Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Andi Kleen
2007-02-12  9:32   ` [patches] " Jan Beulich
2007-02-12 16:42     ` Jeff Dike
2007-02-12 17:01       ` Jan Beulich
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [5/25] i386: revert i386-fix-the-verify_quirk_intel_irqbalance Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [6/25] x86_64: revert x86_64-mm-add-genapic_force Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [7/25] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525 Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [8/25] x86_64: optimize & fix APIC mode setup Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [9/25] x86_64: always use physical delivery mode on > 8 CPUs Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [10/25] x86_64: remove clustered APIC mode Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels Andi Kleen
2007-02-11 11:13   ` Eric W. Biederman
2007-02-12 22:36     ` Andi Kleen
2007-02-12 23:10       ` Eric W. Biederman
2007-02-12 23:51         ` Siddha, Suresh B
2007-02-12 23:43       ` Siddha, Suresh B
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [12/25] x86_64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [13/25] i386: Fix a typo in an IRQ handler name Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [14/25] x86: Share what's shareable Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [15/25] i386: Only call unreachable_devices() when type 1 is available Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [16/25] i386: Detect and support the E7520 and the 945G/GZ/P/PL Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [17/25] i386: Reserve resources but only when we're sure about them Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address Andi Kleen
2007-02-10 11:58   ` Arjan van de Ven
2007-02-10 12:07     ` Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [19/25] x86: Reject a broken MCFG tables on Asus etc Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [20/25] x86_64: get rid of ARCH_HAVE_XTIME_LOCK Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure Andi Kleen
2007-02-12  9:57   ` [patches] " Jan Beulich
2007-02-12 10:25     ` Andi Kleen
2007-02-13 11:27   ` Eric Dumazet
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [22/25] x86_64: use memcpy_uncached_read() in RDMA interrupt handler to reduce packet loss Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [23/25] x86_64: improved iommu documentation Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [24/25] x86_64: do not always end the stack trace with ULONG_MAX Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [25/25] i386: arch/i386/kernel/e820.c should #include <asm/setup.h Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200702101250.142420000@suse.de \
    --to=ak@suse.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=patches@x86-64.org \
    --subject='Re: [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).