LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Andi Kleen <ak@suse.de>
To: "Bryan O'Sullivan" <bos@pathscale.com>, Andi Kleen <ak@suse.de>,
	Roland Dreier <rolandd@cisco.com>,
	patches@x86-64.org, linux-kernel@vger.kernel.org
Subject: [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure
Date: Sat, 10 Feb 2007 12:50:34 +0100 (CET)	[thread overview]
Message-ID: <20070210115034.694B013DBF@wotan.suse.de> (raw)
In-Reply-To: <200702101250.142420000@suse.de>


From: "Bryan O'Sullivan" <bos@pathscale.com>

This copy routine is memcpy-compatible, but on some architectures will use
cache-bypassing loads to avoid bringing the source data into the cache.

One case where this is useful is when a device issues a DMA to a memory
region, and the CPU must copy the DMAed data elsewhere before doing any work
with it.  Since the source data is read-once, write-never from the CPU's
perspective, caching the data at those addresses can only evict potentially
useful data.

We provide an x86_64 implementation that uses SSE non-temporal loads, and a
generic version that falls back to plain memcpy.

Implementors for other arches should not use cache-bypassing stores to the
destination, as in most cases, the destination is accessed almost immediately
after a copy finishes.

[akpm@osdl.org: add module export]
[akpm@osdl.org: remove an ARCH_HAS_foo]
Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/x86_64/kernel/x8664_ksyms.c       |    2 
 arch/x86_64/lib/Makefile               |    1 
 arch/x86_64/lib/memcpy_uncached_read.S |  142 +++++++++++++++++++++++++++++++++
 include/asm-x86_64/string.h            |    2 
 include/linux/string.h                 |    3 
 5 files changed, 150 insertions(+)

Index: linux/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -8,6 +8,7 @@
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm/string.h>
 
 EXPORT_SYMBOL(kernel_thread);
 
@@ -54,6 +55,7 @@ extern void * __memcpy(void *,const void
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(memcpy_uncached_read);
 
 EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(init_level4_pgt);
Index: linux/arch/x86_64/lib/Makefile
===================================================================
--- linux.orig/arch/x86_64/lib/Makefile
+++ linux/arch/x86_64/lib/Makefile
@@ -10,3 +10,4 @@ lib-y := csum-partial.o csum-copy.o csum
 	usercopy.o getuser.o putuser.o  \
 	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
 lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
+lib-y += memcpy_uncached_read.o
Index: linux/arch/x86_64/lib/memcpy_uncached_read.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/lib/memcpy_uncached_read.S
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 QLogic Corporation.  All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * memcpy_uncached_read - memcpy-compatible copy routine, using streaming loads
+ * @dest: destination address
+ * @src: source address (will not be cached)
+ * @count: number of bytes to copy
+ *
+ * Use streaming loads and normal stores for a special-case copy where
+ * we know we won't be reading the source again, but will be reading the
+ * destination again soon.
+ */
+	.text
+	.p2align 4,,15
+	/* rdi  destination, rsi source, rdx count */
+	.globl	memcpy_uncached_read
+	.type	memcpy_uncached_read, @function
+memcpy_uncached_read:
+	movq	%rdi, %rax
+.L5:
+	cmpq	$15, %rdx
+	ja	.L34
+.L3:
+	cmpl	$8, %edx	/* rdx is 0..15 */
+	jbe	.L9
+.L6:
+	testb	$8, %dxl	/* rdx is 3,5,6,7,9..15 */
+	je	.L13
+	movq	(%rsi), %rcx
+	addq	$8, %rsi
+	movq	%rcx, (%rdi)
+	addq	$8, %rdi
+.L13:
+	testb	$4, %dxl
+	je	.L15
+	movl	(%rsi), %ecx
+	addq	$4, %rsi
+	movl	%ecx, (%rdi)
+	addq	$4, %rdi
+.L15:
+	testb	$2, %dxl
+	je	.L17
+	movzwl	(%rsi), %ecx
+	addq	$2, %rsi
+	movw	%cx, (%rdi)
+	addq	$2, %rdi
+.L17:
+	testb	$1, %dxl
+	je	.L33
+.L1:
+	movzbl	(%rsi), %ecx
+	movb	%cl, (%rdi)
+.L33:
+	ret
+.L34:
+	cmpq	$63, %rdx	/* rdx is > 15 */
+	ja	.L64
+	movl	$16, %ecx	/* rdx is 16..63 */
+.L25:
+	movq	8(%rsi), %r8
+	movq	(%rsi), %r9
+	addq	%rcx, %rsi
+	movq	%r8, 8(%rdi)
+	movq	%r9, (%rdi)
+	addq	%rcx, %rdi
+	subq	%rcx, %rdx
+	cmpl	%edx, %ecx	/* is rdx >= 16? */
+	jbe	.L25
+	jmp	.L3		/* rdx is 0..15 */
+	.p2align 4,,7
+.L64:
+	movl	$64, %ecx
+.L42:
+	prefetchnta	128(%rsi)
+	movq	(%rsi), %r8
+	movq	8(%rsi), %r9
+	movq	16(%rsi), %r10
+	movq	24(%rsi), %r11
+	subq	%rcx, %rdx
+	movq	%r8, (%rdi)
+	movq	32(%rsi), %r8
+	movq	%r9, 8(%rdi)
+	movq	40(%rsi), %r9
+	movq	%r10, 16(%rdi)
+	movq	48(%rsi), %r10
+	movq	%r11, 24(%rdi)
+	movq	56(%rsi), %r11
+	addq	%rcx, %rsi
+	movq	%r8, 32(%rdi)
+	movq	%r9, 40(%rdi)
+	movq	%r10, 48(%rdi)
+	movq	%r11, 56(%rdi)
+	addq	%rcx, %rdi
+	cmpq	%rdx, %rcx	/* is rdx >= 64? */
+	jbe	.L42
+	sfence
+	orl	%edx, %edx
+	je	.L33
+	jmp	.L5
+.L9:
+	jmp	*.L12(,%rdx,8)	/* rdx is 0..8 */
+	.section	.rodata
+	.align 8
+	.align 4
+.L12:
+	.quad	.L33
+	.quad	.L1
+	.quad	.L2
+	.quad	.L6
+	.quad	.L4
+	.quad	.L6
+	.quad	.L6
+	.quad	.L6
+	.quad	.L8
+	.text
+.L2:
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+	ret
+.L4:
+	movl	(%rsi), %ecx
+	movl	%ecx, (%rdi)
+	ret
+.L8:
+	movq	(%rsi), %rcx
+	movq	%rcx, (%rdi)
+	ret
Index: linux/include/asm-x86_64/string.h
===================================================================
--- linux.orig/include/asm-x86_64/string.h
+++ linux/include/asm-x86_64/string.h
@@ -39,6 +39,8 @@ extern void *__memcpy(void *to, const vo
 		 __ret = __builtin_memcpy((dst),(src),__len);	\
 	   __ret; }) 
 
+extern void *memcpy_uncached_read(void *to, const void *from, size_t len);
+#define memcpy_uncached_read memcpy_uncached_read
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
Index: linux/include/linux/string.h
===================================================================
--- linux.orig/include/linux/string.h
+++ linux/include/linux/string.h
@@ -85,6 +85,9 @@ extern void * memset(void *,int,__kernel
 #ifndef __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *,const void *,__kernel_size_t);
 #endif
+#ifndef memcpy_uncached_read
+#define memcpy_uncached_read(dest, src, count) memcpy((dest), (src), (count))
+#endif
 #ifndef __HAVE_ARCH_MEMMOVE
 extern void * memmove(void *,const void *,__kernel_size_t);
 #endif

  parent reply	other threads:[~2007-02-10 11:51 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [3/25] i386: Convert i386 PDA code to use %fs Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Andi Kleen
2007-02-12  9:32   ` [patches] " Jan Beulich
2007-02-12 16:42     ` Jeff Dike
2007-02-12 17:01       ` Jan Beulich
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [5/25] i386: revert i386-fix-the-verify_quirk_intel_irqbalance Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [6/25] x86_64: revert x86_64-mm-add-genapic_force Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [7/25] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525 Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [8/25] x86_64: optimize & fix APIC mode setup Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [9/25] x86_64: always use physical delivery mode on > 8 CPUs Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [10/25] x86_64: remove clustered APIC mode Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels Andi Kleen
2007-02-11 11:13   ` Eric W. Biederman
2007-02-12 22:36     ` Andi Kleen
2007-02-12 23:10       ` Eric W. Biederman
2007-02-12 23:51         ` Siddha, Suresh B
2007-02-12 23:43       ` Siddha, Suresh B
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [12/25] x86_64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [13/25] i386: Fix a typo in an IRQ handler name Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [14/25] x86: Share what's shareable Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [15/25] i386: Only call unreachable_devices() when type 1 is available Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [16/25] i386: Detect and support the E7520 and the 945G/GZ/P/PL Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [17/25] i386: Reserve resources but only when we're sure about them Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address Andi Kleen
2007-02-10 11:58   ` Arjan van de Ven
2007-02-10 12:07     ` Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [19/25] x86: Reject a broken MCFG tables on Asus etc Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [20/25] x86_64: get rid of ARCH_HAVE_XTIME_LOCK Andi Kleen
2007-02-10 11:50 ` Andi Kleen [this message]
2007-02-12  9:57   ` [patches] [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure Jan Beulich
2007-02-12 10:25     ` Andi Kleen
2007-02-13 11:27   ` Eric Dumazet
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [22/25] x86_64: use memcpy_uncached_read() in RDMA interrupt handler to reduce packet loss Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [23/25] x86_64: improved iommu documentation Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [24/25] x86_64: do not always end the stack trace with ULONG_MAX Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [25/25] i386: arch/i386/kernel/e820.c should #include <asm/setup.h Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070210115034.694B013DBF@wotan.suse.de \
    --to=ak@suse.de \
    --cc=bos@pathscale.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=patches@x86-64.org \
    --cc=rolandd@cisco.com \
    --subject='Re: [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).