1/* 2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <machine/cpu_capabilities.h> 30#include <machine/commpage.h> 31 32/* 33 * The bcopy/memcpy loops, tuned for Nehalem. This is the 64-bit version. 34 * 35 * The following #defines are tightly coupled to the u-architecture: 36 */ 37 38#define kShort 80 // too short to bother with SSE (must be >=80) 39 40 41// void bcopy(const void *src, void *dst, size_t len); 42 43 .text 44 .code64 45 .align 5, 0x90 46Lbcopy_sse42_64: // void bcopy(const void *src, void *dst, size_t len) 47 pushq %rbp // set up a frame for backtraces 48 movq %rsp,%rbp 49 movq %rsi,%rax // copy dest ptr 50 movq %rdi,%rsi // xchange source and dest ptrs 51 movq %rax,%rdi 52 subq %rsi,%rax // (dest - source) 53 cmpq %rdx,%rax // must move in reverse if (dest - source) < length 54 jb LReverseIsland 55 cmpq $(kShort),%rdx // long enough to bother with SSE? 56 jbe LShort // no 57 jmp LNotShort 58 59// 60// void *memcpy(void *dst, const void *src, size_t len); 61// void *memmove(void *dst, const void *src, size_t len); 62// 63// NB: These need to be 32 bytes from bcopy(): 64// 65 66 .align 5, 0x90 67Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) 68Lmemmove: // void *memmove(void *dst, const void *src, size_t len) 69 pushq %rbp // set up a frame for backtraces 70 movq %rsp,%rbp 71 movq %rdi,%r11 // save return value here 72 movq %rdi,%rax 73 subq %rsi,%rax // (dest - source) 74 cmpq %rdx,%rax // must move in reverse if (dest - source) < length 75 jb LReverseIsland 76 cmpq $(kShort),%rdx // long enough to bother with SSE? 77 ja LNotShort // yes 78 79// Handle short forward copies. As the most common case, this is the fall-through path. 80// rdx = length (<= kShort) 81// rsi = source ptr 82// rdi = dest ptr 83 84LShort: 85 movl %edx,%ecx // copy length using 32-bit operation 86 shrl $2,%ecx // get #doublewords 87 jz 3f 882: // loop copying doublewords 89 movl (%rsi),%eax 90 addq $4,%rsi 91 movl %eax,(%rdi) 92 addq $4,%rdi 93 decl %ecx 94 jnz 2b 953: // handle leftover bytes (0..3) in last word 96 andl $3,%edx // any leftover bytes? 97 jz 5f 984: // loop copying bytes 99 movb (%rsi),%al 100 incq %rsi 101 movb %al,(%rdi) 102 incq %rdi 103 decl %edx 104 jnz 4b 1055: 106 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove 107 popq %rbp 108 ret 109 110 111LReverseIsland: // keep the "jb" above a short branch... 112 jmp LReverse // ...because reverse moves are uncommon 113 114 115// Handle forward moves that are long enough to justify use of SSE. 116// First, 16-byte align the destination. 117// rdx = length (> kShort) 118// rsi = source ptr 119// rdi = dest ptr 120 121LNotShort: 122 movl %edi,%ecx // copy low half of destination ptr 123 negl %ecx 124 andl $15,%ecx // get #bytes to align destination 125 jz LDestAligned // already aligned 126 subl %ecx,%edx // decrement length 1271: // loop copying 1..15 bytes 128 movb (%rsi),%al 129 inc %rsi 130 movb %al,(%rdi) 131 inc %rdi 132 dec %ecx 133 jnz 1b 134 135 136// Destination is now aligned. Nehalem does a great job with unaligned SSE loads, 137// so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we 138// know there is at least one 64-byte chunk to move. 139// When we enter the copy loops, the following registers are set up: 140// rdx = residual length (0..63) 141// rcx = -(length to move), a multiple of 64 less than 2GB 142// rsi = ptr to 1st source byte not to move (unaligned) 143// rdi = ptr to 1st dest byte not to move (aligned) 144 145LDestAligned: 146 movq %rdx,%rcx // copy length 147 andl $63,%edx // get remaining bytes for LShort 148 andq $-64,%rcx // get number of bytes we will copy in inner loop 149 addq %rcx,%rsi // point to 1st byte not copied 150 addq %rcx,%rdi 151 negq %rcx // now generate offset to 1st byte to be copied 152 testl $15,%esi // source also aligned? 153 jnz LUnalignedLoop 154 jmp LAlignedLoop 155 156 157// Forward loop for aligned operands. 158 159 .align 4,0x90 // 16-byte align inner loops 160LAlignedLoop: // loop over 64-byte chunks 161 movdqa (%rsi,%rcx),%xmm0 162 movdqa 16(%rsi,%rcx),%xmm1 163 movdqa 32(%rsi,%rcx),%xmm2 164 movdqa 48(%rsi,%rcx),%xmm3 165 166 movdqa %xmm0,(%rdi,%rcx) 167 movdqa %xmm1,16(%rdi,%rcx) 168 movdqa %xmm2,32(%rdi,%rcx) 169 movdqa %xmm3,48(%rdi,%rcx) 170 171 addq $64,%rcx 172 jnz LAlignedLoop 173 174 jmp LShort // copy remaining 0..63 bytes and done 175 176 177// Forward loop for unaligned operands. 178 179 .align 4,0x90 // 16-byte align inner loops 180LUnalignedLoop: // loop over 64-byte chunks 181 movdqu (%rsi,%rcx),%xmm0 182 movdqu 16(%rsi,%rcx),%xmm1 183 movdqu 32(%rsi,%rcx),%xmm2 184 movdqu 48(%rsi,%rcx),%xmm3 185 186 movdqa %xmm0,(%rdi,%rcx) 187 movdqa %xmm1,16(%rdi,%rcx) 188 movdqa %xmm2,32(%rdi,%rcx) 189 movdqa %xmm3,48(%rdi,%rcx) 190 191 addq $64,%rcx 192 jnz LUnalignedLoop 193 194 jmp LShort // copy remaining 0..63 bytes and done 195 196 197// Reverse moves. These are only used with destructive overlap. 198// rdx = length 199// rsi = source ptr 200// rdi = dest ptr 201 202LReverse: 203 addq %rdx,%rsi // point to end of strings 204 addq %rdx,%rdi 205 cmpq $(kShort),%rdx // long enough to bother with SSE? 206 ja LReverseNotShort // yes 207 208// Handle reverse short copies. 209// edx = length (<= kShort) 210// rsi = one byte past end of source 211// rdi = one byte past end of dest 212 213LReverseShort: 214 movl %edx,%ecx // copy length 215 shrl $3,%ecx // #quadwords 216 jz 3f 2171: 218 subq $8,%rsi 219 movq (%rsi),%rax 220 subq $8,%rdi 221 movq %rax,(%rdi) 222 decl %ecx 223 jnz 1b 2243: 225 andl $7,%edx // bytes? 226 jz 5f 2274: 228 decq %rsi 229 movb (%rsi),%al 230 decq %rdi 231 movb %al,(%rdi) 232 decl %edx 233 jnz 4b 2345: 235 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove 236 popq %rbp 237 ret 238 239// Handle a reverse move long enough to justify using SSE. 240// rdx = length (> kShort) 241// rsi = one byte past end of source 242// rdi = one byte past end of dest 243 244LReverseNotShort: 245 movl %edi,%ecx // copy destination 246 andl $15,%ecx // get #bytes to align destination 247 jz LReverseDestAligned // already aligned 248 subq %rcx,%rdx // adjust length 2491: // loop copying 1..15 bytes 250 decq %rsi 251 movb (%rsi),%al 252 decq %rdi 253 movb %al,(%rdi) 254 decl %ecx 255 jnz 1b 256 257// Destination is now aligned. Prepare for reverse loops. 258 259LReverseDestAligned: 260 movq %rdx,%rcx // copy length 261 andl $63,%edx // get remaining bytes for LReverseShort 262 andq $-64,%rcx // get number of bytes we will copy in inner loop 263 subq %rcx,%rsi // point to endpoint of copy 264 subq %rcx,%rdi 265 testl $15,%esi // is source aligned too? 266 jnz LReverseUnalignedLoop // no 267 268LReverseAlignedLoop: // loop over 64-byte chunks 269 movdqa -16(%rsi,%rcx),%xmm0 270 movdqa -32(%rsi,%rcx),%xmm1 271 movdqa -48(%rsi,%rcx),%xmm2 272 movdqa -64(%rsi,%rcx),%xmm3 273 274 movdqa %xmm0,-16(%rdi,%rcx) 275 movdqa %xmm1,-32(%rdi,%rcx) 276 movdqa %xmm2,-48(%rdi,%rcx) 277 movdqa %xmm3,-64(%rdi,%rcx) 278 279 subq $64,%rcx 280 jne LReverseAlignedLoop 281 282 jmp LReverseShort // copy remaining 0..63 bytes and done 283 284 285// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. 286 287LReverseUnalignedLoop: // loop over 64-byte chunks 288 movdqu -16(%rsi,%rcx),%xmm0 289 movdqu -32(%rsi,%rcx),%xmm1 290 movdqu -48(%rsi,%rcx),%xmm2 291 movdqu -64(%rsi,%rcx),%xmm3 292 293 movdqa %xmm0,-16(%rdi,%rcx) 294 movdqa %xmm1,-32(%rdi,%rcx) 295 movdqa %xmm2,-48(%rdi,%rcx) 296 movdqa %xmm3,-64(%rdi,%rcx) 297 298 subq $64,%rcx 299 jne LReverseUnalignedLoop 300 301 jmp LReverseShort // copy remaining 0..63 bytes and done 302 303 304 COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0) 305