1/* 2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <machine/cpu_capabilities.h> 30#include <machine/commpage.h> 31 32/* The common path for nonzero memset and the memset_pattern routines, 33 * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines. 34 * This is the 64-bit bersion. It is used by the following functions: 35 * 36 * void *memset(void *b, int c, size_t len); // when c!=0 37 * void memset_pattern4(void *b, const void *c4, size_t len); 38 * void memset_pattern8(void *b, const void *c8, size_t len); 39 * void memset_pattern16(void *b, const void *c16, size_t len); 40 * 41 * Note bzero() and memset() of 0 are handled separately. 42 */ 43 44#define kShort 63 45#define kVeryLong (1024*1024) 46 47// Initial entry from Libc with parameters passed in registers. Although we 48// correctly handle misaligned ptrs and short operands, they are inefficient. 49// Therefore our caller should filter out short operands and exploit local 50// knowledge (ie, original pattern length) to align the ptr if possible. 51// When called, we expect: 52// %rdi = ptr to memory to set (not necessarily aligned) 53// %rdx = length (may be short or even 0) 54// %xmm0 = the pattern to store 55// Return conditions: 56// %rax, %rdi, %rsi, %rcx, and %rdx all trashed 57// we preserve %r8, %r9, %r10, and %r11 58 59 .text 60 .align 5, 0x90 61 .code64 62Lmemset_pattern_sse2_64: 63 cmpq $(kShort),%rdx // long enough to bother aligning? 64 ja LNotShort // yes 65 jmp LShort // no 66 67// Here for short operands or the end of long ones. 68// %rdx = length (<= kShort) 69// %rdi = ptr (may not be not aligned) 70// %xmm0 = pattern 71 72LUnalignedStore16: 73 movdqu %xmm0,(%rdi) // stuff in another 16 bytes 74 subl $16,%edx 75 addq $16,%rdi 76LShort: 77 cmpl $16,%edx // room for another vector? 78 jge LUnalignedStore16 // yes 79LLessThan16: // here at end of copy with < 16 bytes remaining 80 test $8,%dl // 8-byte store required? 81 jz 2f // no 82 movq %xmm0,(%rdi) // pack in 8 low bytes 83 psrldq $8,%xmm0 // then shift vector down 8 bytes 84 addq $8,%rdi 852: 86 test $4,%dl // 4-byte store required? 87 jz 3f // no 88 movd %xmm0,(%rdi) // pack in 4 low bytes 89 psrldq $4,%xmm0 // then shift vector down 4 bytes 90 addq $4,%rdi 913: 92 andl $3,%edx // more to go? 93 jz 5f // no 94 movd %xmm0,%eax // move remainders out into %eax 954: // loop on up to three bytes 96 movb %al,(%rdi) // pack in next byte 97 shrl $8,%eax // shift next byte into position 98 incq %rdi 99 dec %edx 100 jnz 4b 1015: ret 102 103// Long enough to justify aligning ptr. Note that we have to rotate the 104// pattern to account for any alignment. We do this by doing two unaligned 105// stores, and then an aligned load from the middle of the two stores. 106// This will stall on store forwarding alignment mismatch, and the unaligned 107// stores can be pretty slow too, but the alternatives aren't any better. 108// Fortunately, in most cases our caller has already aligned the ptr. 109// %rdx = length (> kShort) 110// %rdi = ptr (may not be aligned) 111// %xmm0 = pattern 112 113LNotShort: 114 movl %edi,%ecx // copy low bits of dest ptr 115 negl %ecx 116 andl $15,%ecx // mask down to #bytes to 16-byte align 117 jz LAligned // skip if already aligned 118 movdqu %xmm0,(%rdi) // store 16 unaligned bytes 119 movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk 120 addq %rcx,%rdi // now point to the aligned chunk 121 subq %rcx,%rdx // adjust remaining count 122 movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling) 123 addq $16,%rdi // skip past the aligned chunk 124 subq $16,%rdx 125 126// Set up for 64-byte loops. 127// %rdx = length remaining 128// %rdi = ptr (aligned) 129// %xmm0 = rotated pattern 130 131LAligned: 132 movq %rdx,%rcx // copy length remaining 133 andl $63,%edx // mask down to residual length (0..63) 134 andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop 135 jz LNoMoreChunks // no 64-byte chunks 136 addq %rcx,%rdi // increment ptr by length to move 137 cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? 138 jge LVeryLong // yes 139 negq %rcx // negate length to move 140 jmp 1f 141 142// Loop over 64-byte chunks, storing into cache. 143 144 .align 4,0x90 // keep inner loops 16-byte aligned 1451: 146 movdqa %xmm0,(%rdi,%rcx) 147 movdqa %xmm0,16(%rdi,%rcx) 148 movdqa %xmm0,32(%rdi,%rcx) 149 movdqa %xmm0,48(%rdi,%rcx) 150 addq $64,%rcx 151 jne 1b 152 153 jmp LNoMoreChunks 154 155// Very long operands: use non-temporal stores to bypass cache. 156 157LVeryLong: 158 negq %rcx // negate length to move 159 jmp 1f 160 161 .align 4,0x90 // keep inner loops 16-byte aligned 1621: 163 movntdq %xmm0,(%rdi,%rcx) 164 movntdq %xmm0,16(%rdi,%rcx) 165 movntdq %xmm0,32(%rdi,%rcx) 166 movntdq %xmm0,48(%rdi,%rcx) 167 addq $64,%rcx 168 jne 1b 169 170 sfence // required by non-temporal stores 171 jmp LNoMoreChunks 172 173// Handle leftovers: loop by 16. 174// %edx = length remaining (<64) 175// %edi = ptr (aligned) 176// %xmm0 = rotated pattern 177 178LLoopBy16: 179 movdqa %xmm0,(%rdi) // pack in 16 more bytes 180 subl $16,%edx // decrement count 181 addq $16,%rdi // increment ptr 182LNoMoreChunks: 183 cmpl $16,%edx // more to go? 184 jge LLoopBy16 // yes 185 jmp LLessThan16 // handle up to 15 remaining bytes 186 187 COMMPAGE_DESCRIPTOR(memset_pattern_sse2_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0) 188