1/* 2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24#include <machine/cpu_capabilities.h> 25 26 27// ***************** 28// * S T R N C P Y * 29// ***************** 30// 31// char *strncpy(const char *dst, const char *src, size_t n); 32// 33// We optimize the move by doing it vector parallel. This introduces 34// a complication: if we blindly did vector load/stores until finding 35// a 0, we might get a spurious page fault by touching bytes past it. 36// To avoid this, we never do a load that crosses a page boundary, 37// and never store a byte we don't have to. 38// 39// We align the destination, because unaligned vector stores are slow. 40// 41// Recall that strncpy() zero fills the remainder of the dest buffer, 42// and does not terminate the string if it's length is greater than or 43// equal to n. 44 45#define kShort 31 // too short to bother with vector loop 46 47 .text 48 .globl _strncpy 49 50 .align 4 51_strncpy: // char *strncpy(const char *dst, const char *src, size_t n); 52 pushl %edi 53 pushl %esi 54 movl 12(%esp),%edi // get dest ptr 55 movl 16(%esp),%esi // get source ptr 56 movl 20(%esp),%ecx // get length 57 movl %edi,%edx // copy dest ptr 58 negl %edx 59 andl $15,%edx // how many bytes to align dest ptr? 60 jnz LCheckShortCopy // align destination first 61 62 63// In order to avoid spurious page faults, we loop until nearing the source page 64// end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed, 65// then resume the vector loop. 66// %esi = source ptr (unaligned) 67// %edi = dest ptr (aligned) 68// %ecx = buffer length remaining 69 70LNextChunk: // NB: can drop down to here 71 movl %esi,%eax // copy source ptr 72 movl $4096,%edx 73 andl $4095,%eax // get offset into source page 74 subl %eax,%edx // get #bytes remaining in source page 75 cmpl %ecx,%edx // will buffer run out before the page end? 76 cmova %ecx,%edx // get min(length remaining, bytes to page end) 77 shrl $4,%edx // get #chunks till end of page 78 jnz LLoopOverChunks // enter vector loop 79 80// We can't use the chunk loop yet. Check for short and empty buffers, then use byte loop. 81 82LCrossPage: // if buffer is large enough, cross source page 83 movl $16,%edx // move 16 bytes to cross page but keep dest aligned 84LCheckShortCopy: // we propose to copy %edx bytes in byte loop 85 cmpl $(kShort),%ecx // much left? 86 ja LLoopOverBytes // yes, loop over bytes then more chunks 87 movl %ecx,%edx // no, use the byte loop for everything 88 testl %ecx,%ecx // have we filled buffer? 89 jnz LLoopOverBytes // no 90 jmp LDone 91 92 93// Loop over bytes. 94// %esi = source ptr 95// %edi = dest ptr 96// %ecx = buffer length remaining 97// %edx = count of bytes to loop over (<= buffer length) 98 99 .align 4,0x90 // align inner loops to optimize I-fetch 100LLoopOverBytes: 101 movzb (%esi),%eax // get source byte 102 inc %esi 103 dec %ecx // decrement length 104 movb %al,(%edi) // pack into dest 105 inc %edi 106 testl %eax,%eax // 0? 107 jz LZeroBuffer // yes, we're done copying string 108 dec %edx // more to go? 109 jnz LLoopOverBytes 110 111 testl %ecx,%ecx // at end of buffer? 112 jnz LNextChunk // no, xfer chunks 113 jmp LDone // yes 114 115 116// Loop over 16-byte chunks. 117// %esi = source ptr (unaligned) 118// %edi = dest ptr (aligned) 119// %ecx = buffer length remaining 120// %edx = chunk count 121 122 .align 4,0x90 // align inner loops to optimize I-fetch 123LLoopOverChunks: 124 movdqu (%esi),%xmm1 // get source 125 pxor %xmm0,%xmm0 // get some 0s 126 addl $16,%esi 127 pcmpeqb %xmm1,%xmm0 // compare source to 0s 128 pmovmskb %xmm0,%eax // get result mask for 0 check 129 testl %eax,%eax // any 0s? 130 jnz LFound0 // yes, exit loop 131 movdqa %xmm1,(%edi) // no 0s so do aligned store into destination 132 addl $16,%edi 133 subl $16,%ecx // decrement length remaining 134 dec %edx // more to go? 135 jnz LLoopOverChunks 136 137 jmp LCrossPage // cross page but keep dest aligned 138 139 140// Found a zero in the vector. Figure out where it is, and store the bytes 141// up to it. It is possible that we should check to be sure (%ecx >= 16), and 142// just do an aligned store of %xmm1 if so. But if we did, we'd be doing byte 143// stores into the same double quadword in bzero(), which might hit a hazard. 144// Experimentation needed. 145// %edi = dest ptr (aligned) 146// %eax = result mask 147// %ecx = buffer length remaining 148// %xmm1 = source vector 149 150LFound0: 151 bsf %eax,%edx // find first 0 152 subl %edx,%ecx // decrement remaining buffer length 153 test $8,%dl // 8-byte store required? 154 jz 4f // no 155 movq %xmm1,(%edi) // pack in 8 low bytes 156 psrldq $8,%xmm1 // then shift vector down 8 bytes 157 addl $8,%edi 1584: 159 test $4,%dl // 4-byte store required? 160 jz 3f // no 161 movd %xmm1,(%edi) // pack in 4 low bytes 162 psrldq $4,%xmm1 // then shift vector down 4 bytes 163 addl $4,%edi 1643: 165 andl $3,%edx // more to go? 166 jz LZeroBuffer // no 167 movd %xmm1,%eax // move remainders out of vector into %eax 1681: // loop on up to three bytes 169 movb %al,(%edi) // pack in next byte 170 shrl $8,%eax // shift next byte into position 171 inc %edi 172 dec %edx 173 jnz 1b 174 175// We've copied the string. Now zero the rest of the buffer, using commpage bzero(). 176// %edi = dest ptr 177// %ecx = buffer length remaining 178 179LZeroBuffer: 180// The stack currently is aligned to 4 mod 16 (it was 0 mod 16 at the time of 181// the call, and the return address, edi, and esi have been pushed). It needs 182// to aligned 0 mod 16 when we call bzero, so we subtract 20 from esp (not 4 183// because we need to have 8 bytes for the arguments to bzero). 184 subl $20,%esp 185 movl %ecx,4(%esp) // remaining buffer size 186 movl %edi, (%esp) // pointer to first unstored byte 187 call _bzero 188 addl $20,%esp 189 190LDone: 191 movl 12(%esp),%eax // original dest ptr is return value 192 popl %esi 193 popl %edi 194 ret 195