1/* 2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24#include <machine/cpu_capabilities.h> 25 26 27// ***************** 28// * S T R N C P Y * 29// ***************** 30// 31// char *strncpy(const char *dst, const char *src, size_t n); 32// 33// We optimize the move by doing it vector parallel. This introduces 34// a complication: if we blindly did vector load/stores until finding 35// a 0, we might get a spurious page fault by touching bytes past it. 36// To avoid this, we never do a load that crosses a page boundary, 37// and never store a byte we don't have to. 38// 39// We align the destination, because unaligned vector stores are slow. 40// 41// Recall that strncpy() zero fills the remainder of the dest buffer, 42// and does not terminate the string if its length is greater than or 43// equal to n. 44 45#define kShort 31 // too short to bother with vector loop 46 47 .text 48 .globl _strncpy 49 50 .align 4 51_strncpy: // char *strncpy(const char *dst, const char *src, size_t n); 52 movq %rdi,%r8 // preserve destination pointer so we can return it 53 movl %edi,%ecx // copy low 4 bytes of dest ptr 54 negl %ecx 55 andl $15,%ecx // how many bytes to align dest ptr? 56 jnz LCheckShortCopy // align destination first 57 58 59// In order to avoid spurious page faults, we loop until nearing the source page 60// end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed, 61// then resume the vector loop. 62// %rsi = source ptr (unaligned) 63// %rdi = dest ptr (aligned) 64// %rdx = buffer length remaining 65 66LNextChunk: // NB: can drop down to here 67 movl %esi,%eax // copy the low 4 bytes of the source ptr 68 movl $4096,%ecx 69 andl $4095,%eax // get offset into source page 70 subl %eax,%ecx // get #bytes remaining in source page 71 cmpq %rdx,%rcx // will buffer run out before the page end? 72 cmova %rdx,%rcx // get min(length remaining, bytes to page end) 73 shrl $4,%ecx // get #chunks till end of page 74 jnz LLoopOverChunks // enter vector loop 75 76// We can't use the chunk loop yet. Check for short and empty buffers, then use byte loop. 77 78LCrossPage: // if buffer is large enough, cross source page 79 movl $16,%ecx // move 16 bytes to cross page but keep dest aligned 80LCheckShortCopy: // we propose to copy %ecx bytes in byte loop 81 cmpq $(kShort),%rdx // much left? 82 ja LLoopOverBytes // yes, loop over bytes then more chunks 83 movl %edx,%ecx // no, use the byte loop for everything 84 testl %edx,%edx // have we filled buffer? 85 jnz LLoopOverBytes // no 86 jmp LDone 87 88 89// Loop over bytes. 90// %rsi = source ptr 91// %rdi = dest ptr 92// %rdx = buffer length remaining 93// %rcx = count of bytes to loop over (<= buffer length) 94 95 .align 4,0x90 // align inner loops to optimize I-fetch 96LLoopOverBytes: 97 movzb (%rsi),%eax // get source byte 98 addq $1,%rsi 99 subq $1,%rdx // decrement length 100 movb %al,(%rdi) // pack into dest 101 addq $1,%rdi 102 testl %eax,%eax // 0? 103 jz LZeroBuffer // yes, we're done copying string 104 subq $1,%rcx // more to go? 105 jnz LLoopOverBytes 106 107 testq %rdx,%rdx // at end of buffer? 108 jnz LNextChunk // no, xfer chunks 109 jmp LDone // yes 110 111 112// Loop over 16-byte chunks. 113// %rsi = source ptr (unaligned) 114// %rdi = dest ptr (aligned) 115// %rdx = buffer length remaining 116// %ecx = chunk count 117 118 .align 4,0x90 // align inner loops to optimize I-fetch 119LLoopOverChunks: 120 movdqu (%rsi),%xmm1 // get source 121 pxor %xmm0,%xmm0 // get some 0s 122 addq $16,%rsi 123 pcmpeqb %xmm1,%xmm0 // compare source to 0s 124 pmovmskb %xmm0,%eax // get result mask for 0 check 125 testl %eax,%eax // any 0s? 126 jnz LFound0 // yes, exit loop 127 movdqa %xmm1,(%rdi) // no 0s so do aligned store into destination 128 addq $16,%rdi 129 subq $16,%rdx // decrement length remaining 130 subl $1,%ecx // more to go? 131 jnz LLoopOverChunks 132 133 jmp LCrossPage // cross page but keep dest aligned 134 135 136// Found a zero in the vector. Figure out where it is, and store the bytes 137// up to it. It is possible that we should check to be sure (%rdx >= 16), and 138// just do an aligned store of %xmm1 if so. But if we did, we'd be doing byte 139// stores into the same double quadword in bzero(), which might hit a hazard. 140// Experimentation needed. 141// %rdi = dest ptr (aligned) 142// %eax = result mask 143// %rdx = buffer length remaining 144// %xmm1 = source vector 145 146LFound0: 147 bsf %eax,%ecx // find first 0 148 subq %rcx,%rdx // decrement remaining buffer length 149 test $8,%cl // 8-byte store required? 150 jz 4f // no 151 movq %xmm1,(%rdi) // pack in 8 low bytes 152 psrldq $8,%xmm1 // then shift vector down 8 bytes 153 addq $8,%rdi 1544: 155 test $4,%cl // 4-byte store required? 156 jz 3f // no 157 movd %xmm1,(%rdi) // pack in 4 low bytes 158 psrldq $4,%xmm1 // then shift vector down 4 bytes 159 addq $4,%rdi 1603: 161 andl $3,%ecx // more to go? 162 jz LZeroBuffer // no 163 movd %xmm1,%eax // move remainders out of vector into %eax 1641: // loop on up to three bytes 165 movb %al,(%rdi) // pack in next byte 166 shrl $8,%eax // shift next byte into position 167 addq $1,%rdi 168 subl $1,%ecx 169 jnz 1b 170 171// We've copied the string. Now zero the rest of the buffer, using commpage bzero(). 172// %rdi = dest ptr 173// %rcx = buffer length remaining 174 175LZeroBuffer: 176 movq %rdx,%rsi // remaining buffer size (2nd argument) 177 subq $8,%rsp // align stack to 16B before call 178 call _bzero 179 addq $8,%rsp // restore stack 180 181LDone: 182 movq %r8,%rax // original dest ptr is return value 183 ret 184