1/* 2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24 25// *************** 26// * S T R C P Y * 27// *************** 28// 29// char *strcpy(const char *dst, const char *src); 30// 31// We optimize the move by doing it vector parallel. This introduces 32// a complication: if we blindly did vector load/stores until finding 33// a 0, we might get a spurious page fault by touching bytes past it. 34// To avoid this, we never do a load that crosses a page boundary, 35// and never store a byte we don't have to. 36// 37// We align the destination, because unaligned vector stores are slow. 38 39 .text 40 .globl _strcpy 41 42 .align 4 43_strcpy: // char *strcpy(const char *dst, const char *src); 44 pushl %edi 45 movl 8(%esp),%edi // get dest ptr 46 movl 12(%esp),%ecx // get source ptr 47 movl %edi,%edx // copy dest ptr 48 negl %edx 49 andl $15,%edx // how many bytes to align dest ptr? 50 jnz LLoopOverBytes // not aligned, so go do so 51 52 53// In order to avoid spurious page faults, we loop until nearing the source page 54// end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed, 55// then resume the vector loop. 56// %ecx = source ptr (unaligned) 57// %edi = dest ptr (aligned) 58 59LNextChunk: 60 movl %ecx,%eax // copy source ptr 61 movl $4096,%edx 62 andl $4095,%eax // get offset into source page 63 subl %eax,%edx // get #bytes remaining in source page 64 shrl $4,%edx // get #chunks till end of page 65 jnz LLoopOverChunks // enter vector loop 66 movl $16,%edx // move 16 bytes to cross page but keep dest aligned 67 jmp LLoopOverBytes 68 69 70// Loop over bytes. 71// %ecx = source ptr 72// %edi = dest ptr 73// %edx = byte count 74 75 .align 4,0x90 // align inner loops to optimize I-fetch 76LLoopOverBytes: 77 movzb (%ecx),%eax // get source byte 78 inc %ecx 79 movb %al,(%edi) // pack into dest 80 inc %edi 81 testl %eax,%eax // 0? 82 jz LDone // yes, we're done 83 dec %edx // more to go? 84 jnz LLoopOverBytes 85 86 jmp LNextChunk // we've come to end of page 87 88 89// Loop over 16-byte chunks. 90// %ecx = source ptr (unaligned) 91// %edi = dest ptr (aligned) 92// %edx = chunk count 93 94 .align 4,0x90 // align inner loops to optimize I-fetch 95LLoopOverChunks: 96 movdqu (%ecx),%xmm1 // get source 97 pxor %xmm0,%xmm0 // get some 0s 98 addl $16,%ecx 99 pcmpeqb %xmm1,%xmm0 // compare source to 0s 100 pmovmskb %xmm0,%eax // get result mask for 0 check 101 testl %eax,%eax // any 0s? 102 jnz LFound0 // yes, exit loop 103 movdqa %xmm1,(%edi) // no 0s so do aligned store into destination 104 addl $16,%edi 105 dec %edx // more to go? 106 jnz LLoopOverChunks 107 108 movl $16,%edx // move 16 bytes 109 jmp LLoopOverBytes // cross page but keep dest aligned 110 111 112// Found a zero in the vector. Figure out where it is, and store the bytes 113// up to it. 114// %edi = dest ptr (aligned) 115// %eax = result mask 116// %xmm1 = source vector 117 118LFound0: 119 bsf %eax,%edx // find first 0 120 inc %edx // we need to store the 0 too 121 test $16,%dl // was 0 last byte? 122 jz 8f // no 123 movdqa %xmm1,(%edi) // yes, store entire vector 124 jmp LDone 1258: 126 test $8,%dl // 8-byte store required? 127 jz 4f // no 128 movq %xmm1,(%edi) // pack in 8 low bytes 129 psrldq $8,%xmm1 // then shift vector down 8 bytes 130 addl $8,%edi 1314: 132 test $4,%dl // 4-byte store required? 133 jz 3f // no 134 movd %xmm1,(%edi) // pack in 4 low bytes 135 psrldq $4,%xmm1 // then shift vector down 4 bytes 136 addl $4,%edi 1373: 138 andl $3,%edx // more to go? 139 jz LDone // no 140 movd %xmm1,%eax // move remainders out of vector into %eax 1411: // loop on up to three bytes 142 movb %al,(%edi) // pack in next byte 143 shrl $8,%eax // shift next byte into position 144 inc %edi 145 dec %edx 146 jnz 1b 147 148LDone: 149 movl 8(%esp),%eax // original dest ptr is return value 150 popl %edi 151 ret 152