1/* 2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <machine/cpu_capabilities.h> 30#include <machine/commpage.h> 31 32/* 33 * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem. 34 * We don't actually use SSE4.2, but rather use it to identify Nehalem. 35 * 36 * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS. 37 * 38 * This routine is also used for memset(p,0,n), which is a common case 39 * since gcc sometimes silently maps bzero() into memset(). As a result, 40 * we always load the original ptr into %eax before returning. 41 */ 42 43#define kShort 80 // too short to bother with SSE (must be >=80) 44 45 46 .text 47 .align 5, 0x90 48Lbzero_sse42: // void bzero(void *b, size_t len); 49 pushl %ebp // set up a frame for backtraces 50 movl %esp,%ebp 51 pushl %edi 52 movl 8(%ebp),%edi // get ptr 53 movl 12(%ebp),%edx // get length 54 55 xorl %eax,%eax // set fill data to 0 56 cmpl $(kShort),%edx // long enough for SSE? 57 jg LNotShort // yes 58 59// Here for short operands or the end of long ones. 60// %edx = length 61// %edi = ptr 62// %eax = zero 63 64Lshort: 65 cmpl $12,%edx // long enough to word align? 66 jge 3f // yes 67 test %edx,%edx // length==0? 68 jz 6f 691: 70 movb %al,(%edi) // zero a byte 71 inc %edi 72 dec %edx 73 jnz 1b 74 jmp 6f 752: 76 movb %al,(%edi) // zero a byte 77 inc %edi 78 dec %edx 793: 80 test $3,%edi // is ptr doubleword aligned? 81 jnz 2b // no 82 movl %edx,%ecx // copy length 83 shrl $2,%edx // #doublewords to store 844: 85 movl %eax,(%edi) // zero an aligned doubleword 86 addl $4,%edi 87 dec %edx 88 jnz 4b 89 andl $3,%ecx // mask down to #bytes at end (0..3) 90 jz 6f // none 915: 92 movb %al,(%edi) // zero a byte 93 inc %edi 94 dec %ecx 95 jnz 5b 966: 97 movl 8(%ebp),%eax // get return value in case this was a call of memset() 98 popl %edi 99 popl %ebp 100 ret 101 102 103// We will be using SSE, so align ptr. 104// %edx = length 105// %edi = ptr 106// %eax = zero 107 108LNotShort: 109 testl $3,%edi // 4-byte aligned? 110 jz 2f // yes 111 movb %al,(%edi) // zero another byte 112 incl %edi 113 decl %edx 114 jmp LNotShort 1151: // zero doublewords until 16-byte aligned 116 movl %eax,(%edi) 117 addl $4,%edi 118 subl $4,%edx 1192: 120 testl $15,%edi // 16-byte aligned? 121 jnz 1b // no 122 123 124// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. 125// %edx = length 126// %edi = ptr 127// %eax = zero 128 129LDestAligned: 130 movl %edx,%ecx 131 andl $63,%edx // mask down to residual length (0..63) 132 andl $-64,%ecx // get #bytes we will zero in this loop 133 pxor %xmm0,%xmm0 // zero an SSE register 134 addl %ecx,%edi // increment ptr by length to move 135 negl %ecx // negate length to move 136 jmp 1f 137 138// Loop over 64-byte chunks, storing into cache. 139 140 .align 4,0x90 // keep inner loops 16-byte aligned 1411: 142 movdqa %xmm0,(%edi,%ecx) 143 movdqa %xmm0,16(%edi,%ecx) 144 movdqa %xmm0,32(%edi,%ecx) 145 movdqa %xmm0,48(%edi,%ecx) 146 addl $64,%ecx 147 jne 1b 148 149 jmp Lshort 150 151 152 153 COMMPAGE_DESCRIPTOR(bzero_sse42,_COMM_PAGE_BZERO,kHasSSE4_2,0) 154