1/* 2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <machine/cpu_capabilities.h> 30#include <machine/commpage.h> 31 32/* 33 * Bzero, tuned for Pentium-M class processors with SSE2 34 * and 64-byte cache lines. This is the 64-bit version. 35 * 36 * This routine is also used for memset(p,0,n), which is a common case 37 * since gcc sometimes silently maps bzero() into memset(). As a result, 38 * we always load the original ptr into %eax before returning. 39 */ 40 41#define kShort 80 // too short to bother with SSE (must be >=80) 42#define kVeryLong (1024*1024) 43 44 45 .text 46 .code64 47 .align 5, 0x90 48Lbzero_sse2_64: // void bzero(void *b, size_t len); 49 pushq %rbp // set up a frame for backtraces 50 movq %rsp,%rbp 51 xorl %eax,%eax // set fill data to 0 52 movq %rdi,%r11 // save original ptr as return value 53 cmpq $(kShort),%rsi // long enough for SSE? 54 jg LNotShort // yes 55 56// Here for short operands or the end of long ones. 57// %esi = length (<= kShort) 58// %rdi = ptr 59// %eax = zero 60 61Lshort: 62 cmpl $16,%esi // long enough to word align? 63 jge 3f // yes 64 test %esi,%esi // length==0? 65 jz 6f 661: 67 movb %al,(%rdi) // zero a byte 68 incq %rdi 69 decl %esi 70 jnz 1b 71 jmp 6f 722: 73 movb %al,(%rdi) // zero a byte 74 incq %rdi 75 decl %esi 763: 77 testl $3,%edi // is ptr doubleword aligned? 78 jnz 2b // no 79 movl %esi,%ecx // copy length 80 shrl $2,%esi // #doublewords to store 814: 82 movl %eax,(%rdi) // zero an aligned doubleword 83 addq $4,%rdi 84 decl %esi 85 jnz 4b 86 andl $3,%ecx // mask down to #bytes at end (0..3) 87 jz 6f // none 885: 89 movb %al,(%rdi) // zero a byte 90 incq %rdi 91 decl %ecx 92 jnz 5b 936: 94 movq %r11,%rax // set return value in case this was a call of memset() 95 popq %rbp 96 ret 97 98 99// We will be using SSE, so align ptr. 100// %rsi = length (> kShort) 101// %rdi = ptr 102// %eax = zero 103 104LNotShort: 105 movl %edi,%ecx // get #bytes to 16-byte align ptr 106 negl %ecx 107 andl $15,%ecx 108 jz LDestAligned // already aligned 109 subq %rcx,%rsi // decrement length 1100: // loop storing bytes to align the ptr 111 movb %al,(%rdi) // pack in a byte 112 incq %rdi 113 decl %ecx 114 jnz 0b 115 116// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. 117// %rsi = length (> (kShort-15)) 118// %rdi = ptr (aligned) 119// %eax = zero 120 121LDestAligned: 122 movq %rsi,%rcx 123 andl $63,%esi // mask down to residual length (0..63) 124 andq $-64,%rcx // get #bytes we will zero in this loop 125 pxor %xmm0,%xmm0 // zero an SSE register 126 addq %rcx,%rdi // increment ptr by length to move 127 cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? 128 jae LVeryLong // yes 129 negq %rcx // negate length to move 130 jmp 1f 131 132// Loop over 64-byte chunks, storing into cache. 133 134 .align 4,0x90 // keep inner loops 16-byte aligned 1351: 136 movdqa %xmm0,(%rdi,%rcx) 137 movdqa %xmm0,16(%rdi,%rcx) 138 movdqa %xmm0,32(%rdi,%rcx) 139 movdqa %xmm0,48(%rdi,%rcx) 140 addq $64,%rcx 141 jne 1b 142 143 jmp Lshort 144 145// Very long operands: use non-temporal stores to bypass cache. 146 147LVeryLong: 148 negq %rcx // negate length to move 149 jmp 1f 150 151 .align 4,0x90 // keep inner loops 16-byte aligned 1521: 153 movntdq %xmm0,(%rdi,%rcx) 154 movntdq %xmm0,16(%rdi,%rcx) 155 movntdq %xmm0,32(%rdi,%rcx) 156 movntdq %xmm0,48(%rdi,%rcx) 157 addq $64,%rcx 158 jne 1b 159 160 sfence // required by non-temporal stores 161 jmp Lshort 162 163 164 COMMPAGE_DESCRIPTOR(bzero_sse2_64,_COMM_PAGE_BZERO,kHasSSE2,kHasSSE4_2) 165