1/* 2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <machine/cpu_capabilities.h> 30#include <machine/commpage.h> 31 32/* 33 * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem. 34 * We don't actually use SSE4.2, but rather use it to identify Nehalem. 35 * This is the 64-bit version. 36 * 37 * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS. 38 * 39 * This routine is also used for memset(p,0,n), which is a common case 40 * since gcc sometimes silently maps bzero() into memset(). As a result, 41 * we always load the original ptr into %eax before returning. 42 */ 43 44#define kShort 80 // too short to bother with SSE (must be >=80) 45 46 47 .text 48 .code64 49 .align 5, 0x90 50Lbzero_sse42_64: // void bzero(void *b, size_t len); 51 pushq %rbp // set up a frame for backtraces 52 movq %rsp,%rbp 53 xorl %eax,%eax // set fill data to 0 54 movq %rdi,%r11 // save original ptr as return value 55 cmpq $(kShort),%rsi // long enough for SSE? 56 jg LNotShort // yes 57 58// Here for short operands or the end of long ones. 59// %esi = length (<= kShort) 60// %rdi = ptr 61// %eax = zero 62 63Lshort: 64 cmpl $12,%esi // long enough to word align? 65 jge 3f // yes 66 test %esi,%esi // length==0? 67 jz 6f 681: 69 movb %al,(%rdi) // zero a byte 70 incq %rdi 71 decl %esi 72 jnz 1b 73 jmp 6f 742: 75 movb %al,(%rdi) // zero a byte 76 incq %rdi 77 decl %esi 783: 79 testl $3,%edi // is ptr doubleword aligned? 80 jnz 2b // no 81 movl %esi,%ecx // copy length 82 shrl $2,%esi // #doublewords to store 834: 84 movl %eax,(%rdi) // zero an aligned doubleword 85 addq $4,%rdi 86 decl %esi 87 jnz 4b 88 andl $3,%ecx // mask down to #bytes at end (0..3) 89 jz 6f // none 905: 91 movb %al,(%rdi) // zero a byte 92 incq %rdi 93 decl %ecx 94 jnz 5b 956: 96 movq %r11,%rax // set return value in case this was a call of memset() 97 popq %rbp 98 ret 99 100 101// We will be using SSE, so align ptr. 102// %rsi = length (> kShort) 103// %rdi = ptr 104// %eax = zero 105 106LNotShort: 107 testl $3,%edi // 4-byte aligned? 108 jz 2f // yes 109 movb %al,(%rdi) // zero another byte 110 incq %rdi 111 decq %rsi 112 jmp LNotShort 1131: // zero doublewords until 16-byte aligned 114 movl %eax,(%rdi) 115 addq $4,%rdi 116 subq $4,%rsi 1172: 118 testl $15,%edi // 16-byte aligned? 119 jnz 1b // no 120 121// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. 122// %rsi = length (> (kShort-15)) 123// %rdi = ptr (aligned) 124// %eax = zero 125 126LDestAligned: 127 movq %rsi,%rcx 128 andl $63,%esi // mask down to residual length (0..63) 129 andq $-64,%rcx // get #bytes we will zero in this loop 130 pxor %xmm0,%xmm0 // zero an SSE register 131 addq %rcx,%rdi // increment ptr by length to move 132 negq %rcx // negate length to move 133 jmp 1f 134 135// Loop over 64-byte chunks, storing into cache. 136 137 .align 4,0x90 // keep inner loops 16-byte aligned 1381: 139 movdqa %xmm0,(%rdi,%rcx) 140 movdqa %xmm0,16(%rdi,%rcx) 141 movdqa %xmm0,32(%rdi,%rcx) 142 movdqa %xmm0,48(%rdi,%rcx) 143 addq $64,%rcx 144 jne 1b 145 146 jmp Lshort 147 148 149 COMMPAGE_DESCRIPTOR(bzero_sse42_64,_COMM_PAGE_BZERO,kHasSSE4_2,0) 150