1/*
2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/*
33 * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem.
34 * We don't actually use SSE4.2, but rather use it to identify Nehalem.
35 * This is the 64-bit version.
36 *
37 * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS.
38 *
39 * This routine is also used for memset(p,0,n), which is a common case
40 * since gcc sometimes silently maps bzero() into memset().  As a result,
41 * we always load the original ptr into %eax before returning.
42 */
43
44#define kShort		80		// too short to bother with SSE (must be >=80)
45
46
47        .text
48	.code64
49        .align  5, 0x90
50Lbzero_sse42_64:                         // void	bzero(void *b, size_t len);
51	pushq	%rbp			// set up a frame for backtraces
52	movq	%rsp,%rbp
53        xorl    %eax,%eax               // set fill data to 0
54	movq	%rdi,%r11		// save original ptr as return value
55        cmpq    $(kShort),%rsi          // long enough for SSE?
56        jg	LNotShort               // yes
57
58// Here for short operands or the end of long ones.
59//      %esi = length (<= kShort)
60//      %rdi = ptr
61//      %eax = zero
62
63Lshort:
64	cmpl	$12,%esi		// long enough to word align?
65	jge	3f			// yes
66	test	%esi,%esi		// length==0?
67	jz	6f
681:
69	movb	%al,(%rdi)		// zero a byte
70	incq	%rdi
71	decl	%esi
72	jnz	1b
73	jmp	6f
742:
75	movb	%al,(%rdi)		// zero a byte
76	incq	%rdi
77	decl	%esi
783:
79	testl	$3,%edi			// is ptr doubleword aligned?
80	jnz	2b			// no
81	movl	%esi,%ecx		// copy length
82	shrl	$2,%esi			// #doublewords to store
834:
84	movl	%eax,(%rdi)		// zero an aligned doubleword
85	addq	$4,%rdi
86	decl	%esi
87	jnz	4b
88	andl	$3,%ecx			// mask down to #bytes at end (0..3)
89	jz	6f			// none
905:
91	movb	%al,(%rdi)		// zero a byte
92	incq	%rdi
93	decl	%ecx
94	jnz	5b
956:
96	movq	%r11,%rax		// set return value in case this was a call of memset()
97	popq	%rbp
98        ret
99
100
101// We will be using SSE, so align ptr.
102//      %rsi = length (> kShort)
103//      %rdi = ptr
104//      %eax = zero
105
106LNotShort:
107	testl	$3,%edi			// 4-byte aligned?
108	jz	2f			// yes
109	movb	%al,(%rdi)		// zero another byte
110	incq	%rdi
111	decq	%rsi
112	jmp	LNotShort
1131:					// zero doublewords until 16-byte aligned
114	movl	%eax,(%rdi)
115	addq	$4,%rdi
116	subq	$4,%rsi
1172:
118	testl	$15,%edi		// 16-byte aligned?
119	jnz	1b			// no
120
121// Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
122//      %rsi = length (> (kShort-15))
123//      %rdi = ptr (aligned)
124//      %eax = zero
125
126LDestAligned:
127        movq    %rsi,%rcx
128        andl    $63,%esi                // mask down to residual length (0..63)
129        andq    $-64,%rcx               // get #bytes we will zero in this loop
130        pxor    %xmm0,%xmm0             // zero an SSE register
131        addq    %rcx,%rdi               // increment ptr by length to move
132        negq    %rcx			// negate length to move
133	jmp	1f
134
135// Loop over 64-byte chunks, storing into cache.
136
137	.align	4,0x90			// keep inner loops 16-byte aligned
1381:
139        movdqa  %xmm0,(%rdi,%rcx)
140        movdqa  %xmm0,16(%rdi,%rcx)
141        movdqa  %xmm0,32(%rdi,%rcx)
142        movdqa  %xmm0,48(%rdi,%rcx)
143        addq    $64,%rcx
144        jne     1b
145
146	jmp	Lshort
147
148
149	COMMPAGE_DESCRIPTOR(bzero_sse42_64,_COMM_PAGE_BZERO,kHasSSE4_2,0)
150