1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/*
33 * Bzero, tuned for Pentium-M class processors with SSE2
34 * and 64-byte cache lines.  This is the 64-bit version.
35 *
36 * This routine is also used for memset(p,0,n), which is a common case
37 * since gcc sometimes silently maps bzero() into memset().  As a result,
38 * we always load the original ptr into %eax before returning.
39 */
40
41#define kShort		80		// too short to bother with SSE (must be >=80)
42#define	kVeryLong	(1024*1024)
43
44
45        .text
46	.code64
47        .align  5, 0x90
48Lbzero_sse2_64:                         // void	bzero(void *b, size_t len);
49	pushq	%rbp			// set up a frame for backtraces
50	movq	%rsp,%rbp
51        xorl    %eax,%eax               // set fill data to 0
52	movq	%rdi,%r11		// save original ptr as return value
53        cmpq    $(kShort),%rsi          // long enough for SSE?
54        jg	LNotShort               // yes
55
56// Here for short operands or the end of long ones.
57//      %esi = length (<= kShort)
58//      %rdi = ptr
59//      %eax = zero
60
61Lshort:
62	cmpl	$16,%esi		// long enough to word align?
63	jge	3f			// yes
64	test	%esi,%esi		// length==0?
65	jz	6f
661:
67	movb	%al,(%rdi)		// zero a byte
68	incq	%rdi
69	decl	%esi
70	jnz	1b
71	jmp	6f
722:
73	movb	%al,(%rdi)		// zero a byte
74	incq	%rdi
75	decl	%esi
763:
77	testl	$3,%edi			// is ptr doubleword aligned?
78	jnz	2b			// no
79	movl	%esi,%ecx		// copy length
80	shrl	$2,%esi			// #doublewords to store
814:
82	movl	%eax,(%rdi)		// zero an aligned doubleword
83	addq	$4,%rdi
84	decl	%esi
85	jnz	4b
86	andl	$3,%ecx			// mask down to #bytes at end (0..3)
87	jz	6f			// none
885:
89	movb	%al,(%rdi)		// zero a byte
90	incq	%rdi
91	decl	%ecx
92	jnz	5b
936:
94	movq	%r11,%rax		// set return value in case this was a call of memset()
95	popq	%rbp
96        ret
97
98
99// We will be using SSE, so align ptr.
100//      %rsi = length (> kShort)
101//      %rdi = ptr
102//      %eax = zero
103
104LNotShort:
105        movl    %edi,%ecx		// get #bytes to 16-byte align ptr
106        negl    %ecx
107        andl    $15,%ecx
108	jz	LDestAligned		// already aligned
109        subq    %rcx,%rsi               // decrement length
1100:					// loop storing bytes to align the ptr
111	movb	%al,(%rdi)		// pack in a byte
112	incq	%rdi
113	decl	%ecx
114	jnz	0b
115
116// Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
117//      %rsi = length (> (kShort-15))
118//      %rdi = ptr (aligned)
119//      %eax = zero
120
121LDestAligned:
122        movq    %rsi,%rcx
123        andl    $63,%esi                // mask down to residual length (0..63)
124        andq    $-64,%rcx               // get #bytes we will zero in this loop
125        pxor    %xmm0,%xmm0             // zero an SSE register
126        addq    %rcx,%rdi               // increment ptr by length to move
127	cmpq	$(kVeryLong),%rcx	// long enough to justify non-temporal stores?
128	jae	LVeryLong		// yes
129        negq    %rcx			// negate length to move
130	jmp	1f
131
132// Loop over 64-byte chunks, storing into cache.
133
134	.align	4,0x90			// keep inner loops 16-byte aligned
1351:
136        movdqa  %xmm0,(%rdi,%rcx)
137        movdqa  %xmm0,16(%rdi,%rcx)
138        movdqa  %xmm0,32(%rdi,%rcx)
139        movdqa  %xmm0,48(%rdi,%rcx)
140        addq    $64,%rcx
141        jne     1b
142
143	jmp	Lshort
144
145// Very long operands: use non-temporal stores to bypass cache.
146
147LVeryLong:
148        negq    %rcx			// negate length to move
149	jmp	1f
150
151	.align	4,0x90			// keep inner loops 16-byte aligned
1521:
153        movntdq %xmm0,(%rdi,%rcx)
154        movntdq %xmm0,16(%rdi,%rcx)
155        movntdq %xmm0,32(%rdi,%rcx)
156        movntdq %xmm0,48(%rdi,%rcx)
157        addq    $64,%rcx
158        jne     1b
159
160        sfence                          // required by non-temporal stores
161	jmp	Lshort
162
163
164	COMMPAGE_DESCRIPTOR(bzero_sse2_64,_COMM_PAGE_BZERO,kHasSSE2,kHasSSE4_2)
165