1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/* The common path for nonzero memset and the memset_pattern routines,
33 * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines.
34 * This is the 64-bit bersion.  It is used by the following functions:
35 *
36 *	void *memset(void *b, int c, size_t len);                   // when c!=0
37 *	void memset_pattern4(void *b, const void *c4, size_t len);
38 *	void memset_pattern8(void *b, const void *c8, size_t len);
39 *	void memset_pattern16(void *b, const void *c16, size_t len);
40 *
41 * Note bzero() and memset() of 0 are handled separately.
42 */
43
44#define	kShort		63
45#define	kVeryLong	(1024*1024)
46
47// Initial entry from Libc with parameters passed in registers.  Although we
48// correctly handle misaligned ptrs and short operands, they are inefficient.
49// Therefore our caller should filter out short operands and exploit local
50// knowledge (ie, original pattern length) to align the ptr if possible.
51// When called, we expect:
52//	%rdi = ptr to memory to set (not necessarily aligned)
53//	%rdx = length (may be short or even 0)
54//	%xmm0 = the pattern to store
55// Return conditions:
56//	%rax, %rdi, %rsi, %rcx, and %rdx all trashed
57//	we preserve %r8, %r9, %r10, and %r11
58
59        .text
60        .align  5, 0x90
61	.code64
62Lmemset_pattern_sse2_64:
63        cmpq    $(kShort),%rdx		// long enough to bother aligning?
64        ja	LNotShort		// yes
65	jmp	LShort			// no
66
67// Here for short operands or the end of long ones.
68//      %rdx = length (<= kShort)
69//      %rdi = ptr (may not be not aligned)
70//      %xmm0 = pattern
71
72LUnalignedStore16:
73	movdqu	%xmm0,(%rdi)		// stuff in another 16 bytes
74	subl	$16,%edx
75	addq	$16,%rdi
76LShort:
77	cmpl	$16,%edx		// room for another vector?
78	jge	LUnalignedStore16	// yes
79LLessThan16:				// here at end of copy with < 16 bytes remaining
80	test	$8,%dl			// 8-byte store required?
81	jz	2f			// no
82	movq	%xmm0,(%rdi)		// pack in 8 low bytes
83	psrldq	$8,%xmm0		// then shift vector down 8 bytes
84	addq	$8,%rdi
852:
86	test	$4,%dl			// 4-byte store required?
87	jz	3f			// no
88	movd	%xmm0,(%rdi)		// pack in 4 low bytes
89	psrldq	$4,%xmm0		// then shift vector down 4 bytes
90	addq	$4,%rdi
913:
92	andl	$3,%edx			// more to go?
93	jz	5f			// no
94	movd	%xmm0,%eax		// move remainders out into %eax
954:					// loop on up to three bytes
96	movb	%al,(%rdi)		// pack in next byte
97	shrl	$8,%eax			// shift next byte into position
98	incq	%rdi
99	dec	%edx
100	jnz	4b
1015:	ret
102
103// Long enough to justify aligning ptr.  Note that we have to rotate the
104// pattern to account for any alignment.  We do this by doing two unaligned
105// stores, and then an aligned load from the middle of the two stores.
106// This will stall on store forwarding alignment mismatch, and the unaligned
107// stores can be pretty slow too, but the alternatives aren't any better.
108// Fortunately, in most cases our caller has already aligned the ptr.
109//      %rdx = length (> kShort)
110//      %rdi = ptr (may not be aligned)
111//      %xmm0 = pattern
112
113LNotShort:
114        movl    %edi,%ecx		// copy low bits of dest ptr
115        negl    %ecx
116        andl    $15,%ecx                // mask down to #bytes to 16-byte align
117	jz	LAligned		// skip if already aligned
118	movdqu	%xmm0,(%rdi)		// store 16 unaligned bytes
119	movdqu	%xmm0,16(%rdi)		// and 16 more, to be sure we have an aligned chunk
120	addq	%rcx,%rdi		// now point to the aligned chunk
121	subq	%rcx,%rdx		// adjust remaining count
122	movdqa	(%rdi),%xmm0		// get the rotated pattern (probably stalling)
123	addq	$16,%rdi		// skip past the aligned chunk
124	subq	$16,%rdx
125
126// Set up for 64-byte loops.
127//      %rdx = length remaining
128//      %rdi = ptr (aligned)
129//      %xmm0 = rotated pattern
130
131LAligned:
132	movq	%rdx,%rcx		// copy length remaining
133        andl    $63,%edx                // mask down to residual length (0..63)
134        andq    $-64,%rcx               // %ecx <- #bytes we will zero in by-64 loop
135	jz	LNoMoreChunks		// no 64-byte chunks
136        addq    %rcx,%rdi               // increment ptr by length to move
137	cmpq	$(kVeryLong),%rcx	// long enough to justify non-temporal stores?
138	jge	LVeryLong		// yes
139        negq    %rcx			// negate length to move
140	jmp	1f
141
142// Loop over 64-byte chunks, storing into cache.
143
144	.align	4,0x90			// keep inner loops 16-byte aligned
1451:
146        movdqa  %xmm0,(%rdi,%rcx)
147        movdqa  %xmm0,16(%rdi,%rcx)
148        movdqa  %xmm0,32(%rdi,%rcx)
149        movdqa  %xmm0,48(%rdi,%rcx)
150        addq    $64,%rcx
151        jne     1b
152
153	jmp	LNoMoreChunks
154
155// Very long operands: use non-temporal stores to bypass cache.
156
157LVeryLong:
158        negq    %rcx			// negate length to move
159	jmp	1f
160
161	.align	4,0x90			// keep inner loops 16-byte aligned
1621:
163        movntdq %xmm0,(%rdi,%rcx)
164        movntdq %xmm0,16(%rdi,%rcx)
165        movntdq %xmm0,32(%rdi,%rcx)
166        movntdq %xmm0,48(%rdi,%rcx)
167        addq    $64,%rcx
168        jne     1b
169
170        sfence                          // required by non-temporal stores
171	jmp	LNoMoreChunks
172
173// Handle leftovers: loop by 16.
174//      %edx = length remaining (<64)
175//      %edi = ptr (aligned)
176//      %xmm0 = rotated pattern
177
178LLoopBy16:
179	movdqa	%xmm0,(%rdi)		// pack in 16 more bytes
180	subl	$16,%edx		// decrement count
181	addq	$16,%rdi		// increment ptr
182LNoMoreChunks:
183	cmpl	$16,%edx		// more to go?
184	jge	LLoopBy16		// yes
185	jmp	LLessThan16		// handle up to 15 remaining bytes
186
187	COMMPAGE_DESCRIPTOR(memset_pattern_sse2_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)
188