1/*
2 * Copyright (c) 2010-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#include <System/machine/cpu_capabilities.h>
25
26// bool save_xxm = (*((uint32_t*)_COMM_PAGE_CPU_CAPABILITIES) & kHasAVX1_0) != 0;
27
28#if __x86_64__
29
30#define RDI_SAVE_RBP			-8
31#define RSI_SAVE_RBP			-16
32#define RDX_SAVE_RBP			-24
33#define RCX_SAVE_RBP			-32
34#define RBX_SAVE_RBP			-40
35#define R8_SAVE_RBP 			-48
36#define R9_SAVE_RBP 			-56
37#define R10_SAVE_RBP			-64
38#define R11_SAVE_RBP			-72
39#define STATIC_STACK_SIZE		256	// extra padding to allow it to be 64-byte aligned
40
41#define XMM0_SAVE_RSP			0x00
42#define XMM1_SAVE_RSP			0x10
43#define XMM2_SAVE_RSP			0x20
44#define XMM3_SAVE_RSP			0x30
45#define XMM4_SAVE_RSP			0x40
46#define XMM5_SAVE_RSP			0x50
47#define XMM6_SAVE_RSP			0x60
48#define XMM7_SAVE_RSP			0x70
49
50
51	// returns address of TLV in %rax, all other registers preserved
52	.globl _tlv_get_addr
53	.private_extern _tlv_get_addr
54_tlv_get_addr:
55	movq	8(%rdi),%rax			// get key from descriptor
56	movq	%gs:0x0(,%rax,8),%rax	// get thread value
57	testq	%rax,%rax				// if NULL, lazily allocate
58	je		LlazyAllocate
59	addq	16(%rdi),%rax			// add offset from descriptor
60	ret
61LlazyAllocate:
62	pushq		%rbp
63	movq		%rsp,%rbp
64	subq		$STATIC_STACK_SIZE,%rsp
65	movq		%rdi,RDI_SAVE_RBP(%rbp)	# save registers that might be used as parameters
66	movq		%rsi,RSI_SAVE_RBP(%rbp)
67	movq		%rdx,RDX_SAVE_RBP(%rbp)
68	movq		%rcx,RCX_SAVE_RBP(%rbp)
69	movq		%rbx,RBX_SAVE_RBP(%rbp)
70	movq		%r8,  R8_SAVE_RBP(%rbp)
71	movq		%r9,  R9_SAVE_RBP(%rbp)
72	movq		%r10,R10_SAVE_RBP(%rbp)
73	movq		%r11,R11_SAVE_RBP(%rbp)
74
75	cmpl		$0, _inited(%rip)
76	jne			Linited
77	movl		$0x01,%eax
78	cpuid		# get cpu features to check on xsave instruction support
79	andl		$0x08000000,%ecx		# check OSXSAVE bit
80	movl		%ecx,_hasXSave(%rip)
81	cmpl		$0, %ecx
82	jne			LxsaveInfo
83	movl		$1, _inited(%rip)
84	jmp			Lsse
85
86LxsaveInfo:
87	movl		$0x0D,%eax
88	movl		$0x00,%ecx
89	cpuid		# get xsave parameter info
90	movl		%eax,_features_lo32(%rip)
91	movl		%edx,_features_hi32(%rip)
92	movl		%ecx,_bufferSize32(%rip)
93	movl		$1, _inited(%rip)
94
95Linited:
96	cmpl		$0, _hasXSave(%rip)
97	jne			Lxsave
98
99Lsse:
100	subq		$128, %rsp
101	movdqa      %xmm0, XMM0_SAVE_RSP(%rsp)
102	movdqa      %xmm1, XMM1_SAVE_RSP(%rsp)
103	movdqa      %xmm2, XMM2_SAVE_RSP(%rsp)
104	movdqa      %xmm3, XMM3_SAVE_RSP(%rsp)
105	movdqa      %xmm4, XMM4_SAVE_RSP(%rsp)
106	movdqa      %xmm5, XMM5_SAVE_RSP(%rsp)
107	movdqa      %xmm6, XMM6_SAVE_RSP(%rsp)
108	movdqa      %xmm7, XMM7_SAVE_RSP(%rsp)
109	jmp			Lalloc
110
111Lxsave:
112	movl		_bufferSize32(%rip),%eax
113	movq		%rsp, %rdi
114	subq		%rax, %rdi				# stack alloc buffer
115	andq		$-64, %rdi				# 64-byte align stack
116	movq		%rdi, %rsp
117	# xsave requires buffer to be zero'ed out
118	movq		$0, %rcx
119	movq		%rdi, %r8
120	movq		%rdi, %r9
121	addq		%rax, %r9
122Lz:	movq		%rcx, (%r8)
123	addq		$8, %r8
124	cmpq		%r8,%r9
125	ja			Lz
126
127	movl		_features_lo32(%rip),%eax
128	movl		_features_hi32(%rip),%edx
129	# call xsave with buffer on stack and eax:edx flag bits
130	# note: do not use xsaveopt, it assumes you are using the same
131	# buffer as previous xsaves, and this thread is on the same cpu.
132	xsave		(%rsp)
133
134Lalloc:
135	movq		RDI_SAVE_RBP(%rbp),%rdi
136	movq		8(%rdi),%rdi		// get key from descriptor
137	call		_tlv_allocate_and_initialize_for_key
138
139	cmpl		$0, _hasXSave(%rip)
140	jne			Lxrstror
141
142	movdqa      XMM0_SAVE_RSP(%rsp),%xmm0
143	movdqa      XMM1_SAVE_RSP(%rsp),%xmm1
144	movdqa      XMM2_SAVE_RSP(%rsp),%xmm2
145	movdqa      XMM3_SAVE_RSP(%rsp),%xmm3
146	movdqa      XMM4_SAVE_RSP(%rsp),%xmm4
147	movdqa      XMM5_SAVE_RSP(%rsp),%xmm5
148	movdqa      XMM6_SAVE_RSP(%rsp),%xmm6
149	movdqa      XMM7_SAVE_RSP(%rsp),%xmm7
150	jmp			Ldone
151
152Lxrstror:
153	movq		%rax,%r11
154	movl		_features_lo32(%rip),%eax
155	movl		_features_hi32(%rip),%edx
156	# call xsave with buffer on stack and eax:edx flag bits
157	xrstor		(%rsp)
158	movq		%r11,%rax
159
160Ldone:
161	movq		RDI_SAVE_RBP(%rbp),%rdi
162	movq		RSI_SAVE_RBP(%rbp),%rsi
163	movq		RDX_SAVE_RBP(%rbp),%rdx
164	movq		RCX_SAVE_RBP(%rbp),%rcx
165	movq		RBX_SAVE_RBP(%rbp),%rbx
166	movq		R8_SAVE_RBP(%rbp),%r8
167	movq		R9_SAVE_RBP(%rbp),%r9
168	movq		R10_SAVE_RBP(%rbp),%r10
169	movq		R11_SAVE_RBP(%rbp),%r11
170	movq		%rbp,%rsp
171	popq		%rbp
172 	addq		16(%rdi),%rax			// result = buffer + offset
173	ret
174
175	.data
176# Cached info from cpuid.
177_inited:			.long 0
178_features_lo32:		.long 0
179_features_hi32:		.long 0
180_bufferSize32:		.long 0
181_hasXSave:			.long 0
182
183#endif
184
185
186
187#if __i386__
188	// returns address of TLV in %eax, all other registers (except %ecx) preserved
189	.globl _tlv_get_addr
190	.private_extern _tlv_get_addr
191_tlv_get_addr:
192	movl	4(%eax),%ecx			// get key from descriptor
193	movl	%gs:0x0(,%ecx,4),%ecx	// get thread value
194	testl	%ecx,%ecx				// if NULL, lazily allocate
195	je		LlazyAllocate
196	movl	8(%eax),%eax			// add offset from descriptor
197	addl	%ecx,%eax
198	ret
199LlazyAllocate:
200	pushl	%ebp
201	movl	%esp,%ebp
202	pushl	%edx					// save edx
203	subl	$548,%esp
204	movl	%eax,-8(%ebp)		    // save descriptor
205	lea		-528(%ebp),%ecx		    // get 512 byte buffer in frame
206	and		$-16, %ecx			    // 16-byte align buffer for fxsave
207	fxsave  (%ecx)
208	movl	4(%eax),%ecx			// get key from descriptor
209	movl	%ecx,(%esp)	            // push key parameter, also leaves stack aligned properly
210	call	_tlv_allocate_and_initialize_for_key
211	movl	-8(%ebp),%ecx			// get descriptor
212	movl	8(%ecx),%ecx			// get offset from descriptor
213	addl	%ecx,%eax				// add offset to buffer
214	lea 	-528(%ebp),%ecx
215	and 	$-16, %ecx              // 16-byte align buffer for fxrstor
216	fxrstor (%ecx)
217	addl	$548,%esp
218	popl	%edx	                // restore edx
219	popl	%ebp
220	ret
221#endif
222
223#if __arm64__
224	// Parameters: X0 = descriptor
225	// Result:  X0 = address of TLV
226	// Note: all registers except X0, x16, and x17 are preserved
227	.align 2
228	.globl _tlv_get_addr
229	.private_extern _tlv_get_addr
230_tlv_get_addr:
231	ldr		x16, [x0, #8]			// get key from descriptor
232	mrs		x17, TPIDRRO_EL0
233	and		x17, x17, #-8			// clear low 3 bits???
234	ldr		x17, [x17, x16, lsl #3]	// get thread allocation address for this key
235	cbz		x17, LlazyAllocate		// if NULL, lazily allocate
236	ldr		x16, [x0, #16]			// get offset from descriptor
237	add		x0, x17, x16			// return allocation+offset
238	ret		lr
239
240LlazyAllocate:
241	stp		fp, lr, [sp, #-16]!
242	mov		fp, sp
243	sub		sp, sp, #288
244	stp		x1, x2, [sp, #-16]!		// save all registers that C function might trash
245	stp		x3, x4, [sp, #-16]!
246	stp		x5, x6, [sp, #-16]!
247	stp		x7, x8, [sp, #-16]!
248	stp		x9, x10,  [sp, #-16]!
249	stp		x11, x12, [sp, #-16]!
250	stp		x13, x14, [sp, #-16]!
251	stp		x15, x16, [sp, #-16]!
252	stp		q0,  q1,  [sp, #-32]!
253	stp		q2,  q3,  [sp, #-32]!
254	stp		q4,  q5,  [sp, #-32]!
255	stp		q6,  q7,  [sp, #-32]!
256	stp		x0, x17,  [sp, #-16]!	// save descriptor
257
258	mov		x0, x16					// use key from descriptor as parameter
259	bl		_tlv_allocate_and_initialize_for_key
260	ldp		x16, x17, [sp], #16		// pop descriptor
261	ldr		x16, [x16, #16]			// get offset from descriptor
262	add		x0, x0, x16				// return allocation+offset
263
264	ldp		q6,  q7,  [sp], #32
265	ldp		q4,  q5,  [sp], #32
266	ldp		q2,  q3,  [sp], #32
267	ldp		q0,  q1,  [sp], #32
268	ldp		x15, x16, [sp], #16
269	ldp		x13, x14, [sp], #16
270	ldp		x11, x12, [sp], #16
271	ldp		x9, x10,  [sp], #16
272	ldp		x7, x8, [sp], #16
273	ldp		x5, x6, [sp], #16
274	ldp		x3, x4, [sp], #16
275	ldp		x1, x2, [sp], #16
276
277	mov		sp, fp
278	ldp		fp, lr, [sp], #16
279	ret		lr
280
281#endif
282
283#if 0
284#if __arm__
285	// returns address of TLV in r0, all other registers preserved
286	.globl _tlv_get_addr
287	.private_extern _tlv_get_addr
288_tlv_get_addr:
289	push	{r1,r2,r3,r7,lr}
290	mov		r7,r0							// save descriptor in r7
291	ldr		r0, [r7, #4]					// get key from descriptor
292	bl		_pthread_getspecific			// get thread value
293	cmp		r0, #0
294	bne		L2								// if NULL, lazily allocate
295	ldr		r0, [r7, #4]					// get key from descriptor
296	bl		_tlv_allocate_and_initialize_for_key
297L2:	ldr		r1, [r7, #8]					// get offset from descriptor
298	add		r0, r1, r0						// add offset into allocation block
299	pop		{r1,r2,r3,r7,pc}
300#endif
301#endif
302
303	.subsections_via_symbols
304
305
306