1/*
2 * Copyright (c) 2010-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#include <System/machine/cpu_capabilities.h>
25
26// bool save_xxm = (*((uint32_t*)_COMM_PAGE_CPU_CAPABILITIES) & kHasAVX1_0) != 0;
27
28#if __x86_64__
29	// returns address of TLV in %rax, all other registers preserved
30	#define FP_SAVE			-192
31	#define VECTOR_SAVE		-704
32	#define STACK_SIZE		704
33
34	.globl _tlv_get_addr
35	.private_extern _tlv_get_addr
36_tlv_get_addr:
37	movq	8(%rdi),%rax			// get key from descriptor
38	movq	%gs:0x0(,%rax,8),%rax	// get thread value
39	testq	%rax,%rax				// if NULL, lazily allocate
40	je		LlazyAllocate
41	addq	16(%rdi),%rax			// add offset from descriptor
42	ret
43LlazyAllocate:
44	pushq	%rbp
45	movq	%rsp, %rbp
46	subq	$STACK_SIZE,%rsp				// fxsave uses 512 bytes of store, xsave uses
47	movq	%rdi,-8(%rbp)
48	movq	%rsi,-16(%rbp)
49	movq	%rdx,-24(%rbp)
50	movq	%rcx,-32(%rbp)
51	movq	%r8,-40(%rbp)
52	movq	%r9,-48(%rbp)
53	movq	%r10,-56(%rbp)
54	movq	%r11,-64(%rbp)
55	fnsave	FP_SAVE(%rbp)
56	movq    $(_COMM_PAGE_CPU_CAPABILITIES), %rcx
57	movl    (%rcx), %ecx
58	testl   $kHasAVX1_0, %ecx
59	jne     L2
60	movdqa	%xmm0, VECTOR_SAVE+0x00(%rbp)
61	movdqa	%xmm1, VECTOR_SAVE+0x10(%rbp)
62	movdqa	%xmm2, VECTOR_SAVE+0x20(%rbp)
63	movdqa	%xmm3, VECTOR_SAVE+0x30(%rbp)
64	movdqa	%xmm4, VECTOR_SAVE+0x40(%rbp)
65	movdqa	%xmm5, VECTOR_SAVE+0x50(%rbp)
66	movdqa	%xmm6, VECTOR_SAVE+0x60(%rbp)
67	movdqa	%xmm7, VECTOR_SAVE+0x70(%rbp)
68	movdqa	%xmm8, VECTOR_SAVE+0x80(%rbp)
69	movdqa	%xmm9, VECTOR_SAVE+0x90(%rbp)
70	movdqa	%xmm10,VECTOR_SAVE+0xA0(%rbp)
71	movdqa	%xmm11,VECTOR_SAVE+0xB0(%rbp)
72	movdqa	%xmm12,VECTOR_SAVE+0xC0(%rbp)
73	movdqa	%xmm13,VECTOR_SAVE+0xD0(%rbp)
74	movdqa	%xmm14,VECTOR_SAVE+0xE0(%rbp)
75	movdqa	%xmm15,VECTOR_SAVE+0xF0(%rbp)
76	jmp		L3
77L2:	vmovdqu	%ymm0, VECTOR_SAVE+0x00(%rbp)
78	vmovdqu	%ymm1, VECTOR_SAVE+0x20(%rbp)
79	vmovdqu	%ymm2, VECTOR_SAVE+0x40(%rbp)
80	vmovdqu	%ymm3, VECTOR_SAVE+0x60(%rbp)
81	vmovdqu	%ymm4, VECTOR_SAVE+0x80(%rbp)
82	vmovdqu	%ymm5, VECTOR_SAVE+0xA0(%rbp)
83	vmovdqu	%ymm6, VECTOR_SAVE+0xC0(%rbp)
84	vmovdqu	%ymm7, VECTOR_SAVE+0xE0(%rbp)
85	vmovdqu	%ymm8, VECTOR_SAVE+0x100(%rbp)
86	vmovdqu	%ymm9, VECTOR_SAVE+0x120(%rbp)
87	vmovdqu	%ymm10,VECTOR_SAVE+0x140(%rbp)
88	vmovdqu	%ymm11,VECTOR_SAVE+0x160(%rbp)
89	vmovdqu	%ymm12,VECTOR_SAVE+0x180(%rbp)
90	vmovdqu	%ymm13,VECTOR_SAVE+0x1A0(%rbp)
91	vmovdqu	%ymm14,VECTOR_SAVE+0x1C0(%rbp)
92	vmovdqu	%ymm15,VECTOR_SAVE+0x1E0(%rbp)
93L3:	movq	-32(%rbp),%rcx
94	movq	8(%rdi),%rdi			// get key from descriptor
95	call	_tlv_allocate_and_initialize_for_key
96
97	frstor	FP_SAVE(%rbp)
98	movq    $(_COMM_PAGE_CPU_CAPABILITIES), %rcx
99	movl    (%rcx), %ecx
100	testl   $kHasAVX1_0, %ecx
101	jne     L4
102	movdqa	VECTOR_SAVE+0x00(%rbp), %xmm0
103	movdqa	VECTOR_SAVE+0x10(%rbp), %xmm1
104	movdqa	VECTOR_SAVE+0x20(%rbp), %xmm2
105	movdqa	VECTOR_SAVE+0x30(%rbp), %xmm3
106	movdqa	VECTOR_SAVE+0x40(%rbp), %xmm4
107	movdqa	VECTOR_SAVE+0x50(%rbp), %xmm5
108	movdqa	VECTOR_SAVE+0x60(%rbp), %xmm6
109	movdqa	VECTOR_SAVE+0x70(%rbp), %xmm7
110	movdqa	VECTOR_SAVE+0x80(%rbp), %xmm8
111	movdqa	VECTOR_SAVE+0x90(%rbp), %xmm9
112	movdqa	VECTOR_SAVE+0xA0(%rbp), %xmm10
113	movdqa	VECTOR_SAVE+0xB0(%rbp), %xmm11
114	movdqa	VECTOR_SAVE+0xC0(%rbp), %xmm12
115	movdqa	VECTOR_SAVE+0xD0(%rbp), %xmm13
116	movdqa	VECTOR_SAVE+0xE0(%rbp), %xmm14
117	movdqa	VECTOR_SAVE+0xF0(%rbp), %xmm15
118	jmp		L5
119L4: vmovdqu	VECTOR_SAVE+0x00(%rbp),  %ymm0
120	vmovdqu VECTOR_SAVE+0x20(%rbp),  %ymm1
121	vmovdqu	VECTOR_SAVE+0x40(%rbp),  %ymm2
122	vmovdqu	VECTOR_SAVE+0x60(%rbp),  %ymm3
123	vmovdqu	VECTOR_SAVE+0x80(%rbp),  %ymm4
124	vmovdqu	VECTOR_SAVE+0xA0(%rbp),  %ymm5
125	vmovdqu	VECTOR_SAVE+0xC0(%rbp),  %ymm6
126	vmovdqu	VECTOR_SAVE+0xE0(%rbp),  %ymm7
127	vmovdqu	VECTOR_SAVE+0x100(%rbp), %ymm8
128	vmovdqu	VECTOR_SAVE+0x120(%rbp), %ymm9
129	vmovdqu VECTOR_SAVE+0x140(%rbp), %ymm10
130	vmovdqu	VECTOR_SAVE+0x160(%rbp), %ymm11
131	vmovdqu	VECTOR_SAVE+0x180(%rbp), %ymm12
132	vmovdqu	VECTOR_SAVE+0x1A0(%rbp), %ymm13
133	vmovdqu	VECTOR_SAVE+0x1C0(%rbp), %ymm14
134	vmovdqu	VECTOR_SAVE+0x1E0(%rbp), %ymm15
135L5:	movq	-64(%rbp),%r11
136	movq	-56(%rbp),%r10
137	movq	-48(%rbp),%r9
138	movq	-40(%rbp),%r8
139	movq	-32(%rbp),%rcx
140	movq	-24(%rbp),%rdx
141	movq	-16(%rbp),%rsi
142	movq	-8(%rbp),%rdi
143 	addq	16(%rdi),%rax			// result = buffer + offset
144	addq	$STACK_SIZE,%rsp
145	popq	%rbp
146	ret
147#endif
148
149
150
151#if __i386__
152	// returns address of TLV in %eax, all other registers (except %ecx) preserved
153	.globl _tlv_get_addr
154	.private_extern _tlv_get_addr
155_tlv_get_addr:
156	movl	4(%eax),%ecx			// get key from descriptor
157	movl	%gs:0x0(,%ecx,4),%ecx	// get thread value
158	testl	%ecx,%ecx				// if NULL, lazily allocate
159	je		LlazyAllocate
160	movl	8(%eax),%eax			// add offset from descriptor
161	addl	%ecx,%eax
162	ret
163LlazyAllocate:
164	pushl	%ebp
165	movl	%esp,%ebp
166	pushl	%edx					// save edx
167	subl	$548,%esp
168	movl	%eax,-8(%ebp)		    // save descriptor
169	lea		-528(%ebp),%ecx		    // get 512 byte buffer in frame
170	and		$-16, %ecx			    // 16-byte align buffer for fxsave
171	fxsave  (%ecx)
172	movl	4(%eax),%ecx			// get key from descriptor
173	movl	%ecx,(%esp)	            // push key parameter, also leaves stack aligned properly
174	call	_tlv_allocate_and_initialize_for_key
175	movl	-8(%ebp),%ecx			// get descriptor
176	movl	8(%ecx),%ecx			// get offset from descriptor
177	addl	%ecx,%eax				// add offset to buffer
178	lea 	-528(%ebp),%ecx
179	and 	$-16, %ecx              // 16-byte align buffer for fxrstor
180	fxrstor (%ecx)
181	addl	$548,%esp
182	popl	%edx	                // restore edx
183	popl	%ebp
184	ret
185#endif
186
187
188#if 0
189#if __arm__
190	// returns address of TLV in r0, all other registers preserved
191	.globl _tlv_get_addr
192	.private_extern _tlv_get_addr
193_tlv_get_addr:
194	push	{r1,r2,r3,r7,lr}
195	mov		r7,r0							// save descriptor in r7
196	ldr		r0, [r7, #4]					// get key from descriptor
197	bl		_pthread_getspecific			// get thread value
198	cmp		r0, #0
199	bne		L2								// if NULL, lazily allocate
200	ldr		r0, [r7, #4]					// get key from descriptor
201	bl		_tlv_allocate_and_initialize_for_key
202L2:	ldr		r1, [r7, #8]					// get offset from descriptor
203	add		r0, r1, r0						// add offset into allocation block
204	pop		{r1,r2,r3,r7,pc}
205#endif
206#endif
207
208	.subsections_via_symbols
209
210
211