1160814Ssimon#if defined(__SUNPRO_C) && defined(__sparcv9)
2160814Ssimon# define ABI64  /* They've said -xarch=v9 at command line */
3160814Ssimon#elif defined(__GNUC__) && defined(__arch64__)
4160814Ssimon# define ABI64  /* They've said -m64 at command line */
5160814Ssimon#endif
6160814Ssimon
7160814Ssimon#ifdef ABI64
8160814Ssimon  .register	%g2,#scratch
9160814Ssimon  .register	%g3,#scratch
10160814Ssimon# define	FRAME	-192
11160814Ssimon# define	BIAS	2047
12160814Ssimon#else
13160814Ssimon# define	FRAME	-96
14160814Ssimon# define	BIAS	0
15160814Ssimon#endif
16160814Ssimon
17160814Ssimon.text
18160814Ssimon.align	32
19160814Ssimon.global	OPENSSL_wipe_cpu
20160814Ssimon.type	OPENSSL_wipe_cpu,#function
21160814Ssimon! Keep in mind that this does not excuse us from wiping the stack!
22160814Ssimon! This routine wipes registers, but not the backing store [which
23160814Ssimon! resides on the stack, toward lower addresses]. To facilitate for
24160814Ssimon! stack wiping I return pointer to the top of stack of the *caller*.
25160814SsimonOPENSSL_wipe_cpu:
26160814Ssimon	save	%sp,FRAME,%sp
27160814Ssimon	nop
28160814Ssimon#ifdef __sun
29160814Ssimon#include <sys/trap.h>
30160814Ssimon	ta	ST_CLEAN_WINDOWS
31160814Ssimon#else
32160814Ssimon	call	.walk.reg.wins
33160814Ssimon#endif
34160814Ssimon	nop
35160814Ssimon	call	.PIC.zero.up
36160814Ssimon	mov	.zero-(.-4),%o0
37238405Sjkim	ld	[%o0],%f0
38238405Sjkim	ld	[%o0],%f1
39160814Ssimon
40160814Ssimon	subcc	%g0,1,%o0
41160814Ssimon	! Following is V9 "rd %ccr,%o0" instruction. However! V8
42160814Ssimon	! specification says that it ("rd %asr2,%o0" in V8 terms) does
43160814Ssimon	! not cause illegal_instruction trap. It therefore can be used
44160814Ssimon	! to determine if the CPU the code is executing on is V8- or
45160814Ssimon	! V9-compliant, as V9 returns a distinct value of 0x99,
46160814Ssimon	! "negative" and "borrow" bits set in both %icc and %xcc.
47160814Ssimon	.word	0x91408000	!rd	%ccr,%o0
48160814Ssimon	cmp	%o0,0x99
49160814Ssimon	bne	.v8
50160814Ssimon	nop
51160814Ssimon			! Even though we do not use %fp register bank,
52160814Ssimon			! we wipe it as memcpy might have used it...
53160814Ssimon			.word	0xbfa00040	!fmovd	%f0,%f62
54160814Ssimon			.word	0xbba00040	!...
55160814Ssimon			.word	0xb7a00040
56160814Ssimon			.word	0xb3a00040
57160814Ssimon			.word	0xafa00040
58160814Ssimon			.word	0xaba00040
59160814Ssimon			.word	0xa7a00040
60160814Ssimon			.word	0xa3a00040
61160814Ssimon			.word	0x9fa00040
62160814Ssimon			.word	0x9ba00040
63160814Ssimon			.word	0x97a00040
64160814Ssimon			.word	0x93a00040
65160814Ssimon			.word	0x8fa00040
66160814Ssimon			.word	0x8ba00040
67160814Ssimon			.word	0x87a00040
68160814Ssimon			.word	0x83a00040	!fmovd	%f0,%f32
69160814Ssimon.v8:			fmovs	%f1,%f31
70160814Ssimon	clr	%o0
71160814Ssimon			fmovs	%f0,%f30
72160814Ssimon	clr	%o1
73160814Ssimon			fmovs	%f1,%f29
74160814Ssimon	clr	%o2
75160814Ssimon			fmovs	%f0,%f28
76160814Ssimon	clr	%o3
77160814Ssimon			fmovs	%f1,%f27
78160814Ssimon	clr	%o4
79160814Ssimon			fmovs	%f0,%f26
80160814Ssimon	clr	%o5
81160814Ssimon			fmovs	%f1,%f25
82160814Ssimon	clr	%o7
83160814Ssimon			fmovs	%f0,%f24
84160814Ssimon	clr	%l0
85160814Ssimon			fmovs	%f1,%f23
86160814Ssimon	clr	%l1
87160814Ssimon			fmovs	%f0,%f22
88160814Ssimon	clr	%l2
89160814Ssimon			fmovs	%f1,%f21
90160814Ssimon	clr	%l3
91160814Ssimon			fmovs	%f0,%f20
92160814Ssimon	clr	%l4
93160814Ssimon			fmovs	%f1,%f19
94160814Ssimon	clr	%l5
95160814Ssimon			fmovs	%f0,%f18
96160814Ssimon	clr	%l6
97160814Ssimon			fmovs	%f1,%f17
98160814Ssimon	clr	%l7
99160814Ssimon			fmovs	%f0,%f16
100160814Ssimon	clr	%i0
101160814Ssimon			fmovs	%f1,%f15
102160814Ssimon	clr	%i1
103160814Ssimon			fmovs	%f0,%f14
104160814Ssimon	clr	%i2
105160814Ssimon			fmovs	%f1,%f13
106160814Ssimon	clr	%i3
107160814Ssimon			fmovs	%f0,%f12
108160814Ssimon	clr	%i4
109160814Ssimon			fmovs	%f1,%f11
110160814Ssimon	clr	%i5
111160814Ssimon			fmovs	%f0,%f10
112160814Ssimon	clr	%g1
113160814Ssimon			fmovs	%f1,%f9
114160814Ssimon	clr	%g2
115160814Ssimon			fmovs	%f0,%f8
116160814Ssimon	clr	%g3
117160814Ssimon			fmovs	%f1,%f7
118160814Ssimon	clr	%g4
119160814Ssimon			fmovs	%f0,%f6
120160814Ssimon	clr	%g5
121160814Ssimon			fmovs	%f1,%f5
122160814Ssimon			fmovs	%f0,%f4
123160814Ssimon			fmovs	%f1,%f3
124160814Ssimon			fmovs	%f0,%f2
125160814Ssimon
126291721Sjkim	add	%fp,BIAS,%i0	! return pointer to caller��s top of stack
127160814Ssimon
128160814Ssimon	ret
129160814Ssimon	restore
130160814Ssimon
131160814Ssimon.zero:	.long	0x0,0x0
132160814Ssimon.PIC.zero.up:
133160814Ssimon	retl
134160814Ssimon	add	%o0,%o7,%o0
135160814Ssimon#ifdef DEBUG
136160814Ssimon.global	walk_reg_wins
137160814Ssimon.type	walk_reg_wins,#function
138160814Ssimonwalk_reg_wins:
139160814Ssimon#endif
140160814Ssimon.walk.reg.wins:
141160814Ssimon	save	%sp,FRAME,%sp
142160814Ssimon	cmp	%i7,%o7
143160814Ssimon	be	2f
144160814Ssimon	clr	%o0
145160814Ssimon	cmp	%o7,0	! compiler never cleans %o7...
146160814Ssimon	be	1f	! could have been a leaf function...
147160814Ssimon	clr	%o1
148160814Ssimon	call	.walk.reg.wins
149160814Ssimon	nop
150160814Ssimon1:	clr	%o2
151160814Ssimon	clr	%o3
152160814Ssimon	clr	%o4
153160814Ssimon	clr	%o5
154160814Ssimon	clr	%o7
155160814Ssimon	clr	%l0
156160814Ssimon	clr	%l1
157160814Ssimon	clr	%l2
158160814Ssimon	clr	%l3
159160814Ssimon	clr	%l4
160160814Ssimon	clr	%l5
161160814Ssimon	clr	%l6
162160814Ssimon	clr	%l7
163160814Ssimon	add	%o0,1,%i0	! used for debugging
164160814Ssimon2:	ret
165160814Ssimon	restore
166160814Ssimon.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
167160814Ssimon
168160814Ssimon.global	OPENSSL_atomic_add
169160814Ssimon.type	OPENSSL_atomic_add,#function
170238405Sjkim.align	32
171160814SsimonOPENSSL_atomic_add:
172160814Ssimon#ifndef ABI64
173160814Ssimon	subcc	%g0,1,%o2
174160814Ssimon	.word	0x95408000	!rd	%ccr,%o2, see comment above
175160814Ssimon	cmp	%o2,0x99
176160814Ssimon	be	.v9
177160814Ssimon	nop
178160814Ssimon	save	%sp,FRAME,%sp
179160814Ssimon	ba	.enter
180160814Ssimon	nop
181160814Ssimon#ifdef __sun
182238405Sjkim! Note that you do not have to link with libthread to call thr_yield,
183160814Ssimon! as libc provides a stub, which is overloaded the moment you link
184160814Ssimon! with *either* libpthread or libthread...
185160814Ssimon#define	YIELD_CPU	thr_yield
186160814Ssimon#else
187160814Ssimon! applies at least to Linux and FreeBSD... Feedback expected...
188160814Ssimon#define	YIELD_CPU	sched_yield
189160814Ssimon#endif
190160814Ssimon.spin:	call	YIELD_CPU
191160814Ssimon	nop
192160814Ssimon.enter:	ld	[%i0],%i2
193160814Ssimon	cmp	%i2,-4096
194160814Ssimon	be	.spin
195160814Ssimon	mov	-1,%i2
196160814Ssimon	swap	[%i0],%i2
197160814Ssimon	cmp	%i2,-1
198160814Ssimon	be	.spin
199160814Ssimon	add	%i2,%i1,%i2
200160814Ssimon	stbar
201160814Ssimon	st	%i2,[%i0]
202160814Ssimon	sra	%i2,%g0,%i0
203160814Ssimon	ret
204160814Ssimon	restore
205160814Ssimon.v9:
206160814Ssimon#endif
207160814Ssimon	ld	[%o0],%o2
208160814Ssimon1:	add	%o1,%o2,%o3
209160814Ssimon	.word	0xd7e2100a	!cas [%o0],%o2,%o3, compare [%o0] with %o2 and swap %o3
210160814Ssimon	cmp	%o2,%o3
211160814Ssimon	bne	1b
212160814Ssimon	mov	%o3,%o2		! cas is always fetching to dest. register
213160814Ssimon	add	%o1,%o2,%o0	! OpenSSL expects the new value
214160814Ssimon	retl
215160814Ssimon	sra	%o0,%g0,%o0	! we return signed int, remember?
216160814Ssimon.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
217160814Ssimon
218238405Sjkim.global	_sparcv9_rdtick
219238405Sjkim.align	32
220238405Sjkim_sparcv9_rdtick:
221160814Ssimon	subcc	%g0,1,%o0
222160814Ssimon	.word	0x91408000	!rd	%ccr,%o0
223160814Ssimon	cmp	%o0,0x99
224238405Sjkim	bne	.notick
225160814Ssimon	xor	%o0,%o0,%o0
226238405Sjkim	.word	0x91410000	!rd	%tick,%o0
227238405Sjkim	retl
228238405Sjkim	.word	0x93323020	!srlx	%o0,32,%o1
229238405Sjkim.notick:
230238405Sjkim	retl
231238405Sjkim	xor	%o1,%o1,%o1
232238405Sjkim.type	_sparcv9_rdtick,#function
233238405Sjkim.size	_sparcv9_rdtick,.-_sparcv9_rdtick
234160814Ssimon
235238405Sjkim.global	_sparcv9_vis1_probe
236238405Sjkim.align	8
237238405Sjkim_sparcv9_vis1_probe:
238238405Sjkim	add	%sp,BIAS+2,%o1
239246772Sjkim	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
240160814Ssimon	retl
241246772Sjkim	.word	0x81b00d80	!fxor	%f0,%f0,%f0
242238405Sjkim.type	_sparcv9_vis1_probe,#function
243238405Sjkim.size	_sparcv9_vis1_probe,.-_sparcv9_vis1_probe
244238405Sjkim
245238405Sjkim! Probe and instrument VIS1 instruction. Output is number of cycles it
246238405Sjkim! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit
247238405Sjkim! is slow (documented to be 6 cycles on T2) and the core is in-order
248238405Sjkim! single-issue, it should be possible to distinguish Tx reliably...
249238405Sjkim! Observed return values are:
250238405Sjkim!
251238405Sjkim!	UltraSPARC IIe		7
252238405Sjkim!	UltraSPARC III		7
253238405Sjkim!	UltraSPARC T1		24
254238405Sjkim!
255238405Sjkim! Numbers for T2 and SPARC64 V-VII are more than welcomed.
256238405Sjkim!
257238405Sjkim! It would be possible to detect specifically US-T1 by instrumenting
258238405Sjkim! fmul8ulx16, which is emulated on T1 and as such accounts for quite
259238405Sjkim! a lot of %tick-s, couple of thousand on Linux...
260238405Sjkim.global	_sparcv9_vis1_instrument
261238405Sjkim.align	8
262238405Sjkim_sparcv9_vis1_instrument:
263238405Sjkim	.word	0x91410000	!rd	%tick,%o0
264238405Sjkim	.word	0x81b00d80	!fxor	%f0,%f0,%f0
265238405Sjkim	.word	0x85b08d82	!fxor	%f2,%f2,%f2
266238405Sjkim	.word	0x93410000	!rd	%tick,%o1
267238405Sjkim	.word	0x81b00d80	!fxor	%f0,%f0,%f0
268238405Sjkim	.word	0x85b08d82	!fxor	%f2,%f2,%f2
269238405Sjkim	.word	0x95410000	!rd	%tick,%o2
270238405Sjkim	.word	0x81b00d80	!fxor	%f0,%f0,%f0
271238405Sjkim	.word	0x85b08d82	!fxor	%f2,%f2,%f2
272238405Sjkim	.word	0x97410000	!rd	%tick,%o3
273238405Sjkim	.word	0x81b00d80	!fxor	%f0,%f0,%f0
274238405Sjkim	.word	0x85b08d82	!fxor	%f2,%f2,%f2
275238405Sjkim	.word	0x99410000	!rd	%tick,%o4
276238405Sjkim
277238405Sjkim	! calculate intervals
278238405Sjkim	sub	%o1,%o0,%o0
279238405Sjkim	sub	%o2,%o1,%o1
280238405Sjkim	sub	%o3,%o2,%o2
281238405Sjkim	sub	%o4,%o3,%o3
282238405Sjkim
283238405Sjkim	! find minumum value
284238405Sjkim	cmp	%o0,%o1
285238405Sjkim	.word	0x38680002	!bgu,a	%xcc,.+8
286238405Sjkim	mov	%o1,%o0
287238405Sjkim	cmp	%o0,%o2
288238405Sjkim	.word	0x38680002	!bgu,a	%xcc,.+8
289238405Sjkim	mov	%o2,%o0
290238405Sjkim	cmp	%o0,%o3
291238405Sjkim	.word	0x38680002	!bgu,a	%xcc,.+8
292238405Sjkim	mov	%o3,%o0
293238405Sjkim
294238405Sjkim	retl
295160814Ssimon	nop
296238405Sjkim.type	_sparcv9_vis1_instrument,#function
297238405Sjkim.size	_sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
298238405Sjkim
299238405Sjkim.global	_sparcv9_vis2_probe
300238405Sjkim.align	8
301238405Sjkim_sparcv9_vis2_probe:
302238405Sjkim	retl
303238405Sjkim	.word	0x81b00980	!bshuffle	%f0,%f0,%f0
304238405Sjkim.type	_sparcv9_vis2_probe,#function
305238405Sjkim.size	_sparcv9_vis2_probe,.-_sparcv9_vis2_probe
306238405Sjkim
307238405Sjkim.global	_sparcv9_fmadd_probe
308238405Sjkim.align	8
309238405Sjkim_sparcv9_fmadd_probe:
310238405Sjkim	.word	0x81b00d80	!fxor	%f0,%f0,%f0
311238405Sjkim	.word	0x85b08d82	!fxor	%f2,%f2,%f2
312238405Sjkim	retl
313238405Sjkim	.word	0x81b80440	!fmaddd	%f0,%f0,%f2,%f0
314238405Sjkim.type	_sparcv9_fmadd_probe,#function
315238405Sjkim.size	_sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
316238405Sjkim
317238405Sjkim.global	OPENSSL_cleanse
318238405Sjkim.align	32
319238405SjkimOPENSSL_cleanse:
320238405Sjkim	cmp	%o1,14
321238405Sjkim	nop
322238405Sjkim#ifdef ABI64
323238405Sjkim	bgu	%xcc,.Lot
324238405Sjkim#else
325238405Sjkim	bgu	.Lot
326238405Sjkim#endif
327238405Sjkim	cmp	%o1,0
328238405Sjkim	bne	.Little
329238405Sjkim	nop
330238405Sjkim	retl
331238405Sjkim	nop
332238405Sjkim
333238405Sjkim.Little:
334238405Sjkim	stb	%g0,[%o0]
335238405Sjkim	subcc	%o1,1,%o1
336238405Sjkim	bnz	.Little
337238405Sjkim	add	%o0,1,%o0
338238405Sjkim	retl
339238405Sjkim	nop
340238405Sjkim.align	32
341238405Sjkim.Lot:
342238405Sjkim#ifndef ABI64
343238405Sjkim	subcc	%g0,1,%g1
344238405Sjkim	! see above for explanation
345238405Sjkim	.word	0x83408000	!rd	%ccr,%g1
346238405Sjkim	cmp	%g1,0x99
347238405Sjkim	bne	.v8lot
348238405Sjkim	nop
349238405Sjkim#endif
350238405Sjkim
351238405Sjkim.v9lot:	andcc	%o0,7,%g0
352238405Sjkim	bz	.v9aligned
353238405Sjkim	nop
354238405Sjkim	stb	%g0,[%o0]
355238405Sjkim	sub	%o1,1,%o1
356238405Sjkim	ba	.v9lot
357238405Sjkim	add	%o0,1,%o0
358238405Sjkim.align	16,0x01000000
359238405Sjkim.v9aligned:
360238405Sjkim	.word	0xc0720000	!stx	%g0,[%o0]
361238405Sjkim	sub	%o1,8,%o1
362238405Sjkim	andcc	%o1,-8,%g0
363238405Sjkim#ifdef ABI64
364238405Sjkim	.word	0x126ffffd	!bnz	%xcc,.v9aligned
365238405Sjkim#else
366238405Sjkim	.word	0x124ffffd	!bnz	%icc,.v9aligned
367238405Sjkim#endif
368238405Sjkim	add	%o0,8,%o0
369238405Sjkim
370238405Sjkim	cmp	%o1,0
371238405Sjkim	bne	.Little
372238405Sjkim	nop
373238405Sjkim	retl
374238405Sjkim	nop
375238405Sjkim#ifndef ABI64
376238405Sjkim.v8lot:	andcc	%o0,3,%g0
377238405Sjkim	bz	.v8aligned
378238405Sjkim	nop
379238405Sjkim	stb	%g0,[%o0]
380238405Sjkim	sub	%o1,1,%o1
381238405Sjkim	ba	.v8lot
382238405Sjkim	add	%o0,1,%o0
383238405Sjkim	nop
384238405Sjkim.v8aligned:
385238405Sjkim	st	%g0,[%o0]
386238405Sjkim	sub	%o1,4,%o1
387238405Sjkim	andcc	%o1,-4,%g0
388238405Sjkim	bnz	.v8aligned
389238405Sjkim	add	%o0,4,%o0
390238405Sjkim
391238405Sjkim	cmp	%o1,0
392238405Sjkim	bne	.Little
393238405Sjkim	nop
394238405Sjkim	retl
395238405Sjkim	nop
396238405Sjkim#endif
397238405Sjkim.type	OPENSSL_cleanse,#function
398238405Sjkim.size	OPENSSL_cleanse,.-OPENSSL_cleanse
399238405Sjkim
400238405Sjkim.section	".init",#alloc,#execinstr
401238405Sjkim	call	OPENSSL_cpuid_setup
402238405Sjkim	nop
403