1#if defined(__SUNPRO_C) && defined(__sparcv9)
2# define ABI64  /* They've said -xarch=v9 at command line */
3#elif defined(__GNUC__) && defined(__arch64__)
4# define ABI64  /* They've said -m64 at command line */
5#endif
6
7#ifdef ABI64
8  .register	%g2,#scratch
9  .register	%g3,#scratch
10# define	FRAME	-192
11# define	BIAS	2047
12#else
13# define	FRAME	-96
14# define	BIAS	0
15#endif
16
17.text
18.align	32
19.global	OPENSSL_wipe_cpu
20.type	OPENSSL_wipe_cpu,#function
21! Keep in mind that this does not excuse us from wiping the stack!
22! This routine wipes registers, but not the backing store [which
23! resides on the stack, toward lower addresses]. To facilitate for
24! stack wiping I return pointer to the top of stack of the *caller*.
25OPENSSL_wipe_cpu:
26	save	%sp,FRAME,%sp
27	nop
28#ifdef __sun
29#include <sys/trap.h>
30	ta	ST_CLEAN_WINDOWS
31#else
32	call	.walk.reg.wins
33#endif
34	nop
35	call	.PIC.zero.up
36	mov	.zero-(.-4),%o0
37	ld	[%o0],%f0
38	ld	[%o0],%f1
39
40	subcc	%g0,1,%o0
41	! Following is V9 "rd %ccr,%o0" instruction. However! V8
42	! specification says that it ("rd %asr2,%o0" in V8 terms) does
43	! not cause illegal_instruction trap. It therefore can be used
44	! to determine if the CPU the code is executing on is V8- or
45	! V9-compliant, as V9 returns a distinct value of 0x99,
46	! "negative" and "borrow" bits set in both %icc and %xcc.
47	.word	0x91408000	!rd	%ccr,%o0
48	cmp	%o0,0x99
49	bne	.v8
50	nop
51			! Even though we do not use %fp register bank,
52			! we wipe it as memcpy might have used it...
53			.word	0xbfa00040	!fmovd	%f0,%f62
54			.word	0xbba00040	!...
55			.word	0xb7a00040
56			.word	0xb3a00040
57			.word	0xafa00040
58			.word	0xaba00040
59			.word	0xa7a00040
60			.word	0xa3a00040
61			.word	0x9fa00040
62			.word	0x9ba00040
63			.word	0x97a00040
64			.word	0x93a00040
65			.word	0x8fa00040
66			.word	0x8ba00040
67			.word	0x87a00040
68			.word	0x83a00040	!fmovd	%f0,%f32
69.v8:			fmovs	%f1,%f31
70	clr	%o0
71			fmovs	%f0,%f30
72	clr	%o1
73			fmovs	%f1,%f29
74	clr	%o2
75			fmovs	%f0,%f28
76	clr	%o3
77			fmovs	%f1,%f27
78	clr	%o4
79			fmovs	%f0,%f26
80	clr	%o5
81			fmovs	%f1,%f25
82	clr	%o7
83			fmovs	%f0,%f24
84	clr	%l0
85			fmovs	%f1,%f23
86	clr	%l1
87			fmovs	%f0,%f22
88	clr	%l2
89			fmovs	%f1,%f21
90	clr	%l3
91			fmovs	%f0,%f20
92	clr	%l4
93			fmovs	%f1,%f19
94	clr	%l5
95			fmovs	%f0,%f18
96	clr	%l6
97			fmovs	%f1,%f17
98	clr	%l7
99			fmovs	%f0,%f16
100	clr	%i0
101			fmovs	%f1,%f15
102	clr	%i1
103			fmovs	%f0,%f14
104	clr	%i2
105			fmovs	%f1,%f13
106	clr	%i3
107			fmovs	%f0,%f12
108	clr	%i4
109			fmovs	%f1,%f11
110	clr	%i5
111			fmovs	%f0,%f10
112	clr	%g1
113			fmovs	%f1,%f9
114	clr	%g2
115			fmovs	%f0,%f8
116	clr	%g3
117			fmovs	%f1,%f7
118	clr	%g4
119			fmovs	%f0,%f6
120	clr	%g5
121			fmovs	%f1,%f5
122			fmovs	%f0,%f4
123			fmovs	%f1,%f3
124			fmovs	%f0,%f2
125
126	add	%fp,BIAS,%i0	! return pointer to caller�s top of stack
127
128	ret
129	restore
130
131.zero:	.long	0x0,0x0
132.PIC.zero.up:
133	retl
134	add	%o0,%o7,%o0
135#ifdef DEBUG
136.global	walk_reg_wins
137.type	walk_reg_wins,#function
138walk_reg_wins:
139#endif
140.walk.reg.wins:
141	save	%sp,FRAME,%sp
142	cmp	%i7,%o7
143	be	2f
144	clr	%o0
145	cmp	%o7,0	! compiler never cleans %o7...
146	be	1f	! could have been a leaf function...
147	clr	%o1
148	call	.walk.reg.wins
149	nop
1501:	clr	%o2
151	clr	%o3
152	clr	%o4
153	clr	%o5
154	clr	%o7
155	clr	%l0
156	clr	%l1
157	clr	%l2
158	clr	%l3
159	clr	%l4
160	clr	%l5
161	clr	%l6
162	clr	%l7
163	add	%o0,1,%i0	! used for debugging
1642:	ret
165	restore
166.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
167
168.global	OPENSSL_atomic_add
169.type	OPENSSL_atomic_add,#function
170.align	32
171OPENSSL_atomic_add:
172#ifndef ABI64
173	subcc	%g0,1,%o2
174	.word	0x95408000	!rd	%ccr,%o2, see comment above
175	cmp	%o2,0x99
176	be	.v9
177	nop
178	save	%sp,FRAME,%sp
179	ba	.enter
180	nop
181#ifdef __sun
182! Note that you do not have to link with libthread to call thr_yield,
183! as libc provides a stub, which is overloaded the moment you link
184! with *either* libpthread or libthread...
185#define	YIELD_CPU	thr_yield
186#else
187! applies at least to Linux and FreeBSD... Feedback expected...
188#define	YIELD_CPU	sched_yield
189#endif
190.spin:	call	YIELD_CPU
191	nop
192.enter:	ld	[%i0],%i2
193	cmp	%i2,-4096
194	be	.spin
195	mov	-1,%i2
196	swap	[%i0],%i2
197	cmp	%i2,-1
198	be	.spin
199	add	%i2,%i1,%i2
200	stbar
201	st	%i2,[%i0]
202	sra	%i2,%g0,%i0
203	ret
204	restore
205.v9:
206#endif
207	ld	[%o0],%o2
2081:	add	%o1,%o2,%o3
209	.word	0xd7e2100a	!cas [%o0],%o2,%o3, compare [%o0] with %o2 and swap %o3
210	cmp	%o2,%o3
211	bne	1b
212	mov	%o3,%o2		! cas is always fetching to dest. register
213	add	%o1,%o2,%o0	! OpenSSL expects the new value
214	retl
215	sra	%o0,%g0,%o0	! we return signed int, remember?
216.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
217
218.global	_sparcv9_rdtick
219.align	32
220_sparcv9_rdtick:
221	subcc	%g0,1,%o0
222	.word	0x91408000	!rd	%ccr,%o0
223	cmp	%o0,0x99
224	bne	.notick
225	xor	%o0,%o0,%o0
226	.word	0x91410000	!rd	%tick,%o0
227	retl
228	.word	0x93323020	!srlx	%o0,32,%o1
229.notick:
230	retl
231	xor	%o1,%o1,%o1
232.type	_sparcv9_rdtick,#function
233.size	_sparcv9_rdtick,.-_sparcv9_rdtick
234
235.global	_sparcv9_vis1_probe
236.align	8
237_sparcv9_vis1_probe:
238	add	%sp,BIAS+2,%o1
239	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
240	retl
241	.word	0x81b00d80	!fxor	%f0,%f0,%f0
242.type	_sparcv9_vis1_probe,#function
243.size	_sparcv9_vis1_probe,.-_sparcv9_vis1_probe
244
245! Probe and instrument VIS1 instruction. Output is number of cycles it
246! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit
247! is slow (documented to be 6 cycles on T2) and the core is in-order
248! single-issue, it should be possible to distinguish Tx reliably...
249! Observed return values are:
250!
251!	UltraSPARC IIe		7
252!	UltraSPARC III		7
253!	UltraSPARC T1		24
254!	SPARC T4		65(*)
255!
256! (*)	result has lesser to do with VIS instruction latencies, rdtick
257!	appears that slow, but it does the trick in sense that FP and
258!	VIS code paths are still slower than integer-only ones.
259!
260! Numbers for T2 and SPARC64 V-VII are more than welcomed.
261!
262! It would be possible to detect specifically US-T1 by instrumenting
263! fmul8ulx16, which is emulated on T1 and as such accounts for quite
264! a lot of %tick-s, couple of thousand on Linux...
265.global	_sparcv9_vis1_instrument
266.align	8
267_sparcv9_vis1_instrument:
268	.word	0x81b00d80	!fxor	%f0,%f0,%f0
269	.word	0x85b08d82	!fxor	%f2,%f2,%f2
270	.word	0x91410000	!rd	%tick,%o0
271	.word	0x81b00d80	!fxor	%f0,%f0,%f0
272	.word	0x85b08d82	!fxor	%f2,%f2,%f2
273	.word	0x93410000	!rd	%tick,%o1
274	.word	0x81b00d80	!fxor	%f0,%f0,%f0
275	.word	0x85b08d82	!fxor	%f2,%f2,%f2
276	.word	0x95410000	!rd	%tick,%o2
277	.word	0x81b00d80	!fxor	%f0,%f0,%f0
278	.word	0x85b08d82	!fxor	%f2,%f2,%f2
279	.word	0x97410000	!rd	%tick,%o3
280	.word	0x81b00d80	!fxor	%f0,%f0,%f0
281	.word	0x85b08d82	!fxor	%f2,%f2,%f2
282	.word	0x99410000	!rd	%tick,%o4
283
284	! calculate intervals
285	sub	%o1,%o0,%o0
286	sub	%o2,%o1,%o1
287	sub	%o3,%o2,%o2
288	sub	%o4,%o3,%o3
289
290	! find minumum value
291	cmp	%o0,%o1
292	.word	0x38680002	!bgu,a	%xcc,.+8
293	mov	%o1,%o0
294	cmp	%o0,%o2
295	.word	0x38680002	!bgu,a	%xcc,.+8
296	mov	%o2,%o0
297	cmp	%o0,%o3
298	.word	0x38680002	!bgu,a	%xcc,.+8
299	mov	%o3,%o0
300
301	retl
302	nop
303.type	_sparcv9_vis1_instrument,#function
304.size	_sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
305
306.global	_sparcv9_vis2_probe
307.align	8
308_sparcv9_vis2_probe:
309	retl
310	.word	0x81b00980	!bshuffle	%f0,%f0,%f0
311.type	_sparcv9_vis2_probe,#function
312.size	_sparcv9_vis2_probe,.-_sparcv9_vis2_probe
313
314.global	_sparcv9_fmadd_probe
315.align	8
316_sparcv9_fmadd_probe:
317	.word	0x81b00d80	!fxor	%f0,%f0,%f0
318	.word	0x85b08d82	!fxor	%f2,%f2,%f2
319	retl
320	.word	0x81b80440	!fmaddd	%f0,%f0,%f2,%f0
321.type	_sparcv9_fmadd_probe,#function
322.size	_sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
323
324.global	_sparcv9_rdcfr
325.align	8
326_sparcv9_rdcfr:
327	retl
328	.word	0x91468000	!rd	%asr26,%o0
329.type	_sparcv9_rdcfr,#function
330.size	_sparcv9_rdcfr,.-_sparcv9_rdcfr
331
332.global	_sparcv9_vis3_probe
333.align	8
334_sparcv9_vis3_probe:
335	retl
336	.word	0x81b022a0	!xmulx	%g0,%g0,%g0
337.type	_sparcv9_vis3_probe,#function
338.size	_sparcv9_vis3_probe,.-_sparcv9_vis3_probe
339
340.global	_sparcv9_random
341.align	8
342_sparcv9_random:
343	retl
344	.word	0x91b002a0	!random	%o0
345.type	_sparcv9_random,#function
346.size	_sparcv9_random,.-_sparcv9_vis3_probe
347
348.global	OPENSSL_cleanse
349.align	32
350OPENSSL_cleanse:
351	cmp	%o1,14
352	nop
353#ifdef ABI64
354	bgu	%xcc,.Lot
355#else
356	bgu	.Lot
357#endif
358	cmp	%o1,0
359	bne	.Little
360	nop
361	retl
362	nop
363
364.Little:
365	stb	%g0,[%o0]
366	subcc	%o1,1,%o1
367	bnz	.Little
368	add	%o0,1,%o0
369	retl
370	nop
371.align	32
372.Lot:
373#ifndef ABI64
374	subcc	%g0,1,%g1
375	! see above for explanation
376	.word	0x83408000	!rd	%ccr,%g1
377	cmp	%g1,0x99
378	bne	.v8lot
379	nop
380#endif
381
382.v9lot:	andcc	%o0,7,%g0
383	bz	.v9aligned
384	nop
385	stb	%g0,[%o0]
386	sub	%o1,1,%o1
387	ba	.v9lot
388	add	%o0,1,%o0
389.align	16,0x01000000
390.v9aligned:
391	.word	0xc0720000	!stx	%g0,[%o0]
392	sub	%o1,8,%o1
393	andcc	%o1,-8,%g0
394#ifdef ABI64
395	.word	0x126ffffd	!bnz	%xcc,.v9aligned
396#else
397	.word	0x124ffffd	!bnz	%icc,.v9aligned
398#endif
399	add	%o0,8,%o0
400
401	cmp	%o1,0
402	bne	.Little
403	nop
404	retl
405	nop
406#ifndef ABI64
407.v8lot:	andcc	%o0,3,%g0
408	bz	.v8aligned
409	nop
410	stb	%g0,[%o0]
411	sub	%o1,1,%o1
412	ba	.v8lot
413	add	%o0,1,%o0
414	nop
415.v8aligned:
416	st	%g0,[%o0]
417	sub	%o1,4,%o1
418	andcc	%o1,-4,%g0
419	bnz	.v8aligned
420	add	%o0,4,%o0
421
422	cmp	%o1,0
423	bne	.Little
424	nop
425	retl
426	nop
427#endif
428.type	OPENSSL_cleanse,#function
429.size	OPENSSL_cleanse,.-OPENSSL_cleanse
430
431.global	_sparcv9_vis1_instrument_bus
432.align	8
433_sparcv9_vis1_instrument_bus:
434	mov	%o1,%o3					! save cnt
435	.word	0x99410000	!rd	%tick,%o4	! tick
436	mov	%o4,%o5					! lasttick = tick
437	set	0,%g4					! diff
438
439	andn	%o0,63,%g1
440	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
441	.word	0x8143e040	!membar	#Sync
442	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
443	.word	0x8143e040	!membar	#Sync
444	ld	[%o0],%o4
445	add	%o4,%g4,%g4
446	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
447
448.Loop:	.word	0x99410000	!rd	%tick,%o4
449	sub	%o4,%o5,%g4				! diff=tick-lasttick
450	mov	%o4,%o5					! lasttick=tick
451
452	andn	%o0,63,%g1
453	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
454	.word	0x8143e040	!membar	#Sync
455	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
456	.word	0x8143e040	!membar	#Sync
457	ld	[%o0],%o4
458	add	%o4,%g4,%g4
459	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
460	subcc	%o1,1,%o1				! --$cnt
461	bnz	.Loop
462	add	%o0,4,%o0				! ++$out
463
464	retl
465	mov	%o3,%o0
466.type	_sparcv9_vis1_instrument_bus,#function
467.size	_sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus
468
469.global	_sparcv9_vis1_instrument_bus2
470.align	8
471_sparcv9_vis1_instrument_bus2:
472	mov	%o1,%o3					! save cnt
473	sll	%o1,2,%o1				! cnt*=4
474
475	.word	0x99410000	!rd	%tick,%o4	! tick
476	mov	%o4,%o5					! lasttick = tick
477	set	0,%g4					! diff
478
479	andn	%o0,63,%g1
480	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
481	.word	0x8143e040	!membar	#Sync
482	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
483	.word	0x8143e040	!membar	#Sync
484	ld	[%o0],%o4
485	add	%o4,%g4,%g4
486	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
487
488	.word	0x99410000	!rd	%tick,%o4	! tick
489	sub	%o4,%o5,%g4				! diff=tick-lasttick
490	mov	%o4,%o5					! lasttick=tick
491	mov	%g4,%g5					! lastdiff=diff
492.Loop2:
493	andn	%o0,63,%g1
494	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
495	.word	0x8143e040	!membar	#Sync
496	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
497	.word	0x8143e040	!membar	#Sync
498	ld	[%o0],%o4
499	add	%o4,%g4,%g4
500	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
501
502	subcc	%o2,1,%o2				! --max
503	bz	.Ldone2
504	nop
505
506	.word	0x99410000	!rd	%tick,%o4	! tick
507	sub	%o4,%o5,%g4				! diff=tick-lasttick
508	mov	%o4,%o5					! lasttick=tick
509	cmp	%g4,%g5
510	mov	%g4,%g5					! lastdiff=diff
511
512	.word	0x83408000	!rd	%ccr,%g1
513	and	%g1,4,%g1				! isolate zero flag
514	xor	%g1,4,%g1				! flip zero flag
515
516	subcc	%o1,%g1,%o1				! conditional --$cnt
517	bnz	.Loop2
518	add	%o0,%g1,%o0				! conditional ++$out
519
520.Ldone2:
521	srl	%o1,2,%o1
522	retl
523	sub	%o3,%o1,%o0
524.type	_sparcv9_vis1_instrument_bus2,#function
525.size	_sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2
526
527.section	".init",#alloc,#execinstr
528	call	OPENSSL_cpuid_setup
529	nop
530