sparccpuid.S revision 306195
1#if defined(__SUNPRO_C) && defined(__sparcv9)
2# define ABI64  /* They've said -xarch=v9 at command line */
3#elif defined(__GNUC__) && defined(__arch64__)
4# define ABI64  /* They've said -m64 at command line */
5#endif
6
7#ifdef ABI64
8  .register	%g2,#scratch
9  .register	%g3,#scratch
10# define	FRAME	-192
11# define	BIAS	2047
12#else
13# define	FRAME	-96
14# define	BIAS	0
15#endif
16
17.text
18.align	32
19.global	OPENSSL_wipe_cpu
20.type	OPENSSL_wipe_cpu,#function
21! Keep in mind that this does not excuse us from wiping the stack!
22! This routine wipes registers, but not the backing store [which
23! resides on the stack, toward lower addresses]. To facilitate for
24! stack wiping I return pointer to the top of stack of the *caller*.
25OPENSSL_wipe_cpu:
26	save	%sp,FRAME,%sp
27	nop
28#ifdef __sun
29#include <sys/trap.h>
30	ta	ST_CLEAN_WINDOWS
31#else
32	call	.walk.reg.wins
33#endif
34	nop
35	call	.PIC.zero.up
36	mov	.zero-(.-4),%o0
37	ld	[%o0],%f0
38	ld	[%o0],%f1
39
40	subcc	%g0,1,%o0
41	! Following is V9 "rd %ccr,%o0" instruction. However! V8
42	! specification says that it ("rd %asr2,%o0" in V8 terms) does
43	! not cause illegal_instruction trap. It therefore can be used
44	! to determine if the CPU the code is executing on is V8- or
45	! V9-compliant, as V9 returns a distinct value of 0x99,
46	! "negative" and "borrow" bits set in both %icc and %xcc.
47	.word	0x91408000	!rd	%ccr,%o0
48	cmp	%o0,0x99
49	bne	.v8
50	nop
51			! Even though we do not use %fp register bank,
52			! we wipe it as memcpy might have used it...
53			.word	0xbfa00040	!fmovd	%f0,%f62
54			.word	0xbba00040	!...
55			.word	0xb7a00040
56			.word	0xb3a00040
57			.word	0xafa00040
58			.word	0xaba00040
59			.word	0xa7a00040
60			.word	0xa3a00040
61			.word	0x9fa00040
62			.word	0x9ba00040
63			.word	0x97a00040
64			.word	0x93a00040
65			.word	0x8fa00040
66			.word	0x8ba00040
67			.word	0x87a00040
68			.word	0x83a00040	!fmovd	%f0,%f32
69.v8:			fmovs	%f1,%f31
70	clr	%o0
71			fmovs	%f0,%f30
72	clr	%o1
73			fmovs	%f1,%f29
74	clr	%o2
75			fmovs	%f0,%f28
76	clr	%o3
77			fmovs	%f1,%f27
78	clr	%o4
79			fmovs	%f0,%f26
80	clr	%o5
81			fmovs	%f1,%f25
82	clr	%o7
83			fmovs	%f0,%f24
84	clr	%l0
85			fmovs	%f1,%f23
86	clr	%l1
87			fmovs	%f0,%f22
88	clr	%l2
89			fmovs	%f1,%f21
90	clr	%l3
91			fmovs	%f0,%f20
92	clr	%l4
93			fmovs	%f1,%f19
94	clr	%l5
95			fmovs	%f0,%f18
96	clr	%l6
97			fmovs	%f1,%f17
98	clr	%l7
99			fmovs	%f0,%f16
100	clr	%i0
101			fmovs	%f1,%f15
102	clr	%i1
103			fmovs	%f0,%f14
104	clr	%i2
105			fmovs	%f1,%f13
106	clr	%i3
107			fmovs	%f0,%f12
108	clr	%i4
109			fmovs	%f1,%f11
110	clr	%i5
111			fmovs	%f0,%f10
112	clr	%g1
113			fmovs	%f1,%f9
114	clr	%g2
115			fmovs	%f0,%f8
116	clr	%g3
117			fmovs	%f1,%f7
118	clr	%g4
119			fmovs	%f0,%f6
120	clr	%g5
121			fmovs	%f1,%f5
122			fmovs	%f0,%f4
123			fmovs	%f1,%f3
124			fmovs	%f0,%f2
125
126	add	%fp,BIAS,%i0	! return pointer to caller��s top of stack
127
128	ret
129	restore
130
131.zero:	.long	0x0,0x0
132.PIC.zero.up:
133	retl
134	add	%o0,%o7,%o0
135#ifdef DEBUG
136.global	walk_reg_wins
137.type	walk_reg_wins,#function
138walk_reg_wins:
139#endif
140.walk.reg.wins:
141	save	%sp,FRAME,%sp
142	cmp	%i7,%o7
143	be	2f
144	clr	%o0
145	cmp	%o7,0	! compiler never cleans %o7...
146	be	1f	! could have been a leaf function...
147	clr	%o1
148	call	.walk.reg.wins
149	nop
1501:	clr	%o2
151	clr	%o3
152	clr	%o4
153	clr	%o5
154	clr	%o7
155	clr	%l0
156	clr	%l1
157	clr	%l2
158	clr	%l3
159	clr	%l4
160	clr	%l5
161	clr	%l6
162	clr	%l7
163	add	%o0,1,%i0	! used for debugging
1642:	ret
165	restore
166.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
167
168.global	OPENSSL_atomic_add
169.type	OPENSSL_atomic_add,#function
170.align	32
171OPENSSL_atomic_add:
172#ifndef ABI64
173	subcc	%g0,1,%o2
174	.word	0x95408000	!rd	%ccr,%o2, see comment above
175	cmp	%o2,0x99
176	be	.v9
177	nop
178	save	%sp,FRAME,%sp
179	ba	.enter
180	nop
181#ifdef __sun
182! Note that you do not have to link with libthread to call thr_yield,
183! as libc provides a stub, which is overloaded the moment you link
184! with *either* libpthread or libthread...
185#define	YIELD_CPU	thr_yield
186#else
187! applies at least to Linux and FreeBSD... Feedback expected...
188#define	YIELD_CPU	sched_yield
189#endif
190.spin:	call	YIELD_CPU
191	nop
192.enter:	ld	[%i0],%i2
193	cmp	%i2,-4096
194	be	.spin
195	mov	-1,%i2
196	swap	[%i0],%i2
197	cmp	%i2,-1
198	be	.spin
199	add	%i2,%i1,%i2
200	stbar
201	st	%i2,[%i0]
202	sra	%i2,%g0,%i0
203	ret
204	restore
205.v9:
206#endif
207	ld	[%o0],%o2
2081:	add	%o1,%o2,%o3
209	.word	0xd7e2100a	!cas [%o0],%o2,%o3, compare [%o0] with %o2 and swap %o3
210	cmp	%o2,%o3
211	bne	1b
212	mov	%o3,%o2		! cas is always fetching to dest. register
213	add	%o1,%o2,%o0	! OpenSSL expects the new value
214	retl
215	sra	%o0,%g0,%o0	! we return signed int, remember?
216.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
217
218.global	_sparcv9_rdtick
219.align	32
220_sparcv9_rdtick:
221	subcc	%g0,1,%o0
222	.word	0x91408000	!rd	%ccr,%o0
223	cmp	%o0,0x99
224	bne	.notick
225	xor	%o0,%o0,%o0
226	.word	0x91410000	!rd	%tick,%o0
227	retl
228	.word	0x93323020	!srlx	%o0,32,%o1
229.notick:
230	retl
231	xor	%o1,%o1,%o1
232.type	_sparcv9_rdtick,#function
233.size	_sparcv9_rdtick,.-_sparcv9_rdtick
234
235.global	_sparcv9_vis1_probe
236.align	8
237_sparcv9_vis1_probe:
238	add	%sp,BIAS+2,%o1
239	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
240	retl
241	.word	0x81b00d80	!fxor	%f0,%f0,%f0
242.type	_sparcv9_vis1_probe,#function
243.size	_sparcv9_vis1_probe,.-_sparcv9_vis1_probe
244
245! Probe and instrument VIS1 instruction. Output is number of cycles it
246! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit
247! is slow (documented to be 6 cycles on T2) and the core is in-order
248! single-issue, it should be possible to distinguish Tx reliably...
249! Observed return values are:
250!
251!	UltraSPARC IIe		7
252!	UltraSPARC III		7
253!	UltraSPARC T1		24
254!	SPARC T4		65(*)
255!
256! (*)	result has lesser to do with VIS instruction latencies, rdtick
257!	appears that slow, but it does the trick in sense that FP and
258!	VIS code paths are still slower than integer-only ones.
259!
260! Numbers for T2 and SPARC64 V-VII are more than welcomed.
261!
262! It would be possible to detect specifically US-T1 by instrumenting
263! fmul8ulx16, which is emulated on T1 and as such accounts for quite
264! a lot of %tick-s, couple of thousand on Linux...
265.global	_sparcv9_vis1_instrument
266.align	8
267_sparcv9_vis1_instrument:
268	.word	0x81b00d80	!fxor	%f0,%f0,%f0
269	.word	0x85b08d82	!fxor	%f2,%f2,%f2
270	.word	0x91410000	!rd	%tick,%o0
271	.word	0x81b00d80	!fxor	%f0,%f0,%f0
272	.word	0x85b08d82	!fxor	%f2,%f2,%f2
273	.word	0x93410000	!rd	%tick,%o1
274	.word	0x81b00d80	!fxor	%f0,%f0,%f0
275	.word	0x85b08d82	!fxor	%f2,%f2,%f2
276	.word	0x95410000	!rd	%tick,%o2
277	.word	0x81b00d80	!fxor	%f0,%f0,%f0
278	.word	0x85b08d82	!fxor	%f2,%f2,%f2
279	.word	0x97410000	!rd	%tick,%o3
280	.word	0x81b00d80	!fxor	%f0,%f0,%f0
281	.word	0x85b08d82	!fxor	%f2,%f2,%f2
282	.word	0x99410000	!rd	%tick,%o4
283
284	! calculate intervals
285	sub	%o1,%o0,%o0
286	sub	%o2,%o1,%o1
287	sub	%o3,%o2,%o2
288	sub	%o4,%o3,%o3
289
290	! find minumum value
291	cmp	%o0,%o1
292	.word	0x38680002	!bgu,a	%xcc,.+8
293	mov	%o1,%o0
294	cmp	%o0,%o2
295	.word	0x38680002	!bgu,a	%xcc,.+8
296	mov	%o2,%o0
297	cmp	%o0,%o3
298	.word	0x38680002	!bgu,a	%xcc,.+8
299	mov	%o3,%o0
300
301	retl
302	nop
303.type	_sparcv9_vis1_instrument,#function
304.size	_sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
305
306.global	_sparcv9_vis2_probe
307.align	8
308_sparcv9_vis2_probe:
309	retl
310	.word	0x81b00980	!bshuffle	%f0,%f0,%f0
311.type	_sparcv9_vis2_probe,#function
312.size	_sparcv9_vis2_probe,.-_sparcv9_vis2_probe
313
314.global	_sparcv9_fmadd_probe
315.align	8
316_sparcv9_fmadd_probe:
317	.word	0x81b00d80	!fxor	%f0,%f0,%f0
318	.word	0x85b08d82	!fxor	%f2,%f2,%f2
319	retl
320	.word	0x81b80440	!fmaddd	%f0,%f0,%f2,%f0
321.type	_sparcv9_fmadd_probe,#function
322.size	_sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
323
324.global	_sparcv9_rdcfr
325.align	8
326_sparcv9_rdcfr:
327	retl
328	.word	0x91468000	!rd	%asr26,%o0
329.type	_sparcv9_rdcfr,#function
330.size	_sparcv9_rdcfr,.-_sparcv9_rdcfr
331
332.global	_sparcv9_vis3_probe
333.align	8
334_sparcv9_vis3_probe:
335	retl
336	.word	0x81b022a0	!xmulx	%g0,%g0,%g0
337.type	_sparcv9_vis3_probe,#function
338.size	_sparcv9_vis3_probe,.-_sparcv9_vis3_probe
339
340.global	_sparcv9_random
341.align	8
342_sparcv9_random:
343	retl
344	.word	0x91b002a0	!random	%o0
345.type	_sparcv9_random,#function
346.size	_sparcv9_random,.-_sparcv9_vis3_probe
347
348.global	OPENSSL_cleanse
349.align	32
350OPENSSL_cleanse:
351	cmp	%o1,14
352	nop
353#ifdef ABI64
354	bgu	%xcc,.Lot
355#else
356	bgu	.Lot
357#endif
358	cmp	%o1,0
359	bne	.Little
360	nop
361	retl
362	nop
363
364.Little:
365	stb	%g0,[%o0]
366	subcc	%o1,1,%o1
367	bnz	.Little
368	add	%o0,1,%o0
369	retl
370	nop
371.align	32
372.Lot:
373#ifndef ABI64
374	subcc	%g0,1,%g1
375	! see above for explanation
376	.word	0x83408000	!rd	%ccr,%g1
377	cmp	%g1,0x99
378	bne	.v8lot
379	nop
380#endif
381
382.v9lot:	andcc	%o0,7,%g0
383	bz	.v9aligned
384	nop
385	stb	%g0,[%o0]
386	sub	%o1,1,%o1
387	ba	.v9lot
388	add	%o0,1,%o0
389.align	16,0x01000000
390.v9aligned:
391	.word	0xc0720000	!stx	%g0,[%o0]
392	sub	%o1,8,%o1
393	andcc	%o1,-8,%g0
394#ifdef ABI64
395	.word	0x126ffffd	!bnz	%xcc,.v9aligned
396#else
397	.word	0x124ffffd	!bnz	%icc,.v9aligned
398#endif
399	add	%o0,8,%o0
400
401	cmp	%o1,0
402	bne	.Little
403	nop
404	retl
405	nop
406#ifndef ABI64
407.v8lot:	andcc	%o0,3,%g0
408	bz	.v8aligned
409	nop
410	stb	%g0,[%o0]
411	sub	%o1,1,%o1
412	ba	.v8lot
413	add	%o0,1,%o0
414	nop
415.v8aligned:
416	st	%g0,[%o0]
417	sub	%o1,4,%o1
418	andcc	%o1,-4,%g0
419	bnz	.v8aligned
420	add	%o0,4,%o0
421
422	cmp	%o1,0
423	bne	.Little
424	nop
425	retl
426	nop
427#endif
428.type	OPENSSL_cleanse,#function
429.size	OPENSSL_cleanse,.-OPENSSL_cleanse
430
431.global	_sparcv9_vis1_instrument_bus
432.weak	_sparcv9_vis1_instrument_bus
433.align	8
434_sparcv9_vis1_instrument_bus:
435	mov	%o1,%o3					! save cnt
436	.word	0x99410000	!rd	%tick,%o4	! tick
437	mov	%o4,%o5					! lasttick = tick
438	set	0,%g4					! diff
439
440	andn	%o0,63,%g1
441	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
442	.word	0x8143e040	!membar	#Sync
443	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
444	.word	0x8143e040	!membar	#Sync
445	ld	[%o0],%o4
446	add	%o4,%g4,%g4
447	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
448
449.Loop:	.word	0x99410000	!rd	%tick,%o4
450	sub	%o4,%o5,%g4				! diff=tick-lasttick
451	mov	%o4,%o5					! lasttick=tick
452
453	andn	%o0,63,%g1
454	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
455	.word	0x8143e040	!membar	#Sync
456	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
457	.word	0x8143e040	!membar	#Sync
458	ld	[%o0],%o4
459	add	%o4,%g4,%g4
460	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
461	subcc	%o1,1,%o1				! --$cnt
462	bnz	.Loop
463	add	%o0,4,%o0				! ++$out
464
465	retl
466	mov	%o3,%o0
467.type	_sparcv9_vis1_instrument_bus,#function
468.size	_sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus
469
470.global	_sparcv9_vis1_instrument_bus2
471.weak	_sparcv9_vis1_instrument_bus2
472.align	8
473_sparcv9_vis1_instrument_bus2:
474	mov	%o1,%o3					! save cnt
475	sll	%o1,2,%o1				! cnt*=4
476
477	.word	0x99410000	!rd	%tick,%o4	! tick
478	mov	%o4,%o5					! lasttick = tick
479	set	0,%g4					! diff
480
481	andn	%o0,63,%g1
482	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
483	.word	0x8143e040	!membar	#Sync
484	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
485	.word	0x8143e040	!membar	#Sync
486	ld	[%o0],%o4
487	add	%o4,%g4,%g4
488	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
489
490	.word	0x99410000	!rd	%tick,%o4	! tick
491	sub	%o4,%o5,%g4				! diff=tick-lasttick
492	mov	%o4,%o5					! lasttick=tick
493	mov	%g4,%g5					! lastdiff=diff
494.Loop2:
495	andn	%o0,63,%g1
496	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
497	.word	0x8143e040	!membar	#Sync
498	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
499	.word	0x8143e040	!membar	#Sync
500	ld	[%o0],%o4
501	add	%o4,%g4,%g4
502	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
503
504	subcc	%o2,1,%o2				! --max
505	bz	.Ldone2
506	nop
507
508	.word	0x99410000	!rd	%tick,%o4	! tick
509	sub	%o4,%o5,%g4				! diff=tick-lasttick
510	mov	%o4,%o5					! lasttick=tick
511	cmp	%g4,%g5
512	mov	%g4,%g5					! lastdiff=diff
513
514	.word	0x83408000	!rd	%ccr,%g1
515	and	%g1,4,%g1				! isolate zero flag
516	xor	%g1,4,%g1				! flip zero flag
517
518	subcc	%o1,%g1,%o1				! conditional --$cnt
519	bnz	.Loop2
520	add	%o0,%g1,%o0				! conditional ++$out
521
522.Ldone2:
523	srl	%o1,2,%o1
524	retl
525	sub	%o3,%o1,%o0
526.type	_sparcv9_vis1_instrument_bus2,#function
527.size	_sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2
528
529.section	".init",#alloc,#execinstr
530	call	OPENSSL_cpuid_setup
531	nop
532