sparccpuid.S revision 306195
1#if defined(__SUNPRO_C) && defined(__sparcv9) 2# define ABI64 /* They've said -xarch=v9 at command line */ 3#elif defined(__GNUC__) && defined(__arch64__) 4# define ABI64 /* They've said -m64 at command line */ 5#endif 6 7#ifdef ABI64 8 .register %g2,#scratch 9 .register %g3,#scratch 10# define FRAME -192 11# define BIAS 2047 12#else 13# define FRAME -96 14# define BIAS 0 15#endif 16 17.text 18.align 32 19.global OPENSSL_wipe_cpu 20.type OPENSSL_wipe_cpu,#function 21! Keep in mind that this does not excuse us from wiping the stack! 22! This routine wipes registers, but not the backing store [which 23! resides on the stack, toward lower addresses]. To facilitate for 24! stack wiping I return pointer to the top of stack of the *caller*. 25OPENSSL_wipe_cpu: 26 save %sp,FRAME,%sp 27 nop 28#ifdef __sun 29#include <sys/trap.h> 30 ta ST_CLEAN_WINDOWS 31#else 32 call .walk.reg.wins 33#endif 34 nop 35 call .PIC.zero.up 36 mov .zero-(.-4),%o0 37 ld [%o0],%f0 38 ld [%o0],%f1 39 40 subcc %g0,1,%o0 41 ! Following is V9 "rd %ccr,%o0" instruction. However! V8 42 ! specification says that it ("rd %asr2,%o0" in V8 terms) does 43 ! not cause illegal_instruction trap. It therefore can be used 44 ! to determine if the CPU the code is executing on is V8- or 45 ! V9-compliant, as V9 returns a distinct value of 0x99, 46 ! "negative" and "borrow" bits set in both %icc and %xcc. 47 .word 0x91408000 !rd %ccr,%o0 48 cmp %o0,0x99 49 bne .v8 50 nop 51 ! Even though we do not use %fp register bank, 52 ! we wipe it as memcpy might have used it... 53 .word 0xbfa00040 !fmovd %f0,%f62 54 .word 0xbba00040 !... 55 .word 0xb7a00040 56 .word 0xb3a00040 57 .word 0xafa00040 58 .word 0xaba00040 59 .word 0xa7a00040 60 .word 0xa3a00040 61 .word 0x9fa00040 62 .word 0x9ba00040 63 .word 0x97a00040 64 .word 0x93a00040 65 .word 0x8fa00040 66 .word 0x8ba00040 67 .word 0x87a00040 68 .word 0x83a00040 !fmovd %f0,%f32 69.v8: fmovs %f1,%f31 70 clr %o0 71 fmovs %f0,%f30 72 clr %o1 73 fmovs %f1,%f29 74 clr %o2 75 fmovs %f0,%f28 76 clr %o3 77 fmovs %f1,%f27 78 clr %o4 79 fmovs %f0,%f26 80 clr %o5 81 fmovs %f1,%f25 82 clr %o7 83 fmovs %f0,%f24 84 clr %l0 85 fmovs %f1,%f23 86 clr %l1 87 fmovs %f0,%f22 88 clr %l2 89 fmovs %f1,%f21 90 clr %l3 91 fmovs %f0,%f20 92 clr %l4 93 fmovs %f1,%f19 94 clr %l5 95 fmovs %f0,%f18 96 clr %l6 97 fmovs %f1,%f17 98 clr %l7 99 fmovs %f0,%f16 100 clr %i0 101 fmovs %f1,%f15 102 clr %i1 103 fmovs %f0,%f14 104 clr %i2 105 fmovs %f1,%f13 106 clr %i3 107 fmovs %f0,%f12 108 clr %i4 109 fmovs %f1,%f11 110 clr %i5 111 fmovs %f0,%f10 112 clr %g1 113 fmovs %f1,%f9 114 clr %g2 115 fmovs %f0,%f8 116 clr %g3 117 fmovs %f1,%f7 118 clr %g4 119 fmovs %f0,%f6 120 clr %g5 121 fmovs %f1,%f5 122 fmovs %f0,%f4 123 fmovs %f1,%f3 124 fmovs %f0,%f2 125 126 add %fp,BIAS,%i0 ! return pointer to caller��s top of stack 127 128 ret 129 restore 130 131.zero: .long 0x0,0x0 132.PIC.zero.up: 133 retl 134 add %o0,%o7,%o0 135#ifdef DEBUG 136.global walk_reg_wins 137.type walk_reg_wins,#function 138walk_reg_wins: 139#endif 140.walk.reg.wins: 141 save %sp,FRAME,%sp 142 cmp %i7,%o7 143 be 2f 144 clr %o0 145 cmp %o7,0 ! compiler never cleans %o7... 146 be 1f ! could have been a leaf function... 147 clr %o1 148 call .walk.reg.wins 149 nop 1501: clr %o2 151 clr %o3 152 clr %o4 153 clr %o5 154 clr %o7 155 clr %l0 156 clr %l1 157 clr %l2 158 clr %l3 159 clr %l4 160 clr %l5 161 clr %l6 162 clr %l7 163 add %o0,1,%i0 ! used for debugging 1642: ret 165 restore 166.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu 167 168.global OPENSSL_atomic_add 169.type OPENSSL_atomic_add,#function 170.align 32 171OPENSSL_atomic_add: 172#ifndef ABI64 173 subcc %g0,1,%o2 174 .word 0x95408000 !rd %ccr,%o2, see comment above 175 cmp %o2,0x99 176 be .v9 177 nop 178 save %sp,FRAME,%sp 179 ba .enter 180 nop 181#ifdef __sun 182! Note that you do not have to link with libthread to call thr_yield, 183! as libc provides a stub, which is overloaded the moment you link 184! with *either* libpthread or libthread... 185#define YIELD_CPU thr_yield 186#else 187! applies at least to Linux and FreeBSD... Feedback expected... 188#define YIELD_CPU sched_yield 189#endif 190.spin: call YIELD_CPU 191 nop 192.enter: ld [%i0],%i2 193 cmp %i2,-4096 194 be .spin 195 mov -1,%i2 196 swap [%i0],%i2 197 cmp %i2,-1 198 be .spin 199 add %i2,%i1,%i2 200 stbar 201 st %i2,[%i0] 202 sra %i2,%g0,%i0 203 ret 204 restore 205.v9: 206#endif 207 ld [%o0],%o2 2081: add %o1,%o2,%o3 209 .word 0xd7e2100a !cas [%o0],%o2,%o3, compare [%o0] with %o2 and swap %o3 210 cmp %o2,%o3 211 bne 1b 212 mov %o3,%o2 ! cas is always fetching to dest. register 213 add %o1,%o2,%o0 ! OpenSSL expects the new value 214 retl 215 sra %o0,%g0,%o0 ! we return signed int, remember? 216.size OPENSSL_atomic_add,.-OPENSSL_atomic_add 217 218.global _sparcv9_rdtick 219.align 32 220_sparcv9_rdtick: 221 subcc %g0,1,%o0 222 .word 0x91408000 !rd %ccr,%o0 223 cmp %o0,0x99 224 bne .notick 225 xor %o0,%o0,%o0 226 .word 0x91410000 !rd %tick,%o0 227 retl 228 .word 0x93323020 !srlx %o0,32,%o1 229.notick: 230 retl 231 xor %o1,%o1,%o1 232.type _sparcv9_rdtick,#function 233.size _sparcv9_rdtick,.-_sparcv9_rdtick 234 235.global _sparcv9_vis1_probe 236.align 8 237_sparcv9_vis1_probe: 238 add %sp,BIAS+2,%o1 239 .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0 240 retl 241 .word 0x81b00d80 !fxor %f0,%f0,%f0 242.type _sparcv9_vis1_probe,#function 243.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe 244 245! Probe and instrument VIS1 instruction. Output is number of cycles it 246! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit 247! is slow (documented to be 6 cycles on T2) and the core is in-order 248! single-issue, it should be possible to distinguish Tx reliably... 249! Observed return values are: 250! 251! UltraSPARC IIe 7 252! UltraSPARC III 7 253! UltraSPARC T1 24 254! SPARC T4 65(*) 255! 256! (*) result has lesser to do with VIS instruction latencies, rdtick 257! appears that slow, but it does the trick in sense that FP and 258! VIS code paths are still slower than integer-only ones. 259! 260! Numbers for T2 and SPARC64 V-VII are more than welcomed. 261! 262! It would be possible to detect specifically US-T1 by instrumenting 263! fmul8ulx16, which is emulated on T1 and as such accounts for quite 264! a lot of %tick-s, couple of thousand on Linux... 265.global _sparcv9_vis1_instrument 266.align 8 267_sparcv9_vis1_instrument: 268 .word 0x81b00d80 !fxor %f0,%f0,%f0 269 .word 0x85b08d82 !fxor %f2,%f2,%f2 270 .word 0x91410000 !rd %tick,%o0 271 .word 0x81b00d80 !fxor %f0,%f0,%f0 272 .word 0x85b08d82 !fxor %f2,%f2,%f2 273 .word 0x93410000 !rd %tick,%o1 274 .word 0x81b00d80 !fxor %f0,%f0,%f0 275 .word 0x85b08d82 !fxor %f2,%f2,%f2 276 .word 0x95410000 !rd %tick,%o2 277 .word 0x81b00d80 !fxor %f0,%f0,%f0 278 .word 0x85b08d82 !fxor %f2,%f2,%f2 279 .word 0x97410000 !rd %tick,%o3 280 .word 0x81b00d80 !fxor %f0,%f0,%f0 281 .word 0x85b08d82 !fxor %f2,%f2,%f2 282 .word 0x99410000 !rd %tick,%o4 283 284 ! calculate intervals 285 sub %o1,%o0,%o0 286 sub %o2,%o1,%o1 287 sub %o3,%o2,%o2 288 sub %o4,%o3,%o3 289 290 ! find minumum value 291 cmp %o0,%o1 292 .word 0x38680002 !bgu,a %xcc,.+8 293 mov %o1,%o0 294 cmp %o0,%o2 295 .word 0x38680002 !bgu,a %xcc,.+8 296 mov %o2,%o0 297 cmp %o0,%o3 298 .word 0x38680002 !bgu,a %xcc,.+8 299 mov %o3,%o0 300 301 retl 302 nop 303.type _sparcv9_vis1_instrument,#function 304.size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument 305 306.global _sparcv9_vis2_probe 307.align 8 308_sparcv9_vis2_probe: 309 retl 310 .word 0x81b00980 !bshuffle %f0,%f0,%f0 311.type _sparcv9_vis2_probe,#function 312.size _sparcv9_vis2_probe,.-_sparcv9_vis2_probe 313 314.global _sparcv9_fmadd_probe 315.align 8 316_sparcv9_fmadd_probe: 317 .word 0x81b00d80 !fxor %f0,%f0,%f0 318 .word 0x85b08d82 !fxor %f2,%f2,%f2 319 retl 320 .word 0x81b80440 !fmaddd %f0,%f0,%f2,%f0 321.type _sparcv9_fmadd_probe,#function 322.size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe 323 324.global _sparcv9_rdcfr 325.align 8 326_sparcv9_rdcfr: 327 retl 328 .word 0x91468000 !rd %asr26,%o0 329.type _sparcv9_rdcfr,#function 330.size _sparcv9_rdcfr,.-_sparcv9_rdcfr 331 332.global _sparcv9_vis3_probe 333.align 8 334_sparcv9_vis3_probe: 335 retl 336 .word 0x81b022a0 !xmulx %g0,%g0,%g0 337.type _sparcv9_vis3_probe,#function 338.size _sparcv9_vis3_probe,.-_sparcv9_vis3_probe 339 340.global _sparcv9_random 341.align 8 342_sparcv9_random: 343 retl 344 .word 0x91b002a0 !random %o0 345.type _sparcv9_random,#function 346.size _sparcv9_random,.-_sparcv9_vis3_probe 347 348.global OPENSSL_cleanse 349.align 32 350OPENSSL_cleanse: 351 cmp %o1,14 352 nop 353#ifdef ABI64 354 bgu %xcc,.Lot 355#else 356 bgu .Lot 357#endif 358 cmp %o1,0 359 bne .Little 360 nop 361 retl 362 nop 363 364.Little: 365 stb %g0,[%o0] 366 subcc %o1,1,%o1 367 bnz .Little 368 add %o0,1,%o0 369 retl 370 nop 371.align 32 372.Lot: 373#ifndef ABI64 374 subcc %g0,1,%g1 375 ! see above for explanation 376 .word 0x83408000 !rd %ccr,%g1 377 cmp %g1,0x99 378 bne .v8lot 379 nop 380#endif 381 382.v9lot: andcc %o0,7,%g0 383 bz .v9aligned 384 nop 385 stb %g0,[%o0] 386 sub %o1,1,%o1 387 ba .v9lot 388 add %o0,1,%o0 389.align 16,0x01000000 390.v9aligned: 391 .word 0xc0720000 !stx %g0,[%o0] 392 sub %o1,8,%o1 393 andcc %o1,-8,%g0 394#ifdef ABI64 395 .word 0x126ffffd !bnz %xcc,.v9aligned 396#else 397 .word 0x124ffffd !bnz %icc,.v9aligned 398#endif 399 add %o0,8,%o0 400 401 cmp %o1,0 402 bne .Little 403 nop 404 retl 405 nop 406#ifndef ABI64 407.v8lot: andcc %o0,3,%g0 408 bz .v8aligned 409 nop 410 stb %g0,[%o0] 411 sub %o1,1,%o1 412 ba .v8lot 413 add %o0,1,%o0 414 nop 415.v8aligned: 416 st %g0,[%o0] 417 sub %o1,4,%o1 418 andcc %o1,-4,%g0 419 bnz .v8aligned 420 add %o0,4,%o0 421 422 cmp %o1,0 423 bne .Little 424 nop 425 retl 426 nop 427#endif 428.type OPENSSL_cleanse,#function 429.size OPENSSL_cleanse,.-OPENSSL_cleanse 430 431.global _sparcv9_vis1_instrument_bus 432.weak _sparcv9_vis1_instrument_bus 433.align 8 434_sparcv9_vis1_instrument_bus: 435 mov %o1,%o3 ! save cnt 436 .word 0x99410000 !rd %tick,%o4 ! tick 437 mov %o4,%o5 ! lasttick = tick 438 set 0,%g4 ! diff 439 440 andn %o0,63,%g1 441 .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load 442 .word 0x8143e040 !membar #Sync 443 .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit 444 .word 0x8143e040 !membar #Sync 445 ld [%o0],%o4 446 add %o4,%g4,%g4 447 .word 0xc9e2100c !cas [%o0],%o4,%g4 448 449.Loop: .word 0x99410000 !rd %tick,%o4 450 sub %o4,%o5,%g4 ! diff=tick-lasttick 451 mov %o4,%o5 ! lasttick=tick 452 453 andn %o0,63,%g1 454 .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load 455 .word 0x8143e040 !membar #Sync 456 .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit 457 .word 0x8143e040 !membar #Sync 458 ld [%o0],%o4 459 add %o4,%g4,%g4 460 .word 0xc9e2100c !cas [%o0],%o4,%g4 461 subcc %o1,1,%o1 ! --$cnt 462 bnz .Loop 463 add %o0,4,%o0 ! ++$out 464 465 retl 466 mov %o3,%o0 467.type _sparcv9_vis1_instrument_bus,#function 468.size _sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus 469 470.global _sparcv9_vis1_instrument_bus2 471.weak _sparcv9_vis1_instrument_bus2 472.align 8 473_sparcv9_vis1_instrument_bus2: 474 mov %o1,%o3 ! save cnt 475 sll %o1,2,%o1 ! cnt*=4 476 477 .word 0x99410000 !rd %tick,%o4 ! tick 478 mov %o4,%o5 ! lasttick = tick 479 set 0,%g4 ! diff 480 481 andn %o0,63,%g1 482 .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load 483 .word 0x8143e040 !membar #Sync 484 .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit 485 .word 0x8143e040 !membar #Sync 486 ld [%o0],%o4 487 add %o4,%g4,%g4 488 .word 0xc9e2100c !cas [%o0],%o4,%g4 489 490 .word 0x99410000 !rd %tick,%o4 ! tick 491 sub %o4,%o5,%g4 ! diff=tick-lasttick 492 mov %o4,%o5 ! lasttick=tick 493 mov %g4,%g5 ! lastdiff=diff 494.Loop2: 495 andn %o0,63,%g1 496 .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load 497 .word 0x8143e040 !membar #Sync 498 .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit 499 .word 0x8143e040 !membar #Sync 500 ld [%o0],%o4 501 add %o4,%g4,%g4 502 .word 0xc9e2100c !cas [%o0],%o4,%g4 503 504 subcc %o2,1,%o2 ! --max 505 bz .Ldone2 506 nop 507 508 .word 0x99410000 !rd %tick,%o4 ! tick 509 sub %o4,%o5,%g4 ! diff=tick-lasttick 510 mov %o4,%o5 ! lasttick=tick 511 cmp %g4,%g5 512 mov %g4,%g5 ! lastdiff=diff 513 514 .word 0x83408000 !rd %ccr,%g1 515 and %g1,4,%g1 ! isolate zero flag 516 xor %g1,4,%g1 ! flip zero flag 517 518 subcc %o1,%g1,%o1 ! conditional --$cnt 519 bnz .Loop2 520 add %o0,%g1,%o0 ! conditional ++$out 521 522.Ldone2: 523 srl %o1,2,%o1 524 retl 525 sub %o3,%o1,%o0 526.type _sparcv9_vis1_instrument_bus2,#function 527.size _sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2 528 529.section ".init",#alloc,#execinstr 530 call OPENSSL_cpuid_setup 531 nop 532