1117260Sache#ifndef __ASSEMBLER__ 2117260Sache# define __ASSEMBLER__ 1 3117260Sache#endif 4117260Sache#include "crypto/sparc_arch.h" 5117260Sache 6117260Sache#ifdef __arch64__ 7117260Sache.register %g2,#scratch 8117260Sache.register %g3,#scratch 9117260Sache# define STPTR stx 10117260Sache# define SIZE_T 8 11117260Sache#else 12117260Sache# define STPTR st 13117260Sache# define SIZE_T 4 14117260Sache#endif 15117260Sache#define LOCALS (STACK_BIAS+STACK_FRAME) 16117260Sache 17117260Sache.section ".text",#alloc,#execinstr 18117260Sache 19117260Sache#ifdef __PIC__ 20117260SacheSPARC_PIC_THUNK(%g1) 21117260Sache#endif 22117260Sache 23117260Sache.globl poly1305_init 24117260Sache.align 32 25117260Sachepoly1305_init: 26117260Sache save %sp,-STACK_FRAME-16,%sp 27117260Sache nop 28117260Sache 29117260Sache SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1) 30117260Sache ld [%g1],%g1 31117260Sache 32117260Sache and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1 33117260Sache cmp %g1,SPARCV9_FMADD 34117260Sache be .Lpoly1305_init_fma 35117260Sache nop 36117260Sache 37117260Sache stx %g0,[%i0+0] 38117260Sache stx %g0,[%i0+8] ! zero hash value 39117260Sache brz,pn %i1,.Lno_key 40117260Sache stx %g0,[%i0+16] 41117260Sache 42117260Sache and %i1,7,%i5 ! alignment factor 43117260Sache andn %i1,7,%i1 44117260Sache sll %i5,3,%i5 ! *8 45117260Sache neg %i5,%i4 46117260Sache 47117260Sache sethi %hi(0x0ffffffc),%o4 48117260Sache set 8,%o1 49117260Sache or %o4,%lo(0x0ffffffc),%o4 50117260Sache set 16,%o2 51 sllx %o4,32,%o5 52 or %o4,%o5,%o5 ! 0x0ffffffc0ffffffc 53 or %o5,3,%o4 ! 0x0ffffffc0fffffff 54 55 ldxa [%i1+%g0]0x88,%o0 ! load little-endian key 56 brz,pt %i5,.Lkey_aligned 57 ldxa [%i1+%o1]0x88,%o1 58 59 ldxa [%i1+%o2]0x88,%o2 60 srlx %o0,%i5,%o0 61 sllx %o1,%i4,%o7 62 srlx %o1,%i5,%o1 63 or %o7,%o0,%o0 64 sllx %o2,%i4,%o2 65 or %o2,%o1,%o1 66 67.Lkey_aligned: 68 and %o4,%o0,%o0 69 and %o5,%o1,%o1 70 stx %o0,[%i0+32+0] ! store key 71 stx %o1,[%i0+32+8] 72 73 andcc %g1,SPARCV9_VIS3,%g0 74 be .Lno_key 75 nop 76 771: call .+8 78 add %o7,poly1305_blocks_vis3-1b,%o7 79 80 add %o7,poly1305_emit-poly1305_blocks_vis3,%o5 81 STPTR %o7,[%i2] 82 STPTR %o5,[%i2+SIZE_T] 83 84 ret 85 restore %g0,1,%o0 ! return 1 86 87.Lno_key: 88 ret 89 restore %g0,%g0,%o0 ! return 0 90.type poly1305_init,#function 91.size poly1305_init,.-poly1305_init 92 93.globl poly1305_blocks 94.align 32 95poly1305_blocks: 96 save %sp,-STACK_FRAME,%sp 97 srln %i2,4,%i2 98 99 brz,pn %i2,.Lno_data 100 nop 101 102 ld [%i0+32+0],%l1 ! load key 103 ld [%i0+32+4],%l0 104 ld [%i0+32+8],%l3 105 ld [%i0+32+12],%l2 106 107 ld [%i0+0],%o1 ! load hash value 108 ld [%i0+4],%o0 109 ld [%i0+8],%o3 110 ld [%i0+12],%o2 111 ld [%i0+16],%l7 112 113 and %i1,7,%i5 ! alignment factor 114 andn %i1,7,%i1 115 set 8,%g2 116 sll %i5,3,%i5 ! *8 117 set 16,%g3 118 neg %i5,%i4 119 120 srl %l1,2,%l4 121 srl %l2,2,%l5 122 add %l1,%l4,%l4 123 srl %l3,2,%l6 124 add %l2,%l5,%l5 125 add %l3,%l6,%l6 126 127.Loop: 128 ldxa [%i1+%g0]0x88,%g1 ! load little-endian input 129 brz,pt %i5,.Linp_aligned 130 ldxa [%i1+%g2]0x88,%g2 131 132 ldxa [%i1+%g3]0x88,%g3 133 srlx %g1,%i5,%g1 134 sllx %g2,%i4,%o5 135 srlx %g2,%i5,%g2 136 or %o5,%g1,%g1 137 sllx %g3,%i4,%g3 138 or %g3,%g2,%g2 139 140.Linp_aligned: 141 srlx %g1,32,%o4 142 addcc %g1,%o0,%o0 ! accumulate input 143 srlx %g2,32,%o5 144 addccc %o4,%o1,%o1 145 addccc %g2,%o2,%o2 146 addccc %o5,%o3,%o3 147 addc %i3,%l7,%l7 148 149 umul %l0,%o0,%g1 150 umul %l1,%o0,%g2 151 umul %l2,%o0,%g3 152 umul %l3,%o0,%g4 153 sub %i2,1,%i2 154 add %i1,16,%i1 155 156 umul %l6,%o1,%o4 157 umul %l0,%o1,%o5 158 umul %l1,%o1,%o7 159 add %o4,%g1,%g1 160 add %o5,%g2,%g2 161 umul %l2,%o1,%o4 162 add %o7,%g3,%g3 163 add %o4,%g4,%g4 164 165 umul %l5,%o2,%o5 166 umul %l6,%o2,%o7 167 umul %l0,%o2,%o4 168 add %o5,%g1,%g1 169 add %o7,%g2,%g2 170 umul %l1,%o2,%o5 171 add %o4,%g3,%g3 172 add %o5,%g4,%g4 173 174 umul %l4,%o3,%o7 175 umul %l5,%o3,%o4 176 umul %l6,%o3,%o5 177 add %o7,%g1,%g1 178 add %o4,%g2,%g2 179 umul %l0,%o3,%o7 180 add %o5,%g3,%g3 181 add %o7,%g4,%g4 182 183 umul %l4,%l7,%o4 184 umul %l5,%l7,%o5 185 umul %l6,%l7,%o7 186 umul %l0,%l7,%l7 187 add %o4,%g2,%g2 188 add %o5,%g3,%g3 189 srlx %g1,32,%o1 190 add %o7,%g4,%g4 191 srlx %g2,32,%o2 192 193 addcc %g2,%o1,%o1 194 srlx %g3,32,%o3 195 set 8,%g2 196 addccc %g3,%o2,%o2 197 srlx %g4,32,%o4 198 set 16,%g3 199 addccc %g4,%o3,%o3 200 addc %o4,%l7,%l7 201 202 srl %l7,2,%o4 ! final reduction step 203 andn %l7,3,%o5 204 and %l7,3,%l7 205 add %o5,%o4,%o4 206 207 addcc %o4,%g1,%o0 208 addccc %g0,%o1,%o1 209 addccc %g0,%o2,%o2 210 addccc %g0,%o3,%o3 211 brnz,pt %i2,.Loop 212 addc %g0,%l7,%l7 213 214 st %o1,[%i0+0] ! store hash value 215 st %o0,[%i0+4] 216 st %o3,[%i0+8] 217 st %o2,[%i0+12] 218 st %l7,[%i0+16] 219 220.Lno_data: 221 ret 222 restore 223.type poly1305_blocks,#function 224.size poly1305_blocks,.-poly1305_blocks 225.align 32 226poly1305_blocks_vis3: 227 save %sp,-STACK_FRAME,%sp 228 srln %i2,4,%i2 229 230 brz,pn %i2,.Lno_data 231 nop 232 233 ldx [%i0+32+0],%o3 ! load key 234 ldx [%i0+32+8],%o4 235 236 ldx [%i0+0],%o0 ! load hash value 237 ldx [%i0+8],%o1 238 ld [%i0+16],%o2 239 240 and %i1,7,%i5 ! alignment factor 241 andn %i1,7,%i1 242 set 8,%l1 243 sll %i5,3,%i5 ! *8 244 set 16,%l2 245 neg %i5,%i4 246 247 srlx %o4,2,%o5 248 b .Loop_vis3 249 add %o4,%o5,%o5 250 251.Loop_vis3: 252 ldxa [%i1+%g0]0x88,%g1 ! load little-endian input 253 brz,pt %i5,.Linp_aligned_vis3 254 ldxa [%i1+%l1]0x88,%g2 255 256 ldxa [%i1+%l2]0x88,%g3 257 srlx %g1,%i5,%g1 258 sllx %g2,%i4,%o7 259 srlx %g2,%i5,%g2 260 or %o7,%g1,%g1 261 sllx %g3,%i4,%g3 262 or %g3,%g2,%g2 263 264.Linp_aligned_vis3: 265 addcc %g1,%o0,%o0 ! accumulate input 266 sub %i2,1,%i2 267 .word 0x93b08269 !addxccc %g2,%o1,%o1 268 add %i1,16,%i1 269 270 mulx %o3,%o0,%g1 ! r0*h0 271 .word 0x95b6c22a !addxc %i3,%o2,%o2 272 .word 0x85b2c2c8 !umulxhi %o3,%o0,%g2 273 mulx %o5,%o1,%g4 ! s1*h1 274 .word 0x9fb342c9 !umulxhi %o5,%o1,%o7 275 addcc %g4,%g1,%g1 276 mulx %o4,%o0,%g4 ! r1*h0 277 .word 0x85b3c222 !addxc %o7,%g2,%g2 278 .word 0x87b302c8 !umulxhi %o4,%o0,%g3 279 addcc %g4,%g2,%g2 280 mulx %o3,%o1,%g4 ! r0*h1 281 .word 0x87b00223 !addxc %g0,%g3,%g3 282 .word 0x9fb2c2c9 !umulxhi %o3,%o1,%o7 283 addcc %g4,%g2,%g2 284 mulx %o5,%o2,%g4 ! s1*h2 285 .word 0x87b3c223 !addxc %o7,%g3,%g3 286 mulx %o3,%o2,%o7 ! r0*h2 287 addcc %g4,%g2,%g2 288 .word 0x87b3c223 !addxc %o7,%g3,%g3 289 290 srlx %g3,2,%g4 ! final reduction step 291 andn %g3,3,%o7 292 and %g3,3,%o2 293 add %o7,%g4,%g4 294 295 addcc %g4,%g1,%o0 296 .word 0x93b00262 !addxccc %g0,%g2,%o1 297 brnz,pt %i2,.Loop_vis3 298 .word 0x95b0022a !addxc %g0,%o2,%o2 299 300 stx %o0,[%i0+0] ! store hash value 301 stx %o1,[%i0+8] 302 st %o2,[%i0+16] 303 304 ret 305 restore 306.type poly1305_blocks_vis3,#function 307.size poly1305_blocks_vis3,.-poly1305_blocks_vis3 308.globl poly1305_emit 309.align 32 310poly1305_emit: 311 save %sp,-STACK_FRAME,%sp 312 313 ld [%i0+0],%o1 ! load hash value 314 ld [%i0+4],%o0 315 ld [%i0+8],%o3 316 ld [%i0+12],%o2 317 ld [%i0+16],%l7 318 319 addcc %o0,5,%l0 ! compare to modulus 320 addccc %o1,0,%l1 321 addccc %o2,0,%l2 322 addccc %o3,0,%l3 323 addc %l7,0,%l7 324 andcc %l7,4,%g0 ! did it carry/borrow? 325 326 movnz %icc,%l0,%o0 327 ld [%i2+0],%l0 ! load nonce 328 movnz %icc,%l1,%o1 329 ld [%i2+4],%l1 330 movnz %icc,%l2,%o2 331 ld [%i2+8],%l2 332 movnz %icc,%l3,%o3 333 ld [%i2+12],%l3 334 335 addcc %l0,%o0,%o0 ! accumulate nonce 336 addccc %l1,%o1,%o1 337 addccc %l2,%o2,%o2 338 addc %l3,%o3,%o3 339 340 srl %o0,8,%l0 341 stb %o0,[%i1+0] ! store little-endian result 342 srl %o0,16,%l1 343 stb %l0,[%i1+1] 344 srl %o0,24,%l2 345 stb %l1,[%i1+2] 346 stb %l2,[%i1+3] 347 348 srl %o1,8,%l0 349 stb %o1,[%i1+4] 350 srl %o1,16,%l1 351 stb %l0,[%i1+5] 352 srl %o1,24,%l2 353 stb %l1,[%i1+6] 354 stb %l2,[%i1+7] 355 356 srl %o2,8,%l0 357 stb %o2,[%i1+8] 358 srl %o2,16,%l1 359 stb %l0,[%i1+9] 360 srl %o2,24,%l2 361 stb %l1,[%i1+10] 362 stb %l2,[%i1+11] 363 364 srl %o3,8,%l0 365 stb %o3,[%i1+12] 366 srl %o3,16,%l1 367 stb %l0,[%i1+13] 368 srl %o3,24,%l2 369 stb %l1,[%i1+14] 370 stb %l2,[%i1+15] 371 372 ret 373 restore 374.type poly1305_emit,#function 375.size poly1305_emit,.-poly1305_emit 376.align 32 377poly1305_init_fma: 378 save %sp,-STACK_FRAME-16,%sp 379 nop 380 381.Lpoly1305_init_fma: 3821: call .+8 383 add %o7,.Lconsts_fma-1b,%o7 384 385 ldd [%o7+8*0],%f16 ! load constants 386 ldd [%o7+8*1],%f18 387 ldd [%o7+8*2],%f20 388 ldd [%o7+8*3],%f22 389 ldd [%o7+8*5],%f26 390 391 std %f16,[%i0+8*0] ! initial hash value, biased 0 392 std %f18,[%i0+8*1] 393 std %f20,[%i0+8*2] 394 std %f22,[%i0+8*3] 395 396 brz,pn %i1,.Lno_key_fma 397 nop 398 399 stx %fsr,[%sp+LOCALS] ! save original %fsr 400 ldx [%o7+8*6],%fsr ! load new %fsr 401 402 std %f16,[%i0+8*4] ! key "template" 403 std %f18,[%i0+8*5] 404 std %f20,[%i0+8*6] 405 std %f22,[%i0+8*7] 406 407 and %i1,7,%l2 408 andn %i1,7,%i1 ! align pointer 409 mov 8,%l0 410 sll %l2,3,%l2 411 mov 16,%l1 412 neg %l2,%l3 413 414 ldxa [%i1+%g0]0x88,%o0 ! load little-endian key 415 ldxa [%i1+%l0]0x88,%o2 416 417 brz %l2,.Lkey_aligned_fma 418 sethi %hi(0xf0000000),%l0 ! 0xf0000000 419 420 ldxa [%i1+%l1]0x88,%o4 421 422 srlx %o0,%l2,%o0 ! align data 423 sllx %o2,%l3,%o1 424 srlx %o2,%l2,%o2 425 or %o1,%o0,%o0 426 sllx %o4,%l3,%o3 427 or %o3,%o2,%o2 428 429.Lkey_aligned_fma: 430 or %l0,3,%l1 ! 0xf0000003 431 srlx %o0,32,%o1 432 andn %o0,%l0,%o0 ! &=0x0fffffff 433 andn %o1,%l1,%o1 ! &=0x0ffffffc 434 srlx %o2,32,%o3 435 andn %o2,%l1,%o2 436 andn %o3,%l1,%o3 437 438 st %o0,[%i0+36] ! fill "template" 439 st %o1,[%i0+44] 440 st %o2,[%i0+52] 441 st %o3,[%i0+60] 442 443 ldd [%i0+8*4],%f0 ! load [biased] key 444 ldd [%i0+8*5],%f4 445 ldd [%i0+8*6],%f8 446 ldd [%i0+8*7],%f12 447 448 fsubd %f0,%f16, %f0 ! r0 449 ldd [%o7+8*7],%f16 ! more constants 450 fsubd %f4,%f18,%f4 ! r1 451 ldd [%o7+8*8],%f18 452 fsubd %f8,%f20,%f8 ! r2 453 ldd [%o7+8*9],%f20 454 fsubd %f12,%f22,%f12 ! r3 455 ldd [%o7+8*10],%f22 456 457 fmuld %f26,%f4,%f52 ! s1 458 fmuld %f26,%f8,%f40 ! s2 459 fmuld %f26,%f12,%f44 ! s3 460 461 faddd %f0,%f16, %f2 462 faddd %f4,%f18,%f6 463 faddd %f8,%f20,%f10 464 faddd %f12,%f22,%f14 465 466 fsubd %f2,%f16, %f2 467 ldd [%o7+8*11],%f16 ! more constants 468 fsubd %f6,%f18,%f6 469 ldd [%o7+8*12],%f18 470 fsubd %f10,%f20,%f10 471 ldd [%o7+8*13],%f20 472 fsubd %f14,%f22,%f14 473 474 fsubd %f0,%f2,%f0 475 std %f2,[%i0+8*5] ! r0hi 476 fsubd %f4,%f6,%f4 477 std %f6,[%i0+8*7] ! r1hi 478 fsubd %f8,%f10,%f8 479 std %f10,[%i0+8*9] ! r2hi 480 fsubd %f12,%f14,%f12 481 std %f14,[%i0+8*11] ! r3hi 482 483 faddd %f52,%f16, %f54 484 faddd %f40,%f18,%f42 485 faddd %f44,%f20,%f46 486 487 fsubd %f54,%f16, %f54 488 fsubd %f42,%f18,%f42 489 fsubd %f46,%f20,%f46 490 491 fsubd %f52,%f54,%f52 492 fsubd %f40,%f42,%f40 493 fsubd %f44,%f46,%f44 494 495 ldx [%sp+LOCALS],%fsr ! restore %fsr 496 497 std %f0,[%i0+8*4] ! r0lo 498 std %f4,[%i0+8*6] ! r1lo 499 std %f8,[%i0+8*8] ! r2lo 500 std %f12,[%i0+8*10] ! r3lo 501 502 std %f54,[%i0+8*13] 503 std %f42,[%i0+8*15] 504 std %f46,[%i0+8*17] 505 506 std %f52,[%i0+8*12] 507 std %f40,[%i0+8*14] 508 std %f44,[%i0+8*16] 509 510 add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0 511 add %o7,poly1305_emit_fma-.Lconsts_fma,%o1 512 STPTR %o0,[%i2] 513 STPTR %o1,[%i2+SIZE_T] 514 515 ret 516 restore %g0,1,%o0 ! return 1 517 518.Lno_key_fma: 519 ret 520 restore %g0,%g0,%o0 ! return 0 521.type poly1305_init_fma,#function 522.size poly1305_init_fma,.-poly1305_init_fma 523 524.align 32 525poly1305_blocks_fma: 526 save %sp,-STACK_FRAME-48,%sp 527 srln %i2,4,%i2 528 529 brz,pn %i2,.Labort 530 sub %i2,1,%i2 531 5321: call .+8 533 add %o7,.Lconsts_fma-1b,%o7 534 535 ldd [%o7+8*0],%f16 ! load constants 536 ldd [%o7+8*1],%f18 537 ldd [%o7+8*2],%f20 538 ldd [%o7+8*3],%f22 539 ldd [%o7+8*4],%f24 540 ldd [%o7+8*5],%f26 541 542 ldd [%i0+8*0],%f0 ! load [biased] hash value 543 ldd [%i0+8*1],%f4 544 ldd [%i0+8*2],%f8 545 ldd [%i0+8*3],%f12 546 547 std %f16,[%sp+LOCALS+8*0] ! input "template" 548 sethi %hi((1023+52+96)<<20),%o3 549 std %f18,[%sp+LOCALS+8*1] 550 or %i3,%o3,%o3 551 std %f20,[%sp+LOCALS+8*2] 552 st %o3,[%sp+LOCALS+8*3] 553 554 and %i1,7,%l2 555 andn %i1,7,%i1 ! align pointer 556 mov 8,%l0 557 sll %l2,3,%l2 558 mov 16,%l1 559 neg %l2,%l3 560 561 ldxa [%i1+%g0]0x88,%o0 ! load little-endian input 562 brz %l2,.Linp_aligned_fma 563 ldxa [%i1+%l0]0x88,%o2 564 565 ldxa [%i1+%l1]0x88,%o4 566 add %i1,8,%i1 567 568 srlx %o0,%l2,%o0 ! align data 569 sllx %o2,%l3,%o1 570 srlx %o2,%l2,%o2 571 or %o1,%o0,%o0 572 sllx %o4,%l3,%o3 573 srlx %o4,%l2,%o4 ! pre-shift 574 or %o3,%o2,%o2 575 576.Linp_aligned_fma: 577 srlx %o0,32,%o1 578 movrz %i2,0,%l1 579 srlx %o2,32,%o3 580 add %l1,%i1,%i1 ! conditional advance 581 582 st %o0,[%sp+LOCALS+8*0+4] ! fill "template" 583 st %o1,[%sp+LOCALS+8*1+4] 584 st %o2,[%sp+LOCALS+8*2+4] 585 st %o3,[%sp+LOCALS+8*3+4] 586 587 ldd [%i0+8*4],%f28 ! load key 588 ldd [%i0+8*5],%f30 589 ldd [%i0+8*6],%f32 590 ldd [%i0+8*7],%f34 591 ldd [%i0+8*8],%f36 592 ldd [%i0+8*9],%f38 593 ldd [%i0+8*10],%f48 594 ldd [%i0+8*11],%f50 595 ldd [%i0+8*12],%f52 596 ldd [%i0+8*13],%f54 597 ldd [%i0+8*14],%f40 598 ldd [%i0+8*15],%f42 599 ldd [%i0+8*16],%f44 600 ldd [%i0+8*17],%f46 601 602 stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr 603 ldx [%o7+8*6],%fsr ! load new %fsr 604 605 subcc %i2,1,%i2 606 movrz %i2,0,%l1 607 608 ldd [%sp+LOCALS+8*0],%f56 ! load biased input 609 ldd [%sp+LOCALS+8*1],%f58 610 ldd [%sp+LOCALS+8*2],%f60 611 ldd [%sp+LOCALS+8*3],%f62 612 613 fsubd %f0,%f16, %f0 ! de-bias hash value 614 fsubd %f4,%f18,%f4 615 ldxa [%i1+%g0]0x88,%o0 ! modulo-scheduled input load 616 fsubd %f8,%f20,%f8 617 fsubd %f12,%f22,%f12 618 ldxa [%i1+%l0]0x88,%o2 619 620 fsubd %f56,%f16, %f56 ! de-bias input 621 fsubd %f58,%f18,%f58 622 fsubd %f60,%f20,%f60 623 fsubd %f62,%f22,%f62 624 625 brz %l2,.Linp_aligned_fma2 626 add %l1,%i1,%i1 ! conditional advance 627 628 sllx %o0,%l3,%o1 ! align data 629 srlx %o0,%l2,%o3 630 or %o1,%o4,%o0 631 sllx %o2,%l3,%o1 632 srlx %o2,%l2,%o4 ! pre-shift 633 or %o3,%o1,%o2 634.Linp_aligned_fma2: 635 srlx %o0,32,%o1 636 srlx %o2,32,%o3 637 638 faddd %f0,%f56,%f56 ! accumulate input 639 stw %o0,[%sp+LOCALS+8*0+4] 640 faddd %f4,%f58,%f58 641 stw %o1,[%sp+LOCALS+8*1+4] 642 faddd %f8,%f60,%f60 643 stw %o2,[%sp+LOCALS+8*2+4] 644 faddd %f12,%f62,%f62 645 stw %o3,[%sp+LOCALS+8*3+4] 646 647 b .Lentry_fma 648 nop 649 650.align 16 651.Loop_fma: 652 ldxa [%i1+%g0]0x88,%o0 ! modulo-scheduled input load 653 ldxa [%i1+%l0]0x88,%o2 654 movrz %i2,0,%l1 655 656 faddd %f52,%f0,%f0 ! accumulate input 657 faddd %f54,%f2,%f2 658 faddd %f62,%f8,%f8 659 faddd %f60,%f10,%f10 660 661 brz,pn %l2,.Linp_aligned_fma3 662 add %l1,%i1,%i1 ! conditional advance 663 664 sllx %o0,%l3,%o1 ! align data 665 srlx %o0,%l2,%o3 666 or %o1,%o4,%o0 667 sllx %o2,%l3,%o1 668 srlx %o2,%l2,%o4 ! pre-shift 669 or %o3,%o1,%o2 670 671.Linp_aligned_fma3: 672 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 673 faddd %f20,%f4,%f52 674 srlx %o0,32,%o1 675 faddd %f20,%f6,%f54 676 srlx %o2,32,%o3 677 faddd %f24,%f12,%f60 678 st %o0,[%sp+LOCALS+8*0+4] ! fill "template" 679 faddd %f24,%f14,%f62 680 st %o1,[%sp+LOCALS+8*1+4] 681 faddd %f18,%f0,%f48 682 st %o2,[%sp+LOCALS+8*2+4] 683 faddd %f18,%f2,%f50 684 st %o3,[%sp+LOCALS+8*3+4] 685 faddd %f22,%f8,%f56 686 faddd %f22,%f10,%f58 687 688 fsubd %f52,%f20,%f52 689 fsubd %f54,%f20,%f54 690 fsubd %f60,%f24,%f60 691 fsubd %f62,%f24,%f62 692 fsubd %f48,%f18,%f48 693 fsubd %f50,%f18,%f50 694 fsubd %f56,%f22,%f56 695 fsubd %f58,%f22,%f58 696 697 fsubd %f4,%f52,%f4 698 fsubd %f6,%f54,%f6 699 fsubd %f12,%f60,%f12 700 fsubd %f14,%f62,%f14 701 fsubd %f8,%f56,%f8 702 fsubd %f10,%f58,%f10 703 fsubd %f0,%f48,%f0 704 fsubd %f2,%f50,%f2 705 706 faddd %f4,%f48,%f4 707 faddd %f6,%f50,%f6 708 faddd %f12,%f56,%f12 709 faddd %f14,%f58,%f14 710 faddd %f8,%f52,%f8 711 faddd %f10,%f54,%f10 712 .word 0x81be805d !fmaddd %f26,%f60,%f0,%f0 713 .word 0x85be845f !fmaddd %f26,%f62,%f2,%f2 714 715 faddd %f4,%f6,%f58 716 ldd [%i0+8*12],%f52 ! reload constants 717 faddd %f12,%f14,%f62 718 ldd [%i0+8*13],%f54 719 faddd %f8,%f10,%f60 720 ldd [%i0+8*10],%f48 721 faddd %f0,%f2,%f56 722 ldd [%i0+8*11],%f50 723 724.Lentry_fma: 725 fmuld %f58,%f44,%f0 726 fmuld %f58,%f46,%f2 727 fmuld %f58,%f32,%f8 728 fmuld %f58,%f34,%f10 729 fmuld %f58,%f28,%f4 730 fmuld %f58,%f30,%f6 731 fmuld %f58,%f36,%f12 732 fmuld %f58,%f38,%f14 733 734 .word 0x81bfc055 !fmaddd %f62,%f52,%f0,%f0 735 .word 0x85bfc457 !fmaddd %f62,%f54,%f2,%f2 736 .word 0x91bfd04d !fmaddd %f62,%f44,%f8,%f8 737 .word 0x95bfd44f !fmaddd %f62,%f46,%f10,%f10 738 .word 0x89bfc849 !fmaddd %f62,%f40,%f4,%f4 739 .word 0x8dbfcc4b !fmaddd %f62,%f42,%f6,%f6 740 .word 0x99bfd85c !fmaddd %f62,%f28,%f12,%f12 741 .word 0x9dbfdc5e !fmaddd %f62,%f30,%f14,%f14 742 743 .word 0x81bf4049 !fmaddd %f60,%f40,%f0,%f0 744 .word 0x85bf444b !fmaddd %f60,%f42,%f2,%f2 745 .word 0x91bf505c !fmaddd %f60,%f28,%f8,%f8 746 .word 0x95bf545e !fmaddd %f60,%f30,%f10,%f10 747 .word 0x89bf484d !fmaddd %f60,%f44,%f4,%f4 748 ldd [%sp+LOCALS+8*0],%f52 ! load [biased] input 749 .word 0x8dbf4c4f !fmaddd %f60,%f46,%f6,%f6 750 ldd [%sp+LOCALS+8*1],%f54 751 .word 0x99bf5841 !fmaddd %f60,%f32,%f12,%f12 752 ldd [%sp+LOCALS+8*2],%f62 753 .word 0x9dbf5c43 !fmaddd %f60,%f34,%f14,%f14 754 ldd [%sp+LOCALS+8*3],%f60 755 756 .word 0x81be405c !fmaddd %f56,%f28,%f0,%f0 757 fsubd %f52,%f16, %f52 ! de-bias input 758 .word 0x85be445e !fmaddd %f56,%f30,%f2,%f2 759 fsubd %f54,%f18,%f54 760 .word 0x91be5045 !fmaddd %f56,%f36,%f8,%f8 761 fsubd %f62,%f20,%f62 762 .word 0x95be5447 !fmaddd %f56,%f38,%f10,%f10 763 fsubd %f60,%f22,%f60 764 .word 0x89be4841 !fmaddd %f56,%f32,%f4,%f4 765 .word 0x8dbe4c43 !fmaddd %f56,%f34,%f6,%f6 766 .word 0x99be5851 !fmaddd %f56,%f48,%f12,%f12 767 .word 0x9dbe5c53 !fmaddd %f56,%f50,%f14,%f14 768 769 bcc SIZE_T_CC,.Loop_fma 770 subcc %i2,1,%i2 771 772 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 773 faddd %f0,%f18,%f48 774 faddd %f2,%f18,%f50 775 faddd %f8,%f22,%f56 776 faddd %f10,%f22,%f58 777 faddd %f4,%f20,%f52 778 faddd %f6,%f20,%f54 779 faddd %f12,%f24,%f60 780 faddd %f14,%f24,%f62 781 782 fsubd %f48,%f18,%f48 783 fsubd %f50,%f18,%f50 784 fsubd %f56,%f22,%f56 785 fsubd %f58,%f22,%f58 786 fsubd %f52,%f20,%f52 787 fsubd %f54,%f20,%f54 788 fsubd %f60,%f24,%f60 789 fsubd %f62,%f24,%f62 790 791 fsubd %f4,%f52,%f4 792 fsubd %f6,%f54,%f6 793 fsubd %f12,%f60,%f12 794 fsubd %f14,%f62,%f14 795 fsubd %f8,%f56,%f8 796 fsubd %f10,%f58,%f10 797 fsubd %f0,%f48,%f0 798 fsubd %f2,%f50,%f2 799 800 faddd %f4,%f48,%f4 801 faddd %f6,%f50,%f6 802 faddd %f12,%f56,%f12 803 faddd %f14,%f58,%f14 804 faddd %f8,%f52,%f8 805 faddd %f10,%f54,%f10 806 .word 0x81be805d !fmaddd %f26,%f60,%f0,%f0 807 .word 0x85be845f !fmaddd %f26,%f62,%f2,%f2 808 809 faddd %f4,%f6,%f58 810 faddd %f12,%f14,%f62 811 faddd %f8,%f10,%f60 812 faddd %f0,%f2,%f56 813 814 faddd %f58,%f18,%f58 ! bias 815 faddd %f62,%f22,%f62 816 faddd %f60,%f20,%f60 817 faddd %f56,%f16, %f56 818 819 ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr 820 821 std %f58,[%i0+8*1] ! store [biased] hash value 822 std %f62,[%i0+8*3] 823 std %f60,[%i0+8*2] 824 std %f56,[%i0+8*0] 825 826.Labort: 827 ret 828 restore 829.type poly1305_blocks_fma,#function 830.size poly1305_blocks_fma,.-poly1305_blocks_fma 831.align 32 832poly1305_emit_fma: 833 save %sp,-STACK_FRAME,%sp 834 835 ld [%i0+8*0+0],%l5 ! load hash 836 ld [%i0+8*0+4],%l0 837 ld [%i0+8*1+0],%o0 838 ld [%i0+8*1+4],%l1 839 ld [%i0+8*2+0],%o1 840 ld [%i0+8*2+4],%l2 841 ld [%i0+8*3+0],%o2 842 ld [%i0+8*3+4],%l3 843 844 sethi %hi(0xfff00000),%o3 845 andn %l5,%o3,%l5 ! mask exponent 846 andn %o0,%o3,%o0 847 andn %o1,%o3,%o1 848 andn %o2,%o3,%o2 ! can be partially reduced... 849 mov 3,%o3 850 851 srl %o2,2,%i3 ! ... so reduce 852 and %o2,%o3,%l4 853 andn %o2,%o3,%o2 854 add %i3,%o2,%o2 855 856 addcc %o2,%l0,%l0 857 addccc %l5,%l1,%l1 858 addccc %o0,%l2,%l2 859 addccc %o1,%l3,%l3 860 addc %g0,%l4,%l4 861 862 addcc %l0,5,%l5 ! compare to modulus 863 addccc %l1,0,%o0 864 addccc %l2,0,%o1 865 addccc %l3,0,%o2 866 addc %l4,0,%o3 867 868 srl %o3,2,%o3 ! did it carry/borrow? 869 neg %o3,%o3 870 sra %o3,31,%o3 ! mask 871 872 andn %l0,%o3,%l0 873 and %l5,%o3,%l5 874 andn %l1,%o3,%l1 875 and %o0,%o3,%o0 876 or %l5,%l0,%l0 877 ld [%i2+0],%l5 ! load nonce 878 andn %l2,%o3,%l2 879 and %o1,%o3,%o1 880 or %o0,%l1,%l1 881 ld [%i2+4],%o0 882 andn %l3,%o3,%l3 883 and %o2,%o3,%o2 884 or %o1,%l2,%l2 885 ld [%i2+8],%o1 886 or %o2,%l3,%l3 887 ld [%i2+12],%o2 888 889 addcc %l5,%l0,%l0 ! accumulate nonce 890 addccc %o0,%l1,%l1 891 addccc %o1,%l2,%l2 892 addc %o2,%l3,%l3 893 894 stb %l0,[%i1+0] ! write little-endian result 895 srl %l0,8,%l0 896 stb %l1,[%i1+4] 897 srl %l1,8,%l1 898 stb %l2,[%i1+8] 899 srl %l2,8,%l2 900 stb %l3,[%i1+12] 901 srl %l3,8,%l3 902 903 stb %l0,[%i1+1] 904 srl %l0,8,%l0 905 stb %l1,[%i1+5] 906 srl %l1,8,%l1 907 stb %l2,[%i1+9] 908 srl %l2,8,%l2 909 stb %l3,[%i1+13] 910 srl %l3,8,%l3 911 912 stb %l0,[%i1+2] 913 srl %l0,8,%l0 914 stb %l1,[%i1+6] 915 srl %l1,8,%l1 916 stb %l2,[%i1+10] 917 srl %l2,8,%l2 918 stb %l3,[%i1+14] 919 srl %l3,8,%l3 920 921 stb %l0,[%i1+3] 922 stb %l1,[%i1+7] 923 stb %l2,[%i1+11] 924 stb %l3,[%i1+15] 925 926 ret 927 restore 928.type poly1305_emit_fma,#function 929.size poly1305_emit_fma,.-poly1305_emit_fma 930.align 64 931.Lconsts_fma: 932.word 0x43300000,0x00000000 ! 2^(52+0) 933.word 0x45300000,0x00000000 ! 2^(52+32) 934.word 0x47300000,0x00000000 ! 2^(52+64) 935.word 0x49300000,0x00000000 ! 2^(52+96) 936.word 0x4b500000,0x00000000 ! 2^(52+130) 937 938.word 0x37f40000,0x00000000 ! 5/2^130 939.word 0,1<<30 ! fsr: truncate, no exceptions 940 941.word 0x44300000,0x00000000 ! 2^(52+16+0) 942.word 0x46300000,0x00000000 ! 2^(52+16+32) 943.word 0x48300000,0x00000000 ! 2^(52+16+64) 944.word 0x4a300000,0x00000000 ! 2^(52+16+96) 945.word 0x3e300000,0x00000000 ! 2^(52+16+0-96) 946.word 0x40300000,0x00000000 ! 2^(52+16+32-96) 947.word 0x42300000,0x00000000 ! 2^(52+16+64-96) 948.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro@openssl.org>" 949.align 4 950