sparcv9_modes.pl revision 306195
1#!/usr/bin/env perl 2 3# Specific modes implementations for SPARC Architecture 2011. There 4# is T4 dependency though, an ASI value that is not specified in the 5# Architecture Manual. But as SPARC universe is rather monocultural, 6# we imply that processor capable of executing crypto instructions 7# can handle the ASI in question as well. This means that we ought to 8# keep eyes open when new processors emerge... 9# 10# As for above mentioned ASI. It's so called "block initializing 11# store" which cancels "read" in "read-update-write" on cache lines. 12# This is "cooperative" optimization, as it reduces overall pressure 13# on memory interface. Benefits can't be observed/quantified with 14# usual benchmarks, on the contrary you can notice that single-thread 15# performance for parallelizable modes is ~1.5% worse for largest 16# block sizes [though few percent better for not so long ones]. All 17# this based on suggestions from David Miller. 18 19sub asm_init { # to be called with @ARGV as argument 20 for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); } 21 if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; } 22 else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; } 23} 24 25# unified interface 26my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5)); 27# local variables 28my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7)); 29 30sub alg_cbc_encrypt_implement { 31my ($alg,$bits) = @_; 32 33$::code.=<<___; 34.globl ${alg}${bits}_t4_cbc_encrypt 35.align 32 36${alg}${bits}_t4_cbc_encrypt: 37 save %sp, -$::frame, %sp 38 cmp $len, 0 39 be,pn $::size_t_cc, .L${bits}_cbc_enc_abort 40 srln $len, 0, $len ! needed on v8+, "nop" on v9 41 sub $inp, $out, $blk_init ! $inp!=$out 42___ 43$::code.=<<___ if (!$::evp); 44 andcc $ivec, 7, $ivoff 45 alignaddr $ivec, %g0, $ivec 46 47 ldd [$ivec + 0], %f0 ! load ivec 48 bz,pt %icc, 1f 49 ldd [$ivec + 8], %f2 50 ldd [$ivec + 16], %f4 51 faligndata %f0, %f2, %f0 52 faligndata %f2, %f4, %f2 531: 54___ 55$::code.=<<___ if ($::evp); 56 ld [$ivec + 0], %f0 57 ld [$ivec + 4], %f1 58 ld [$ivec + 8], %f2 59 ld [$ivec + 12], %f3 60___ 61$::code.=<<___; 62 prefetch [$inp], 20 63 prefetch [$inp + 63], 20 64 call _${alg}${bits}_load_enckey 65 and $inp, 7, $ileft 66 andn $inp, 7, $inp 67 sll $ileft, 3, $ileft 68 mov 64, $iright 69 mov 0xff, $omask 70 sub $iright, $ileft, $iright 71 and $out, 7, $ooff 72 cmp $len, 127 73 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 74 movleu $::size_t_cc, 0, $blk_init ! $len<128 || 75 brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out) 76 srl $omask, $ooff, $omask 77 78 alignaddrl $out, %g0, $out 79 srlx $len, 4, $len 80 prefetch [$out], 22 81 82.L${bits}_cbc_enc_loop: 83 ldx [$inp + 0], %o0 84 brz,pt $ileft, 4f 85 ldx [$inp + 8], %o1 86 87 ldx [$inp + 16], %o2 88 sllx %o0, $ileft, %o0 89 srlx %o1, $iright, %g1 90 sllx %o1, $ileft, %o1 91 or %g1, %o0, %o0 92 srlx %o2, $iright, %o2 93 or %o2, %o1, %o1 944: 95 xor %g4, %o0, %o0 ! ^= rk[0] 96 xor %g5, %o1, %o1 97 movxtod %o0, %f12 98 movxtod %o1, %f14 99 100 fxor %f12, %f0, %f0 ! ^= ivec 101 fxor %f14, %f2, %f2 102 prefetch [$out + 63], 22 103 prefetch [$inp + 16+63], 20 104 call _${alg}${bits}_encrypt_1x 105 add $inp, 16, $inp 106 107 brnz,pn $ooff, 2f 108 sub $len, 1, $len 109 110 std %f0, [$out + 0] 111 std %f2, [$out + 8] 112 brnz,pt $len, .L${bits}_cbc_enc_loop 113 add $out, 16, $out 114___ 115$::code.=<<___ if ($::evp); 116 st %f0, [$ivec + 0] 117 st %f1, [$ivec + 4] 118 st %f2, [$ivec + 8] 119 st %f3, [$ivec + 12] 120___ 121$::code.=<<___ if (!$::evp); 122 brnz,pn $ivoff, 3f 123 nop 124 125 std %f0, [$ivec + 0] ! write out ivec 126 std %f2, [$ivec + 8] 127___ 128$::code.=<<___; 129.L${bits}_cbc_enc_abort: 130 ret 131 restore 132 133.align 16 1342: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 135 ! and ~3x deterioration 136 ! in inp==out case 137 faligndata %f0, %f0, %f4 ! handle unaligned output 138 faligndata %f0, %f2, %f6 139 faligndata %f2, %f2, %f8 140 141 stda %f4, [$out + $omask]0xc0 ! partial store 142 std %f6, [$out + 8] 143 add $out, 16, $out 144 orn %g0, $omask, $omask 145 stda %f8, [$out + $omask]0xc0 ! partial store 146 147 brnz,pt $len, .L${bits}_cbc_enc_loop+4 148 orn %g0, $omask, $omask 149___ 150$::code.=<<___ if ($::evp); 151 st %f0, [$ivec + 0] 152 st %f1, [$ivec + 4] 153 st %f2, [$ivec + 8] 154 st %f3, [$ivec + 12] 155___ 156$::code.=<<___ if (!$::evp); 157 brnz,pn $ivoff, 3f 158 nop 159 160 std %f0, [$ivec + 0] ! write out ivec 161 std %f2, [$ivec + 8] 162 ret 163 restore 164 165.align 16 1663: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec 167 mov 0xff, $omask 168 srl $omask, $ivoff, $omask 169 faligndata %f0, %f0, %f4 170 faligndata %f0, %f2, %f6 171 faligndata %f2, %f2, %f8 172 stda %f4, [$ivec + $omask]0xc0 173 std %f6, [$ivec + 8] 174 add $ivec, 16, $ivec 175 orn %g0, $omask, $omask 176 stda %f8, [$ivec + $omask]0xc0 177___ 178$::code.=<<___; 179 ret 180 restore 181 182!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 183.align 32 184.L${bits}cbc_enc_blk: 185 add $out, $len, $blk_init 186 and $blk_init, 63, $blk_init ! tail 187 sub $len, $blk_init, $len 188 add $blk_init, 15, $blk_init ! round up to 16n 189 srlx $len, 4, $len 190 srl $blk_init, 4, $blk_init 191 192.L${bits}_cbc_enc_blk_loop: 193 ldx [$inp + 0], %o0 194 brz,pt $ileft, 5f 195 ldx [$inp + 8], %o1 196 197 ldx [$inp + 16], %o2 198 sllx %o0, $ileft, %o0 199 srlx %o1, $iright, %g1 200 sllx %o1, $ileft, %o1 201 or %g1, %o0, %o0 202 srlx %o2, $iright, %o2 203 or %o2, %o1, %o1 2045: 205 xor %g4, %o0, %o0 ! ^= rk[0] 206 xor %g5, %o1, %o1 207 movxtod %o0, %f12 208 movxtod %o1, %f14 209 210 fxor %f12, %f0, %f0 ! ^= ivec 211 fxor %f14, %f2, %f2 212 prefetch [$inp + 16+63], 20 213 call _${alg}${bits}_encrypt_1x 214 add $inp, 16, $inp 215 sub $len, 1, $len 216 217 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 218 add $out, 8, $out 219 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 220 brnz,pt $len, .L${bits}_cbc_enc_blk_loop 221 add $out, 8, $out 222 223 membar #StoreLoad|#StoreStore 224 brnz,pt $blk_init, .L${bits}_cbc_enc_loop 225 mov $blk_init, $len 226___ 227$::code.=<<___ if ($::evp); 228 st %f0, [$ivec + 0] 229 st %f1, [$ivec + 4] 230 st %f2, [$ivec + 8] 231 st %f3, [$ivec + 12] 232___ 233$::code.=<<___ if (!$::evp); 234 brnz,pn $ivoff, 3b 235 nop 236 237 std %f0, [$ivec + 0] ! write out ivec 238 std %f2, [$ivec + 8] 239___ 240$::code.=<<___; 241 ret 242 restore 243.type ${alg}${bits}_t4_cbc_encrypt,#function 244.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt 245___ 246} 247 248sub alg_cbc_decrypt_implement { 249my ($alg,$bits) = @_; 250 251$::code.=<<___; 252.globl ${alg}${bits}_t4_cbc_decrypt 253.align 32 254${alg}${bits}_t4_cbc_decrypt: 255 save %sp, -$::frame, %sp 256 cmp $len, 0 257 be,pn $::size_t_cc, .L${bits}_cbc_dec_abort 258 srln $len, 0, $len ! needed on v8+, "nop" on v9 259 sub $inp, $out, $blk_init ! $inp!=$out 260___ 261$::code.=<<___ if (!$::evp); 262 andcc $ivec, 7, $ivoff 263 alignaddr $ivec, %g0, $ivec 264 265 ldd [$ivec + 0], %f12 ! load ivec 266 bz,pt %icc, 1f 267 ldd [$ivec + 8], %f14 268 ldd [$ivec + 16], %f0 269 faligndata %f12, %f14, %f12 270 faligndata %f14, %f0, %f14 2711: 272___ 273$::code.=<<___ if ($::evp); 274 ld [$ivec + 0], %f12 ! load ivec 275 ld [$ivec + 4], %f13 276 ld [$ivec + 8], %f14 277 ld [$ivec + 12], %f15 278___ 279$::code.=<<___; 280 prefetch [$inp], 20 281 prefetch [$inp + 63], 20 282 call _${alg}${bits}_load_deckey 283 and $inp, 7, $ileft 284 andn $inp, 7, $inp 285 sll $ileft, 3, $ileft 286 mov 64, $iright 287 mov 0xff, $omask 288 sub $iright, $ileft, $iright 289 and $out, 7, $ooff 290 cmp $len, 255 291 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 292 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 293 brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out) 294 srl $omask, $ooff, $omask 295 296 andcc $len, 16, %g0 ! is number of blocks even? 297 srlx $len, 4, $len 298 alignaddrl $out, %g0, $out 299 bz %icc, .L${bits}_cbc_dec_loop2x 300 prefetch [$out], 22 301.L${bits}_cbc_dec_loop: 302 ldx [$inp + 0], %o0 303 brz,pt $ileft, 4f 304 ldx [$inp + 8], %o1 305 306 ldx [$inp + 16], %o2 307 sllx %o0, $ileft, %o0 308 srlx %o1, $iright, %g1 309 sllx %o1, $ileft, %o1 310 or %g1, %o0, %o0 311 srlx %o2, $iright, %o2 312 or %o2, %o1, %o1 3134: 314 xor %g4, %o0, %o2 ! ^= rk[0] 315 xor %g5, %o1, %o3 316 movxtod %o2, %f0 317 movxtod %o3, %f2 318 319 prefetch [$out + 63], 22 320 prefetch [$inp + 16+63], 20 321 call _${alg}${bits}_decrypt_1x 322 add $inp, 16, $inp 323 324 fxor %f12, %f0, %f0 ! ^= ivec 325 fxor %f14, %f2, %f2 326 movxtod %o0, %f12 327 movxtod %o1, %f14 328 329 brnz,pn $ooff, 2f 330 sub $len, 1, $len 331 332 std %f0, [$out + 0] 333 std %f2, [$out + 8] 334 brnz,pt $len, .L${bits}_cbc_dec_loop2x 335 add $out, 16, $out 336___ 337$::code.=<<___ if ($::evp); 338 st %f12, [$ivec + 0] 339 st %f13, [$ivec + 4] 340 st %f14, [$ivec + 8] 341 st %f15, [$ivec + 12] 342___ 343$::code.=<<___ if (!$::evp); 344 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 345 nop 346 347 std %f12, [$ivec + 0] ! write out ivec 348 std %f14, [$ivec + 8] 349___ 350$::code.=<<___; 351.L${bits}_cbc_dec_abort: 352 ret 353 restore 354 355.align 16 3562: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 357 ! and ~3x deterioration 358 ! in inp==out case 359 faligndata %f0, %f0, %f4 ! handle unaligned output 360 faligndata %f0, %f2, %f6 361 faligndata %f2, %f2, %f8 362 363 stda %f4, [$out + $omask]0xc0 ! partial store 364 std %f6, [$out + 8] 365 add $out, 16, $out 366 orn %g0, $omask, $omask 367 stda %f8, [$out + $omask]0xc0 ! partial store 368 369 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 370 orn %g0, $omask, $omask 371___ 372$::code.=<<___ if ($::evp); 373 st %f12, [$ivec + 0] 374 st %f13, [$ivec + 4] 375 st %f14, [$ivec + 8] 376 st %f15, [$ivec + 12] 377___ 378$::code.=<<___ if (!$::evp); 379 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 380 nop 381 382 std %f12, [$ivec + 0] ! write out ivec 383 std %f14, [$ivec + 8] 384___ 385$::code.=<<___; 386 ret 387 restore 388 389!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 390.align 32 391.L${bits}_cbc_dec_loop2x: 392 ldx [$inp + 0], %o0 393 ldx [$inp + 8], %o1 394 ldx [$inp + 16], %o2 395 brz,pt $ileft, 4f 396 ldx [$inp + 24], %o3 397 398 ldx [$inp + 32], %o4 399 sllx %o0, $ileft, %o0 400 srlx %o1, $iright, %g1 401 or %g1, %o0, %o0 402 sllx %o1, $ileft, %o1 403 srlx %o2, $iright, %g1 404 or %g1, %o1, %o1 405 sllx %o2, $ileft, %o2 406 srlx %o3, $iright, %g1 407 or %g1, %o2, %o2 408 sllx %o3, $ileft, %o3 409 srlx %o4, $iright, %o4 410 or %o4, %o3, %o3 4114: 412 xor %g4, %o0, %o4 ! ^= rk[0] 413 xor %g5, %o1, %o5 414 movxtod %o4, %f0 415 movxtod %o5, %f2 416 xor %g4, %o2, %o4 417 xor %g5, %o3, %o5 418 movxtod %o4, %f4 419 movxtod %o5, %f6 420 421 prefetch [$out + 63], 22 422 prefetch [$inp + 32+63], 20 423 call _${alg}${bits}_decrypt_2x 424 add $inp, 32, $inp 425 426 movxtod %o0, %f8 427 movxtod %o1, %f10 428 fxor %f12, %f0, %f0 ! ^= ivec 429 fxor %f14, %f2, %f2 430 movxtod %o2, %f12 431 movxtod %o3, %f14 432 fxor %f8, %f4, %f4 433 fxor %f10, %f6, %f6 434 435 brnz,pn $ooff, 2f 436 sub $len, 2, $len 437 438 std %f0, [$out + 0] 439 std %f2, [$out + 8] 440 std %f4, [$out + 16] 441 std %f6, [$out + 24] 442 brnz,pt $len, .L${bits}_cbc_dec_loop2x 443 add $out, 32, $out 444___ 445$::code.=<<___ if ($::evp); 446 st %f12, [$ivec + 0] 447 st %f13, [$ivec + 4] 448 st %f14, [$ivec + 8] 449 st %f15, [$ivec + 12] 450___ 451$::code.=<<___ if (!$::evp); 452 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 453 nop 454 455 std %f12, [$ivec + 0] ! write out ivec 456 std %f14, [$ivec + 8] 457___ 458$::code.=<<___; 459 ret 460 restore 461 462.align 16 4632: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 464 ! and ~3x deterioration 465 ! in inp==out case 466 faligndata %f0, %f0, %f8 ! handle unaligned output 467 faligndata %f0, %f2, %f0 468 faligndata %f2, %f4, %f2 469 faligndata %f4, %f6, %f4 470 faligndata %f6, %f6, %f6 471 stda %f8, [$out + $omask]0xc0 ! partial store 472 std %f0, [$out + 8] 473 std %f2, [$out + 16] 474 std %f4, [$out + 24] 475 add $out, 32, $out 476 orn %g0, $omask, $omask 477 stda %f6, [$out + $omask]0xc0 ! partial store 478 479 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 480 orn %g0, $omask, $omask 481___ 482$::code.=<<___ if ($::evp); 483 st %f12, [$ivec + 0] 484 st %f13, [$ivec + 4] 485 st %f14, [$ivec + 8] 486 st %f15, [$ivec + 12] 487___ 488$::code.=<<___ if (!$::evp); 489 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 490 nop 491 492 std %f12, [$ivec + 0] ! write out ivec 493 std %f14, [$ivec + 8] 494 ret 495 restore 496 497.align 16 498.L${bits}_cbc_dec_unaligned_ivec: 499 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec 500 mov 0xff, $omask 501 srl $omask, $ivoff, $omask 502 faligndata %f12, %f12, %f0 503 faligndata %f12, %f14, %f2 504 faligndata %f14, %f14, %f4 505 stda %f0, [$ivec + $omask]0xc0 506 std %f2, [$ivec + 8] 507 add $ivec, 16, $ivec 508 orn %g0, $omask, $omask 509 stda %f4, [$ivec + $omask]0xc0 510___ 511$::code.=<<___; 512 ret 513 restore 514 515!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 516.align 32 517.L${bits}cbc_dec_blk: 518 add $out, $len, $blk_init 519 and $blk_init, 63, $blk_init ! tail 520 sub $len, $blk_init, $len 521 add $blk_init, 15, $blk_init ! round up to 16n 522 srlx $len, 4, $len 523 srl $blk_init, 4, $blk_init 524 sub $len, 1, $len 525 add $blk_init, 1, $blk_init 526 527.L${bits}_cbc_dec_blk_loop2x: 528 ldx [$inp + 0], %o0 529 ldx [$inp + 8], %o1 530 ldx [$inp + 16], %o2 531 brz,pt $ileft, 5f 532 ldx [$inp + 24], %o3 533 534 ldx [$inp + 32], %o4 535 sllx %o0, $ileft, %o0 536 srlx %o1, $iright, %g1 537 or %g1, %o0, %o0 538 sllx %o1, $ileft, %o1 539 srlx %o2, $iright, %g1 540 or %g1, %o1, %o1 541 sllx %o2, $ileft, %o2 542 srlx %o3, $iright, %g1 543 or %g1, %o2, %o2 544 sllx %o3, $ileft, %o3 545 srlx %o4, $iright, %o4 546 or %o4, %o3, %o3 5475: 548 xor %g4, %o0, %o4 ! ^= rk[0] 549 xor %g5, %o1, %o5 550 movxtod %o4, %f0 551 movxtod %o5, %f2 552 xor %g4, %o2, %o4 553 xor %g5, %o3, %o5 554 movxtod %o4, %f4 555 movxtod %o5, %f6 556 557 prefetch [$inp + 32+63], 20 558 call _${alg}${bits}_decrypt_2x 559 add $inp, 32, $inp 560 subcc $len, 2, $len 561 562 movxtod %o0, %f8 563 movxtod %o1, %f10 564 fxor %f12, %f0, %f0 ! ^= ivec 565 fxor %f14, %f2, %f2 566 movxtod %o2, %f12 567 movxtod %o3, %f14 568 fxor %f8, %f4, %f4 569 fxor %f10, %f6, %f6 570 571 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 572 add $out, 8, $out 573 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 574 add $out, 8, $out 575 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 576 add $out, 8, $out 577 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 578 bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x 579 add $out, 8, $out 580 581 add $blk_init, $len, $len 582 andcc $len, 1, %g0 ! is number of blocks even? 583 membar #StoreLoad|#StoreStore 584 bnz,pt %icc, .L${bits}_cbc_dec_loop 585 srl $len, 0, $len 586 brnz,pn $len, .L${bits}_cbc_dec_loop2x 587 nop 588___ 589$::code.=<<___ if ($::evp); 590 st %f12, [$ivec + 0] ! write out ivec 591 st %f13, [$ivec + 4] 592 st %f14, [$ivec + 8] 593 st %f15, [$ivec + 12] 594___ 595$::code.=<<___ if (!$::evp); 596 brnz,pn $ivoff, 3b 597 nop 598 599 std %f12, [$ivec + 0] ! write out ivec 600 std %f14, [$ivec + 8] 601___ 602$::code.=<<___; 603 ret 604 restore 605.type ${alg}${bits}_t4_cbc_decrypt,#function 606.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt 607___ 608} 609 610sub alg_ctr32_implement { 611my ($alg,$bits) = @_; 612 613$::code.=<<___; 614.globl ${alg}${bits}_t4_ctr32_encrypt 615.align 32 616${alg}${bits}_t4_ctr32_encrypt: 617 save %sp, -$::frame, %sp 618 srln $len, 0, $len ! needed on v8+, "nop" on v9 619 620 prefetch [$inp], 20 621 prefetch [$inp + 63], 20 622 call _${alg}${bits}_load_enckey 623 sllx $len, 4, $len 624 625 ld [$ivec + 0], %l4 ! counter 626 ld [$ivec + 4], %l5 627 ld [$ivec + 8], %l6 628 ld [$ivec + 12], %l7 629 630 sllx %l4, 32, %o5 631 or %l5, %o5, %o5 632 sllx %l6, 32, %g1 633 xor %o5, %g4, %g4 ! ^= rk[0] 634 xor %g1, %g5, %g5 635 movxtod %g4, %f14 ! most significant 64 bits 636 637 sub $inp, $out, $blk_init ! $inp!=$out 638 and $inp, 7, $ileft 639 andn $inp, 7, $inp 640 sll $ileft, 3, $ileft 641 mov 64, $iright 642 mov 0xff, $omask 643 sub $iright, $ileft, $iright 644 and $out, 7, $ooff 645 cmp $len, 255 646 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 647 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 648 brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out) 649 srl $omask, $ooff, $omask 650 651 andcc $len, 16, %g0 ! is number of blocks even? 652 alignaddrl $out, %g0, $out 653 bz %icc, .L${bits}_ctr32_loop2x 654 srlx $len, 4, $len 655.L${bits}_ctr32_loop: 656 ldx [$inp + 0], %o0 657 brz,pt $ileft, 4f 658 ldx [$inp + 8], %o1 659 660 ldx [$inp + 16], %o2 661 sllx %o0, $ileft, %o0 662 srlx %o1, $iright, %g1 663 sllx %o1, $ileft, %o1 664 or %g1, %o0, %o0 665 srlx %o2, $iright, %o2 666 or %o2, %o1, %o1 6674: 668 xor %g5, %l7, %g1 ! ^= rk[0] 669 add %l7, 1, %l7 670 movxtod %g1, %f2 671 srl %l7, 0, %l7 ! clruw 672 prefetch [$out + 63], 22 673 prefetch [$inp + 16+63], 20 674___ 675$::code.=<<___ if ($alg eq "aes"); 676 aes_eround01 %f16, %f14, %f2, %f4 677 aes_eround23 %f18, %f14, %f2, %f2 678___ 679$::code.=<<___ if ($alg eq "cmll"); 680 camellia_f %f16, %f2, %f14, %f2 681 camellia_f %f18, %f14, %f2, %f0 682___ 683$::code.=<<___; 684 call _${alg}${bits}_encrypt_1x+8 685 add $inp, 16, $inp 686 687 movxtod %o0, %f10 688 movxtod %o1, %f12 689 fxor %f10, %f0, %f0 ! ^= inp 690 fxor %f12, %f2, %f2 691 692 brnz,pn $ooff, 2f 693 sub $len, 1, $len 694 695 std %f0, [$out + 0] 696 std %f2, [$out + 8] 697 brnz,pt $len, .L${bits}_ctr32_loop2x 698 add $out, 16, $out 699 700 ret 701 restore 702 703.align 16 7042: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 705 ! and ~3x deterioration 706 ! in inp==out case 707 faligndata %f0, %f0, %f4 ! handle unaligned output 708 faligndata %f0, %f2, %f6 709 faligndata %f2, %f2, %f8 710 stda %f4, [$out + $omask]0xc0 ! partial store 711 std %f6, [$out + 8] 712 add $out, 16, $out 713 orn %g0, $omask, $omask 714 stda %f8, [$out + $omask]0xc0 ! partial store 715 716 brnz,pt $len, .L${bits}_ctr32_loop2x+4 717 orn %g0, $omask, $omask 718 719 ret 720 restore 721 722!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 723.align 32 724.L${bits}_ctr32_loop2x: 725 ldx [$inp + 0], %o0 726 ldx [$inp + 8], %o1 727 ldx [$inp + 16], %o2 728 brz,pt $ileft, 4f 729 ldx [$inp + 24], %o3 730 731 ldx [$inp + 32], %o4 732 sllx %o0, $ileft, %o0 733 srlx %o1, $iright, %g1 734 or %g1, %o0, %o0 735 sllx %o1, $ileft, %o1 736 srlx %o2, $iright, %g1 737 or %g1, %o1, %o1 738 sllx %o2, $ileft, %o2 739 srlx %o3, $iright, %g1 740 or %g1, %o2, %o2 741 sllx %o3, $ileft, %o3 742 srlx %o4, $iright, %o4 743 or %o4, %o3, %o3 7444: 745 xor %g5, %l7, %g1 ! ^= rk[0] 746 add %l7, 1, %l7 747 movxtod %g1, %f2 748 srl %l7, 0, %l7 ! clruw 749 xor %g5, %l7, %g1 750 add %l7, 1, %l7 751 movxtod %g1, %f6 752 srl %l7, 0, %l7 ! clruw 753 prefetch [$out + 63], 22 754 prefetch [$inp + 32+63], 20 755___ 756$::code.=<<___ if ($alg eq "aes"); 757 aes_eround01 %f16, %f14, %f2, %f8 758 aes_eround23 %f18, %f14, %f2, %f2 759 aes_eround01 %f16, %f14, %f6, %f10 760 aes_eround23 %f18, %f14, %f6, %f6 761___ 762$::code.=<<___ if ($alg eq "cmll"); 763 camellia_f %f16, %f2, %f14, %f2 764 camellia_f %f16, %f6, %f14, %f6 765 camellia_f %f18, %f14, %f2, %f0 766 camellia_f %f18, %f14, %f6, %f4 767___ 768$::code.=<<___; 769 call _${alg}${bits}_encrypt_2x+16 770 add $inp, 32, $inp 771 772 movxtod %o0, %f8 773 movxtod %o1, %f10 774 movxtod %o2, %f12 775 fxor %f8, %f0, %f0 ! ^= inp 776 movxtod %o3, %f8 777 fxor %f10, %f2, %f2 778 fxor %f12, %f4, %f4 779 fxor %f8, %f6, %f6 780 781 brnz,pn $ooff, 2f 782 sub $len, 2, $len 783 784 std %f0, [$out + 0] 785 std %f2, [$out + 8] 786 std %f4, [$out + 16] 787 std %f6, [$out + 24] 788 brnz,pt $len, .L${bits}_ctr32_loop2x 789 add $out, 32, $out 790 791 ret 792 restore 793 794.align 16 7952: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 796 ! and ~3x deterioration 797 ! in inp==out case 798 faligndata %f0, %f0, %f8 ! handle unaligned output 799 faligndata %f0, %f2, %f0 800 faligndata %f2, %f4, %f2 801 faligndata %f4, %f6, %f4 802 faligndata %f6, %f6, %f6 803 804 stda %f8, [$out + $omask]0xc0 ! partial store 805 std %f0, [$out + 8] 806 std %f2, [$out + 16] 807 std %f4, [$out + 24] 808 add $out, 32, $out 809 orn %g0, $omask, $omask 810 stda %f6, [$out + $omask]0xc0 ! partial store 811 812 brnz,pt $len, .L${bits}_ctr32_loop2x+4 813 orn %g0, $omask, $omask 814 815 ret 816 restore 817 818!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 819.align 32 820.L${bits}_ctr32_blk: 821 add $out, $len, $blk_init 822 and $blk_init, 63, $blk_init ! tail 823 sub $len, $blk_init, $len 824 add $blk_init, 15, $blk_init ! round up to 16n 825 srlx $len, 4, $len 826 srl $blk_init, 4, $blk_init 827 sub $len, 1, $len 828 add $blk_init, 1, $blk_init 829 830.L${bits}_ctr32_blk_loop2x: 831 ldx [$inp + 0], %o0 832 ldx [$inp + 8], %o1 833 ldx [$inp + 16], %o2 834 brz,pt $ileft, 5f 835 ldx [$inp + 24], %o3 836 837 ldx [$inp + 32], %o4 838 sllx %o0, $ileft, %o0 839 srlx %o1, $iright, %g1 840 or %g1, %o0, %o0 841 sllx %o1, $ileft, %o1 842 srlx %o2, $iright, %g1 843 or %g1, %o1, %o1 844 sllx %o2, $ileft, %o2 845 srlx %o3, $iright, %g1 846 or %g1, %o2, %o2 847 sllx %o3, $ileft, %o3 848 srlx %o4, $iright, %o4 849 or %o4, %o3, %o3 8505: 851 xor %g5, %l7, %g1 ! ^= rk[0] 852 add %l7, 1, %l7 853 movxtod %g1, %f2 854 srl %l7, 0, %l7 ! clruw 855 xor %g5, %l7, %g1 856 add %l7, 1, %l7 857 movxtod %g1, %f6 858 srl %l7, 0, %l7 ! clruw 859 prefetch [$inp + 32+63], 20 860___ 861$::code.=<<___ if ($alg eq "aes"); 862 aes_eround01 %f16, %f14, %f2, %f8 863 aes_eround23 %f18, %f14, %f2, %f2 864 aes_eround01 %f16, %f14, %f6, %f10 865 aes_eround23 %f18, %f14, %f6, %f6 866___ 867$::code.=<<___ if ($alg eq "cmll"); 868 camellia_f %f16, %f2, %f14, %f2 869 camellia_f %f16, %f6, %f14, %f6 870 camellia_f %f18, %f14, %f2, %f0 871 camellia_f %f18, %f14, %f6, %f4 872___ 873$::code.=<<___; 874 call _${alg}${bits}_encrypt_2x+16 875 add $inp, 32, $inp 876 subcc $len, 2, $len 877 878 movxtod %o0, %f8 879 movxtod %o1, %f10 880 movxtod %o2, %f12 881 fxor %f8, %f0, %f0 ! ^= inp 882 movxtod %o3, %f8 883 fxor %f10, %f2, %f2 884 fxor %f12, %f4, %f4 885 fxor %f8, %f6, %f6 886 887 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 888 add $out, 8, $out 889 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 890 add $out, 8, $out 891 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 892 add $out, 8, $out 893 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 894 bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x 895 add $out, 8, $out 896 897 add $blk_init, $len, $len 898 andcc $len, 1, %g0 ! is number of blocks even? 899 membar #StoreLoad|#StoreStore 900 bnz,pt %icc, .L${bits}_ctr32_loop 901 srl $len, 0, $len 902 brnz,pn $len, .L${bits}_ctr32_loop2x 903 nop 904 905 ret 906 restore 907.type ${alg}${bits}_t4_ctr32_encrypt,#function 908.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt 909___ 910} 911 912sub alg_xts_implement { 913my ($alg,$bits,$dir) = @_; 914my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5)); 915my $rem=$ivec; 916 917$::code.=<<___; 918.globl ${alg}${bits}_t4_xts_${dir}crypt 919.align 32 920${alg}${bits}_t4_xts_${dir}crypt: 921 save %sp, -$::frame-16, %sp 922 srln $len, 0, $len ! needed on v8+, "nop" on v9 923 924 mov $ivec, %o0 925 add %fp, $::bias-16, %o1 926 call ${alg}_t4_encrypt 927 mov $key2, %o2 928 929 add %fp, $::bias-16, %l7 930 ldxa [%l7]0x88, %g2 931 add %fp, $::bias-8, %l7 932 ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak 933 934 sethi %hi(0x76543210), %l7 935 or %l7, %lo(0x76543210), %l7 936 bmask %l7, %g0, %g0 ! byte swap mask 937 938 prefetch [$inp], 20 939 prefetch [$inp + 63], 20 940 call _${alg}${bits}_load_${dir}ckey 941 and $len, 15, $rem 942 and $len, -16, $len 943___ 944$code.=<<___ if ($dir eq "de"); 945 mov 0, %l7 946 movrnz $rem, 16, %l7 947 sub $len, %l7, $len 948___ 949$code.=<<___; 950 951 sub $inp, $out, $blk_init ! $inp!=$out 952 and $inp, 7, $ileft 953 andn $inp, 7, $inp 954 sll $ileft, 3, $ileft 955 mov 64, $iright 956 mov 0xff, $omask 957 sub $iright, $ileft, $iright 958 and $out, 7, $ooff 959 cmp $len, 255 960 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 961 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 962 brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out) 963 srl $omask, $ooff, $omask 964 965 andcc $len, 16, %g0 ! is number of blocks even? 966___ 967$code.=<<___ if ($dir eq "de"); 968 brz,pn $len, .L${bits}_xts_${dir}steal 969___ 970$code.=<<___; 971 alignaddrl $out, %g0, $out 972 bz %icc, .L${bits}_xts_${dir}loop2x 973 srlx $len, 4, $len 974.L${bits}_xts_${dir}loop: 975 ldx [$inp + 0], %o0 976 brz,pt $ileft, 4f 977 ldx [$inp + 8], %o1 978 979 ldx [$inp + 16], %o2 980 sllx %o0, $ileft, %o0 981 srlx %o1, $iright, %g1 982 sllx %o1, $ileft, %o1 983 or %g1, %o0, %o0 984 srlx %o2, $iright, %o2 985 or %o2, %o1, %o1 9864: 987 movxtod %g2, %f12 988 movxtod %g3, %f14 989 bshuffle %f12, %f12, %f12 990 bshuffle %f14, %f14, %f14 991 992 xor %g4, %o0, %o0 ! ^= rk[0] 993 xor %g5, %o1, %o1 994 movxtod %o0, %f0 995 movxtod %o1, %f2 996 997 fxor %f12, %f0, %f0 ! ^= tweak[0] 998 fxor %f14, %f2, %f2 999 1000 prefetch [$out + 63], 22 1001 prefetch [$inp + 16+63], 20 1002 call _${alg}${bits}_${dir}crypt_1x 1003 add $inp, 16, $inp 1004 1005 fxor %f12, %f0, %f0 ! ^= tweak[0] 1006 fxor %f14, %f2, %f2 1007 1008 srax %g3, 63, %l7 ! next tweak value 1009 addcc %g2, %g2, %g2 1010 and %l7, 0x87, %l7 1011 addxc %g3, %g3, %g3 1012 xor %l7, %g2, %g2 1013 1014 brnz,pn $ooff, 2f 1015 sub $len, 1, $len 1016 1017 std %f0, [$out + 0] 1018 std %f2, [$out + 8] 1019 brnz,pt $len, .L${bits}_xts_${dir}loop2x 1020 add $out, 16, $out 1021 1022 brnz,pn $rem, .L${bits}_xts_${dir}steal 1023 nop 1024 1025 ret 1026 restore 1027 1028.align 16 10292: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 1030 ! and ~3x deterioration 1031 ! in inp==out case 1032 faligndata %f0, %f0, %f4 ! handle unaligned output 1033 faligndata %f0, %f2, %f6 1034 faligndata %f2, %f2, %f8 1035 stda %f4, [$out + $omask]0xc0 ! partial store 1036 std %f6, [$out + 8] 1037 add $out, 16, $out 1038 orn %g0, $omask, $omask 1039 stda %f8, [$out + $omask]0xc0 ! partial store 1040 1041 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 1042 orn %g0, $omask, $omask 1043 1044 brnz,pn $rem, .L${bits}_xts_${dir}steal 1045 nop 1046 1047 ret 1048 restore 1049 1050!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1051.align 32 1052.L${bits}_xts_${dir}loop2x: 1053 ldx [$inp + 0], %o0 1054 ldx [$inp + 8], %o1 1055 ldx [$inp + 16], %o2 1056 brz,pt $ileft, 4f 1057 ldx [$inp + 24], %o3 1058 1059 ldx [$inp + 32], %o4 1060 sllx %o0, $ileft, %o0 1061 srlx %o1, $iright, %g1 1062 or %g1, %o0, %o0 1063 sllx %o1, $ileft, %o1 1064 srlx %o2, $iright, %g1 1065 or %g1, %o1, %o1 1066 sllx %o2, $ileft, %o2 1067 srlx %o3, $iright, %g1 1068 or %g1, %o2, %o2 1069 sllx %o3, $ileft, %o3 1070 srlx %o4, $iright, %o4 1071 or %o4, %o3, %o3 10724: 1073 movxtod %g2, %f12 1074 movxtod %g3, %f14 1075 bshuffle %f12, %f12, %f12 1076 bshuffle %f14, %f14, %f14 1077 1078 srax %g3, 63, %l7 ! next tweak value 1079 addcc %g2, %g2, %g2 1080 and %l7, 0x87, %l7 1081 addxc %g3, %g3, %g3 1082 xor %l7, %g2, %g2 1083 1084 movxtod %g2, %f8 1085 movxtod %g3, %f10 1086 bshuffle %f8, %f8, %f8 1087 bshuffle %f10, %f10, %f10 1088 1089 xor %g4, %o0, %o0 ! ^= rk[0] 1090 xor %g5, %o1, %o1 1091 xor %g4, %o2, %o2 ! ^= rk[0] 1092 xor %g5, %o3, %o3 1093 movxtod %o0, %f0 1094 movxtod %o1, %f2 1095 movxtod %o2, %f4 1096 movxtod %o3, %f6 1097 1098 fxor %f12, %f0, %f0 ! ^= tweak[0] 1099 fxor %f14, %f2, %f2 1100 fxor %f8, %f4, %f4 ! ^= tweak[0] 1101 fxor %f10, %f6, %f6 1102 1103 prefetch [$out + 63], 22 1104 prefetch [$inp + 32+63], 20 1105 call _${alg}${bits}_${dir}crypt_2x 1106 add $inp, 32, $inp 1107 1108 movxtod %g2, %f8 1109 movxtod %g3, %f10 1110 1111 srax %g3, 63, %l7 ! next tweak value 1112 addcc %g2, %g2, %g2 1113 and %l7, 0x87, %l7 1114 addxc %g3, %g3, %g3 1115 xor %l7, %g2, %g2 1116 1117 bshuffle %f8, %f8, %f8 1118 bshuffle %f10, %f10, %f10 1119 1120 fxor %f12, %f0, %f0 ! ^= tweak[0] 1121 fxor %f14, %f2, %f2 1122 fxor %f8, %f4, %f4 1123 fxor %f10, %f6, %f6 1124 1125 brnz,pn $ooff, 2f 1126 sub $len, 2, $len 1127 1128 std %f0, [$out + 0] 1129 std %f2, [$out + 8] 1130 std %f4, [$out + 16] 1131 std %f6, [$out + 24] 1132 brnz,pt $len, .L${bits}_xts_${dir}loop2x 1133 add $out, 32, $out 1134 1135 fsrc2 %f4, %f0 1136 fsrc2 %f6, %f2 1137 brnz,pn $rem, .L${bits}_xts_${dir}steal 1138 nop 1139 1140 ret 1141 restore 1142 1143.align 16 11442: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 1145 ! and ~3x deterioration 1146 ! in inp==out case 1147 faligndata %f0, %f0, %f8 ! handle unaligned output 1148 faligndata %f0, %f2, %f10 1149 faligndata %f2, %f4, %f12 1150 faligndata %f4, %f6, %f14 1151 faligndata %f6, %f6, %f0 1152 1153 stda %f8, [$out + $omask]0xc0 ! partial store 1154 std %f10, [$out + 8] 1155 std %f12, [$out + 16] 1156 std %f14, [$out + 24] 1157 add $out, 32, $out 1158 orn %g0, $omask, $omask 1159 stda %f0, [$out + $omask]0xc0 ! partial store 1160 1161 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 1162 orn %g0, $omask, $omask 1163 1164 fsrc2 %f4, %f0 1165 fsrc2 %f6, %f2 1166 brnz,pn $rem, .L${bits}_xts_${dir}steal 1167 nop 1168 1169 ret 1170 restore 1171 1172!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1173.align 32 1174.L${bits}_xts_${dir}blk: 1175 add $out, $len, $blk_init 1176 and $blk_init, 63, $blk_init ! tail 1177 sub $len, $blk_init, $len 1178 add $blk_init, 15, $blk_init ! round up to 16n 1179 srlx $len, 4, $len 1180 srl $blk_init, 4, $blk_init 1181 sub $len, 1, $len 1182 add $blk_init, 1, $blk_init 1183 1184.L${bits}_xts_${dir}blk2x: 1185 ldx [$inp + 0], %o0 1186 ldx [$inp + 8], %o1 1187 ldx [$inp + 16], %o2 1188 brz,pt $ileft, 5f 1189 ldx [$inp + 24], %o3 1190 1191 ldx [$inp + 32], %o4 1192 sllx %o0, $ileft, %o0 1193 srlx %o1, $iright, %g1 1194 or %g1, %o0, %o0 1195 sllx %o1, $ileft, %o1 1196 srlx %o2, $iright, %g1 1197 or %g1, %o1, %o1 1198 sllx %o2, $ileft, %o2 1199 srlx %o3, $iright, %g1 1200 or %g1, %o2, %o2 1201 sllx %o3, $ileft, %o3 1202 srlx %o4, $iright, %o4 1203 or %o4, %o3, %o3 12045: 1205 movxtod %g2, %f12 1206 movxtod %g3, %f14 1207 bshuffle %f12, %f12, %f12 1208 bshuffle %f14, %f14, %f14 1209 1210 srax %g3, 63, %l7 ! next tweak value 1211 addcc %g2, %g2, %g2 1212 and %l7, 0x87, %l7 1213 addxc %g3, %g3, %g3 1214 xor %l7, %g2, %g2 1215 1216 movxtod %g2, %f8 1217 movxtod %g3, %f10 1218 bshuffle %f8, %f8, %f8 1219 bshuffle %f10, %f10, %f10 1220 1221 xor %g4, %o0, %o0 ! ^= rk[0] 1222 xor %g5, %o1, %o1 1223 xor %g4, %o2, %o2 ! ^= rk[0] 1224 xor %g5, %o3, %o3 1225 movxtod %o0, %f0 1226 movxtod %o1, %f2 1227 movxtod %o2, %f4 1228 movxtod %o3, %f6 1229 1230 fxor %f12, %f0, %f0 ! ^= tweak[0] 1231 fxor %f14, %f2, %f2 1232 fxor %f8, %f4, %f4 ! ^= tweak[0] 1233 fxor %f10, %f6, %f6 1234 1235 prefetch [$inp + 32+63], 20 1236 call _${alg}${bits}_${dir}crypt_2x 1237 add $inp, 32, $inp 1238 1239 movxtod %g2, %f8 1240 movxtod %g3, %f10 1241 1242 srax %g3, 63, %l7 ! next tweak value 1243 addcc %g2, %g2, %g2 1244 and %l7, 0x87, %l7 1245 addxc %g3, %g3, %g3 1246 xor %l7, %g2, %g2 1247 1248 bshuffle %f8, %f8, %f8 1249 bshuffle %f10, %f10, %f10 1250 1251 fxor %f12, %f0, %f0 ! ^= tweak[0] 1252 fxor %f14, %f2, %f2 1253 fxor %f8, %f4, %f4 1254 fxor %f10, %f6, %f6 1255 1256 subcc $len, 2, $len 1257 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1258 add $out, 8, $out 1259 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1260 add $out, 8, $out 1261 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1262 add $out, 8, $out 1263 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1264 bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x 1265 add $out, 8, $out 1266 1267 add $blk_init, $len, $len 1268 andcc $len, 1, %g0 ! is number of blocks even? 1269 membar #StoreLoad|#StoreStore 1270 bnz,pt %icc, .L${bits}_xts_${dir}loop 1271 srl $len, 0, $len 1272 brnz,pn $len, .L${bits}_xts_${dir}loop2x 1273 nop 1274 1275 fsrc2 %f4, %f0 1276 fsrc2 %f6, %f2 1277 brnz,pn $rem, .L${bits}_xts_${dir}steal 1278 nop 1279 1280 ret 1281 restore 1282!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1283___ 1284$code.=<<___ if ($dir eq "en"); 1285.align 32 1286.L${bits}_xts_${dir}steal: 1287 std %f0, [%fp + $::bias-16] ! copy of output 1288 std %f2, [%fp + $::bias-8] 1289 1290 srl $ileft, 3, $ileft 1291 add %fp, $::bias-16, %l7 1292 add $inp, $ileft, $inp ! original $inp+$len&-15 1293 add $out, $ooff, $out ! original $out+$len&-15 1294 mov 0, $ileft 1295 nop ! align 1296 1297.L${bits}_xts_${dir}stealing: 1298 ldub [$inp + $ileft], %o0 1299 ldub [%l7 + $ileft], %o1 1300 dec $rem 1301 stb %o0, [%l7 + $ileft] 1302 stb %o1, [$out + $ileft] 1303 brnz $rem, .L${bits}_xts_${dir}stealing 1304 inc $ileft 1305 1306 mov %l7, $inp 1307 sub $out, 16, $out 1308 mov 0, $ileft 1309 sub $out, $ooff, $out 1310 ba .L${bits}_xts_${dir}loop ! one more time 1311 mov 1, $len ! $rem is 0 1312___ 1313$code.=<<___ if ($dir eq "de"); 1314.align 32 1315.L${bits}_xts_${dir}steal: 1316 ldx [$inp + 0], %o0 1317 brz,pt $ileft, 8f 1318 ldx [$inp + 8], %o1 1319 1320 ldx [$inp + 16], %o2 1321 sllx %o0, $ileft, %o0 1322 srlx %o1, $iright, %g1 1323 sllx %o1, $ileft, %o1 1324 or %g1, %o0, %o0 1325 srlx %o2, $iright, %o2 1326 or %o2, %o1, %o1 13278: 1328 srax %g3, 63, %l7 ! next tweak value 1329 addcc %g2, %g2, %o2 1330 and %l7, 0x87, %l7 1331 addxc %g3, %g3, %o3 1332 xor %l7, %o2, %o2 1333 1334 movxtod %o2, %f12 1335 movxtod %o3, %f14 1336 bshuffle %f12, %f12, %f12 1337 bshuffle %f14, %f14, %f14 1338 1339 xor %g4, %o0, %o0 ! ^= rk[0] 1340 xor %g5, %o1, %o1 1341 movxtod %o0, %f0 1342 movxtod %o1, %f2 1343 1344 fxor %f12, %f0, %f0 ! ^= tweak[0] 1345 fxor %f14, %f2, %f2 1346 1347 call _${alg}${bits}_${dir}crypt_1x 1348 add $inp, 16, $inp 1349 1350 fxor %f12, %f0, %f0 ! ^= tweak[0] 1351 fxor %f14, %f2, %f2 1352 1353 std %f0, [%fp + $::bias-16] 1354 std %f2, [%fp + $::bias-8] 1355 1356 srl $ileft, 3, $ileft 1357 add %fp, $::bias-16, %l7 1358 add $inp, $ileft, $inp ! original $inp+$len&-15 1359 add $out, $ooff, $out ! original $out+$len&-15 1360 mov 0, $ileft 1361 add $out, 16, $out 1362 nop ! align 1363 1364.L${bits}_xts_${dir}stealing: 1365 ldub [$inp + $ileft], %o0 1366 ldub [%l7 + $ileft], %o1 1367 dec $rem 1368 stb %o0, [%l7 + $ileft] 1369 stb %o1, [$out + $ileft] 1370 brnz $rem, .L${bits}_xts_${dir}stealing 1371 inc $ileft 1372 1373 mov %l7, $inp 1374 sub $out, 16, $out 1375 mov 0, $ileft 1376 sub $out, $ooff, $out 1377 ba .L${bits}_xts_${dir}loop ! one more time 1378 mov 1, $len ! $rem is 0 1379___ 1380$code.=<<___; 1381 ret 1382 restore 1383.type ${alg}${bits}_t4_xts_${dir}crypt,#function 1384.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt 1385___ 1386} 1387 1388# Purpose of these subroutines is to explicitly encode VIS instructions, 1389# so that one can compile the module without having to specify VIS 1390# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 1391# Idea is to reserve for option to produce "universal" binary and let 1392# programmer detect if current CPU is VIS capable at run-time. 1393sub unvis { 1394my ($mnemonic,$rs1,$rs2,$rd)=@_; 1395my ($ref,$opf); 1396my %visopf = ( "faligndata" => 0x048, 1397 "bshuffle" => 0x04c, 1398 "fnot2" => 0x066, 1399 "fxor" => 0x06c, 1400 "fsrc2" => 0x078 ); 1401 1402 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1403 1404 if ($opf=$visopf{$mnemonic}) { 1405 foreach ($rs1,$rs2,$rd) { 1406 return $ref if (!/%f([0-9]{1,2})/); 1407 $_=$1; 1408 if ($1>=32) { 1409 return $ref if ($1&1); 1410 # re-encode for upper double register addressing 1411 $_=($1|$1>>5)&31; 1412 } 1413 } 1414 1415 return sprintf ".word\t0x%08x !%s", 1416 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1417 $ref; 1418 } else { 1419 return $ref; 1420 } 1421} 1422 1423sub unvis3 { 1424my ($mnemonic,$rs1,$rs2,$rd)=@_; 1425my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 1426my ($ref,$opf); 1427my %visopf = ( "addxc" => 0x011, 1428 "addxccc" => 0x013, 1429 "umulxhi" => 0x016, 1430 "alignaddr" => 0x018, 1431 "bmask" => 0x019, 1432 "alignaddrl" => 0x01a ); 1433 1434 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1435 1436 if ($opf=$visopf{$mnemonic}) { 1437 foreach ($rs1,$rs2,$rd) { 1438 return $ref if (!/%([goli])([0-9])/); 1439 $_=$bias{$1}+$2; 1440 } 1441 1442 return sprintf ".word\t0x%08x !%s", 1443 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1444 $ref; 1445 } else { 1446 return $ref; 1447 } 1448} 1449 1450sub unaes_round { # 4-argument instructions 1451my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1452my ($ref,$opf); 1453my %aesopf = ( "aes_eround01" => 0, 1454 "aes_eround23" => 1, 1455 "aes_dround01" => 2, 1456 "aes_dround23" => 3, 1457 "aes_eround01_l"=> 4, 1458 "aes_eround23_l"=> 5, 1459 "aes_dround01_l"=> 6, 1460 "aes_dround23_l"=> 7, 1461 "aes_kexpand1" => 8 ); 1462 1463 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1464 1465 if (defined($opf=$aesopf{$mnemonic})) { 1466 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; 1467 foreach ($rs1,$rs2,$rd) { 1468 return $ref if (!/%f([0-9]{1,2})/); 1469 $_=$1; 1470 if ($1>=32) { 1471 return $ref if ($1&1); 1472 # re-encode for upper double register addressing 1473 $_=($1|$1>>5)&31; 1474 } 1475 } 1476 1477 return sprintf ".word\t0x%08x !%s", 1478 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, 1479 $ref; 1480 } else { 1481 return $ref; 1482 } 1483} 1484 1485sub unaes_kexpand { # 3-argument instructions 1486my ($mnemonic,$rs1,$rs2,$rd)=@_; 1487my ($ref,$opf); 1488my %aesopf = ( "aes_kexpand0" => 0x130, 1489 "aes_kexpand2" => 0x131 ); 1490 1491 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1492 1493 if (defined($opf=$aesopf{$mnemonic})) { 1494 foreach ($rs1,$rs2,$rd) { 1495 return $ref if (!/%f([0-9]{1,2})/); 1496 $_=$1; 1497 if ($1>=32) { 1498 return $ref if ($1&1); 1499 # re-encode for upper double register addressing 1500 $_=($1|$1>>5)&31; 1501 } 1502 } 1503 1504 return sprintf ".word\t0x%08x !%s", 1505 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1506 $ref; 1507 } else { 1508 return $ref; 1509 } 1510} 1511 1512sub uncamellia_f { # 4-argument instructions 1513my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1514my ($ref,$opf); 1515 1516 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1517 1518 if (1) { 1519 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; 1520 foreach ($rs1,$rs2,$rd) { 1521 return $ref if (!/%f([0-9]{1,2})/); 1522 $_=$1; 1523 if ($1>=32) { 1524 return $ref if ($1&1); 1525 # re-encode for upper double register addressing 1526 $_=($1|$1>>5)&31; 1527 } 1528 } 1529 1530 return sprintf ".word\t0x%08x !%s", 1531 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2, 1532 $ref; 1533 } else { 1534 return $ref; 1535 } 1536} 1537 1538sub uncamellia3 { # 3-argument instructions 1539my ($mnemonic,$rs1,$rs2,$rd)=@_; 1540my ($ref,$opf); 1541my %cmllopf = ( "camellia_fl" => 0x13c, 1542 "camellia_fli" => 0x13d ); 1543 1544 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1545 1546 if (defined($opf=$cmllopf{$mnemonic})) { 1547 foreach ($rs1,$rs2,$rd) { 1548 return $ref if (!/%f([0-9]{1,2})/); 1549 $_=$1; 1550 if ($1>=32) { 1551 return $ref if ($1&1); 1552 # re-encode for upper double register addressing 1553 $_=($1|$1>>5)&31; 1554 } 1555 } 1556 1557 return sprintf ".word\t0x%08x !%s", 1558 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1559 $ref; 1560 } else { 1561 return $ref; 1562 } 1563} 1564 1565sub unmovxtox { # 2-argument instructions 1566my ($mnemonic,$rs,$rd)=@_; 1567my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 ); 1568my ($ref,$opf); 1569my %movxopf = ( "movdtox" => 0x110, 1570 "movstouw" => 0x111, 1571 "movstosw" => 0x113, 1572 "movxtod" => 0x118, 1573 "movwtos" => 0x119 ); 1574 1575 $ref = "$mnemonic\t$rs,$rd"; 1576 1577 if (defined($opf=$movxopf{$mnemonic})) { 1578 foreach ($rs,$rd) { 1579 return $ref if (!/%([fgoli])([0-9]{1,2})/); 1580 $_=$bias{$1}+$2; 1581 if ($2>=32) { 1582 return $ref if ($2&1); 1583 # re-encode for upper double register addressing 1584 $_=($2|$2>>5)&31; 1585 } 1586 } 1587 1588 return sprintf ".word\t0x%08x !%s", 1589 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs, 1590 $ref; 1591 } else { 1592 return $ref; 1593 } 1594} 1595 1596sub undes { 1597my ($mnemonic)=shift; 1598my @args=@_; 1599my ($ref,$opf); 1600my %desopf = ( "des_round" => 0b1001, 1601 "des_ip" => 0b100110100, 1602 "des_iip" => 0b100110101, 1603 "des_kexpand" => 0b100110110 ); 1604 1605 $ref = "$mnemonic\t".join(",",@_); 1606 1607 if (defined($opf=$desopf{$mnemonic})) { # 4-arg 1608 if ($mnemonic eq "des_round") { 1609 foreach (@args[0..3]) { 1610 return $ref if (!/%f([0-9]{1,2})/); 1611 $_=$1; 1612 if ($1>=32) { 1613 return $ref if ($1&1); 1614 # re-encode for upper double register addressing 1615 $_=($1|$1>>5)&31; 1616 } 1617 } 1618 return sprintf ".word\t0x%08x !%s", 1619 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25, 1620 $ref; 1621 } elsif ($mnemonic eq "des_kexpand") { # 3-arg 1622 foreach (@args[0..2]) { 1623 return $ref if (!/(%f)?([0-9]{1,2})/); 1624 $_=$2; 1625 if ($2>=32) { 1626 return $ref if ($2&1); 1627 # re-encode for upper double register addressing 1628 $_=($2|$2>>5)&31; 1629 } 1630 } 1631 return sprintf ".word\t0x%08x !%s", 1632 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25, 1633 $ref; 1634 } else { # 2-arg 1635 foreach (@args[0..1]) { 1636 return $ref if (!/%f([0-9]{1,2})/); 1637 $_=$1; 1638 if ($1>=32) { 1639 return $ref if ($2&1); 1640 # re-encode for upper double register addressing 1641 $_=($1|$1>>5)&31; 1642 } 1643 } 1644 return sprintf ".word\t0x%08x !%s", 1645 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25, 1646 $ref; 1647 } 1648 } else { 1649 return $ref; 1650 } 1651} 1652 1653sub emit_assembler { 1654 foreach (split("\n",$::code)) { 1655 s/\`([^\`]*)\`/eval $1/ge; 1656 1657 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go; 1658 1659 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1660 &unaes_round($1,$2,$3,$4,$5) 1661 /geo or 1662 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1663 &unaes_kexpand($1,$2,$3,$4) 1664 /geo or 1665 s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1666 &uncamellia_f($1,$2,$3,$4,$5) 1667 /geo or 1668 s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1669 &uncamellia3($1,$2,$3,$4) 1670 /geo or 1671 s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/ 1672 &undes($1,$2,$3,$4,$5) 1673 /geo or 1674 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/ 1675 &unmovxtox($1,$2,$3) 1676 /geo or 1677 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/ 1678 &unmovxtox($1,$2,$3) 1679 /geo or 1680 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1681 &unvis($1,$2,$3,$4) 1682 /geo or 1683 s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 1684 &unvis3($1,$2,$3,$4) 1685 /geo; 1686 1687 print $_,"\n"; 1688 } 1689} 1690 16911; 1692