dest4-sparcv9.pl revision 306195
1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov 5# <appro@openssl.org>. The module is licensed under 2-clause BSD 6# license. March 2013. All rights reserved. 7# ==================================================================== 8 9###################################################################### 10# DES for SPARC T4. 11# 12# As with other hardware-assisted ciphers CBC encrypt results [for 13# aligned data] are virtually identical to critical path lengths: 14# 15# DES Triple-DES 16# CBC encrypt 4.14/4.15(*) 11.7/11.7 17# CBC decrypt 1.77/4.11(**) 6.42/7.47 18# 19# (*) numbers after slash are for 20# misaligned data; 21# (**) this is result for largest 22# block size, unlike all other 23# cases smaller blocks results 24# are better[?]; 25 26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27push(@INC,"${dir}","${dir}../../perlasm"); 28require "sparcv9_modes.pl"; 29 30&asm_init(@ARGV); 31 32$code.=<<___ if ($::abibits==64); 33.register %g2,#scratch 34.register %g3,#scratch 35___ 36 37$code.=<<___; 38.text 39___ 40 41{ my ($inp,$out)=("%o0","%o1"); 42 43$code.=<<___; 44.align 32 45.globl des_t4_key_expand 46.type des_t4_key_expand,#function 47des_t4_key_expand: 48 andcc $inp, 0x7, %g0 49 alignaddr $inp, %g0, $inp 50 bz,pt %icc, 1f 51 ldd [$inp + 0x00], %f0 52 ldd [$inp + 0x08], %f2 53 faligndata %f0, %f2, %f0 541: des_kexpand %f0, 0, %f0 55 des_kexpand %f0, 1, %f2 56 std %f0, [$out + 0x00] 57 des_kexpand %f2, 3, %f6 58 std %f2, [$out + 0x08] 59 des_kexpand %f2, 2, %f4 60 des_kexpand %f6, 3, %f10 61 std %f6, [$out + 0x18] 62 des_kexpand %f6, 2, %f8 63 std %f4, [$out + 0x10] 64 des_kexpand %f10, 3, %f14 65 std %f10, [$out + 0x28] 66 des_kexpand %f10, 2, %f12 67 std %f8, [$out + 0x20] 68 des_kexpand %f14, 1, %f16 69 std %f14, [$out + 0x38] 70 des_kexpand %f16, 3, %f20 71 std %f12, [$out + 0x30] 72 des_kexpand %f16, 2, %f18 73 std %f16, [$out + 0x40] 74 des_kexpand %f20, 3, %f24 75 std %f20, [$out + 0x50] 76 des_kexpand %f20, 2, %f22 77 std %f18, [$out + 0x48] 78 des_kexpand %f24, 3, %f28 79 std %f24, [$out + 0x60] 80 des_kexpand %f24, 2, %f26 81 std %f22, [$out + 0x58] 82 des_kexpand %f28, 1, %f30 83 std %f28, [$out + 0x70] 84 std %f26, [$out + 0x68] 85 retl 86 std %f30, [$out + 0x78] 87.size des_t4_key_expand,.-des_t4_key_expand 88___ 89} 90{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4)); 91 my ($ileft,$iright,$omask) = map("%g$_",(1..3)); 92 93$code.=<<___; 94.globl des_t4_cbc_encrypt 95.align 32 96des_t4_cbc_encrypt: 97 cmp $len, 0 98 be,pn $::size_t_cc, .Lcbc_abort 99 srln $len, 0, $len ! needed on v8+, "nop" on v9 100 ld [$ivec + 0], %f0 ! load ivec 101 ld [$ivec + 4], %f1 102 103 and $inp, 7, $ileft 104 andn $inp, 7, $inp 105 sll $ileft, 3, $ileft 106 mov 0xff, $omask 107 prefetch [$inp], 20 108 prefetch [$inp + 63], 20 109 sub %g0, $ileft, $iright 110 and $out, 7, %g4 111 alignaddrl $out, %g0, $out 112 srl $omask, %g4, $omask 113 srlx $len, 3, $len 114 movrz %g4, 0, $omask 115 prefetch [$out], 22 116 117 ldd [$key + 0x00], %f4 ! load key schedule 118 ldd [$key + 0x08], %f6 119 ldd [$key + 0x10], %f8 120 ldd [$key + 0x18], %f10 121 ldd [$key + 0x20], %f12 122 ldd [$key + 0x28], %f14 123 ldd [$key + 0x30], %f16 124 ldd [$key + 0x38], %f18 125 ldd [$key + 0x40], %f20 126 ldd [$key + 0x48], %f22 127 ldd [$key + 0x50], %f24 128 ldd [$key + 0x58], %f26 129 ldd [$key + 0x60], %f28 130 ldd [$key + 0x68], %f30 131 ldd [$key + 0x70], %f32 132 ldd [$key + 0x78], %f34 133 134.Ldes_cbc_enc_loop: 135 ldx [$inp + 0], %g4 136 brz,pt $ileft, 4f 137 nop 138 139 ldx [$inp + 8], %g5 140 sllx %g4, $ileft, %g4 141 srlx %g5, $iright, %g5 142 or %g5, %g4, %g4 1434: 144 movxtod %g4, %f2 145 prefetch [$inp + 8+63], 20 146 add $inp, 8, $inp 147 fxor %f2, %f0, %f0 ! ^= ivec 148 prefetch [$out + 63], 22 149 150 des_ip %f0, %f0 151 des_round %f4, %f6, %f0, %f0 152 des_round %f8, %f10, %f0, %f0 153 des_round %f12, %f14, %f0, %f0 154 des_round %f16, %f18, %f0, %f0 155 des_round %f20, %f22, %f0, %f0 156 des_round %f24, %f26, %f0, %f0 157 des_round %f28, %f30, %f0, %f0 158 des_round %f32, %f34, %f0, %f0 159 des_iip %f0, %f0 160 161 brnz,pn $omask, 2f 162 sub $len, 1, $len 163 164 std %f0, [$out + 0] 165 brnz,pt $len, .Ldes_cbc_enc_loop 166 add $out, 8, $out 167 168 st %f0, [$ivec + 0] ! write out ivec 169 retl 170 st %f1, [$ivec + 4] 171.Lcbc_abort: 172 retl 173 nop 174 175.align 16 1762: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard 177 ! and ~4x deterioration 178 ! in inp==out case 179 faligndata %f0, %f0, %f2 ! handle unaligned output 180 181 stda %f2, [$out + $omask]0xc0 ! partial store 182 add $out, 8, $out 183 orn %g0, $omask, $omask 184 stda %f2, [$out + $omask]0xc0 ! partial store 185 186 brnz,pt $len, .Ldes_cbc_enc_loop+4 187 orn %g0, $omask, $omask 188 189 st %f0, [$ivec + 0] ! write out ivec 190 retl 191 st %f1, [$ivec + 4] 192.type des_t4_cbc_encrypt,#function 193.size des_t4_cbc_encrypt,.-des_t4_cbc_encrypt 194 195.globl des_t4_cbc_decrypt 196.align 32 197des_t4_cbc_decrypt: 198 cmp $len, 0 199 be,pn $::size_t_cc, .Lcbc_abort 200 srln $len, 0, $len ! needed on v8+, "nop" on v9 201 ld [$ivec + 0], %f2 ! load ivec 202 ld [$ivec + 4], %f3 203 204 and $inp, 7, $ileft 205 andn $inp, 7, $inp 206 sll $ileft, 3, $ileft 207 mov 0xff, $omask 208 prefetch [$inp], 20 209 prefetch [$inp + 63], 20 210 sub %g0, $ileft, $iright 211 and $out, 7, %g4 212 alignaddrl $out, %g0, $out 213 srl $omask, %g4, $omask 214 srlx $len, 3, $len 215 movrz %g4, 0, $omask 216 prefetch [$out], 22 217 218 ldd [$key + 0x78], %f4 ! load key schedule 219 ldd [$key + 0x70], %f6 220 ldd [$key + 0x68], %f8 221 ldd [$key + 0x60], %f10 222 ldd [$key + 0x58], %f12 223 ldd [$key + 0x50], %f14 224 ldd [$key + 0x48], %f16 225 ldd [$key + 0x40], %f18 226 ldd [$key + 0x38], %f20 227 ldd [$key + 0x30], %f22 228 ldd [$key + 0x28], %f24 229 ldd [$key + 0x20], %f26 230 ldd [$key + 0x18], %f28 231 ldd [$key + 0x10], %f30 232 ldd [$key + 0x08], %f32 233 ldd [$key + 0x00], %f34 234 235.Ldes_cbc_dec_loop: 236 ldx [$inp + 0], %g4 237 brz,pt $ileft, 4f 238 nop 239 240 ldx [$inp + 8], %g5 241 sllx %g4, $ileft, %g4 242 srlx %g5, $iright, %g5 243 or %g5, %g4, %g4 2444: 245 movxtod %g4, %f0 246 prefetch [$inp + 8+63], 20 247 add $inp, 8, $inp 248 prefetch [$out + 63], 22 249 250 des_ip %f0, %f0 251 des_round %f4, %f6, %f0, %f0 252 des_round %f8, %f10, %f0, %f0 253 des_round %f12, %f14, %f0, %f0 254 des_round %f16, %f18, %f0, %f0 255 des_round %f20, %f22, %f0, %f0 256 des_round %f24, %f26, %f0, %f0 257 des_round %f28, %f30, %f0, %f0 258 des_round %f32, %f34, %f0, %f0 259 des_iip %f0, %f0 260 261 fxor %f2, %f0, %f0 ! ^= ivec 262 movxtod %g4, %f2 263 264 brnz,pn $omask, 2f 265 sub $len, 1, $len 266 267 std %f0, [$out + 0] 268 brnz,pt $len, .Ldes_cbc_dec_loop 269 add $out, 8, $out 270 271 st %f2, [$ivec + 0] ! write out ivec 272 retl 273 st %f3, [$ivec + 4] 274 275.align 16 2762: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard 277 ! and ~4x deterioration 278 ! in inp==out case 279 faligndata %f0, %f0, %f0 ! handle unaligned output 280 281 stda %f0, [$out + $omask]0xc0 ! partial store 282 add $out, 8, $out 283 orn %g0, $omask, $omask 284 stda %f0, [$out + $omask]0xc0 ! partial store 285 286 brnz,pt $len, .Ldes_cbc_dec_loop+4 287 orn %g0, $omask, $omask 288 289 st %f2, [$ivec + 0] ! write out ivec 290 retl 291 st %f3, [$ivec + 4] 292.type des_t4_cbc_decrypt,#function 293.size des_t4_cbc_decrypt,.-des_t4_cbc_decrypt 294___ 295 296# One might wonder why does one have back-to-back des_iip/des_ip 297# pairs between EDE passes. Indeed, aren't they inverse of each other? 298# They almost are. Outcome of the pair is 32-bit words being swapped 299# in target register. Consider pair of des_iip/des_ip as a way to 300# perform the due swap, it's actually fastest way in this case. 301 302$code.=<<___; 303.globl des_t4_ede3_cbc_encrypt 304.align 32 305des_t4_ede3_cbc_encrypt: 306 cmp $len, 0 307 be,pn $::size_t_cc, .Lcbc_abort 308 srln $len, 0, $len ! needed on v8+, "nop" on v9 309 ld [$ivec + 0], %f0 ! load ivec 310 ld [$ivec + 4], %f1 311 312 and $inp, 7, $ileft 313 andn $inp, 7, $inp 314 sll $ileft, 3, $ileft 315 mov 0xff, $omask 316 prefetch [$inp], 20 317 prefetch [$inp + 63], 20 318 sub %g0, $ileft, $iright 319 and $out, 7, %g4 320 alignaddrl $out, %g0, $out 321 srl $omask, %g4, $omask 322 srlx $len, 3, $len 323 movrz %g4, 0, $omask 324 prefetch [$out], 22 325 326 ldd [$key + 0x00], %f4 ! load key schedule 327 ldd [$key + 0x08], %f6 328 ldd [$key + 0x10], %f8 329 ldd [$key + 0x18], %f10 330 ldd [$key + 0x20], %f12 331 ldd [$key + 0x28], %f14 332 ldd [$key + 0x30], %f16 333 ldd [$key + 0x38], %f18 334 ldd [$key + 0x40], %f20 335 ldd [$key + 0x48], %f22 336 ldd [$key + 0x50], %f24 337 ldd [$key + 0x58], %f26 338 ldd [$key + 0x60], %f28 339 ldd [$key + 0x68], %f30 340 ldd [$key + 0x70], %f32 341 ldd [$key + 0x78], %f34 342 343.Ldes_ede3_cbc_enc_loop: 344 ldx [$inp + 0], %g4 345 brz,pt $ileft, 4f 346 nop 347 348 ldx [$inp + 8], %g5 349 sllx %g4, $ileft, %g4 350 srlx %g5, $iright, %g5 351 or %g5, %g4, %g4 3524: 353 movxtod %g4, %f2 354 prefetch [$inp + 8+63], 20 355 add $inp, 8, $inp 356 fxor %f2, %f0, %f0 ! ^= ivec 357 prefetch [$out + 63], 22 358 359 des_ip %f0, %f0 360 des_round %f4, %f6, %f0, %f0 361 des_round %f8, %f10, %f0, %f0 362 des_round %f12, %f14, %f0, %f0 363 des_round %f16, %f18, %f0, %f0 364 ldd [$key + 0x100-0x08], %f36 365 ldd [$key + 0x100-0x10], %f38 366 des_round %f20, %f22, %f0, %f0 367 ldd [$key + 0x100-0x18], %f40 368 ldd [$key + 0x100-0x20], %f42 369 des_round %f24, %f26, %f0, %f0 370 ldd [$key + 0x100-0x28], %f44 371 ldd [$key + 0x100-0x30], %f46 372 des_round %f28, %f30, %f0, %f0 373 ldd [$key + 0x100-0x38], %f48 374 ldd [$key + 0x100-0x40], %f50 375 des_round %f32, %f34, %f0, %f0 376 ldd [$key + 0x100-0x48], %f52 377 ldd [$key + 0x100-0x50], %f54 378 des_iip %f0, %f0 379 380 ldd [$key + 0x100-0x58], %f56 381 ldd [$key + 0x100-0x60], %f58 382 des_ip %f0, %f0 383 ldd [$key + 0x100-0x68], %f60 384 ldd [$key + 0x100-0x70], %f62 385 des_round %f36, %f38, %f0, %f0 386 ldd [$key + 0x100-0x78], %f36 387 ldd [$key + 0x100-0x80], %f38 388 des_round %f40, %f42, %f0, %f0 389 des_round %f44, %f46, %f0, %f0 390 des_round %f48, %f50, %f0, %f0 391 ldd [$key + 0x100+0x00], %f40 392 ldd [$key + 0x100+0x08], %f42 393 des_round %f52, %f54, %f0, %f0 394 ldd [$key + 0x100+0x10], %f44 395 ldd [$key + 0x100+0x18], %f46 396 des_round %f56, %f58, %f0, %f0 397 ldd [$key + 0x100+0x20], %f48 398 ldd [$key + 0x100+0x28], %f50 399 des_round %f60, %f62, %f0, %f0 400 ldd [$key + 0x100+0x30], %f52 401 ldd [$key + 0x100+0x38], %f54 402 des_round %f36, %f38, %f0, %f0 403 ldd [$key + 0x100+0x40], %f56 404 ldd [$key + 0x100+0x48], %f58 405 des_iip %f0, %f0 406 407 ldd [$key + 0x100+0x50], %f60 408 ldd [$key + 0x100+0x58], %f62 409 des_ip %f0, %f0 410 ldd [$key + 0x100+0x60], %f36 411 ldd [$key + 0x100+0x68], %f38 412 des_round %f40, %f42, %f0, %f0 413 ldd [$key + 0x100+0x70], %f40 414 ldd [$key + 0x100+0x78], %f42 415 des_round %f44, %f46, %f0, %f0 416 des_round %f48, %f50, %f0, %f0 417 des_round %f52, %f54, %f0, %f0 418 des_round %f56, %f58, %f0, %f0 419 des_round %f60, %f62, %f0, %f0 420 des_round %f36, %f38, %f0, %f0 421 des_round %f40, %f42, %f0, %f0 422 des_iip %f0, %f0 423 424 brnz,pn $omask, 2f 425 sub $len, 1, $len 426 427 std %f0, [$out + 0] 428 brnz,pt $len, .Ldes_ede3_cbc_enc_loop 429 add $out, 8, $out 430 431 st %f0, [$ivec + 0] ! write out ivec 432 retl 433 st %f1, [$ivec + 4] 434 435.align 16 4362: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard 437 ! and ~2x deterioration 438 ! in inp==out case 439 faligndata %f0, %f0, %f2 ! handle unaligned output 440 441 stda %f2, [$out + $omask]0xc0 ! partial store 442 add $out, 8, $out 443 orn %g0, $omask, $omask 444 stda %f2, [$out + $omask]0xc0 ! partial store 445 446 brnz,pt $len, .Ldes_ede3_cbc_enc_loop+4 447 orn %g0, $omask, $omask 448 449 st %f0, [$ivec + 0] ! write out ivec 450 retl 451 st %f1, [$ivec + 4] 452.type des_t4_ede3_cbc_encrypt,#function 453.size des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt 454 455.globl des_t4_ede3_cbc_decrypt 456.align 32 457des_t4_ede3_cbc_decrypt: 458 cmp $len, 0 459 be,pn $::size_t_cc, .Lcbc_abort 460 srln $len, 0, $len ! needed on v8+, "nop" on v9 461 ld [$ivec + 0], %f2 ! load ivec 462 ld [$ivec + 4], %f3 463 464 and $inp, 7, $ileft 465 andn $inp, 7, $inp 466 sll $ileft, 3, $ileft 467 mov 0xff, $omask 468 prefetch [$inp], 20 469 prefetch [$inp + 63], 20 470 sub %g0, $ileft, $iright 471 and $out, 7, %g4 472 alignaddrl $out, %g0, $out 473 srl $omask, %g4, $omask 474 srlx $len, 3, $len 475 movrz %g4, 0, $omask 476 prefetch [$out], 22 477 478 ldd [$key + 0x100+0x78], %f4 ! load key schedule 479 ldd [$key + 0x100+0x70], %f6 480 ldd [$key + 0x100+0x68], %f8 481 ldd [$key + 0x100+0x60], %f10 482 ldd [$key + 0x100+0x58], %f12 483 ldd [$key + 0x100+0x50], %f14 484 ldd [$key + 0x100+0x48], %f16 485 ldd [$key + 0x100+0x40], %f18 486 ldd [$key + 0x100+0x38], %f20 487 ldd [$key + 0x100+0x30], %f22 488 ldd [$key + 0x100+0x28], %f24 489 ldd [$key + 0x100+0x20], %f26 490 ldd [$key + 0x100+0x18], %f28 491 ldd [$key + 0x100+0x10], %f30 492 ldd [$key + 0x100+0x08], %f32 493 ldd [$key + 0x100+0x00], %f34 494 495.Ldes_ede3_cbc_dec_loop: 496 ldx [$inp + 0], %g4 497 brz,pt $ileft, 4f 498 nop 499 500 ldx [$inp + 8], %g5 501 sllx %g4, $ileft, %g4 502 srlx %g5, $iright, %g5 503 or %g5, %g4, %g4 5044: 505 movxtod %g4, %f0 506 prefetch [$inp + 8+63], 20 507 add $inp, 8, $inp 508 prefetch [$out + 63], 22 509 510 des_ip %f0, %f0 511 des_round %f4, %f6, %f0, %f0 512 des_round %f8, %f10, %f0, %f0 513 des_round %f12, %f14, %f0, %f0 514 des_round %f16, %f18, %f0, %f0 515 ldd [$key + 0x80+0x00], %f36 516 ldd [$key + 0x80+0x08], %f38 517 des_round %f20, %f22, %f0, %f0 518 ldd [$key + 0x80+0x10], %f40 519 ldd [$key + 0x80+0x18], %f42 520 des_round %f24, %f26, %f0, %f0 521 ldd [$key + 0x80+0x20], %f44 522 ldd [$key + 0x80+0x28], %f46 523 des_round %f28, %f30, %f0, %f0 524 ldd [$key + 0x80+0x30], %f48 525 ldd [$key + 0x80+0x38], %f50 526 des_round %f32, %f34, %f0, %f0 527 ldd [$key + 0x80+0x40], %f52 528 ldd [$key + 0x80+0x48], %f54 529 des_iip %f0, %f0 530 531 ldd [$key + 0x80+0x50], %f56 532 ldd [$key + 0x80+0x58], %f58 533 des_ip %f0, %f0 534 ldd [$key + 0x80+0x60], %f60 535 ldd [$key + 0x80+0x68], %f62 536 des_round %f36, %f38, %f0, %f0 537 ldd [$key + 0x80+0x70], %f36 538 ldd [$key + 0x80+0x78], %f38 539 des_round %f40, %f42, %f0, %f0 540 des_round %f44, %f46, %f0, %f0 541 des_round %f48, %f50, %f0, %f0 542 ldd [$key + 0x80-0x08], %f40 543 ldd [$key + 0x80-0x10], %f42 544 des_round %f52, %f54, %f0, %f0 545 ldd [$key + 0x80-0x18], %f44 546 ldd [$key + 0x80-0x20], %f46 547 des_round %f56, %f58, %f0, %f0 548 ldd [$key + 0x80-0x28], %f48 549 ldd [$key + 0x80-0x30], %f50 550 des_round %f60, %f62, %f0, %f0 551 ldd [$key + 0x80-0x38], %f52 552 ldd [$key + 0x80-0x40], %f54 553 des_round %f36, %f38, %f0, %f0 554 ldd [$key + 0x80-0x48], %f56 555 ldd [$key + 0x80-0x50], %f58 556 des_iip %f0, %f0 557 558 ldd [$key + 0x80-0x58], %f60 559 ldd [$key + 0x80-0x60], %f62 560 des_ip %f0, %f0 561 ldd [$key + 0x80-0x68], %f36 562 ldd [$key + 0x80-0x70], %f38 563 des_round %f40, %f42, %f0, %f0 564 ldd [$key + 0x80-0x78], %f40 565 ldd [$key + 0x80-0x80], %f42 566 des_round %f44, %f46, %f0, %f0 567 des_round %f48, %f50, %f0, %f0 568 des_round %f52, %f54, %f0, %f0 569 des_round %f56, %f58, %f0, %f0 570 des_round %f60, %f62, %f0, %f0 571 des_round %f36, %f38, %f0, %f0 572 des_round %f40, %f42, %f0, %f0 573 des_iip %f0, %f0 574 575 fxor %f2, %f0, %f0 ! ^= ivec 576 movxtod %g4, %f2 577 578 brnz,pn $omask, 2f 579 sub $len, 1, $len 580 581 std %f0, [$out + 0] 582 brnz,pt $len, .Ldes_ede3_cbc_dec_loop 583 add $out, 8, $out 584 585 st %f2, [$ivec + 0] ! write out ivec 586 retl 587 st %f3, [$ivec + 4] 588 589.align 16 5902: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard 591 ! and ~3x deterioration 592 ! in inp==out case 593 faligndata %f0, %f0, %f0 ! handle unaligned output 594 595 stda %f0, [$out + $omask]0xc0 ! partial store 596 add $out, 8, $out 597 orn %g0, $omask, $omask 598 stda %f0, [$out + $omask]0xc0 ! partial store 599 600 brnz,pt $len, .Ldes_ede3_cbc_dec_loop+4 601 orn %g0, $omask, $omask 602 603 st %f2, [$ivec + 0] ! write out ivec 604 retl 605 st %f3, [$ivec + 4] 606.type des_t4_ede3_cbc_decrypt,#function 607.size des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt 608___ 609} 610$code.=<<___; 611.asciz "DES for SPARC T4, David S. Miller, Andy Polyakov" 612.align 4 613___ 614 615&emit_assembler(); 616 617close STDOUT; 618