sparcv9-mont.pl revision 337982
1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# December 2005 11# 12# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons 13# for undertaken effort are multiple. First of all, UltraSPARC is not 14# the whole SPARCv9 universe and other VIS-free implementations deserve 15# optimized code as much. Secondly, newly introduced UltraSPARC T1, 16# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, 17# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with 18# several integrated RSA/DSA accelerator circuits accessible through 19# kernel driver [only(*)], but having decent user-land software 20# implementation is important too. Finally, reasons like desire to 21# experiment with dedicated squaring procedure. Yes, this module 22# implements one, because it was easiest to draft it in SPARCv9 23# instructions... 24 25# (*) Engine accessing the driver in question is on my TODO list. 26# For reference, acceleator is estimated to give 6 to 10 times 27# improvement on single-threaded RSA sign. It should be noted 28# that 6-10x improvement coefficient does not actually mean 29# something extraordinary in terms of absolute [single-threaded] 30# performance, as SPARCv9 instruction set is by all means least 31# suitable for high performance crypto among other 64 bit 32# platforms. 6-10x factor simply places T1 in same performance 33# domain as say AMD64 and IA-64. Improvement of RSA verify don't 34# appear impressive at all, but it's the sign operation which is 35# far more critical/interesting. 36 37# You might notice that inner loops are modulo-scheduled:-) This has 38# essentially negligible impact on UltraSPARC performance, it's 39# Fujitsu SPARC64 V users who should notice and hopefully appreciate 40# the advantage... Currently this module surpasses sparcv9a-mont.pl 41# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a 42# module still have hidden potential [see TODO list there], which is 43# estimated to be larger than 20%... 44 45# int bn_mul_mont( 46$rp="%i0"; # BN_ULONG *rp, 47$ap="%i1"; # const BN_ULONG *ap, 48$bp="%i2"; # const BN_ULONG *bp, 49$np="%i3"; # const BN_ULONG *np, 50$n0="%i4"; # const BN_ULONG *n0, 51$num="%i5"; # int num); 52 53$bits=32; 54for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 55if ($bits==64) { $bias=2047; $frame=192; } 56else { $bias=0; $frame=128; } 57 58$car0="%o0"; 59$car1="%o1"; 60$car2="%o2"; # 1 bit 61$acc0="%o3"; 62$acc1="%o4"; 63$mask="%g1"; # 32 bits, what a waste... 64$tmp0="%g4"; 65$tmp1="%g5"; 66 67$i="%l0"; 68$j="%l1"; 69$mul0="%l2"; 70$mul1="%l3"; 71$tp="%l4"; 72$apj="%l5"; 73$npj="%l6"; 74$tpj="%l7"; 75 76$fname="bn_mul_mont_int"; 77 78$code=<<___; 79.section ".text",#alloc,#execinstr 80 81.global $fname 82.align 32 83$fname: 84 cmp %o5,4 ! 128 bits minimum 85 bge,pt %icc,.Lenter 86 sethi %hi(0xffffffff),$mask 87 retl 88 clr %o0 89.align 32 90.Lenter: 91 save %sp,-$frame,%sp 92 sll $num,2,$num ! num*=4 93 or $mask,%lo(0xffffffff),$mask 94 ld [$n0],$n0 95 cmp $ap,$bp 96 and $num,$mask,$num 97 ld [$bp],$mul0 ! bp[0] 98 nop 99 100 add %sp,$bias,%o7 ! real top of stack 101 ld [$ap],$car0 ! ap[0] ! redundant in squaring context 102 sub %o7,$num,%o7 103 ld [$ap+4],$apj ! ap[1] 104 and %o7,-1024,%o7 105 ld [$np],$car1 ! np[0] 106 sub %o7,$bias,%sp ! alloca 107 ld [$np+4],$npj ! np[1] 108 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont 109 mov 12,$j 110 111 mulx $car0,$mul0,$car0 ! ap[0]*bp[0] 112 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] 113 and $car0,$mask,$acc0 114 add %sp,$bias+$frame,$tp 115 ld [$ap+8],$apj !prologue! 116 117 mulx $n0,$acc0,$mul1 ! "t[0]"*n0 118 and $mul1,$mask,$mul1 119 120 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 121 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 122 srlx $car0,32,$car0 123 add $acc0,$car1,$car1 124 ld [$np+8],$npj !prologue! 125 srlx $car1,32,$car1 126 mov $tmp0,$acc0 !prologue! 127 128.L1st: 129 mulx $apj,$mul0,$tmp0 130 mulx $npj,$mul1,$tmp1 131 add $acc0,$car0,$car0 132 ld [$ap+$j],$apj ! ap[j] 133 and $car0,$mask,$acc0 134 add $acc1,$car1,$car1 135 ld [$np+$j],$npj ! np[j] 136 srlx $car0,32,$car0 137 add $acc0,$car1,$car1 138 add $j,4,$j ! j++ 139 mov $tmp0,$acc0 140 st $car1,[$tp] 141 cmp $j,$num 142 mov $tmp1,$acc1 143 srlx $car1,32,$car1 144 bl %icc,.L1st 145 add $tp,4,$tp ! tp++ 146!.L1st 147 148 mulx $apj,$mul0,$tmp0 !epilogue! 149 mulx $npj,$mul1,$tmp1 150 add $acc0,$car0,$car0 151 and $car0,$mask,$acc0 152 add $acc1,$car1,$car1 153 srlx $car0,32,$car0 154 add $acc0,$car1,$car1 155 st $car1,[$tp] 156 srlx $car1,32,$car1 157 158 add $tmp0,$car0,$car0 159 and $car0,$mask,$acc0 160 add $tmp1,$car1,$car1 161 srlx $car0,32,$car0 162 add $acc0,$car1,$car1 163 st $car1,[$tp+4] 164 srlx $car1,32,$car1 165 166 add $car0,$car1,$car1 167 st $car1,[$tp+8] 168 srlx $car1,32,$car2 169 170 mov 4,$i ! i++ 171 ld [$bp+4],$mul0 ! bp[1] 172.Louter: 173 add %sp,$bias+$frame,$tp 174 ld [$ap],$car0 ! ap[0] 175 ld [$ap+4],$apj ! ap[1] 176 ld [$np],$car1 ! np[0] 177 ld [$np+4],$npj ! np[1] 178 ld [$tp],$tmp1 ! tp[0] 179 ld [$tp+4],$tpj ! tp[1] 180 mov 12,$j 181 182 mulx $car0,$mul0,$car0 183 mulx $apj,$mul0,$tmp0 !prologue! 184 add $tmp1,$car0,$car0 185 ld [$ap+8],$apj !prologue! 186 and $car0,$mask,$acc0 187 188 mulx $n0,$acc0,$mul1 189 and $mul1,$mask,$mul1 190 191 mulx $car1,$mul1,$car1 192 mulx $npj,$mul1,$acc1 !prologue! 193 srlx $car0,32,$car0 194 add $acc0,$car1,$car1 195 ld [$np+8],$npj !prologue! 196 srlx $car1,32,$car1 197 mov $tmp0,$acc0 !prologue! 198 199.Linner: 200 mulx $apj,$mul0,$tmp0 201 mulx $npj,$mul1,$tmp1 202 add $tpj,$car0,$car0 203 ld [$ap+$j],$apj ! ap[j] 204 add $acc0,$car0,$car0 205 add $acc1,$car1,$car1 206 ld [$np+$j],$npj ! np[j] 207 and $car0,$mask,$acc0 208 ld [$tp+8],$tpj ! tp[j] 209 srlx $car0,32,$car0 210 add $acc0,$car1,$car1 211 add $j,4,$j ! j++ 212 mov $tmp0,$acc0 213 st $car1,[$tp] ! tp[j-1] 214 srlx $car1,32,$car1 215 mov $tmp1,$acc1 216 cmp $j,$num 217 bl %icc,.Linner 218 add $tp,4,$tp ! tp++ 219!.Linner 220 221 mulx $apj,$mul0,$tmp0 !epilogue! 222 mulx $npj,$mul1,$tmp1 223 add $tpj,$car0,$car0 224 add $acc0,$car0,$car0 225 ld [$tp+8],$tpj ! tp[j] 226 and $car0,$mask,$acc0 227 add $acc1,$car1,$car1 228 srlx $car0,32,$car0 229 add $acc0,$car1,$car1 230 st $car1,[$tp] ! tp[j-1] 231 srlx $car1,32,$car1 232 233 add $tpj,$car0,$car0 234 add $tmp0,$car0,$car0 235 and $car0,$mask,$acc0 236 add $tmp1,$car1,$car1 237 add $acc0,$car1,$car1 238 st $car1,[$tp+4] ! tp[j-1] 239 srlx $car0,32,$car0 240 add $i,4,$i ! i++ 241 srlx $car1,32,$car1 242 243 add $car0,$car1,$car1 244 cmp $i,$num 245 add $car2,$car1,$car1 246 st $car1,[$tp+8] 247 248 srlx $car1,32,$car2 249 bl,a %icc,.Louter 250 ld [$bp+$i],$mul0 ! bp[i] 251!.Louter 252 253 add $tp,12,$tp 254 255.Ltail: 256 add $np,$num,$np 257 add $rp,$num,$rp 258 sub %g0,$num,%o7 ! k=-num 259 ba .Lsub 260 subcc %g0,%g0,%g0 ! clear %icc.c 261.align 16 262.Lsub: 263 ld [$tp+%o7],%o0 264 ld [$np+%o7],%o1 265 subccc %o0,%o1,%o1 ! tp[j]-np[j] 266 add $rp,%o7,$i 267 add %o7,4,%o7 268 brnz %o7,.Lsub 269 st %o1,[$i] 270 subccc $car2,0,$car2 ! handle upmost overflow bit 271 sub %g0,$num,%o7 272 273.Lcopy: 274 ld [$tp+%o7],%o1 ! conditional copy 275 ld [$rp+%o7],%o0 276 st %g0,[$tp+%o7] ! zap tp 277 movcs %icc,%o1,%o0 278 st %o0,[$rp+%o7] 279 add %o7,4,%o7 280 brnz %o7,.Lcopy 281 nop 282 mov 1,%i0 283 ret 284 restore 285___ 286 287######## 288######## .Lbn_sqr_mont gives up to 20% *overall* improvement over 289######## code without following dedicated squaring procedure. 290######## 291$sbit="%o5"; 292 293$code.=<<___; 294.align 32 295.Lbn_sqr_mont: 296 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] 297 mulx $apj,$mul0,$tmp0 !prologue! 298 and $car0,$mask,$acc0 299 add %sp,$bias+$frame,$tp 300 ld [$ap+8],$apj !prologue! 301 302 mulx $n0,$acc0,$mul1 ! "t[0]"*n0 303 srlx $car0,32,$car0 304 and $mul1,$mask,$mul1 305 306 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 307 mulx $npj,$mul1,$acc1 !prologue! 308 and $car0,1,$sbit 309 ld [$np+8],$npj !prologue! 310 srlx $car0,1,$car0 311 add $acc0,$car1,$car1 312 srlx $car1,32,$car1 313 mov $tmp0,$acc0 !prologue! 314 315.Lsqr_1st: 316 mulx $apj,$mul0,$tmp0 317 mulx $npj,$mul1,$tmp1 318 add $acc0,$car0,$car0 ! ap[j]*a0+c0 319 add $acc1,$car1,$car1 320 ld [$ap+$j],$apj ! ap[j] 321 and $car0,$mask,$acc0 322 ld [$np+$j],$npj ! np[j] 323 srlx $car0,32,$car0 324 add $acc0,$acc0,$acc0 325 or $sbit,$acc0,$acc0 326 mov $tmp1,$acc1 327 srlx $acc0,32,$sbit 328 add $j,4,$j ! j++ 329 and $acc0,$mask,$acc0 330 cmp $j,$num 331 add $acc0,$car1,$car1 332 st $car1,[$tp] 333 mov $tmp0,$acc0 334 srlx $car1,32,$car1 335 bl %icc,.Lsqr_1st 336 add $tp,4,$tp ! tp++ 337!.Lsqr_1st 338 339 mulx $apj,$mul0,$tmp0 ! epilogue 340 mulx $npj,$mul1,$tmp1 341 add $acc0,$car0,$car0 ! ap[j]*a0+c0 342 add $acc1,$car1,$car1 343 and $car0,$mask,$acc0 344 srlx $car0,32,$car0 345 add $acc0,$acc0,$acc0 346 or $sbit,$acc0,$acc0 347 srlx $acc0,32,$sbit 348 and $acc0,$mask,$acc0 349 add $acc0,$car1,$car1 350 st $car1,[$tp] 351 srlx $car1,32,$car1 352 353 add $tmp0,$car0,$car0 ! ap[j]*a0+c0 354 add $tmp1,$car1,$car1 355 and $car0,$mask,$acc0 356 srlx $car0,32,$car0 357 add $acc0,$acc0,$acc0 358 or $sbit,$acc0,$acc0 359 srlx $acc0,32,$sbit 360 and $acc0,$mask,$acc0 361 add $acc0,$car1,$car1 362 st $car1,[$tp+4] 363 srlx $car1,32,$car1 364 365 add $car0,$car0,$car0 366 or $sbit,$car0,$car0 367 add $car0,$car1,$car1 368 st $car1,[$tp+8] 369 srlx $car1,32,$car2 370 371 ld [%sp+$bias+$frame],$tmp0 ! tp[0] 372 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] 373 ld [%sp+$bias+$frame+8],$tpj ! tp[2] 374 ld [$ap+4],$mul0 ! ap[1] 375 ld [$ap+8],$apj ! ap[2] 376 ld [$np],$car1 ! np[0] 377 ld [$np+4],$npj ! np[1] 378 mulx $n0,$tmp0,$mul1 379 380 mulx $mul0,$mul0,$car0 381 and $mul1,$mask,$mul1 382 383 mulx $car1,$mul1,$car1 384 mulx $npj,$mul1,$acc1 385 add $tmp0,$car1,$car1 386 and $car0,$mask,$acc0 387 ld [$np+8],$npj ! np[2] 388 srlx $car1,32,$car1 389 add $tmp1,$car1,$car1 390 srlx $car0,32,$car0 391 add $acc0,$car1,$car1 392 and $car0,1,$sbit 393 add $acc1,$car1,$car1 394 srlx $car0,1,$car0 395 mov 12,$j 396 st $car1,[%sp+$bias+$frame] ! tp[0]= 397 srlx $car1,32,$car1 398 add %sp,$bias+$frame+4,$tp 399 400.Lsqr_2nd: 401 mulx $apj,$mul0,$acc0 402 mulx $npj,$mul1,$acc1 403 add $acc0,$car0,$car0 404 add $tpj,$sbit,$sbit 405 ld [$ap+$j],$apj ! ap[j] 406 and $car0,$mask,$acc0 407 ld [$np+$j],$npj ! np[j] 408 srlx $car0,32,$car0 409 add $acc1,$car1,$car1 410 ld [$tp+8],$tpj ! tp[j] 411 add $acc0,$acc0,$acc0 412 add $j,4,$j ! j++ 413 add $sbit,$acc0,$acc0 414 srlx $acc0,32,$sbit 415 and $acc0,$mask,$acc0 416 cmp $j,$num 417 add $acc0,$car1,$car1 418 st $car1,[$tp] ! tp[j-1] 419 srlx $car1,32,$car1 420 bl %icc,.Lsqr_2nd 421 add $tp,4,$tp ! tp++ 422!.Lsqr_2nd 423 424 mulx $apj,$mul0,$acc0 425 mulx $npj,$mul1,$acc1 426 add $acc0,$car0,$car0 427 add $tpj,$sbit,$sbit 428 and $car0,$mask,$acc0 429 srlx $car0,32,$car0 430 add $acc1,$car1,$car1 431 add $acc0,$acc0,$acc0 432 add $sbit,$acc0,$acc0 433 srlx $acc0,32,$sbit 434 and $acc0,$mask,$acc0 435 add $acc0,$car1,$car1 436 st $car1,[$tp] ! tp[j-1] 437 srlx $car1,32,$car1 438 439 add $car0,$car0,$car0 440 add $sbit,$car0,$car0 441 add $car0,$car1,$car1 442 add $car2,$car1,$car1 443 st $car1,[$tp+4] 444 srlx $car1,32,$car2 445 446 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 447 ld [%sp+$bias+$frame+4],$tpj ! tp[1] 448 ld [$ap+8],$mul0 ! ap[2] 449 ld [$np],$car1 ! np[0] 450 ld [$np+4],$npj ! np[1] 451 mulx $n0,$tmp1,$mul1 452 and $mul1,$mask,$mul1 453 mov 8,$i 454 455 mulx $mul0,$mul0,$car0 456 mulx $car1,$mul1,$car1 457 and $car0,$mask,$acc0 458 add $tmp1,$car1,$car1 459 srlx $car0,32,$car0 460 add %sp,$bias+$frame,$tp 461 srlx $car1,32,$car1 462 and $car0,1,$sbit 463 srlx $car0,1,$car0 464 mov 4,$j 465 466.Lsqr_outer: 467.Lsqr_inner1: 468 mulx $npj,$mul1,$acc1 469 add $tpj,$car1,$car1 470 add $j,4,$j 471 ld [$tp+8],$tpj 472 cmp $j,$i 473 add $acc1,$car1,$car1 474 ld [$np+$j],$npj 475 st $car1,[$tp] 476 srlx $car1,32,$car1 477 bl %icc,.Lsqr_inner1 478 add $tp,4,$tp 479!.Lsqr_inner1 480 481 add $j,4,$j 482 ld [$ap+$j],$apj ! ap[j] 483 mulx $npj,$mul1,$acc1 484 add $tpj,$car1,$car1 485 ld [$np+$j],$npj ! np[j] 486 srlx $car1,32,$tmp0 487 and $car1,$mask,$car1 488 add $tmp0,$sbit,$sbit 489 add $acc0,$car1,$car1 490 ld [$tp+8],$tpj ! tp[j] 491 add $acc1,$car1,$car1 492 st $car1,[$tp] 493 srlx $car1,32,$car1 494 495 add $j,4,$j 496 cmp $j,$num 497 be,pn %icc,.Lsqr_no_inner2 498 add $tp,4,$tp 499 500.Lsqr_inner2: 501 mulx $apj,$mul0,$acc0 502 mulx $npj,$mul1,$acc1 503 add $tpj,$sbit,$sbit 504 add $acc0,$car0,$car0 505 ld [$ap+$j],$apj ! ap[j] 506 and $car0,$mask,$acc0 507 ld [$np+$j],$npj ! np[j] 508 srlx $car0,32,$car0 509 add $acc0,$acc0,$acc0 510 ld [$tp+8],$tpj ! tp[j] 511 add $sbit,$acc0,$acc0 512 add $j,4,$j ! j++ 513 srlx $acc0,32,$sbit 514 and $acc0,$mask,$acc0 515 cmp $j,$num 516 add $acc0,$car1,$car1 517 add $acc1,$car1,$car1 518 st $car1,[$tp] ! tp[j-1] 519 srlx $car1,32,$car1 520 bl %icc,.Lsqr_inner2 521 add $tp,4,$tp ! tp++ 522 523.Lsqr_no_inner2: 524 mulx $apj,$mul0,$acc0 525 mulx $npj,$mul1,$acc1 526 add $tpj,$sbit,$sbit 527 add $acc0,$car0,$car0 528 and $car0,$mask,$acc0 529 srlx $car0,32,$car0 530 add $acc0,$acc0,$acc0 531 add $sbit,$acc0,$acc0 532 srlx $acc0,32,$sbit 533 and $acc0,$mask,$acc0 534 add $acc0,$car1,$car1 535 add $acc1,$car1,$car1 536 st $car1,[$tp] ! tp[j-1] 537 srlx $car1,32,$car1 538 539 add $car0,$car0,$car0 540 add $sbit,$car0,$car0 541 add $car0,$car1,$car1 542 add $car2,$car1,$car1 543 st $car1,[$tp+4] 544 srlx $car1,32,$car2 545 546 add $i,4,$i ! i++ 547 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 548 ld [%sp+$bias+$frame+4],$tpj ! tp[1] 549 ld [$ap+$i],$mul0 ! ap[j] 550 ld [$np],$car1 ! np[0] 551 ld [$np+4],$npj ! np[1] 552 mulx $n0,$tmp1,$mul1 553 and $mul1,$mask,$mul1 554 add $i,4,$tmp0 555 556 mulx $mul0,$mul0,$car0 557 mulx $car1,$mul1,$car1 558 and $car0,$mask,$acc0 559 add $tmp1,$car1,$car1 560 srlx $car0,32,$car0 561 add %sp,$bias+$frame,$tp 562 srlx $car1,32,$car1 563 and $car0,1,$sbit 564 srlx $car0,1,$car0 565 566 cmp $tmp0,$num ! i<num-1 567 bl %icc,.Lsqr_outer 568 mov 4,$j 569 570.Lsqr_last: 571 mulx $npj,$mul1,$acc1 572 add $tpj,$car1,$car1 573 add $j,4,$j 574 ld [$tp+8],$tpj 575 cmp $j,$i 576 add $acc1,$car1,$car1 577 ld [$np+$j],$npj 578 st $car1,[$tp] 579 srlx $car1,32,$car1 580 bl %icc,.Lsqr_last 581 add $tp,4,$tp 582!.Lsqr_last 583 584 mulx $npj,$mul1,$acc1 585 add $tpj,$acc0,$acc0 586 srlx $acc0,32,$tmp0 587 and $acc0,$mask,$acc0 588 add $tmp0,$sbit,$sbit 589 add $acc0,$car1,$car1 590 add $acc1,$car1,$car1 591 st $car1,[$tp] 592 srlx $car1,32,$car1 593 594 add $car0,$car0,$car0 ! recover $car0 595 add $sbit,$car0,$car0 596 add $car0,$car1,$car1 597 add $car2,$car1,$car1 598 st $car1,[$tp+4] 599 srlx $car1,32,$car2 600 601 ba .Ltail 602 add $tp,8,$tp 603.type $fname,#function 604.size $fname,(.-$fname) 605.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 606.align 32 607___ 608$code =~ s/\`([^\`]*)\`/eval($1)/gem; 609print $code; 610close STDOUT; 611