sparct4-mont.pl revision 337982
1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov 5# <appro@openssl.org>. The module is licensed under 2-clause BSD 6# license. November 2012. All rights reserved. 7# ==================================================================== 8 9###################################################################### 10# Montgomery squaring-n-multiplication module for SPARC T4. 11# 12# The module consists of three parts: 13# 14# 1) collection of "single-op" subroutines that perform single 15# operation, Montgomery squaring or multiplication, on 512-, 16# 1024-, 1536- and 2048-bit operands; 17# 2) collection of "multi-op" subroutines that perform 5 squaring and 18# 1 multiplication operations on operands of above lengths; 19# 3) fall-back and helper VIS3 subroutines. 20# 21# RSA sign is dominated by multi-op subroutine, while RSA verify and 22# DSA - by single-op. Special note about 4096-bit RSA verify result. 23# Operands are too long for dedicated hardware and it's handled by 24# VIS3 code, which is why you don't see any improvement. It's surely 25# possible to improve it [by deploying 'mpmul' instruction], maybe in 26# the future... 27# 28# Performance improvement. 29# 30# 64-bit process, VIS3: 31# sign verify sign/s verify/s 32# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4 33# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3 34# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9 35# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9 36# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8 37# 38# 64-bit process, this module: 39# sign verify sign/s verify/s 40# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9 41# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7 42# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5 43# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5 44# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6 45# 46###################################################################### 47# 32-bit process, VIS3: 48# sign verify sign/s verify/s 49# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3 50# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4 51# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8 52# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6 53# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4 54# 55# 32-bit process, this module: 56# sign verify sign/s verify/s 57# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0 58# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7 59# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4 60# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2 61# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2 62# 63# 32-bit code is prone to performance degradation as interrupt rate 64# dispatched to CPU executing the code grows. This is because in 65# standard process of handling interrupt in 32-bit process context 66# upper halves of most integer registers used as input or output are 67# zeroed. This renders result invalid, and operation has to be re-run. 68# If CPU is "bothered" with timer interrupts only, the penalty is 69# hardly measurable. But in order to mitigate this problem for higher 70# interrupt rates contemporary Linux kernel recognizes biased stack 71# even in 32-bit process context and preserves full register contents. 72# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb 73# for details. 74 75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 76push(@INC,"${dir}","${dir}../../perlasm"); 77require "sparcv9_modes.pl"; 78 79$code.=<<___; 80#include "sparc_arch.h" 81 82#ifdef __arch64__ 83.register %g2,#scratch 84.register %g3,#scratch 85#endif 86 87.section ".text",#alloc,#execinstr 88 89#ifdef __PIC__ 90SPARC_PIC_THUNK(%g1) 91#endif 92___ 93 94######################################################################## 95# Register layout for mont[mul|sqr] instructions. 96# For details see "Oracle SPARC Architecture 2011" manual at 97# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/. 98# 99my @R=map("%f".2*$_,(0..11,30,31,12..29)); 100my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]); 101my @A=(@N[0..13],@R[14..31]); 102my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3))); 103 104######################################################################## 105# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp, 106# const u64 *np,const BN_ULONG *n0); 107# 108sub generate_bn_mul_mont_t4() { 109my $NUM=shift; 110my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5)); 111 112$code.=<<___; 113.globl bn_mul_mont_t4_$NUM 114.align 32 115bn_mul_mont_t4_$NUM: 116#ifdef __arch64__ 117 mov 0,$sentinel 118 mov -128,%g4 119#elif defined(SPARCV9_64BIT_STACK) 120 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 121 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] 122 mov -2047,%g4 123 and %g1,SPARCV9_64BIT_STACK,%g1 124 movrz %g1,0,%g4 125 mov -1,$sentinel 126 add %g4,-128,%g4 127#else 128 mov -1,$sentinel 129 mov -128,%g4 130#endif 131 sllx $sentinel,32,$sentinel 132 save %sp,%g4,%sp 133#ifndef __arch64__ 134 save %sp,-128,%sp ! warm it up 135 save %sp,-128,%sp 136 save %sp,-128,%sp 137 save %sp,-128,%sp 138 save %sp,-128,%sp 139 save %sp,-128,%sp 140 restore 141 restore 142 restore 143 restore 144 restore 145 restore 146#endif 147 and %sp,1,%g4 148 or $sentinel,%fp,%fp 149 or %g4,$sentinel,$sentinel 150 151 ! copy arguments to global registers 152 mov %i0,$rp 153 mov %i1,$ap 154 mov %i2,$bp 155 mov %i3,$np 156 ld [%i4+0],%f1 ! load *n0 157 ld [%i4+4],%f0 158 fsrc2 %f0,%f60 159___ 160 161# load ap[$NUM] ######################################################## 162$code.=<<___; 163 save %sp,-128,%sp; or $sentinel,%fp,%fp 164___ 165for($i=0; $i<14 && $i<$NUM; $i++) { 166my $lo=$i<13?@A[$i+1]:"%o7"; 167$code.=<<___; 168 ld [$ap+$i*8+0],$lo 169 ld [$ap+$i*8+4],@A[$i] 170 sllx @A[$i],32,@A[$i] 171 or $lo,@A[$i],@A[$i] 172___ 173} 174for(; $i<$NUM; $i++) { 175my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); 176$code.=<<___; 177 ld [$ap+$i*8+0],$lo 178 ld [$ap+$i*8+4],$hi 179 fsrc2 $hi,@A[$i] 180___ 181} 182# load np[$NUM] ######################################################## 183$code.=<<___; 184 save %sp,-128,%sp; or $sentinel,%fp,%fp 185___ 186for($i=0; $i<14 && $i<$NUM; $i++) { 187my $lo=$i<13?@N[$i+1]:"%o7"; 188$code.=<<___; 189 ld [$np+$i*8+0],$lo 190 ld [$np+$i*8+4],@N[$i] 191 sllx @N[$i],32,@N[$i] 192 or $lo,@N[$i],@N[$i] 193___ 194} 195$code.=<<___; 196 save %sp,-128,%sp; or $sentinel,%fp,%fp 197___ 198for(; $i<28 && $i<$NUM; $i++) { 199my $lo=$i<27?@N[$i+1]:"%o7"; 200$code.=<<___; 201 ld [$np+$i*8+0],$lo 202 ld [$np+$i*8+4],@N[$i] 203 sllx @N[$i],32,@N[$i] 204 or $lo,@N[$i],@N[$i] 205___ 206} 207$code.=<<___; 208 save %sp,-128,%sp; or $sentinel,%fp,%fp 209___ 210for(; $i<$NUM; $i++) { 211my $lo=($i<$NUM-1)?@N[$i+1]:"%o7"; 212$code.=<<___; 213 ld [$np+$i*8+0],$lo 214 ld [$np+$i*8+4],@N[$i] 215 sllx @N[$i],32,@N[$i] 216 or $lo,@N[$i],@N[$i] 217___ 218} 219$code.=<<___; 220 cmp $ap,$bp 221 be SIZE_T_CC,.Lmsquare_$NUM 222 nop 223___ 224 225# load bp[$NUM] ######################################################## 226$code.=<<___; 227 save %sp,-128,%sp; or $sentinel,%fp,%fp 228___ 229for($i=0; $i<14 && $i<$NUM; $i++) { 230my $lo=$i<13?@B[$i+1]:"%o7"; 231$code.=<<___; 232 ld [$bp+$i*8+0],$lo 233 ld [$bp+$i*8+4],@B[$i] 234 sllx @B[$i],32,@B[$i] 235 or $lo,@B[$i],@B[$i] 236___ 237} 238$code.=<<___; 239 save %sp,-128,%sp; or $sentinel,%fp,%fp 240___ 241for(; $i<$NUM; $i++) { 242my $lo=($i<$NUM-1)?@B[$i+1]:"%o7"; 243$code.=<<___; 244 ld [$bp+$i*8+0],$lo 245 ld [$bp+$i*8+4],@B[$i] 246 sllx @B[$i],32,@B[$i] 247 or $lo,@B[$i],@B[$i] 248___ 249} 250# magic ################################################################ 251$code.=<<___; 252 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 253.Lmresume_$NUM: 254 fbu,pn %fcc3,.Lmabort_$NUM 255#ifndef __arch64__ 256 and %fp,$sentinel,$sentinel 257 brz,pn $sentinel,.Lmabort_$NUM 258#endif 259 nop 260#ifdef __arch64__ 261 restore 262 restore 263 restore 264 restore 265 restore 266#else 267 restore; and %fp,$sentinel,$sentinel 268 restore; and %fp,$sentinel,$sentinel 269 restore; and %fp,$sentinel,$sentinel 270 restore; and %fp,$sentinel,$sentinel 271 brz,pn $sentinel,.Lmabort1_$NUM 272 restore 273#endif 274___ 275 276# save tp[$NUM] ######################################################## 277for($i=0; $i<14 && $i<$NUM; $i++) { 278$code.=<<___; 279 movxtod @A[$i],@R[$i] 280___ 281} 282$code.=<<___; 283#ifdef __arch64__ 284 restore 285#else 286 and %fp,$sentinel,$sentinel 287 restore 288 and $sentinel,1,%o7 289 and %fp,$sentinel,$sentinel 290 srl %fp,0,%fp ! just in case? 291 or %o7,$sentinel,$sentinel 292 brz,a,pn $sentinel,.Lmdone_$NUM 293 mov 0,%i0 ! return failure 294#endif 295___ 296for($i=0; $i<12 && $i<$NUM; $i++) { 297@R[$i] =~ /%f([0-9]+)/; 298my $lo = "%f".($1+1); 299$code.=<<___; 300 st $lo,[$rp+$i*8+0] 301 st @R[$i],[$rp+$i*8+4] 302___ 303} 304for(; $i<$NUM; $i++) { 305my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); 306$code.=<<___; 307 fsrc2 @R[$i],$hi 308 st $lo,[$rp+$i*8+0] 309 st $hi,[$rp+$i*8+4] 310___ 311} 312$code.=<<___; 313 mov 1,%i0 ! return success 314.Lmdone_$NUM: 315 ret 316 restore 317 318.Lmabort_$NUM: 319 restore 320 restore 321 restore 322 restore 323 restore 324.Lmabort1_$NUM: 325 restore 326 327 mov 0,%i0 ! return failure 328 ret 329 restore 330 331.align 32 332.Lmsquare_$NUM: 333 save %sp,-128,%sp; or $sentinel,%fp,%fp 334 save %sp,-128,%sp; or $sentinel,%fp,%fp 335 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 336 ba .Lmresume_$NUM 337 nop 338.type bn_mul_mont_t4_$NUM, #function 339.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM 340___ 341} 342 343for ($i=8;$i<=32;$i+=8) { 344 &generate_bn_mul_mont_t4($i); 345} 346 347######################################################################## 348# 349sub load_ccr { 350my ($ptbl,$pwr,$ccr,$skip_wr)=@_; 351$code.=<<___; 352 srl $pwr, 2, %o4 353 and $pwr, 3, %o5 354 and %o4, 7, %o4 355 sll %o5, 3, %o5 ! offset within first cache line 356 add %o5, $ptbl, $ptbl ! of the pwrtbl 357 or %g0, 1, %o5 358 sll %o5, %o4, $ccr 359___ 360$code.=<<___ if (!$skip_wr); 361 wr $ccr, %g0, %ccr 362___ 363} 364sub load_b_pair { 365my ($pwrtbl,$B0,$B1)=@_; 366 367$code.=<<___; 368 ldx [$pwrtbl+0*32], $B0 369 ldx [$pwrtbl+8*32], $B1 370 ldx [$pwrtbl+1*32], %o4 371 ldx [$pwrtbl+9*32], %o5 372 movvs %icc, %o4, $B0 373 ldx [$pwrtbl+2*32], %o4 374 movvs %icc, %o5, $B1 375 ldx [$pwrtbl+10*32],%o5 376 move %icc, %o4, $B0 377 ldx [$pwrtbl+3*32], %o4 378 move %icc, %o5, $B1 379 ldx [$pwrtbl+11*32],%o5 380 movneg %icc, %o4, $B0 381 ldx [$pwrtbl+4*32], %o4 382 movneg %icc, %o5, $B1 383 ldx [$pwrtbl+12*32],%o5 384 movcs %xcc, %o4, $B0 385 ldx [$pwrtbl+5*32],%o4 386 movcs %xcc, %o5, $B1 387 ldx [$pwrtbl+13*32],%o5 388 movvs %xcc, %o4, $B0 389 ldx [$pwrtbl+6*32], %o4 390 movvs %xcc, %o5, $B1 391 ldx [$pwrtbl+14*32],%o5 392 move %xcc, %o4, $B0 393 ldx [$pwrtbl+7*32], %o4 394 move %xcc, %o5, $B1 395 ldx [$pwrtbl+15*32],%o5 396 movneg %xcc, %o4, $B0 397 add $pwrtbl,16*32, $pwrtbl 398 movneg %xcc, %o5, $B1 399___ 400} 401sub load_b { 402my ($pwrtbl,$Bi)=@_; 403 404$code.=<<___; 405 ldx [$pwrtbl+0*32], $Bi 406 ldx [$pwrtbl+1*32], %o4 407 ldx [$pwrtbl+2*32], %o5 408 movvs %icc, %o4, $Bi 409 ldx [$pwrtbl+3*32], %o4 410 move %icc, %o5, $Bi 411 ldx [$pwrtbl+4*32], %o5 412 movneg %icc, %o4, $Bi 413 ldx [$pwrtbl+5*32], %o4 414 movcs %xcc, %o5, $Bi 415 ldx [$pwrtbl+6*32], %o5 416 movvs %xcc, %o4, $Bi 417 ldx [$pwrtbl+7*32], %o4 418 move %xcc, %o5, $Bi 419 add $pwrtbl,8*32, $pwrtbl 420 movneg %xcc, %o4, $Bi 421___ 422} 423 424######################################################################## 425# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0, 426# const u64 *pwrtbl,int pwr,int stride); 427# 428sub generate_bn_pwr5_mont_t4() { 429my $NUM=shift; 430my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5)); 431 432$code.=<<___; 433.globl bn_pwr5_mont_t4_$NUM 434.align 32 435bn_pwr5_mont_t4_$NUM: 436#ifdef __arch64__ 437 mov 0,$sentinel 438 mov -128,%g4 439#elif defined(SPARCV9_64BIT_STACK) 440 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 441 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] 442 mov -2047,%g4 443 and %g1,SPARCV9_64BIT_STACK,%g1 444 movrz %g1,0,%g4 445 mov -1,$sentinel 446 add %g4,-128,%g4 447#else 448 mov -1,$sentinel 449 mov -128,%g4 450#endif 451 sllx $sentinel,32,$sentinel 452 save %sp,%g4,%sp 453#ifndef __arch64__ 454 save %sp,-128,%sp ! warm it up 455 save %sp,-128,%sp 456 save %sp,-128,%sp 457 save %sp,-128,%sp 458 save %sp,-128,%sp 459 save %sp,-128,%sp 460 restore 461 restore 462 restore 463 restore 464 restore 465 restore 466#endif 467 and %sp,1,%g4 468 or $sentinel,%fp,%fp 469 or %g4,$sentinel,$sentinel 470 471 ! copy arguments to global registers 472 mov %i0,$tp 473 mov %i1,$np 474 ld [%i2+0],%f1 ! load *n0 475 ld [%i2+4],%f0 476 mov %i3,$pwrtbl 477 srl %i4,%g0,%i4 ! pack last arguments 478 sllx %i5,32,$pwr 479 or %i4,$pwr,$pwr 480 fsrc2 %f0,%f60 481___ 482 483# load tp[$NUM] ######################################################## 484$code.=<<___; 485 save %sp,-128,%sp; or $sentinel,%fp,%fp 486___ 487for($i=0; $i<14 && $i<$NUM; $i++) { 488$code.=<<___; 489 ldx [$tp+$i*8],@A[$i] 490___ 491} 492for(; $i<$NUM; $i++) { 493$code.=<<___; 494 ldd [$tp+$i*8],@A[$i] 495___ 496} 497# load np[$NUM] ######################################################## 498$code.=<<___; 499 save %sp,-128,%sp; or $sentinel,%fp,%fp 500___ 501for($i=0; $i<14 && $i<$NUM; $i++) { 502$code.=<<___; 503 ldx [$np+$i*8],@N[$i] 504___ 505} 506$code.=<<___; 507 save %sp,-128,%sp; or $sentinel,%fp,%fp 508___ 509for(; $i<28 && $i<$NUM; $i++) { 510$code.=<<___; 511 ldx [$np+$i*8],@N[$i] 512___ 513} 514$code.=<<___; 515 save %sp,-128,%sp; or $sentinel,%fp,%fp 516___ 517for(; $i<$NUM; $i++) { 518$code.=<<___; 519 ldx [$np+$i*8],@N[$i] 520___ 521} 522# load pwrtbl[pwr] ######################################################## 523$code.=<<___; 524 save %sp,-128,%sp; or $sentinel,%fp,%fp 525 526 srlx $pwr, 32, %o4 ! unpack $pwr 527 srl $pwr, %g0, %o5 528 sub %o4, 5, %o4 529 mov $pwrtbl, %o7 530 sllx %o4, 32, $pwr ! re-pack $pwr 531 or %o5, $pwr, $pwr 532 srl %o5, %o4, %o5 533___ 534 &load_ccr("%o7","%o5","%o4"); 535$code.=<<___; 536 b .Lstride_$NUM 537 nop 538.align 16 539.Lstride_$NUM: 540___ 541for($i=0; $i<14 && $i<$NUM; $i+=2) { 542 &load_b_pair("%o7",@B[$i],@B[$i+1]); 543} 544$code.=<<___; 545 save %sp,-128,%sp; or $sentinel,%fp,%fp 546___ 547for(; $i<$NUM; $i+=2) { 548 &load_b_pair("%i7",@B[$i],@B[$i+1]); 549} 550$code.=<<___; 551 srax $pwr, 32, %o4 ! unpack $pwr 552 srl $pwr, %g0, %o5 553 sub %o4, 5, %o4 554 mov $pwrtbl, %i7 555 sllx %o4, 32, $pwr ! re-pack $pwr 556 or %o5, $pwr, $pwr 557 srl %o5, %o4, %o5 558___ 559 &load_ccr("%i7","%o5","%o4",1); 560 561# magic ################################################################ 562for($i=0; $i<5; $i++) { 563$code.=<<___; 564 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 565 fbu,pn %fcc3,.Labort_$NUM 566#ifndef __arch64__ 567 and %fp,$sentinel,$sentinel 568 brz,pn $sentinel,.Labort_$NUM 569#endif 570 nop 571___ 572} 573$code.=<<___; 574 wr %o4, %g0, %ccr 575 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 576 fbu,pn %fcc3,.Labort_$NUM 577#ifndef __arch64__ 578 and %fp,$sentinel,$sentinel 579 brz,pn $sentinel,.Labort_$NUM 580#endif 581 582 srax $pwr, 32, %o4 583#ifdef __arch64__ 584 brgez %o4,.Lstride_$NUM 585 restore 586 restore 587 restore 588 restore 589 restore 590#else 591 brgez %o4,.Lstride_$NUM 592 restore; and %fp,$sentinel,$sentinel 593 restore; and %fp,$sentinel,$sentinel 594 restore; and %fp,$sentinel,$sentinel 595 restore; and %fp,$sentinel,$sentinel 596 brz,pn $sentinel,.Labort1_$NUM 597 restore 598#endif 599___ 600 601# save tp[$NUM] ######################################################## 602for($i=0; $i<14 && $i<$NUM; $i++) { 603$code.=<<___; 604 movxtod @A[$i],@R[$i] 605___ 606} 607$code.=<<___; 608#ifdef __arch64__ 609 restore 610#else 611 and %fp,$sentinel,$sentinel 612 restore 613 and $sentinel,1,%o7 614 and %fp,$sentinel,$sentinel 615 srl %fp,0,%fp ! just in case? 616 or %o7,$sentinel,$sentinel 617 brz,a,pn $sentinel,.Ldone_$NUM 618 mov 0,%i0 ! return failure 619#endif 620___ 621for($i=0; $i<$NUM; $i++) { 622$code.=<<___; 623 std @R[$i],[$tp+$i*8] 624___ 625} 626$code.=<<___; 627 mov 1,%i0 ! return success 628.Ldone_$NUM: 629 ret 630 restore 631 632.Labort_$NUM: 633 restore 634 restore 635 restore 636 restore 637 restore 638.Labort1_$NUM: 639 restore 640 641 mov 0,%i0 ! return failure 642 ret 643 restore 644.type bn_pwr5_mont_t4_$NUM, #function 645.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM 646___ 647} 648 649for ($i=8;$i<=32;$i+=8) { 650 &generate_bn_pwr5_mont_t4($i); 651} 652 653{ 654######################################################################## 655# Fall-back subroutines 656# 657# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values 658# 659($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= 660 (map("%g$_",(1..5)),map("%o$_",(0..5,7))); 661 662# int bn_mul_mont( 663$rp="%o0"; # u64 *rp, 664$ap="%o1"; # const u64 *ap, 665$bp="%o2"; # const u64 *bp, 666$np="%o3"; # const u64 *np, 667$n0p="%o4"; # const BN_ULONG *n0, 668$num="%o5"; # int num); # caller ensures that num is >=3 669$code.=<<___; 670.globl bn_mul_mont_t4 671.align 32 672bn_mul_mont_t4: 673 add %sp, STACK_BIAS, %g4 ! real top of stack 674 sll $num, 3, $num ! size in bytes 675 add $num, 63, %g1 676 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes 677 sub %g4, %g1, %g1 678 andn %g1, 63, %g1 ! align at 64 byte 679 sub %g1, STACK_FRAME, %g1 ! new top of stack 680 sub %g1, %g4, %g1 681 682 save %sp, %g1, %sp 683___ 684# +-------------------------------+<----- %sp 685# . . 686# +-------------------------------+<----- aligned at 64 bytes 687# | __int64 tmp[0] | 688# +-------------------------------+ 689# . . 690# . . 691# +-------------------------------+<----- aligned at 64 bytes 692# . . 693($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 694($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7)); 695($ovf,$i)=($t0,$t1); 696$code.=<<___; 697 ld [$n0p+0], $t0 ! pull n0[0..1] value 698 ld [$n0p+4], $t1 699 add %sp, STACK_BIAS+STACK_FRAME, $tp 700 ldx [$bp+0], $m0 ! m0=bp[0] 701 sllx $t1, 32, $n0 702 add $bp, 8, $bp 703 or $t0, $n0, $n0 704 705 ldx [$ap+0], $aj ! ap[0] 706 707 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 708 umulxhi $aj, $m0, $hi0 709 710 ldx [$ap+8], $aj ! ap[1] 711 add $ap, 16, $ap 712 ldx [$np+0], $nj ! np[0] 713 714 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 715 716 mulx $aj, $m0, $alo ! ap[1]*bp[0] 717 umulxhi $aj, $m0, $aj ! ahi=aj 718 719 mulx $nj, $m1, $lo1 ! np[0]*m1 720 umulxhi $nj, $m1, $hi1 721 722 ldx [$np+8], $nj ! np[1] 723 724 addcc $lo0, $lo1, $lo1 725 add $np, 16, $np 726 addxc %g0, $hi1, $hi1 727 728 mulx $nj, $m1, $nlo ! np[1]*m1 729 umulxhi $nj, $m1, $nj ! nhi=nj 730 731 ba .L1st 732 sub $num, 24, $cnt ! cnt=num-3 733 734.align 16 735.L1st: 736 addcc $alo, $hi0, $lo0 737 addxc $aj, %g0, $hi0 738 739 ldx [$ap+0], $aj ! ap[j] 740 addcc $nlo, $hi1, $lo1 741 add $ap, 8, $ap 742 addxc $nj, %g0, $hi1 ! nhi=nj 743 744 ldx [$np+0], $nj ! np[j] 745 mulx $aj, $m0, $alo ! ap[j]*bp[0] 746 add $np, 8, $np 747 umulxhi $aj, $m0, $aj ! ahi=aj 748 749 mulx $nj, $m1, $nlo ! np[j]*m1 750 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 751 umulxhi $nj, $m1, $nj ! nhi=nj 752 addxc %g0, $hi1, $hi1 753 stxa $lo1, [$tp]0xe2 ! tp[j-1] 754 add $tp, 8, $tp ! tp++ 755 756 brnz,pt $cnt, .L1st 757 sub $cnt, 8, $cnt ! j-- 758!.L1st 759 addcc $alo, $hi0, $lo0 760 addxc $aj, %g0, $hi0 ! ahi=aj 761 762 addcc $nlo, $hi1, $lo1 763 addxc $nj, %g0, $hi1 764 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 765 addxc %g0, $hi1, $hi1 766 stxa $lo1, [$tp]0xe2 ! tp[j-1] 767 add $tp, 8, $tp 768 769 addcc $hi0, $hi1, $hi1 770 addxc %g0, %g0, $ovf ! upmost overflow bit 771 stxa $hi1, [$tp]0xe2 772 add $tp, 8, $tp 773 774 ba .Louter 775 sub $num, 16, $i ! i=num-2 776 777.align 16 778.Louter: 779 ldx [$bp+0], $m0 ! m0=bp[i] 780 add $bp, 8, $bp 781 782 sub $ap, $num, $ap ! rewind 783 sub $np, $num, $np 784 sub $tp, $num, $tp 785 786 ldx [$ap+0], $aj ! ap[0] 787 ldx [$np+0], $nj ! np[0] 788 789 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 790 ldx [$tp], $tj ! tp[0] 791 umulxhi $aj, $m0, $hi0 792 ldx [$ap+8], $aj ! ap[1] 793 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 794 mulx $aj, $m0, $alo ! ap[1]*bp[i] 795 addxc %g0, $hi0, $hi0 796 mulx $lo0, $n0, $m1 ! tp[0]*n0 797 umulxhi $aj, $m0, $aj ! ahi=aj 798 mulx $nj, $m1, $lo1 ! np[0]*m1 799 add $ap, 16, $ap 800 umulxhi $nj, $m1, $hi1 801 ldx [$np+8], $nj ! np[1] 802 add $np, 16, $np 803 addcc $lo1, $lo0, $lo1 804 mulx $nj, $m1, $nlo ! np[1]*m1 805 addxc %g0, $hi1, $hi1 806 umulxhi $nj, $m1, $nj ! nhi=nj 807 808 ba .Linner 809 sub $num, 24, $cnt ! cnt=num-3 810.align 16 811.Linner: 812 addcc $alo, $hi0, $lo0 813 ldx [$tp+8], $tj ! tp[j] 814 addxc $aj, %g0, $hi0 ! ahi=aj 815 ldx [$ap+0], $aj ! ap[j] 816 add $ap, 8, $ap 817 addcc $nlo, $hi1, $lo1 818 mulx $aj, $m0, $alo ! ap[j]*bp[i] 819 addxc $nj, %g0, $hi1 ! nhi=nj 820 ldx [$np+0], $nj ! np[j] 821 add $np, 8, $np 822 umulxhi $aj, $m0, $aj ! ahi=aj 823 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 824 mulx $nj, $m1, $nlo ! np[j]*m1 825 addxc %g0, $hi0, $hi0 826 umulxhi $nj, $m1, $nj ! nhi=nj 827 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 828 addxc %g0, $hi1, $hi1 829 stx $lo1, [$tp] ! tp[j-1] 830 add $tp, 8, $tp 831 brnz,pt $cnt, .Linner 832 sub $cnt, 8, $cnt 833!.Linner 834 ldx [$tp+8], $tj ! tp[j] 835 addcc $alo, $hi0, $lo0 836 addxc $aj, %g0, $hi0 ! ahi=aj 837 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 838 addxc %g0, $hi0, $hi0 839 840 addcc $nlo, $hi1, $lo1 841 addxc $nj, %g0, $hi1 ! nhi=nj 842 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 843 addxc %g0, $hi1, $hi1 844 stx $lo1, [$tp] ! tp[j-1] 845 846 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 847 addxccc $hi1, $hi0, $hi1 848 addxc %g0, %g0, $ovf 849 stx $hi1, [$tp+8] 850 add $tp, 16, $tp 851 852 brnz,pt $i, .Louter 853 sub $i, 8, $i 854 855 sub $ap, $num, $ap ! rewind 856 sub $np, $num, $np 857 sub $tp, $num, $tp 858 ba .Lsub 859 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 860 861.align 16 862.Lsub: 863 ldx [$tp], $tj 864 add $tp, 8, $tp 865 ldx [$np+0], $nj 866 add $np, 8, $np 867 subccc $tj, $nj, $t2 ! tp[j]-np[j] 868 srlx $tj, 32, $tj 869 srlx $nj, 32, $nj 870 subccc $tj, $nj, $t3 871 add $rp, 8, $rp 872 st $t2, [$rp-4] ! reverse order 873 st $t3, [$rp-8] 874 brnz,pt $cnt, .Lsub 875 sub $cnt, 8, $cnt 876 877 sub $np, $num, $np ! rewind 878 sub $tp, $num, $tp 879 sub $rp, $num, $rp 880 881 subccc $ovf, %g0, $ovf ! handle upmost overflow bit 882 ba .Lcopy 883 sub $num, 8, $cnt 884 885.align 16 886.Lcopy: ! conditional copy 887 ldx [$tp], $tj 888 ldx [$rp+0], $t2 889 stx %g0, [$tp] ! zap 890 add $tp, 8, $tp 891 movcs %icc, $tj, $t2 892 stx $t2, [$rp+0] 893 add $rp, 8, $rp 894 brnz $cnt, .Lcopy 895 sub $cnt, 8, $cnt 896 897 mov 1, %o0 898 ret 899 restore 900.type bn_mul_mont_t4, #function 901.size bn_mul_mont_t4, .-bn_mul_mont_t4 902___ 903 904# int bn_mul_mont_gather5( 905$rp="%o0"; # u64 *rp, 906$ap="%o1"; # const u64 *ap, 907$bp="%o2"; # const u64 *pwrtbl, 908$np="%o3"; # const u64 *np, 909$n0p="%o4"; # const BN_ULONG *n0, 910$num="%o5"; # int num, # caller ensures that num is >=3 911 # int power); 912$code.=<<___; 913.globl bn_mul_mont_gather5_t4 914.align 32 915bn_mul_mont_gather5_t4: 916 add %sp, STACK_BIAS, %g4 ! real top of stack 917 sll $num, 3, $num ! size in bytes 918 add $num, 63, %g1 919 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes 920 sub %g4, %g1, %g1 921 andn %g1, 63, %g1 ! align at 64 byte 922 sub %g1, STACK_FRAME, %g1 ! new top of stack 923 sub %g1, %g4, %g1 924 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument 925 926 save %sp, %g1, %sp 927___ 928# +-------------------------------+<----- %sp 929# . . 930# +-------------------------------+<----- aligned at 64 bytes 931# | __int64 tmp[0] | 932# +-------------------------------+ 933# . . 934# . . 935# +-------------------------------+<----- aligned at 64 bytes 936# . . 937($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 938($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7)); 939($ovf,$i)=($t0,$t1); 940 &load_ccr($bp,"%g4",$ccr); 941 &load_b($bp,$m0,"%o7"); # m0=bp[0] 942 943$code.=<<___; 944 ld [$n0p+0], $t0 ! pull n0[0..1] value 945 ld [$n0p+4], $t1 946 add %sp, STACK_BIAS+STACK_FRAME, $tp 947 sllx $t1, 32, $n0 948 or $t0, $n0, $n0 949 950 ldx [$ap+0], $aj ! ap[0] 951 952 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 953 umulxhi $aj, $m0, $hi0 954 955 ldx [$ap+8], $aj ! ap[1] 956 add $ap, 16, $ap 957 ldx [$np+0], $nj ! np[0] 958 959 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 960 961 mulx $aj, $m0, $alo ! ap[1]*bp[0] 962 umulxhi $aj, $m0, $aj ! ahi=aj 963 964 mulx $nj, $m1, $lo1 ! np[0]*m1 965 umulxhi $nj, $m1, $hi1 966 967 ldx [$np+8], $nj ! np[1] 968 969 addcc $lo0, $lo1, $lo1 970 add $np, 16, $np 971 addxc %g0, $hi1, $hi1 972 973 mulx $nj, $m1, $nlo ! np[1]*m1 974 umulxhi $nj, $m1, $nj ! nhi=nj 975 976 ba .L1st_g5 977 sub $num, 24, $cnt ! cnt=num-3 978 979.align 16 980.L1st_g5: 981 addcc $alo, $hi0, $lo0 982 addxc $aj, %g0, $hi0 983 984 ldx [$ap+0], $aj ! ap[j] 985 addcc $nlo, $hi1, $lo1 986 add $ap, 8, $ap 987 addxc $nj, %g0, $hi1 ! nhi=nj 988 989 ldx [$np+0], $nj ! np[j] 990 mulx $aj, $m0, $alo ! ap[j]*bp[0] 991 add $np, 8, $np 992 umulxhi $aj, $m0, $aj ! ahi=aj 993 994 mulx $nj, $m1, $nlo ! np[j]*m1 995 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 996 umulxhi $nj, $m1, $nj ! nhi=nj 997 addxc %g0, $hi1, $hi1 998 stxa $lo1, [$tp]0xe2 ! tp[j-1] 999 add $tp, 8, $tp ! tp++ 1000 1001 brnz,pt $cnt, .L1st_g5 1002 sub $cnt, 8, $cnt ! j-- 1003!.L1st_g5 1004 addcc $alo, $hi0, $lo0 1005 addxc $aj, %g0, $hi0 ! ahi=aj 1006 1007 addcc $nlo, $hi1, $lo1 1008 addxc $nj, %g0, $hi1 1009 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 1010 addxc %g0, $hi1, $hi1 1011 stxa $lo1, [$tp]0xe2 ! tp[j-1] 1012 add $tp, 8, $tp 1013 1014 addcc $hi0, $hi1, $hi1 1015 addxc %g0, %g0, $ovf ! upmost overflow bit 1016 stxa $hi1, [$tp]0xe2 1017 add $tp, 8, $tp 1018 1019 ba .Louter_g5 1020 sub $num, 16, $i ! i=num-2 1021 1022.align 16 1023.Louter_g5: 1024 wr $ccr, %g0, %ccr 1025___ 1026 &load_b($bp,$m0); # m0=bp[i] 1027$code.=<<___; 1028 sub $ap, $num, $ap ! rewind 1029 sub $np, $num, $np 1030 sub $tp, $num, $tp 1031 1032 ldx [$ap+0], $aj ! ap[0] 1033 ldx [$np+0], $nj ! np[0] 1034 1035 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 1036 ldx [$tp], $tj ! tp[0] 1037 umulxhi $aj, $m0, $hi0 1038 ldx [$ap+8], $aj ! ap[1] 1039 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 1040 mulx $aj, $m0, $alo ! ap[1]*bp[i] 1041 addxc %g0, $hi0, $hi0 1042 mulx $lo0, $n0, $m1 ! tp[0]*n0 1043 umulxhi $aj, $m0, $aj ! ahi=aj 1044 mulx $nj, $m1, $lo1 ! np[0]*m1 1045 add $ap, 16, $ap 1046 umulxhi $nj, $m1, $hi1 1047 ldx [$np+8], $nj ! np[1] 1048 add $np, 16, $np 1049 addcc $lo1, $lo0, $lo1 1050 mulx $nj, $m1, $nlo ! np[1]*m1 1051 addxc %g0, $hi1, $hi1 1052 umulxhi $nj, $m1, $nj ! nhi=nj 1053 1054 ba .Linner_g5 1055 sub $num, 24, $cnt ! cnt=num-3 1056.align 16 1057.Linner_g5: 1058 addcc $alo, $hi0, $lo0 1059 ldx [$tp+8], $tj ! tp[j] 1060 addxc $aj, %g0, $hi0 ! ahi=aj 1061 ldx [$ap+0], $aj ! ap[j] 1062 add $ap, 8, $ap 1063 addcc $nlo, $hi1, $lo1 1064 mulx $aj, $m0, $alo ! ap[j]*bp[i] 1065 addxc $nj, %g0, $hi1 ! nhi=nj 1066 ldx [$np+0], $nj ! np[j] 1067 add $np, 8, $np 1068 umulxhi $aj, $m0, $aj ! ahi=aj 1069 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 1070 mulx $nj, $m1, $nlo ! np[j]*m1 1071 addxc %g0, $hi0, $hi0 1072 umulxhi $nj, $m1, $nj ! nhi=nj 1073 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 1074 addxc %g0, $hi1, $hi1 1075 stx $lo1, [$tp] ! tp[j-1] 1076 add $tp, 8, $tp 1077 brnz,pt $cnt, .Linner_g5 1078 sub $cnt, 8, $cnt 1079!.Linner_g5 1080 ldx [$tp+8], $tj ! tp[j] 1081 addcc $alo, $hi0, $lo0 1082 addxc $aj, %g0, $hi0 ! ahi=aj 1083 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 1084 addxc %g0, $hi0, $hi0 1085 1086 addcc $nlo, $hi1, $lo1 1087 addxc $nj, %g0, $hi1 ! nhi=nj 1088 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 1089 addxc %g0, $hi1, $hi1 1090 stx $lo1, [$tp] ! tp[j-1] 1091 1092 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 1093 addxccc $hi1, $hi0, $hi1 1094 addxc %g0, %g0, $ovf 1095 stx $hi1, [$tp+8] 1096 add $tp, 16, $tp 1097 1098 brnz,pt $i, .Louter_g5 1099 sub $i, 8, $i 1100 1101 sub $ap, $num, $ap ! rewind 1102 sub $np, $num, $np 1103 sub $tp, $num, $tp 1104 ba .Lsub_g5 1105 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 1106 1107.align 16 1108.Lsub_g5: 1109 ldx [$tp], $tj 1110 add $tp, 8, $tp 1111 ldx [$np+0], $nj 1112 add $np, 8, $np 1113 subccc $tj, $nj, $t2 ! tp[j]-np[j] 1114 srlx $tj, 32, $tj 1115 srlx $nj, 32, $nj 1116 subccc $tj, $nj, $t3 1117 add $rp, 8, $rp 1118 st $t2, [$rp-4] ! reverse order 1119 st $t3, [$rp-8] 1120 brnz,pt $cnt, .Lsub_g5 1121 sub $cnt, 8, $cnt 1122 1123 sub $np, $num, $np ! rewind 1124 sub $tp, $num, $tp 1125 sub $rp, $num, $rp 1126 1127 subccc $ovf, %g0, $ovf ! handle upmost overflow bit 1128 ba .Lcopy_g5 1129 sub $num, 8, $cnt 1130 1131.align 16 1132.Lcopy_g5: ! conditional copy 1133 ldx [$tp], $tj 1134 ldx [$rp+0], $t2 1135 stx %g0, [$tp] ! zap 1136 add $tp, 8, $tp 1137 movcs %icc, $tj, $t2 1138 stx $t2, [$rp+0] 1139 add $rp, 8, $rp 1140 brnz $cnt, .Lcopy_g5 1141 sub $cnt, 8, $cnt 1142 1143 mov 1, %o0 1144 ret 1145 restore 1146.type bn_mul_mont_gather5_t4, #function 1147.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4 1148___ 1149} 1150 1151$code.=<<___; 1152.globl bn_flip_t4 1153.align 32 1154bn_flip_t4: 1155.Loop_flip: 1156 ld [%o1+0], %o4 1157 sub %o2, 1, %o2 1158 ld [%o1+4], %o5 1159 add %o1, 8, %o1 1160 st %o5, [%o0+0] 1161 st %o4, [%o0+4] 1162 brnz %o2, .Loop_flip 1163 add %o0, 8, %o0 1164 retl 1165 nop 1166.type bn_flip_t4, #function 1167.size bn_flip_t4, .-bn_flip_t4 1168 1169.globl bn_flip_n_scatter5_t4 1170.align 32 1171bn_flip_n_scatter5_t4: 1172 sll %o3, 3, %o3 1173 srl %o1, 1, %o1 1174 add %o3, %o2, %o2 ! &pwrtbl[pwr] 1175 sub %o1, 1, %o1 1176.Loop_flip_n_scatter5: 1177 ld [%o0+0], %o4 ! inp[i] 1178 ld [%o0+4], %o5 1179 add %o0, 8, %o0 1180 sllx %o5, 32, %o5 1181 or %o4, %o5, %o5 1182 stx %o5, [%o2] 1183 add %o2, 32*8, %o2 1184 brnz %o1, .Loop_flip_n_scatter5 1185 sub %o1, 1, %o1 1186 retl 1187 nop 1188.type bn_flip_n_scatter5_t4, #function 1189.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4 1190 1191.globl bn_gather5_t4 1192.align 32 1193bn_gather5_t4: 1194___ 1195 &load_ccr("%o2","%o3","%g1"); 1196$code.=<<___; 1197 sub %o1, 1, %o1 1198.Loop_gather5: 1199___ 1200 &load_b("%o2","%g1"); 1201$code.=<<___; 1202 stx %g1, [%o0] 1203 add %o0, 8, %o0 1204 brnz %o1, .Loop_gather5 1205 sub %o1, 1, %o1 1206 1207 retl 1208 nop 1209.type bn_gather5_t4, #function 1210.size bn_gather5_t4, .-bn_gather5_t4 1211 1212.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov" 1213.align 4 1214___ 1215 1216&emit_assembler(); 1217 1218close STDOUT; 1219