1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov 5# <appro@openssl.org>. The module is licensed under 2-clause BSD 6# license. November 2012. All rights reserved. 7# ==================================================================== 8 9###################################################################### 10# Montgomery squaring-n-multiplication module for SPARC T4. 11# 12# The module consists of three parts: 13# 14# 1) collection of "single-op" subroutines that perform single 15# operation, Montgomery squaring or multiplication, on 512-, 16# 1024-, 1536- and 2048-bit operands; 17# 2) collection of "multi-op" subroutines that perform 5 squaring and 18# 1 multiplication operations on operands of above lengths; 19# 3) fall-back and helper VIS3 subroutines. 20# 21# RSA sign is dominated by multi-op subroutine, while RSA verify and 22# DSA - by single-op. Special note about 4096-bit RSA verify result. 23# Operands are too long for dedicated hardware and it's handled by 24# VIS3 code, which is why you don't see any improvement. It's surely 25# possible to improve it [by deploying 'mpmul' instruction], maybe in 26# the future... 27# 28# Performance improvement. 29# 30# 64-bit process, VIS3: 31# sign verify sign/s verify/s 32# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4 33# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3 34# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9 35# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9 36# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8 37# 38# 64-bit process, this module: 39# sign verify sign/s verify/s 40# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9 41# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7 42# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5 43# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5 44# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6 45# 46###################################################################### 47# 32-bit process, VIS3: 48# sign verify sign/s verify/s 49# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3 50# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4 51# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8 52# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6 53# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4 54# 55# 32-bit process, this module: 56# sign verify sign/s verify/s 57# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0 58# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7 59# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4 60# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2 61# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2 62# 63# 32-bit code is prone to performance degradation as interrupt rate 64# dispatched to CPU executing the code grows. This is because in 65# standard process of handling interrupt in 32-bit process context 66# upper halves of most integer registers used as input or output are 67# zeroed. This renders result invalid, and operation has to be re-run. 68# If CPU is "bothered" with timer interrupts only, the penalty is 69# hardly measurable. But in order to mitigate this problem for higher 70# interrupt rates contemporary Linux kernel recognizes biased stack 71# even in 32-bit process context and preserves full register contents. 72# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb 73# for details. 74 75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 76push(@INC,"${dir}","${dir}../../perlasm"); 77require "sparcv9_modes.pl"; 78 79$code.=<<___; 80#include "sparc_arch.h" 81 82#ifdef __arch64__ 83.register %g2,#scratch 84.register %g3,#scratch 85#endif 86 87.section ".text",#alloc,#execinstr 88 89#ifdef __PIC__ 90SPARC_PIC_THUNK(%g1) 91#endif 92___ 93 94######################################################################## 95# Register layout for mont[mul|sqr] instructions. 96# For details see "Oracle SPARC Architecture 2011" manual at 97# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/. 98# 99my @R=map("%f".2*$_,(0..11,30,31,12..29)); 100my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]); 101my @A=(@N[0..13],@R[14..31]); 102my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3))); 103 104######################################################################## 105# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp, 106# const u64 *np,const BN_ULONG *n0); 107# 108sub generate_bn_mul_mont_t4() { 109my $NUM=shift; 110my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5)); 111 112$code.=<<___; 113.globl bn_mul_mont_t4_$NUM 114.align 32 115bn_mul_mont_t4_$NUM: 116#ifdef __arch64__ 117 mov 0,$sentinel 118 mov -128,%g4 119#elif defined(SPARCV9_64BIT_STACK) 120 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 121 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] 122 mov -2047,%g4 123 and %g1,SPARCV9_64BIT_STACK,%g1 124 movrz %g1,0,%g4 125 mov -1,$sentinel 126 add %g4,-128,%g4 127#else 128 mov -1,$sentinel 129 mov -128,%g4 130#endif 131 sllx $sentinel,32,$sentinel 132 save %sp,%g4,%sp 133#ifndef __arch64__ 134 save %sp,-128,%sp ! warm it up 135 save %sp,-128,%sp 136 save %sp,-128,%sp 137 save %sp,-128,%sp 138 save %sp,-128,%sp 139 save %sp,-128,%sp 140 restore 141 restore 142 restore 143 restore 144 restore 145 restore 146#endif 147 and %sp,1,%g4 148 or $sentinel,%fp,%fp 149 or %g4,$sentinel,$sentinel 150 151 ! copy arguments to global registers 152 mov %i0,$rp 153 mov %i1,$ap 154 mov %i2,$bp 155 mov %i3,$np 156 ld [%i4+0],%f1 ! load *n0 157 ld [%i4+4],%f0 158 fsrc2 %f0,%f60 159___ 160 161# load ap[$NUM] ######################################################## 162$code.=<<___; 163 save %sp,-128,%sp; or $sentinel,%fp,%fp 164___ 165for($i=0; $i<14 && $i<$NUM; $i++) { 166my $lo=$i<13?@A[$i+1]:"%o7"; 167$code.=<<___; 168 ld [$ap+$i*8+0],$lo 169 ld [$ap+$i*8+4],@A[$i] 170 sllx @A[$i],32,@A[$i] 171 or $lo,@A[$i],@A[$i] 172___ 173} 174for(; $i<$NUM; $i++) { 175my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); 176$code.=<<___; 177 ld [$ap+$i*8+0],$lo 178 ld [$ap+$i*8+4],$hi 179 fsrc2 $hi,@A[$i] 180___ 181} 182# load np[$NUM] ######################################################## 183$code.=<<___; 184 save %sp,-128,%sp; or $sentinel,%fp,%fp 185___ 186for($i=0; $i<14 && $i<$NUM; $i++) { 187my $lo=$i<13?@N[$i+1]:"%o7"; 188$code.=<<___; 189 ld [$np+$i*8+0],$lo 190 ld [$np+$i*8+4],@N[$i] 191 sllx @N[$i],32,@N[$i] 192 or $lo,@N[$i],@N[$i] 193___ 194} 195$code.=<<___; 196 save %sp,-128,%sp; or $sentinel,%fp,%fp 197___ 198for(; $i<28 && $i<$NUM; $i++) { 199my $lo=$i<27?@N[$i+1]:"%o7"; 200$code.=<<___; 201 ld [$np+$i*8+0],$lo 202 ld [$np+$i*8+4],@N[$i] 203 sllx @N[$i],32,@N[$i] 204 or $lo,@N[$i],@N[$i] 205___ 206} 207$code.=<<___; 208 save %sp,-128,%sp; or $sentinel,%fp,%fp 209___ 210for(; $i<$NUM; $i++) { 211my $lo=($i<$NUM-1)?@N[$i+1]:"%o7"; 212$code.=<<___; 213 ld [$np+$i*8+0],$lo 214 ld [$np+$i*8+4],@N[$i] 215 sllx @N[$i],32,@N[$i] 216 or $lo,@N[$i],@N[$i] 217___ 218} 219$code.=<<___; 220 cmp $ap,$bp 221 be SIZE_T_CC,.Lmsquare_$NUM 222 nop 223___ 224 225# load bp[$NUM] ######################################################## 226$code.=<<___; 227 save %sp,-128,%sp; or $sentinel,%fp,%fp 228___ 229for($i=0; $i<14 && $i<$NUM; $i++) { 230my $lo=$i<13?@B[$i+1]:"%o7"; 231$code.=<<___; 232 ld [$bp+$i*8+0],$lo 233 ld [$bp+$i*8+4],@B[$i] 234 sllx @B[$i],32,@B[$i] 235 or $lo,@B[$i],@B[$i] 236___ 237} 238$code.=<<___; 239 save %sp,-128,%sp; or $sentinel,%fp,%fp 240___ 241for(; $i<$NUM; $i++) { 242my $lo=($i<$NUM-1)?@B[$i+1]:"%o7"; 243$code.=<<___; 244 ld [$bp+$i*8+0],$lo 245 ld [$bp+$i*8+4],@B[$i] 246 sllx @B[$i],32,@B[$i] 247 or $lo,@B[$i],@B[$i] 248___ 249} 250# magic ################################################################ 251$code.=<<___; 252 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 253.Lmresume_$NUM: 254 fbu,pn %fcc3,.Lmabort_$NUM 255#ifndef __arch64__ 256 and %fp,$sentinel,$sentinel 257 brz,pn $sentinel,.Lmabort_$NUM 258#endif 259 nop 260#ifdef __arch64__ 261 restore 262 restore 263 restore 264 restore 265 restore 266#else 267 restore; and %fp,$sentinel,$sentinel 268 restore; and %fp,$sentinel,$sentinel 269 restore; and %fp,$sentinel,$sentinel 270 restore; and %fp,$sentinel,$sentinel 271 brz,pn $sentinel,.Lmabort1_$NUM 272 restore 273#endif 274___ 275 276# save tp[$NUM] ######################################################## 277for($i=0; $i<14 && $i<$NUM; $i++) { 278$code.=<<___; 279 movxtod @A[$i],@R[$i] 280___ 281} 282$code.=<<___; 283#ifdef __arch64__ 284 restore 285#else 286 and %fp,$sentinel,$sentinel 287 restore 288 and $sentinel,1,%o7 289 and %fp,$sentinel,$sentinel 290 srl %fp,0,%fp ! just in case? 291 or %o7,$sentinel,$sentinel 292 brz,a,pn $sentinel,.Lmdone_$NUM 293 mov 0,%i0 ! return failure 294#endif 295___ 296for($i=0; $i<12 && $i<$NUM; $i++) { 297@R[$i] =~ /%f([0-9]+)/; 298my $lo = "%f".($1+1); 299$code.=<<___; 300 st $lo,[$rp+$i*8+0] 301 st @R[$i],[$rp+$i*8+4] 302___ 303} 304for(; $i<$NUM; $i++) { 305my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); 306$code.=<<___; 307 fsrc2 @R[$i],$hi 308 st $lo,[$rp+$i*8+0] 309 st $hi,[$rp+$i*8+4] 310___ 311} 312$code.=<<___; 313 mov 1,%i0 ! return success 314.Lmdone_$NUM: 315 ret 316 restore 317 318.Lmabort_$NUM: 319 restore 320 restore 321 restore 322 restore 323 restore 324.Lmabort1_$NUM: 325 restore 326 327 mov 0,%i0 ! return failure 328 ret 329 restore 330 331.align 32 332.Lmsquare_$NUM: 333 save %sp,-128,%sp; or $sentinel,%fp,%fp 334 save %sp,-128,%sp; or $sentinel,%fp,%fp 335 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 336 ba .Lmresume_$NUM 337 nop 338.type bn_mul_mont_t4_$NUM, #function 339.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM 340___ 341} 342 343for ($i=8;$i<=32;$i+=8) { 344 &generate_bn_mul_mont_t4($i); 345} 346 347######################################################################## 348# 349sub load_ccr { 350my ($ptbl,$pwr,$ccr,$skip_wr)=@_; 351$code.=<<___; 352 srl $pwr, 2, %o4 353 and $pwr, 3, %o5 354 and %o4, 7, %o4 355 sll %o5, 3, %o5 ! offset within first cache line 356 add %o5, $ptbl, $ptbl ! of the pwrtbl 357 or %g0, 1, %o5 358 sll %o5, %o4, $ccr 359___ 360$code.=<<___ if (!$skip_wr); 361 wr $ccr, %g0, %ccr 362___ 363} 364sub load_b_pair { 365my ($pwrtbl,$B0,$B1)=@_; 366 367$code.=<<___; 368 ldx [$pwrtbl+0*32], $B0 369 ldx [$pwrtbl+8*32], $B1 370 ldx [$pwrtbl+1*32], %o4 371 ldx [$pwrtbl+9*32], %o5 372 movvs %icc, %o4, $B0 373 ldx [$pwrtbl+2*32], %o4 374 movvs %icc, %o5, $B1 375 ldx [$pwrtbl+10*32],%o5 376 move %icc, %o4, $B0 377 ldx [$pwrtbl+3*32], %o4 378 move %icc, %o5, $B1 379 ldx [$pwrtbl+11*32],%o5 380 movneg %icc, %o4, $B0 381 ldx [$pwrtbl+4*32], %o4 382 movneg %icc, %o5, $B1 383 ldx [$pwrtbl+12*32],%o5 384 movcs %xcc, %o4, $B0 385 ldx [$pwrtbl+5*32],%o4 386 movcs %xcc, %o5, $B1 387 ldx [$pwrtbl+13*32],%o5 388 movvs %xcc, %o4, $B0 389 ldx [$pwrtbl+6*32], %o4 390 movvs %xcc, %o5, $B1 391 ldx [$pwrtbl+14*32],%o5 392 move %xcc, %o4, $B0 393 ldx [$pwrtbl+7*32], %o4 394 move %xcc, %o5, $B1 395 ldx [$pwrtbl+15*32],%o5 396 movneg %xcc, %o4, $B0 397 add $pwrtbl,16*32, $pwrtbl 398 movneg %xcc, %o5, $B1 399___ 400} 401sub load_b { 402my ($pwrtbl,$Bi)=@_; 403 404$code.=<<___; 405 ldx [$pwrtbl+0*32], $Bi 406 ldx [$pwrtbl+1*32], %o4 407 ldx [$pwrtbl+2*32], %o5 408 movvs %icc, %o4, $Bi 409 ldx [$pwrtbl+3*32], %o4 410 move %icc, %o5, $Bi 411 ldx [$pwrtbl+4*32], %o5 412 movneg %icc, %o4, $Bi 413 ldx [$pwrtbl+5*32], %o4 414 movcs %xcc, %o5, $Bi 415 ldx [$pwrtbl+6*32], %o5 416 movvs %xcc, %o4, $Bi 417 ldx [$pwrtbl+7*32], %o4 418 move %xcc, %o5, $Bi 419 add $pwrtbl,8*32, $pwrtbl 420 movneg %xcc, %o4, $Bi 421___ 422} 423 424######################################################################## 425# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0, 426# const u64 *pwrtbl,int pwr,int stride); 427# 428sub generate_bn_pwr5_mont_t4() { 429my $NUM=shift; 430my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5)); 431 432$code.=<<___; 433.globl bn_pwr5_mont_t4_$NUM 434.align 32 435bn_pwr5_mont_t4_$NUM: 436#ifdef __arch64__ 437 mov 0,$sentinel 438 mov -128,%g4 439#elif defined(SPARCV9_64BIT_STACK) 440 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 441 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] 442 mov -2047,%g4 443 and %g1,SPARCV9_64BIT_STACK,%g1 444 movrz %g1,0,%g4 445 mov -1,$sentinel 446 add %g4,-128,%g4 447#else 448 mov -1,$sentinel 449 mov -128,%g4 450#endif 451 sllx $sentinel,32,$sentinel 452 save %sp,%g4,%sp 453#ifndef __arch64__ 454 save %sp,-128,%sp ! warm it up 455 save %sp,-128,%sp 456 save %sp,-128,%sp 457 save %sp,-128,%sp 458 save %sp,-128,%sp 459 save %sp,-128,%sp 460 restore 461 restore 462 restore 463 restore 464 restore 465 restore 466#endif 467 and %sp,1,%g4 468 or $sentinel,%fp,%fp 469 or %g4,$sentinel,$sentinel 470 471 ! copy arguments to global registers 472 mov %i0,$tp 473 mov %i1,$np 474 ld [%i2+0],%f1 ! load *n0 475 ld [%i2+4],%f0 476 mov %i3,$pwrtbl 477 srl %i4,%g0,%i4 ! pack last arguments 478 sllx %i5,32,$pwr 479 or %i4,$pwr,$pwr 480 fsrc2 %f0,%f60 481___ 482 483# load tp[$NUM] ######################################################## 484$code.=<<___; 485 save %sp,-128,%sp; or $sentinel,%fp,%fp 486___ 487for($i=0; $i<14 && $i<$NUM; $i++) { 488$code.=<<___; 489 ldx [$tp+$i*8],@A[$i] 490___ 491} 492for(; $i<$NUM; $i++) { 493$code.=<<___; 494 ldd [$tp+$i*8],@A[$i] 495___ 496} 497# load np[$NUM] ######################################################## 498$code.=<<___; 499 save %sp,-128,%sp; or $sentinel,%fp,%fp 500___ 501for($i=0; $i<14 && $i<$NUM; $i++) { 502$code.=<<___; 503 ldx [$np+$i*8],@N[$i] 504___ 505} 506$code.=<<___; 507 save %sp,-128,%sp; or $sentinel,%fp,%fp 508___ 509for(; $i<28 && $i<$NUM; $i++) { 510$code.=<<___; 511 ldx [$np+$i*8],@N[$i] 512___ 513} 514$code.=<<___; 515 save %sp,-128,%sp; or $sentinel,%fp,%fp 516___ 517for(; $i<$NUM; $i++) { 518$code.=<<___; 519 ldx [$np+$i*8],@N[$i] 520___ 521} 522# load pwrtbl[pwr] ######################################################## 523$code.=<<___; 524 save %sp,-128,%sp; or $sentinel,%fp,%fp 525 526 srlx $pwr, 32, %o4 ! unpack $pwr 527 srl $pwr, %g0, %o5 528 sub %o4, 5, %o4 529 mov $pwrtbl, %o7 530 sllx %o4, 32, $pwr ! re-pack $pwr 531 or %o5, $pwr, $pwr 532 srl %o5, %o4, %o5 533___ 534 &load_ccr("%o7","%o5","%o4"); 535$code.=<<___; 536 b .Lstride_$NUM 537 nop 538.align 16 539.Lstride_$NUM: 540___ 541for($i=0; $i<14 && $i<$NUM; $i+=2) { 542 &load_b_pair("%o7",@B[$i],@B[$i+1]); 543} 544$code.=<<___; 545 save %sp,-128,%sp; or $sentinel,%fp,%fp 546___ 547for(; $i<$NUM; $i+=2) { 548 &load_b_pair("%i7",@B[$i],@B[$i+1]); 549} 550$code.=<<___; 551 srax $pwr, 32, %o4 ! unpack $pwr 552 srl $pwr, %g0, %o5 553 sub %o4, 5, %o4 554 mov $pwrtbl, %i7 555 sllx %o4, 32, $pwr ! re-pack $pwr 556 or %o5, $pwr, $pwr 557 srl %o5, %o4, %o5 558___ 559 &load_ccr("%i7","%o5","%o4",1); 560 561# magic ################################################################ 562for($i=0; $i<5; $i++) { 563$code.=<<___; 564 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 565 fbu,pn %fcc3,.Labort_$NUM 566#ifndef __arch64__ 567 and %fp,$sentinel,$sentinel 568 brz,pn $sentinel,.Labort_$NUM 569#endif 570 nop 571___ 572} 573$code.=<<___; 574 wr %o4, %g0, %ccr 575 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 576 fbu,pn %fcc3,.Labort_$NUM 577#ifndef __arch64__ 578 and %fp,$sentinel,$sentinel 579 brz,pn $sentinel,.Labort_$NUM 580#endif 581 582 srax $pwr, 32, %o4 583#ifdef __arch64__ 584 brgez %o4,.Lstride_$NUM 585 restore 586 restore 587 restore 588 restore 589 restore 590#else 591 brgez %o4,.Lstride_$NUM 592 restore; and %fp,$sentinel,$sentinel 593 restore; and %fp,$sentinel,$sentinel 594 restore; and %fp,$sentinel,$sentinel 595 restore; and %fp,$sentinel,$sentinel 596 brz,pn $sentinel,.Labort1_$NUM 597 restore 598#endif 599___ 600 601# save tp[$NUM] ######################################################## 602for($i=0; $i<14 && $i<$NUM; $i++) { 603$code.=<<___; 604 movxtod @A[$i],@R[$i] 605___ 606} 607$code.=<<___; 608#ifdef __arch64__ 609 restore 610#else 611 and %fp,$sentinel,$sentinel 612 restore 613 and $sentinel,1,%o7 614 and %fp,$sentinel,$sentinel 615 srl %fp,0,%fp ! just in case? 616 or %o7,$sentinel,$sentinel 617 brz,a,pn $sentinel,.Ldone_$NUM 618 mov 0,%i0 ! return failure 619#endif 620___ 621for($i=0; $i<$NUM; $i++) { 622$code.=<<___; 623 std @R[$i],[$tp+$i*8] 624___ 625} 626$code.=<<___; 627 mov 1,%i0 ! return success 628.Ldone_$NUM: 629 ret 630 restore 631 632.Labort_$NUM: 633 restore 634 restore 635 restore 636 restore 637 restore 638.Labort1_$NUM: 639 restore 640 641 mov 0,%i0 ! return failure 642 ret 643 restore 644.type bn_pwr5_mont_t4_$NUM, #function 645.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM 646___ 647} 648 649for ($i=8;$i<=32;$i+=8) { 650 &generate_bn_pwr5_mont_t4($i); 651} 652 653{ 654######################################################################## 655# Fall-back subroutines 656# 657# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values 658# 659($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= 660 (map("%g$_",(1..5)),map("%o$_",(0..5,7))); 661 662# int bn_mul_mont( 663$rp="%o0"; # u64 *rp, 664$ap="%o1"; # const u64 *ap, 665$bp="%o2"; # const u64 *bp, 666$np="%o3"; # const u64 *np, 667$n0p="%o4"; # const BN_ULONG *n0, 668$num="%o5"; # int num); # caller ensures that num is >=3 669$code.=<<___; 670.globl bn_mul_mont_t4 671.align 32 672bn_mul_mont_t4: 673 add %sp, STACK_BIAS, %g4 ! real top of stack 674 sll $num, 3, $num ! size in bytes 675 add $num, 63, %g1 676 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes 677 sub %g4, %g1, %g1 678 andn %g1, 63, %g1 ! align at 64 byte 679 sub %g1, STACK_FRAME, %g1 ! new top of stack 680 sub %g1, %g4, %g1 681 682 save %sp, %g1, %sp 683___ 684# +-------------------------------+<----- %sp 685# . . 686# +-------------------------------+<----- aligned at 64 bytes 687# | __int64 tmp[0] | 688# +-------------------------------+ 689# . . 690# . . 691# +-------------------------------+<----- aligned at 64 bytes 692# . . 693($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 694($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7)); 695($ovf,$i)=($t0,$t1); 696$code.=<<___; 697 ld [$n0p+0], $t0 ! pull n0[0..1] value 698 ld [$n0p+4], $t1 699 add %sp, STACK_BIAS+STACK_FRAME, $tp 700 ldx [$bp+0], $m0 ! m0=bp[0] 701 sllx $t1, 32, $n0 702 add $bp, 8, $bp 703 or $t0, $n0, $n0 704 705 ldx [$ap+0], $aj ! ap[0] 706 707 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 708 umulxhi $aj, $m0, $hi0 709 710 ldx [$ap+8], $aj ! ap[1] 711 add $ap, 16, $ap 712 ldx [$np+0], $nj ! np[0] 713 714 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 715 716 mulx $aj, $m0, $alo ! ap[1]*bp[0] 717 umulxhi $aj, $m0, $aj ! ahi=aj 718 719 mulx $nj, $m1, $lo1 ! np[0]*m1 720 umulxhi $nj, $m1, $hi1 721 722 ldx [$np+8], $nj ! np[1] 723 724 addcc $lo0, $lo1, $lo1 725 add $np, 16, $np 726 addxc %g0, $hi1, $hi1 727 728 mulx $nj, $m1, $nlo ! np[1]*m1 729 umulxhi $nj, $m1, $nj ! nhi=nj 730 731 ba .L1st 732 sub $num, 24, $cnt ! cnt=num-3 733 734.align 16 735.L1st: 736 addcc $alo, $hi0, $lo0 737 addxc $aj, %g0, $hi0 738 739 ldx [$ap+0], $aj ! ap[j] 740 addcc $nlo, $hi1, $lo1 741 add $ap, 8, $ap 742 addxc $nj, %g0, $hi1 ! nhi=nj 743 744 ldx [$np+0], $nj ! np[j] 745 mulx $aj, $m0, $alo ! ap[j]*bp[0] 746 add $np, 8, $np 747 umulxhi $aj, $m0, $aj ! ahi=aj 748 749 mulx $nj, $m1, $nlo ! np[j]*m1 750 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 751 umulxhi $nj, $m1, $nj ! nhi=nj 752 addxc %g0, $hi1, $hi1 753 stxa $lo1, [$tp]0xe2 ! tp[j-1] 754 add $tp, 8, $tp ! tp++ 755 756 brnz,pt $cnt, .L1st 757 sub $cnt, 8, $cnt ! j-- 758!.L1st 759 addcc $alo, $hi0, $lo0 760 addxc $aj, %g0, $hi0 ! ahi=aj 761 762 addcc $nlo, $hi1, $lo1 763 addxc $nj, %g0, $hi1 764 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 765 addxc %g0, $hi1, $hi1 766 stxa $lo1, [$tp]0xe2 ! tp[j-1] 767 add $tp, 8, $tp 768 769 addcc $hi0, $hi1, $hi1 770 addxc %g0, %g0, $ovf ! upmost overflow bit 771 stxa $hi1, [$tp]0xe2 772 add $tp, 8, $tp 773 774 ba .Louter 775 sub $num, 16, $i ! i=num-2 776 777.align 16 778.Louter: 779 ldx [$bp+0], $m0 ! m0=bp[i] 780 add $bp, 8, $bp 781 782 sub $ap, $num, $ap ! rewind 783 sub $np, $num, $np 784 sub $tp, $num, $tp 785 786 ldx [$ap+0], $aj ! ap[0] 787 ldx [$np+0], $nj ! np[0] 788 789 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 790 ldx [$tp], $tj ! tp[0] 791 umulxhi $aj, $m0, $hi0 792 ldx [$ap+8], $aj ! ap[1] 793 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 794 mulx $aj, $m0, $alo ! ap[1]*bp[i] 795 addxc %g0, $hi0, $hi0 796 mulx $lo0, $n0, $m1 ! tp[0]*n0 797 umulxhi $aj, $m0, $aj ! ahi=aj 798 mulx $nj, $m1, $lo1 ! np[0]*m1 799 add $ap, 16, $ap 800 umulxhi $nj, $m1, $hi1 801 ldx [$np+8], $nj ! np[1] 802 add $np, 16, $np 803 addcc $lo1, $lo0, $lo1 804 mulx $nj, $m1, $nlo ! np[1]*m1 805 addxc %g0, $hi1, $hi1 806 umulxhi $nj, $m1, $nj ! nhi=nj 807 808 ba .Linner 809 sub $num, 24, $cnt ! cnt=num-3 810.align 16 811.Linner: 812 addcc $alo, $hi0, $lo0 813 ldx [$tp+8], $tj ! tp[j] 814 addxc $aj, %g0, $hi0 ! ahi=aj 815 ldx [$ap+0], $aj ! ap[j] 816 add $ap, 8, $ap 817 addcc $nlo, $hi1, $lo1 818 mulx $aj, $m0, $alo ! ap[j]*bp[i] 819 addxc $nj, %g0, $hi1 ! nhi=nj 820 ldx [$np+0], $nj ! np[j] 821 add $np, 8, $np 822 umulxhi $aj, $m0, $aj ! ahi=aj 823 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 824 mulx $nj, $m1, $nlo ! np[j]*m1 825 addxc %g0, $hi0, $hi0 826 umulxhi $nj, $m1, $nj ! nhi=nj 827 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 828 addxc %g0, $hi1, $hi1 829 stx $lo1, [$tp] ! tp[j-1] 830 add $tp, 8, $tp 831 brnz,pt $cnt, .Linner 832 sub $cnt, 8, $cnt 833!.Linner 834 ldx [$tp+8], $tj ! tp[j] 835 addcc $alo, $hi0, $lo0 836 addxc $aj, %g0, $hi0 ! ahi=aj 837 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 838 addxc %g0, $hi0, $hi0 839 840 addcc $nlo, $hi1, $lo1 841 addxc $nj, %g0, $hi1 ! nhi=nj 842 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 843 addxc %g0, $hi1, $hi1 844 stx $lo1, [$tp] ! tp[j-1] 845 846 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 847 addxccc $hi1, $hi0, $hi1 848 addxc %g0, %g0, $ovf 849 stx $hi1, [$tp+8] 850 add $tp, 16, $tp 851 852 brnz,pt $i, .Louter 853 sub $i, 8, $i 854 855 sub $ap, $num, $ap ! rewind 856 sub $np, $num, $np 857 sub $tp, $num, $tp 858 ba .Lsub 859 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 860 861.align 16 862.Lsub: 863 ldx [$tp], $tj 864 add $tp, 8, $tp 865 ldx [$np+0], $nj 866 add $np, 8, $np 867 subccc $tj, $nj, $t2 ! tp[j]-np[j] 868 srlx $tj, 32, $tj 869 srlx $nj, 32, $nj 870 subccc $tj, $nj, $t3 871 add $rp, 8, $rp 872 st $t2, [$rp-4] ! reverse order 873 st $t3, [$rp-8] 874 brnz,pt $cnt, .Lsub 875 sub $cnt, 8, $cnt 876 877 sub $np, $num, $np ! rewind 878 sub $tp, $num, $tp 879 sub $rp, $num, $rp 880 881 subc $ovf, %g0, $ovf ! handle upmost overflow bit 882 and $tp, $ovf, $ap 883 andn $rp, $ovf, $np 884 or $np, $ap, $ap ! ap=borrow?tp:rp 885 ba .Lcopy 886 sub $num, 8, $cnt 887 888.align 16 889.Lcopy: ! copy or in-place refresh 890 ldx [$ap+0], $t2 891 add $ap, 8, $ap 892 stx %g0, [$tp] ! zap 893 add $tp, 8, $tp 894 stx $t2, [$rp+0] 895 add $rp, 8, $rp 896 brnz $cnt, .Lcopy 897 sub $cnt, 8, $cnt 898 899 mov 1, %o0 900 ret 901 restore 902.type bn_mul_mont_t4, #function 903.size bn_mul_mont_t4, .-bn_mul_mont_t4 904___ 905 906# int bn_mul_mont_gather5( 907$rp="%o0"; # u64 *rp, 908$ap="%o1"; # const u64 *ap, 909$bp="%o2"; # const u64 *pwrtbl, 910$np="%o3"; # const u64 *np, 911$n0p="%o4"; # const BN_ULONG *n0, 912$num="%o5"; # int num, # caller ensures that num is >=3 913 # int power); 914$code.=<<___; 915.globl bn_mul_mont_gather5_t4 916.align 32 917bn_mul_mont_gather5_t4: 918 add %sp, STACK_BIAS, %g4 ! real top of stack 919 sll $num, 3, $num ! size in bytes 920 add $num, 63, %g1 921 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes 922 sub %g4, %g1, %g1 923 andn %g1, 63, %g1 ! align at 64 byte 924 sub %g1, STACK_FRAME, %g1 ! new top of stack 925 sub %g1, %g4, %g1 926 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument 927 928 save %sp, %g1, %sp 929___ 930# +-------------------------------+<----- %sp 931# . . 932# +-------------------------------+<----- aligned at 64 bytes 933# | __int64 tmp[0] | 934# +-------------------------------+ 935# . . 936# . . 937# +-------------------------------+<----- aligned at 64 bytes 938# . . 939($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 940($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7)); 941($ovf,$i)=($t0,$t1); 942 &load_ccr($bp,"%g4",$ccr); 943 &load_b($bp,$m0,"%o7"); # m0=bp[0] 944 945$code.=<<___; 946 ld [$n0p+0], $t0 ! pull n0[0..1] value 947 ld [$n0p+4], $t1 948 add %sp, STACK_BIAS+STACK_FRAME, $tp 949 sllx $t1, 32, $n0 950 or $t0, $n0, $n0 951 952 ldx [$ap+0], $aj ! ap[0] 953 954 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 955 umulxhi $aj, $m0, $hi0 956 957 ldx [$ap+8], $aj ! ap[1] 958 add $ap, 16, $ap 959 ldx [$np+0], $nj ! np[0] 960 961 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 962 963 mulx $aj, $m0, $alo ! ap[1]*bp[0] 964 umulxhi $aj, $m0, $aj ! ahi=aj 965 966 mulx $nj, $m1, $lo1 ! np[0]*m1 967 umulxhi $nj, $m1, $hi1 968 969 ldx [$np+8], $nj ! np[1] 970 971 addcc $lo0, $lo1, $lo1 972 add $np, 16, $np 973 addxc %g0, $hi1, $hi1 974 975 mulx $nj, $m1, $nlo ! np[1]*m1 976 umulxhi $nj, $m1, $nj ! nhi=nj 977 978 ba .L1st_g5 979 sub $num, 24, $cnt ! cnt=num-3 980 981.align 16 982.L1st_g5: 983 addcc $alo, $hi0, $lo0 984 addxc $aj, %g0, $hi0 985 986 ldx [$ap+0], $aj ! ap[j] 987 addcc $nlo, $hi1, $lo1 988 add $ap, 8, $ap 989 addxc $nj, %g0, $hi1 ! nhi=nj 990 991 ldx [$np+0], $nj ! np[j] 992 mulx $aj, $m0, $alo ! ap[j]*bp[0] 993 add $np, 8, $np 994 umulxhi $aj, $m0, $aj ! ahi=aj 995 996 mulx $nj, $m1, $nlo ! np[j]*m1 997 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 998 umulxhi $nj, $m1, $nj ! nhi=nj 999 addxc %g0, $hi1, $hi1 1000 stxa $lo1, [$tp]0xe2 ! tp[j-1] 1001 add $tp, 8, $tp ! tp++ 1002 1003 brnz,pt $cnt, .L1st_g5 1004 sub $cnt, 8, $cnt ! j-- 1005!.L1st_g5 1006 addcc $alo, $hi0, $lo0 1007 addxc $aj, %g0, $hi0 ! ahi=aj 1008 1009 addcc $nlo, $hi1, $lo1 1010 addxc $nj, %g0, $hi1 1011 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 1012 addxc %g0, $hi1, $hi1 1013 stxa $lo1, [$tp]0xe2 ! tp[j-1] 1014 add $tp, 8, $tp 1015 1016 addcc $hi0, $hi1, $hi1 1017 addxc %g0, %g0, $ovf ! upmost overflow bit 1018 stxa $hi1, [$tp]0xe2 1019 add $tp, 8, $tp 1020 1021 ba .Louter_g5 1022 sub $num, 16, $i ! i=num-2 1023 1024.align 16 1025.Louter_g5: 1026 wr $ccr, %g0, %ccr 1027___ 1028 &load_b($bp,$m0); # m0=bp[i] 1029$code.=<<___; 1030 sub $ap, $num, $ap ! rewind 1031 sub $np, $num, $np 1032 sub $tp, $num, $tp 1033 1034 ldx [$ap+0], $aj ! ap[0] 1035 ldx [$np+0], $nj ! np[0] 1036 1037 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 1038 ldx [$tp], $tj ! tp[0] 1039 umulxhi $aj, $m0, $hi0 1040 ldx [$ap+8], $aj ! ap[1] 1041 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 1042 mulx $aj, $m0, $alo ! ap[1]*bp[i] 1043 addxc %g0, $hi0, $hi0 1044 mulx $lo0, $n0, $m1 ! tp[0]*n0 1045 umulxhi $aj, $m0, $aj ! ahi=aj 1046 mulx $nj, $m1, $lo1 ! np[0]*m1 1047 add $ap, 16, $ap 1048 umulxhi $nj, $m1, $hi1 1049 ldx [$np+8], $nj ! np[1] 1050 add $np, 16, $np 1051 addcc $lo1, $lo0, $lo1 1052 mulx $nj, $m1, $nlo ! np[1]*m1 1053 addxc %g0, $hi1, $hi1 1054 umulxhi $nj, $m1, $nj ! nhi=nj 1055 1056 ba .Linner_g5 1057 sub $num, 24, $cnt ! cnt=num-3 1058.align 16 1059.Linner_g5: 1060 addcc $alo, $hi0, $lo0 1061 ldx [$tp+8], $tj ! tp[j] 1062 addxc $aj, %g0, $hi0 ! ahi=aj 1063 ldx [$ap+0], $aj ! ap[j] 1064 add $ap, 8, $ap 1065 addcc $nlo, $hi1, $lo1 1066 mulx $aj, $m0, $alo ! ap[j]*bp[i] 1067 addxc $nj, %g0, $hi1 ! nhi=nj 1068 ldx [$np+0], $nj ! np[j] 1069 add $np, 8, $np 1070 umulxhi $aj, $m0, $aj ! ahi=aj 1071 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 1072 mulx $nj, $m1, $nlo ! np[j]*m1 1073 addxc %g0, $hi0, $hi0 1074 umulxhi $nj, $m1, $nj ! nhi=nj 1075 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 1076 addxc %g0, $hi1, $hi1 1077 stx $lo1, [$tp] ! tp[j-1] 1078 add $tp, 8, $tp 1079 brnz,pt $cnt, .Linner_g5 1080 sub $cnt, 8, $cnt 1081!.Linner_g5 1082 ldx [$tp+8], $tj ! tp[j] 1083 addcc $alo, $hi0, $lo0 1084 addxc $aj, %g0, $hi0 ! ahi=aj 1085 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 1086 addxc %g0, $hi0, $hi0 1087 1088 addcc $nlo, $hi1, $lo1 1089 addxc $nj, %g0, $hi1 ! nhi=nj 1090 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 1091 addxc %g0, $hi1, $hi1 1092 stx $lo1, [$tp] ! tp[j-1] 1093 1094 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 1095 addxccc $hi1, $hi0, $hi1 1096 addxc %g0, %g0, $ovf 1097 stx $hi1, [$tp+8] 1098 add $tp, 16, $tp 1099 1100 brnz,pt $i, .Louter_g5 1101 sub $i, 8, $i 1102 1103 sub $ap, $num, $ap ! rewind 1104 sub $np, $num, $np 1105 sub $tp, $num, $tp 1106 ba .Lsub_g5 1107 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 1108 1109.align 16 1110.Lsub_g5: 1111 ldx [$tp], $tj 1112 add $tp, 8, $tp 1113 ldx [$np+0], $nj 1114 add $np, 8, $np 1115 subccc $tj, $nj, $t2 ! tp[j]-np[j] 1116 srlx $tj, 32, $tj 1117 srlx $nj, 32, $nj 1118 subccc $tj, $nj, $t3 1119 add $rp, 8, $rp 1120 st $t2, [$rp-4] ! reverse order 1121 st $t3, [$rp-8] 1122 brnz,pt $cnt, .Lsub_g5 1123 sub $cnt, 8, $cnt 1124 1125 sub $np, $num, $np ! rewind 1126 sub $tp, $num, $tp 1127 sub $rp, $num, $rp 1128 1129 subc $ovf, %g0, $ovf ! handle upmost overflow bit 1130 and $tp, $ovf, $ap 1131 andn $rp, $ovf, $np 1132 or $np, $ap, $ap ! ap=borrow?tp:rp 1133 ba .Lcopy_g5 1134 sub $num, 8, $cnt 1135 1136.align 16 1137.Lcopy_g5: ! copy or in-place refresh 1138 ldx [$ap+0], $t2 1139 add $ap, 8, $ap 1140 stx %g0, [$tp] ! zap 1141 add $tp, 8, $tp 1142 stx $t2, [$rp+0] 1143 add $rp, 8, $rp 1144 brnz $cnt, .Lcopy_g5 1145 sub $cnt, 8, $cnt 1146 1147 mov 1, %o0 1148 ret 1149 restore 1150.type bn_mul_mont_gather5_t4, #function 1151.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4 1152___ 1153} 1154 1155$code.=<<___; 1156.globl bn_flip_t4 1157.align 32 1158bn_flip_t4: 1159.Loop_flip: 1160 ld [%o1+0], %o4 1161 sub %o2, 1, %o2 1162 ld [%o1+4], %o5 1163 add %o1, 8, %o1 1164 st %o5, [%o0+0] 1165 st %o4, [%o0+4] 1166 brnz %o2, .Loop_flip 1167 add %o0, 8, %o0 1168 retl 1169 nop 1170.type bn_flip_t4, #function 1171.size bn_flip_t4, .-bn_flip_t4 1172 1173.globl bn_flip_n_scatter5_t4 1174.align 32 1175bn_flip_n_scatter5_t4: 1176 sll %o3, 3, %o3 1177 srl %o1, 1, %o1 1178 add %o3, %o2, %o2 ! &pwrtbl[pwr] 1179 sub %o1, 1, %o1 1180.Loop_flip_n_scatter5: 1181 ld [%o0+0], %o4 ! inp[i] 1182 ld [%o0+4], %o5 1183 add %o0, 8, %o0 1184 sllx %o5, 32, %o5 1185 or %o4, %o5, %o5 1186 stx %o5, [%o2] 1187 add %o2, 32*8, %o2 1188 brnz %o1, .Loop_flip_n_scatter5 1189 sub %o1, 1, %o1 1190 retl 1191 nop 1192.type bn_flip_n_scatter5_t4, #function 1193.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4 1194 1195.globl bn_gather5_t4 1196.align 32 1197bn_gather5_t4: 1198___ 1199 &load_ccr("%o2","%o3","%g1"); 1200$code.=<<___; 1201 sub %o1, 1, %o1 1202.Loop_gather5: 1203___ 1204 &load_b("%o2","%g1"); 1205$code.=<<___; 1206 stx %g1, [%o0] 1207 add %o0, 8, %o0 1208 brnz %o1, .Loop_gather5 1209 sub %o1, 1, %o1 1210 1211 retl 1212 nop 1213.type bn_gather5_t4, #function 1214.size bn_gather5_t4, .-bn_gather5_t4 1215 1216.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov" 1217.align 4 1218___ 1219 1220&emit_assembler(); 1221 1222close STDOUT; 1223