sparcv9-mont.pl (302408) | sparcv9-mont.pl (325335) |
---|---|
1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== --- 276 unchanged lines hidden (view full) --- 285 ret 286 restore 287___ 288 289######## 290######## .Lbn_sqr_mont gives up to 20% *overall* improvement over 291######## code without following dedicated squaring procedure. 292######## | 1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== --- 276 unchanged lines hidden (view full) --- 285 ret 286 restore 287___ 288 289######## 290######## .Lbn_sqr_mont gives up to 20% *overall* improvement over 291######## code without following dedicated squaring procedure. 292######## |
293$sbit="%i2"; # re-use $bp! | 293$sbit="%o5"; |
294 295$code.=<<___; 296.align 32 297.Lbn_sqr_mont: 298 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] 299 mulx $apj,$mul0,$tmp0 !prologue! 300 and $car0,$mask,$acc0 301 add %sp,$bias+$frame,$tp --- 96 unchanged lines hidden (view full) --- 398 st $car1,[%sp+$bias+$frame] ! tp[0]= 399 srlx $car1,32,$car1 400 add %sp,$bias+$frame+4,$tp 401 402.Lsqr_2nd: 403 mulx $apj,$mul0,$acc0 404 mulx $npj,$mul1,$acc1 405 add $acc0,$car0,$car0 | 294 295$code.=<<___; 296.align 32 297.Lbn_sqr_mont: 298 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] 299 mulx $apj,$mul0,$tmp0 !prologue! 300 and $car0,$mask,$acc0 301 add %sp,$bias+$frame,$tp --- 96 unchanged lines hidden (view full) --- 398 st $car1,[%sp+$bias+$frame] ! tp[0]= 399 srlx $car1,32,$car1 400 add %sp,$bias+$frame+4,$tp 401 402.Lsqr_2nd: 403 mulx $apj,$mul0,$acc0 404 mulx $npj,$mul1,$acc1 405 add $acc0,$car0,$car0 |
406 add $tpj,$car1,$car1 | 406 add $tpj,$sbit,$sbit |
407 ld [$ap+$j],$apj ! ap[j] 408 and $car0,$mask,$acc0 409 ld [$np+$j],$npj ! np[j] 410 srlx $car0,32,$car0 411 add $acc1,$car1,$car1 412 ld [$tp+8],$tpj ! tp[j] 413 add $acc0,$acc0,$acc0 414 add $j,4,$j ! j++ | 407 ld [$ap+$j],$apj ! ap[j] 408 and $car0,$mask,$acc0 409 ld [$np+$j],$npj ! np[j] 410 srlx $car0,32,$car0 411 add $acc1,$car1,$car1 412 ld [$tp+8],$tpj ! tp[j] 413 add $acc0,$acc0,$acc0 414 add $j,4,$j ! j++ |
415 or $sbit,$acc0,$acc0 | 415 add $sbit,$acc0,$acc0 |
416 srlx $acc0,32,$sbit 417 and $acc0,$mask,$acc0 418 cmp $j,$num 419 add $acc0,$car1,$car1 420 st $car1,[$tp] ! tp[j-1] 421 srlx $car1,32,$car1 422 bl %icc,.Lsqr_2nd 423 add $tp,4,$tp ! tp++ 424!.Lsqr_2nd 425 426 mulx $apj,$mul0,$acc0 427 mulx $npj,$mul1,$acc1 428 add $acc0,$car0,$car0 | 416 srlx $acc0,32,$sbit 417 and $acc0,$mask,$acc0 418 cmp $j,$num 419 add $acc0,$car1,$car1 420 st $car1,[$tp] ! tp[j-1] 421 srlx $car1,32,$car1 422 bl %icc,.Lsqr_2nd 423 add $tp,4,$tp ! tp++ 424!.Lsqr_2nd 425 426 mulx $apj,$mul0,$acc0 427 mulx $npj,$mul1,$acc1 428 add $acc0,$car0,$car0 |
429 add $tpj,$car1,$car1 | 429 add $tpj,$sbit,$sbit |
430 and $car0,$mask,$acc0 431 srlx $car0,32,$car0 432 add $acc1,$car1,$car1 433 add $acc0,$acc0,$acc0 | 430 and $car0,$mask,$acc0 431 srlx $car0,32,$car0 432 add $acc1,$car1,$car1 433 add $acc0,$acc0,$acc0 |
434 or $sbit,$acc0,$acc0 | 434 add $sbit,$acc0,$acc0 |
435 srlx $acc0,32,$sbit 436 and $acc0,$mask,$acc0 437 add $acc0,$car1,$car1 438 st $car1,[$tp] ! tp[j-1] 439 srlx $car1,32,$car1 440 441 add $car0,$car0,$car0 | 435 srlx $acc0,32,$sbit 436 and $acc0,$mask,$acc0 437 add $acc0,$car1,$car1 438 st $car1,[$tp] ! tp[j-1] 439 srlx $car1,32,$car1 440 441 add $car0,$car0,$car0 |
442 or $sbit,$car0,$car0 | 442 add $sbit,$car0,$car0 |
443 add $car0,$car1,$car1 444 add $car2,$car1,$car1 445 st $car1,[$tp+4] 446 srlx $car1,32,$car2 447 448 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 449 ld [%sp+$bias+$frame+4],$tpj ! tp[1] 450 ld [$ap+8],$mul0 ! ap[2] --- 43 unchanged lines hidden (view full) --- 494 add $j,4,$j 495 cmp $j,$num 496 be,pn %icc,.Lsqr_no_inner2 497 add $tp,4,$tp 498 499.Lsqr_inner2: 500 mulx $apj,$mul0,$acc0 501 mulx $npj,$mul1,$acc1 | 443 add $car0,$car1,$car1 444 add $car2,$car1,$car1 445 st $car1,[$tp+4] 446 srlx $car1,32,$car2 447 448 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 449 ld [%sp+$bias+$frame+4],$tpj ! tp[1] 450 ld [$ap+8],$mul0 ! ap[2] --- 43 unchanged lines hidden (view full) --- 494 add $j,4,$j 495 cmp $j,$num 496 be,pn %icc,.Lsqr_no_inner2 497 add $tp,4,$tp 498 499.Lsqr_inner2: 500 mulx $apj,$mul0,$acc0 501 mulx $npj,$mul1,$acc1 |
502 add $tpj,$car1,$car1 | 502 add $tpj,$sbit,$sbit |
503 add $acc0,$car0,$car0 504 ld [$ap+$j],$apj ! ap[j] 505 and $car0,$mask,$acc0 506 ld [$np+$j],$npj ! np[j] 507 srlx $car0,32,$car0 508 add $acc0,$acc0,$acc0 509 ld [$tp+8],$tpj ! tp[j] | 503 add $acc0,$car0,$car0 504 ld [$ap+$j],$apj ! ap[j] 505 and $car0,$mask,$acc0 506 ld [$np+$j],$npj ! np[j] 507 srlx $car0,32,$car0 508 add $acc0,$acc0,$acc0 509 ld [$tp+8],$tpj ! tp[j] |
510 or $sbit,$acc0,$acc0 | 510 add $sbit,$acc0,$acc0 |
511 add $j,4,$j ! j++ 512 srlx $acc0,32,$sbit 513 and $acc0,$mask,$acc0 514 cmp $j,$num 515 add $acc0,$car1,$car1 516 add $acc1,$car1,$car1 517 st $car1,[$tp] ! tp[j-1] 518 srlx $car1,32,$car1 519 bl %icc,.Lsqr_inner2 520 add $tp,4,$tp ! tp++ 521 522.Lsqr_no_inner2: 523 mulx $apj,$mul0,$acc0 524 mulx $npj,$mul1,$acc1 | 511 add $j,4,$j ! j++ 512 srlx $acc0,32,$sbit 513 and $acc0,$mask,$acc0 514 cmp $j,$num 515 add $acc0,$car1,$car1 516 add $acc1,$car1,$car1 517 st $car1,[$tp] ! tp[j-1] 518 srlx $car1,32,$car1 519 bl %icc,.Lsqr_inner2 520 add $tp,4,$tp ! tp++ 521 522.Lsqr_no_inner2: 523 mulx $apj,$mul0,$acc0 524 mulx $npj,$mul1,$acc1 |
525 add $tpj,$car1,$car1 | 525 add $tpj,$sbit,$sbit |
526 add $acc0,$car0,$car0 527 and $car0,$mask,$acc0 528 srlx $car0,32,$car0 529 add $acc0,$acc0,$acc0 | 526 add $acc0,$car0,$car0 527 and $car0,$mask,$acc0 528 srlx $car0,32,$car0 529 add $acc0,$acc0,$acc0 |
530 or $sbit,$acc0,$acc0 | 530 add $sbit,$acc0,$acc0 |
531 srlx $acc0,32,$sbit 532 and $acc0,$mask,$acc0 533 add $acc0,$car1,$car1 534 add $acc1,$car1,$car1 535 st $car1,[$tp] ! tp[j-1] 536 srlx $car1,32,$car1 537 538 add $car0,$car0,$car0 | 531 srlx $acc0,32,$sbit 532 and $acc0,$mask,$acc0 533 add $acc0,$car1,$car1 534 add $acc1,$car1,$car1 535 st $car1,[$tp] ! tp[j-1] 536 srlx $car1,32,$car1 537 538 add $car0,$car0,$car0 |
539 or $sbit,$car0,$car0 | 539 add $sbit,$car0,$car0 |
540 add $car0,$car1,$car1 541 add $car2,$car1,$car1 542 st $car1,[$tp+4] 543 srlx $car1,32,$car2 544 545 add $i,4,$i ! i++ 546 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 547 ld [%sp+$bias+$frame+4],$tpj ! tp[1] --- 28 unchanged lines hidden (view full) --- 576 ld [$np+$j],$npj 577 st $car1,[$tp] 578 srlx $car1,32,$car1 579 bl %icc,.Lsqr_last 580 add $tp,4,$tp 581!.Lsqr_last 582 583 mulx $npj,$mul1,$acc1 | 540 add $car0,$car1,$car1 541 add $car2,$car1,$car1 542 st $car1,[$tp+4] 543 srlx $car1,32,$car2 544 545 add $i,4,$i ! i++ 546 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 547 ld [%sp+$bias+$frame+4],$tpj ! tp[1] --- 28 unchanged lines hidden (view full) --- 576 ld [$np+$j],$npj 577 st $car1,[$tp] 578 srlx $car1,32,$car1 579 bl %icc,.Lsqr_last 580 add $tp,4,$tp 581!.Lsqr_last 582 583 mulx $npj,$mul1,$acc1 |
584 add $tpj,$car1,$car1 | 584 add $tpj,$acc0,$acc0 585 srlx $acc0,32,$tmp0 586 and $acc0,$mask,$acc0 587 add $tmp0,$sbit,$sbit |
585 add $acc0,$car1,$car1 586 add $acc1,$car1,$car1 587 st $car1,[$tp] 588 srlx $car1,32,$car1 589 590 add $car0,$car0,$car0 ! recover $car0 | 588 add $acc0,$car1,$car1 589 add $acc1,$car1,$car1 590 st $car1,[$tp] 591 srlx $car1,32,$car1 592 593 add $car0,$car0,$car0 ! recover $car0 |
591 or $sbit,$car0,$car0 | 594 add $sbit,$car0,$car0 |
592 add $car0,$car1,$car1 593 add $car2,$car1,$car1 594 st $car1,[$tp+4] 595 srlx $car1,32,$car2 596 597 ba .Ltail 598 add $tp,8,$tp 599.type $fname,#function 600.size $fname,(.-$fname) 601.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 602.align 32 603___ 604$code =~ s/\`([^\`]*)\`/eval($1)/gem; 605print $code; 606close STDOUT; | 595 add $car0,$car1,$car1 596 add $car2,$car1,$car1 597 st $car1,[$tp+4] 598 srlx $car1,32,$car2 599 600 ba .Ltail 601 add $tp,8,$tp 602.type $fname,#function 603.size $fname,(.-$fname) 604.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 605.align 32 606___ 607$code =~ s/\`([^\`]*)\`/eval($1)/gem; 608print $code; 609close STDOUT; |