vis3-mont.pl revision 289848
10SN/A#!/usr/bin/env perl 217205Sihse 30SN/A# ==================================================================== 40SN/A# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 50SN/A# project. The module is, however, dual licensed under OpenSSL and 60SN/A# CRYPTOGAMS licenses depending on where you obtain it. For further 72362SN/A# details see http://www.openssl.org/~appro/cryptogams/. 80SN/A# ==================================================================== 92362SN/A 100SN/A# October 2012. 110SN/A# 120SN/A# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and 130SN/A# onward. There are three new instructions used here: umulxhi, 140SN/A# addxc[cc] and initializing store. On T3 RSA private key operations 150SN/A# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key 160SN/A# lengths. This is without dedicated squaring procedure. On T4 170SN/A# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly 180SN/A# for reference purposes, because T4 has dedicated Montgomery 190SN/A# multiplication and squaring *instructions* that deliver even more. 200SN/A 212362SN/A$bits=32; 222362SN/Afor (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 232362SN/Aif ($bits==64) { $bias=2047; $frame=192; } 240SN/Aelse { $bias=0; $frame=112; } 250SN/A 260SN/A$code.=<<___ if ($bits==64); 270SN/A.register %g2,#scratch 280SN/A.register %g3,#scratch 290SN/A___ 300SN/A$code.=<<___; 310SN/A.section ".text",#alloc,#execinstr 320SN/A___ 330SN/A 3417490Sxuelei($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= 350SN/A (map("%g$_",(1..5)),map("%o$_",(0..5,7))); 360SN/A 370SN/A# int bn_mul_mont( 3817490Sxuelei$rp="%o0"; # BN_ULONG *rp, 390SN/A$ap="%o1"; # const BN_ULONG *ap, 4017490Sxuelei$bp="%o2"; # const BN_ULONG *bp, 410SN/A$np="%o3"; # const BN_ULONG *np, 420SN/A$n0p="%o4"; # const BN_ULONG *n0, 430SN/A$num="%o5"; # int num); # caller ensures that num is even 4417490Sxuelei # and >=6 450SN/A$code.=<<___; 460SN/A.globl bn_mul_mont_vis3 470SN/A.align 32 480SN/Abn_mul_mont_vis3: 490SN/A add %sp, $bias, %g4 ! real top of stack 500SN/A sll $num, 2, $num ! size in bytes 510SN/A add $num, 63, %g5 520SN/A andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes 530SN/A add %g5, %g5, %g1 540SN/A add %g5, %g1, %g1 ! 3*buffer size 550SN/A sub %g4, %g1, %g1 560SN/A andn %g1, 63, %g1 ! align at 64 byte 570SN/A sub %g1, $frame, %g1 ! new top of stack 580SN/A sub %g1, %g4, %g1 593002SN/A 603002SN/A save %sp, %g1, %sp 610SN/A___ 620SN/A 630SN/A# +-------------------------------+<----- %sp 640SN/A# . . 650SN/A# +-------------------------------+<----- aligned at 64 bytes 660SN/A# | __int64 tmp[0] | 670SN/A# +-------------------------------+ 680SN/A# . . 690SN/A# . . 700SN/A# +-------------------------------+<----- aligned at 64 bytes 710SN/A# | __int64 ap[1..0] | converted ap[] 720SN/A# +-------------------------------+ 730SN/A# | __int64 np[1..0] | converted np[] 740SN/A# +-------------------------------+ 750SN/A# | __int64 ap[3..2] | 760SN/A# . . 770SN/A# . . 780SN/A# +-------------------------------+ 790SN/A($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 800SN/A($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7)); 810SN/A($ovf,$i)=($t0,$t1); 820SN/A$code.=<<___; 830SN/A ld [$n0p+0], $t0 ! pull n0[0..1] value 840SN/A add %sp, $bias+$frame, $tp 850SN/A ld [$n0p+4], $t1 860SN/A add $tp, %g5, $anp 870SN/A ld [$bp+0], $t2 ! m0=bp[0] 880SN/A sllx $t1, 32, $n0 890SN/A ld [$bp+4], $t3 900SN/A or $t0, $n0, $n0 910SN/A add $bp, 8, $bp 920SN/A 930SN/A ld [$ap+0], $t0 ! ap[0] 940SN/A sllx $t3, 32, $m0 950SN/A ld [$ap+4], $t1 960SN/A or $t2, $m0, $m0 970SN/A 980SN/A ld [$ap+8], $t2 ! ap[1] 990SN/A sllx $t1, 32, $aj 1000SN/A ld [$ap+12], $t3 1010SN/A or $t0, $aj, $aj 1020SN/A add $ap, 16, $ap 1030SN/A stx $aj, [$anp] ! converted ap[0] 1040SN/A 1050SN/A mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 1060SN/A umulxhi $aj, $m0, $hi0 1070SN/A 1080SN/A ld [$np+0], $t0 ! np[0] 1090SN/A sllx $t3, 32, $aj 1100SN/A ld [$np+4], $t1 111508SN/A or $t2, $aj, $aj 112508SN/A 113508SN/A ld [$np+8], $t2 ! np[1] 114508SN/A sllx $t1, 32, $nj 115508SN/A ld [$np+12], $t3 116508SN/A or $t0, $nj, $nj 1170SN/A add $np, 16, $np 1180SN/A stx $nj, [$anp+8] ! converted np[0] 1190SN/A 1200SN/A mulx $lo0, $n0, $m1 ! "tp[0]"*n0 1210SN/A stx $aj, [$anp+16] ! converted ap[1] 1220SN/A 1230SN/A mulx $aj, $m0, $alo ! ap[1]*bp[0] 124508SN/A umulxhi $aj, $m0, $aj ! ahi=aj 125508SN/A 1260SN/A mulx $nj, $m1, $lo1 ! np[0]*m1 1270SN/A umulxhi $nj, $m1, $hi1 1280SN/A 1290SN/A sllx $t3, 32, $nj 1300SN/A or $t2, $nj, $nj 1310SN/A stx $nj, [$anp+24] ! converted np[1] 1320SN/A add $anp, 32, $anp 1330SN/A 1340SN/A addcc $lo0, $lo1, $lo1 1350SN/A addxc %g0, $hi1, $hi1 1360SN/A 1370SN/A mulx $nj, $m1, $nlo ! np[1]*m1 1380SN/A umulxhi $nj, $m1, $nj ! nhi=nj 1390SN/A 1400SN/A ba .L1st 1410SN/A sub $num, 24, $cnt ! cnt=num-3 1420SN/A 1430SN/A.align 16 1440SN/A.L1st: 1450SN/A ld [$ap+0], $t0 ! ap[j] 1460SN/A addcc $alo, $hi0, $lo0 1470SN/A ld [$ap+4], $t1 1480SN/A addxc $aj, %g0, $hi0 1490SN/A 1500SN/A sllx $t1, 32, $aj 1510SN/A add $ap, 8, $ap 1520SN/A or $t0, $aj, $aj 1530SN/A stx $aj, [$anp] ! converted ap[j] 1540SN/A 1550SN/A ld [$np+0], $t2 ! np[j] 156508SN/A addcc $nlo, $hi1, $lo1 157508SN/A ld [$np+4], $t3 158508SN/A addxc $nj, %g0, $hi1 ! nhi=nj 159508SN/A 160508SN/A sllx $t3, 32, $nj 161508SN/A add $np, 8, $np 1620SN/A mulx $aj, $m0, $alo ! ap[j]*bp[0] 1630SN/A or $t2, $nj, $nj 1640SN/A umulxhi $aj, $m0, $aj ! ahi=aj 1650SN/A stx $nj, [$anp+8] ! converted np[j] 1660SN/A add $anp, 16, $anp ! anp++ 167508SN/A 168508SN/A mulx $nj, $m1, $nlo ! np[j]*m1 1690SN/A addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 1700SN/A umulxhi $nj, $m1, $nj ! nhi=nj 1710SN/A addxc %g0, $hi1, $hi1 1720SN/A stx $lo1, [$tp] ! tp[j-1] 1730SN/A add $tp, 8, $tp ! tp++ 1740SN/A 1750SN/A brnz,pt $cnt, .L1st 1760SN/A sub $cnt, 8, $cnt ! j-- 1770SN/A!.L1st 1780SN/A addcc $alo, $hi0, $lo0 1790SN/A addxc $aj, %g0, $hi0 ! ahi=aj 1800SN/A 1810SN/A addcc $nlo, $hi1, $lo1 1820SN/A addxc $nj, %g0, $hi1 1830SN/A addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 1840SN/A addxc %g0, $hi1, $hi1 1850SN/A stx $lo1, [$tp] ! tp[j-1] 1860SN/A add $tp, 8, $tp 1870SN/A 1880SN/A addcc $hi0, $hi1, $hi1 1890SN/A addxc %g0, %g0, $ovf ! upmost overflow bit 1900SN/A stx $hi1, [$tp] 1910SN/A add $tp, 8, $tp 1920SN/A 19313754Sxuelei ba .Louter 19413754Sxuelei sub $num, 16, $i ! i=num-2 19513754Sxuelei 19613754Sxuelei.align 16 19713754Sxuelei.Louter: 19815376Sjnimeh ld [$bp+0], $t2 ! m0=bp[i] 19915376Sjnimeh ld [$bp+4], $t3 20015376Sjnimeh 20117205Sihse sub $anp, $num, $anp ! rewind 20215376Sjnimeh sub $tp, $num, $tp 20315376Sjnimeh sub $anp, $num, $anp 20415376Sjnimeh 2050SN/A add $bp, 8, $bp 2060SN/A sllx $t3, 32, $m0 2070SN/A ldx [$anp+0], $aj ! ap[0] 2080SN/A or $t2, $m0, $m0 2090SN/A ldx [$anp+8], $nj ! np[0] 2100SN/A 2110SN/A mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 2120SN/A ldx [$tp], $tj ! tp[0] 2130SN/A umulxhi $aj, $m0, $hi0 2140SN/A ldx [$anp+16], $aj ! ap[1] 2150SN/A addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 2160SN/A mulx $aj, $m0, $alo ! ap[1]*bp[i] 2170SN/A addxc %g0, $hi0, $hi0 2180SN/A mulx $lo0, $n0, $m1 ! tp[0]*n0 2190SN/A umulxhi $aj, $m0, $aj ! ahi=aj 2200SN/A mulx $nj, $m1, $lo1 ! np[0]*m1 2210SN/A umulxhi $nj, $m1, $hi1 2220SN/A ldx [$anp+24], $nj ! np[1] 2230SN/A add $anp, 32, $anp 2240SN/A addcc $lo1, $lo0, $lo1 22515376Sjnimeh mulx $nj, $m1, $nlo ! np[1]*m1 22615376Sjnimeh addxc %g0, $hi1, $hi1 22717205Sihse umulxhi $nj, $m1, $nj ! nhi=nj 22815376Sjnimeh 22915376Sjnimeh ba .Linner 23015376Sjnimeh sub $num, 24, $cnt ! cnt=num-3 23115376Sjnimeh.align 16 23215376Sjnimeh.Linner: 2330SN/A addcc $alo, $hi0, $lo0 2340SN/A ldx [$tp+8], $tj ! tp[j] 2350SN/A addxc $aj, %g0, $hi0 ! ahi=aj 2360SN/A ldx [$anp+0], $aj ! ap[j] 2370SN/A addcc $nlo, $hi1, $lo1 2380SN/A mulx $aj, $m0, $alo ! ap[j]*bp[i] 2390SN/A addxc $nj, %g0, $hi1 ! nhi=nj 2400SN/A ldx [$anp+8], $nj ! np[j] 2410SN/A add $anp, 16, $anp 2420SN/A umulxhi $aj, $m0, $aj ! ahi=aj 2430SN/A addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 2440SN/A mulx $nj, $m1, $nlo ! np[j]*m1 2450SN/A addxc %g0, $hi0, $hi0 2460SN/A umulxhi $nj, $m1, $nj ! nhi=nj 2470SN/A addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 2480SN/A addxc %g0, $hi1, $hi1 2490SN/A stx $lo1, [$tp] ! tp[j-1] 2500SN/A add $tp, 8, $tp 2510SN/A brnz,pt $cnt, .Linner 2520SN/A sub $cnt, 8, $cnt 2530SN/A!.Linner 25415376Sjnimeh ldx [$tp+8], $tj ! tp[j] 25515376Sjnimeh addcc $alo, $hi0, $lo0 25615376Sjnimeh addxc $aj, %g0, $hi0 ! ahi=aj 25717205Sihse addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 25815376Sjnimeh addxc %g0, $hi0, $hi0 25915376Sjnimeh 26015376Sjnimeh addcc $nlo, $hi1, $lo1 2610SN/A addxc $nj, %g0, $hi1 ! nhi=nj 2620SN/A addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 2630SN/A addxc %g0, $hi1, $hi1 2640SN/A stx $lo1, [$tp] ! tp[j-1] 2650SN/A 2660SN/A subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 2670SN/A addxccc $hi1, $hi0, $hi1 2680SN/A addxc %g0, %g0, $ovf 2690SN/A stx $hi1, [$tp+8] 2700SN/A add $tp, 16, $tp 2710SN/A 2720SN/A brnz,pt $i, .Louter 2730SN/A sub $i, 8, $i 2740SN/A 2750SN/A sub $anp, $num, $anp ! rewind 2760SN/A sub $tp, $num, $tp 2770SN/A sub $anp, $num, $anp 2780SN/A ba .Lsub 2790SN/A subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 2800SN/A 2810SN/A.align 16 28213754Sxuelei.Lsub: 28313754Sxuelei ldx [$tp], $tj 28413754Sxuelei add $tp, 8, $tp 28513754Sxuelei ldx [$anp+8], $nj 28613754Sxuelei add $anp, 16, $anp 2870SN/A subccc $tj, $nj, $t2 ! tp[j]-np[j] 2880SN/A srlx $tj, 32, $tj 2890SN/A srlx $nj, 32, $nj 2900SN/A subccc $tj, $nj, $t3 2910SN/A add $rp, 8, $rp 2920SN/A st $t2, [$rp-4] ! reverse order 2930SN/A st $t3, [$rp-8] 2940SN/A brnz,pt $cnt, .Lsub 2950SN/A sub $cnt, 8, $cnt 2960SN/A 2970SN/A sub $anp, $num, $anp ! rewind 2980SN/A sub $tp, $num, $tp 2990SN/A sub $anp, $num, $anp 3000SN/A sub $rp, $num, $rp 3010SN/A 3020SN/A subc $ovf, %g0, $ovf ! handle upmost overflow bit 3030SN/A and $tp, $ovf, $ap 3040SN/A andn $rp, $ovf, $np 3050SN/A or $np, $ap, $ap ! ap=borrow?tp:rp 3060SN/A ba .Lcopy 3070SN/A sub $num, 8, $cnt 3080SN/A 3090SN/A.align 16 3100SN/A.Lcopy: ! copy or in-place refresh 3110SN/A ld [$ap+0], $t2 3120SN/A ld [$ap+4], $t3 3130SN/A add $ap, 8, $ap 3140SN/A stx %g0, [$tp] ! zap 3150SN/A add $tp, 8, $tp 3160SN/A stx %g0, [$anp] ! zap 3170SN/A stx %g0, [$anp+8] 3180SN/A add $anp, 16, $anp 3190SN/A st $t3, [$rp+0] ! flip order 3200SN/A st $t2, [$rp+4] 3210SN/A add $rp, 8, $rp 3220SN/A brnz $cnt, .Lcopy 3230SN/A sub $cnt, 8, $cnt 3240SN/A 3250SN/A mov 1, %o0 3260SN/A ret 3270SN/A restore 3280SN/A.type bn_mul_mont_vis3, #function 3290SN/A.size bn_mul_mont_vis3, .-bn_mul_mont_vis3 3300SN/A.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>" 3310SN/A.align 4 3320SN/A___ 3330SN/A 3340SN/A# Purpose of these subroutines is to explicitly encode VIS instructions, 3350SN/A# so that one can compile the module without having to specify VIS 3360SN/A# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 3370SN/A# Idea is to reserve for option to produce "universal" binary and let 3380SN/A# programmer detect if current CPU is VIS capable at run-time. 3390SN/Asub unvis3 { 3400SN/Amy ($mnemonic,$rs1,$rs2,$rd)=@_; 3410SN/Amy %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 3420SN/Amy ($ref,$opf); 3430SN/Amy %visopf = ( "addxc" => 0x011, 3440SN/A "addxccc" => 0x013, 3450SN/A "umulxhi" => 0x016 ); 3460SN/A 3470SN/A $ref = "$mnemonic\t$rs1,$rs2,$rd"; 3480SN/A 3490SN/A if ($opf=$visopf{$mnemonic}) { 3500SN/A foreach ($rs1,$rs2,$rd) { 3510SN/A return $ref if (!/%([goli])([0-9])/); 3520SN/A $_=$bias{$1}+$2; 3530SN/A } 3540SN/A 3550SN/A return sprintf ".word\t0x%08x !%s", 3560SN/A 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 3570SN/A $ref; 3580SN/A } else { 3590SN/A return $ref; 3600SN/A } 3610SN/A} 3620SN/A 3630SN/Aforeach (split("\n",$code)) { 3640SN/A s/\`([^\`]*)\`/eval $1/ge; 3650SN/A 3660SN/A s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 3670SN/A &unvis3($1,$2,$3,$4) 3680SN/A /ge; 3690SN/A 3700SN/A print $_,"\n"; 3710SN/A} 3720SN/A 3730SN/Aclose STDOUT; 3740SN/A