1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# December 2005 11238384Sjkim# 12238384Sjkim# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons 13238384Sjkim# for undertaken effort are multiple. First of all, UltraSPARC is not 14238384Sjkim# the whole SPARCv9 universe and other VIS-free implementations deserve 15238384Sjkim# optimized code as much. Secondly, newly introduced UltraSPARC T1, 16238384Sjkim# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, 17238384Sjkim# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with 18238384Sjkim# several integrated RSA/DSA accelerator circuits accessible through 19238384Sjkim# kernel driver [only(*)], but having decent user-land software 20238384Sjkim# implementation is important too. Finally, reasons like desire to 21238384Sjkim# experiment with dedicated squaring procedure. Yes, this module 22238384Sjkim# implements one, because it was easiest to draft it in SPARCv9 23238384Sjkim# instructions... 24238384Sjkim 25238384Sjkim# (*) Engine accessing the driver in question is on my TODO list. 26238384Sjkim# For reference, acceleator is estimated to give 6 to 10 times 27238384Sjkim# improvement on single-threaded RSA sign. It should be noted 28238384Sjkim# that 6-10x improvement coefficient does not actually mean 29238384Sjkim# something extraordinary in terms of absolute [single-threaded] 30238384Sjkim# performance, as SPARCv9 instruction set is by all means least 31238384Sjkim# suitable for high performance crypto among other 64 bit 32238384Sjkim# platforms. 6-10x factor simply places T1 in same performance 33238384Sjkim# domain as say AMD64 and IA-64. Improvement of RSA verify don't 34238384Sjkim# appear impressive at all, but it's the sign operation which is 35238384Sjkim# far more critical/interesting. 36238384Sjkim 37238384Sjkim# You might notice that inner loops are modulo-scheduled:-) This has 38238384Sjkim# essentially negligible impact on UltraSPARC performance, it's 39238384Sjkim# Fujitsu SPARC64 V users who should notice and hopefully appreciate 40238384Sjkim# the advantage... Currently this module surpasses sparcv9a-mont.pl 41238384Sjkim# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a 42238384Sjkim# module still have hidden potential [see TODO list there], which is 43238384Sjkim# estimated to be larger than 20%... 44238384Sjkim 45238384Sjkim# int bn_mul_mont( 46238384Sjkim$rp="%i0"; # BN_ULONG *rp, 47238384Sjkim$ap="%i1"; # const BN_ULONG *ap, 48238384Sjkim$bp="%i2"; # const BN_ULONG *bp, 49238384Sjkim$np="%i3"; # const BN_ULONG *np, 50238384Sjkim$n0="%i4"; # const BN_ULONG *n0, 51238384Sjkim$num="%i5"; # int num); 52238384Sjkim 53238384Sjkim$bits=32; 54238384Sjkimfor (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 55238384Sjkimif ($bits==64) { $bias=2047; $frame=192; } 56238384Sjkimelse { $bias=0; $frame=128; } 57238384Sjkim 58238384Sjkim$car0="%o0"; 59238384Sjkim$car1="%o1"; 60238384Sjkim$car2="%o2"; # 1 bit 61238384Sjkim$acc0="%o3"; 62238384Sjkim$acc1="%o4"; 63238384Sjkim$mask="%g1"; # 32 bits, what a waste... 64238384Sjkim$tmp0="%g4"; 65238384Sjkim$tmp1="%g5"; 66238384Sjkim 67238384Sjkim$i="%l0"; 68238384Sjkim$j="%l1"; 69238384Sjkim$mul0="%l2"; 70238384Sjkim$mul1="%l3"; 71238384Sjkim$tp="%l4"; 72238384Sjkim$apj="%l5"; 73238384Sjkim$npj="%l6"; 74238384Sjkim$tpj="%l7"; 75238384Sjkim 76238384Sjkim$fname="bn_mul_mont_int"; 77238384Sjkim 78238384Sjkim$code=<<___; 79238384Sjkim.section ".text",#alloc,#execinstr 80238384Sjkim 81238384Sjkim.global $fname 82238384Sjkim.align 32 83238384Sjkim$fname: 84238384Sjkim cmp %o5,4 ! 128 bits minimum 85238384Sjkim bge,pt %icc,.Lenter 86238384Sjkim sethi %hi(0xffffffff),$mask 87238384Sjkim retl 88238384Sjkim clr %o0 89238384Sjkim.align 32 90238384Sjkim.Lenter: 91238384Sjkim save %sp,-$frame,%sp 92238384Sjkim sll $num,2,$num ! num*=4 93238384Sjkim or $mask,%lo(0xffffffff),$mask 94238384Sjkim ld [$n0],$n0 95238384Sjkim cmp $ap,$bp 96238384Sjkim and $num,$mask,$num 97238384Sjkim ld [$bp],$mul0 ! bp[0] 98238384Sjkim nop 99238384Sjkim 100238384Sjkim add %sp,$bias,%o7 ! real top of stack 101238384Sjkim ld [$ap],$car0 ! ap[0] ! redundant in squaring context 102238384Sjkim sub %o7,$num,%o7 103238384Sjkim ld [$ap+4],$apj ! ap[1] 104238384Sjkim and %o7,-1024,%o7 105238384Sjkim ld [$np],$car1 ! np[0] 106238384Sjkim sub %o7,$bias,%sp ! alloca 107238384Sjkim ld [$np+4],$npj ! np[1] 108238384Sjkim be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont 109238384Sjkim mov 12,$j 110238384Sjkim 111238384Sjkim mulx $car0,$mul0,$car0 ! ap[0]*bp[0] 112238384Sjkim mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] 113238384Sjkim and $car0,$mask,$acc0 114238384Sjkim add %sp,$bias+$frame,$tp 115238384Sjkim ld [$ap+8],$apj !prologue! 116238384Sjkim 117238384Sjkim mulx $n0,$acc0,$mul1 ! "t[0]"*n0 118238384Sjkim and $mul1,$mask,$mul1 119238384Sjkim 120238384Sjkim mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 121238384Sjkim mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 122238384Sjkim srlx $car0,32,$car0 123238384Sjkim add $acc0,$car1,$car1 124238384Sjkim ld [$np+8],$npj !prologue! 125238384Sjkim srlx $car1,32,$car1 126238384Sjkim mov $tmp0,$acc0 !prologue! 127238384Sjkim 128238384Sjkim.L1st: 129238384Sjkim mulx $apj,$mul0,$tmp0 130238384Sjkim mulx $npj,$mul1,$tmp1 131238384Sjkim add $acc0,$car0,$car0 132238384Sjkim ld [$ap+$j],$apj ! ap[j] 133238384Sjkim and $car0,$mask,$acc0 134238384Sjkim add $acc1,$car1,$car1 135238384Sjkim ld [$np+$j],$npj ! np[j] 136238384Sjkim srlx $car0,32,$car0 137238384Sjkim add $acc0,$car1,$car1 138238384Sjkim add $j,4,$j ! j++ 139238384Sjkim mov $tmp0,$acc0 140238384Sjkim st $car1,[$tp] 141238384Sjkim cmp $j,$num 142238384Sjkim mov $tmp1,$acc1 143238384Sjkim srlx $car1,32,$car1 144238384Sjkim bl %icc,.L1st 145238384Sjkim add $tp,4,$tp ! tp++ 146238384Sjkim!.L1st 147238384Sjkim 148238384Sjkim mulx $apj,$mul0,$tmp0 !epilogue! 149238384Sjkim mulx $npj,$mul1,$tmp1 150238384Sjkim add $acc0,$car0,$car0 151238384Sjkim and $car0,$mask,$acc0 152238384Sjkim add $acc1,$car1,$car1 153238384Sjkim srlx $car0,32,$car0 154238384Sjkim add $acc0,$car1,$car1 155238384Sjkim st $car1,[$tp] 156238384Sjkim srlx $car1,32,$car1 157238384Sjkim 158238384Sjkim add $tmp0,$car0,$car0 159238384Sjkim and $car0,$mask,$acc0 160238384Sjkim add $tmp1,$car1,$car1 161238384Sjkim srlx $car0,32,$car0 162238384Sjkim add $acc0,$car1,$car1 163238384Sjkim st $car1,[$tp+4] 164238384Sjkim srlx $car1,32,$car1 165238384Sjkim 166238384Sjkim add $car0,$car1,$car1 167238384Sjkim st $car1,[$tp+8] 168238384Sjkim srlx $car1,32,$car2 169238384Sjkim 170238384Sjkim mov 4,$i ! i++ 171238384Sjkim ld [$bp+4],$mul0 ! bp[1] 172238384Sjkim.Louter: 173238384Sjkim add %sp,$bias+$frame,$tp 174238384Sjkim ld [$ap],$car0 ! ap[0] 175238384Sjkim ld [$ap+4],$apj ! ap[1] 176238384Sjkim ld [$np],$car1 ! np[0] 177238384Sjkim ld [$np+4],$npj ! np[1] 178238384Sjkim ld [$tp],$tmp1 ! tp[0] 179238384Sjkim ld [$tp+4],$tpj ! tp[1] 180238384Sjkim mov 12,$j 181238384Sjkim 182238384Sjkim mulx $car0,$mul0,$car0 183238384Sjkim mulx $apj,$mul0,$tmp0 !prologue! 184238384Sjkim add $tmp1,$car0,$car0 185238384Sjkim ld [$ap+8],$apj !prologue! 186238384Sjkim and $car0,$mask,$acc0 187238384Sjkim 188238384Sjkim mulx $n0,$acc0,$mul1 189238384Sjkim and $mul1,$mask,$mul1 190238384Sjkim 191238384Sjkim mulx $car1,$mul1,$car1 192238384Sjkim mulx $npj,$mul1,$acc1 !prologue! 193238384Sjkim srlx $car0,32,$car0 194238384Sjkim add $acc0,$car1,$car1 195238384Sjkim ld [$np+8],$npj !prologue! 196238384Sjkim srlx $car1,32,$car1 197238384Sjkim mov $tmp0,$acc0 !prologue! 198238384Sjkim 199238384Sjkim.Linner: 200238384Sjkim mulx $apj,$mul0,$tmp0 201238384Sjkim mulx $npj,$mul1,$tmp1 202238384Sjkim add $tpj,$car0,$car0 203238384Sjkim ld [$ap+$j],$apj ! ap[j] 204238384Sjkim add $acc0,$car0,$car0 205238384Sjkim add $acc1,$car1,$car1 206238384Sjkim ld [$np+$j],$npj ! np[j] 207238384Sjkim and $car0,$mask,$acc0 208238384Sjkim ld [$tp+8],$tpj ! tp[j] 209238384Sjkim srlx $car0,32,$car0 210238384Sjkim add $acc0,$car1,$car1 211238384Sjkim add $j,4,$j ! j++ 212238384Sjkim mov $tmp0,$acc0 213238384Sjkim st $car1,[$tp] ! tp[j-1] 214238384Sjkim srlx $car1,32,$car1 215238384Sjkim mov $tmp1,$acc1 216238384Sjkim cmp $j,$num 217238384Sjkim bl %icc,.Linner 218238384Sjkim add $tp,4,$tp ! tp++ 219238384Sjkim!.Linner 220238384Sjkim 221238384Sjkim mulx $apj,$mul0,$tmp0 !epilogue! 222238384Sjkim mulx $npj,$mul1,$tmp1 223238384Sjkim add $tpj,$car0,$car0 224238384Sjkim add $acc0,$car0,$car0 225238384Sjkim ld [$tp+8],$tpj ! tp[j] 226238384Sjkim and $car0,$mask,$acc0 227238384Sjkim add $acc1,$car1,$car1 228238384Sjkim srlx $car0,32,$car0 229238384Sjkim add $acc0,$car1,$car1 230238384Sjkim st $car1,[$tp] ! tp[j-1] 231238384Sjkim srlx $car1,32,$car1 232238384Sjkim 233238384Sjkim add $tpj,$car0,$car0 234238384Sjkim add $tmp0,$car0,$car0 235238384Sjkim and $car0,$mask,$acc0 236238384Sjkim add $tmp1,$car1,$car1 237238384Sjkim add $acc0,$car1,$car1 238238384Sjkim st $car1,[$tp+4] ! tp[j-1] 239238384Sjkim srlx $car0,32,$car0 240238384Sjkim add $i,4,$i ! i++ 241238384Sjkim srlx $car1,32,$car1 242238384Sjkim 243238384Sjkim add $car0,$car1,$car1 244238384Sjkim cmp $i,$num 245238384Sjkim add $car2,$car1,$car1 246238384Sjkim st $car1,[$tp+8] 247238384Sjkim 248238384Sjkim srlx $car1,32,$car2 249238384Sjkim bl,a %icc,.Louter 250238384Sjkim ld [$bp+$i],$mul0 ! bp[i] 251238384Sjkim!.Louter 252238384Sjkim 253238384Sjkim add $tp,12,$tp 254238384Sjkim 255238384Sjkim.Ltail: 256238384Sjkim add $np,$num,$np 257238384Sjkim add $rp,$num,$rp 258238384Sjkim mov $tp,$ap 259238384Sjkim sub %g0,$num,%o7 ! k=-num 260238384Sjkim ba .Lsub 261238384Sjkim subcc %g0,%g0,%g0 ! clear %icc.c 262238384Sjkim.align 16 263238384Sjkim.Lsub: 264238384Sjkim ld [$tp+%o7],%o0 265238384Sjkim ld [$np+%o7],%o1 266238384Sjkim subccc %o0,%o1,%o1 ! tp[j]-np[j] 267238384Sjkim add $rp,%o7,$i 268238384Sjkim add %o7,4,%o7 269238384Sjkim brnz %o7,.Lsub 270238384Sjkim st %o1,[$i] 271238384Sjkim subc $car2,0,$car2 ! handle upmost overflow bit 272238384Sjkim and $tp,$car2,$ap 273238384Sjkim andn $rp,$car2,$np 274238384Sjkim or $ap,$np,$ap 275238384Sjkim sub %g0,$num,%o7 276238384Sjkim 277238384Sjkim.Lcopy: 278238384Sjkim ld [$ap+%o7],%o0 ! copy or in-place refresh 279238384Sjkim st %g0,[$tp+%o7] ! zap tp 280238384Sjkim st %o0,[$rp+%o7] 281238384Sjkim add %o7,4,%o7 282238384Sjkim brnz %o7,.Lcopy 283238384Sjkim nop 284238384Sjkim mov 1,%i0 285238384Sjkim ret 286238384Sjkim restore 287238384Sjkim___ 288238384Sjkim 289238384Sjkim######## 290238384Sjkim######## .Lbn_sqr_mont gives up to 20% *overall* improvement over 291238384Sjkim######## code without following dedicated squaring procedure. 292238384Sjkim######## 293238384Sjkim$sbit="%i2"; # re-use $bp! 294238384Sjkim 295238384Sjkim$code.=<<___; 296238384Sjkim.align 32 297238384Sjkim.Lbn_sqr_mont: 298238384Sjkim mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] 299238384Sjkim mulx $apj,$mul0,$tmp0 !prologue! 300238384Sjkim and $car0,$mask,$acc0 301238384Sjkim add %sp,$bias+$frame,$tp 302238384Sjkim ld [$ap+8],$apj !prologue! 303238384Sjkim 304238384Sjkim mulx $n0,$acc0,$mul1 ! "t[0]"*n0 305238384Sjkim srlx $car0,32,$car0 306238384Sjkim and $mul1,$mask,$mul1 307238384Sjkim 308238384Sjkim mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 309238384Sjkim mulx $npj,$mul1,$acc1 !prologue! 310238384Sjkim and $car0,1,$sbit 311238384Sjkim ld [$np+8],$npj !prologue! 312238384Sjkim srlx $car0,1,$car0 313238384Sjkim add $acc0,$car1,$car1 314238384Sjkim srlx $car1,32,$car1 315238384Sjkim mov $tmp0,$acc0 !prologue! 316238384Sjkim 317238384Sjkim.Lsqr_1st: 318238384Sjkim mulx $apj,$mul0,$tmp0 319238384Sjkim mulx $npj,$mul1,$tmp1 320238384Sjkim add $acc0,$car0,$car0 ! ap[j]*a0+c0 321238384Sjkim add $acc1,$car1,$car1 322238384Sjkim ld [$ap+$j],$apj ! ap[j] 323238384Sjkim and $car0,$mask,$acc0 324238384Sjkim ld [$np+$j],$npj ! np[j] 325238384Sjkim srlx $car0,32,$car0 326238384Sjkim add $acc0,$acc0,$acc0 327238384Sjkim or $sbit,$acc0,$acc0 328238384Sjkim mov $tmp1,$acc1 329238384Sjkim srlx $acc0,32,$sbit 330238384Sjkim add $j,4,$j ! j++ 331238384Sjkim and $acc0,$mask,$acc0 332238384Sjkim cmp $j,$num 333238384Sjkim add $acc0,$car1,$car1 334238384Sjkim st $car1,[$tp] 335238384Sjkim mov $tmp0,$acc0 336238384Sjkim srlx $car1,32,$car1 337238384Sjkim bl %icc,.Lsqr_1st 338238384Sjkim add $tp,4,$tp ! tp++ 339238384Sjkim!.Lsqr_1st 340238384Sjkim 341238384Sjkim mulx $apj,$mul0,$tmp0 ! epilogue 342238384Sjkim mulx $npj,$mul1,$tmp1 343238384Sjkim add $acc0,$car0,$car0 ! ap[j]*a0+c0 344238384Sjkim add $acc1,$car1,$car1 345238384Sjkim and $car0,$mask,$acc0 346238384Sjkim srlx $car0,32,$car0 347238384Sjkim add $acc0,$acc0,$acc0 348238384Sjkim or $sbit,$acc0,$acc0 349238384Sjkim srlx $acc0,32,$sbit 350238384Sjkim and $acc0,$mask,$acc0 351238384Sjkim add $acc0,$car1,$car1 352238384Sjkim st $car1,[$tp] 353238384Sjkim srlx $car1,32,$car1 354238384Sjkim 355238384Sjkim add $tmp0,$car0,$car0 ! ap[j]*a0+c0 356238384Sjkim add $tmp1,$car1,$car1 357238384Sjkim and $car0,$mask,$acc0 358238384Sjkim srlx $car0,32,$car0 359238384Sjkim add $acc0,$acc0,$acc0 360238384Sjkim or $sbit,$acc0,$acc0 361238384Sjkim srlx $acc0,32,$sbit 362238384Sjkim and $acc0,$mask,$acc0 363238384Sjkim add $acc0,$car1,$car1 364238384Sjkim st $car1,[$tp+4] 365238384Sjkim srlx $car1,32,$car1 366238384Sjkim 367238384Sjkim add $car0,$car0,$car0 368238384Sjkim or $sbit,$car0,$car0 369238384Sjkim add $car0,$car1,$car1 370238384Sjkim st $car1,[$tp+8] 371238384Sjkim srlx $car1,32,$car2 372238384Sjkim 373238384Sjkim ld [%sp+$bias+$frame],$tmp0 ! tp[0] 374238384Sjkim ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] 375238384Sjkim ld [%sp+$bias+$frame+8],$tpj ! tp[2] 376238384Sjkim ld [$ap+4],$mul0 ! ap[1] 377238384Sjkim ld [$ap+8],$apj ! ap[2] 378238384Sjkim ld [$np],$car1 ! np[0] 379238384Sjkim ld [$np+4],$npj ! np[1] 380238384Sjkim mulx $n0,$tmp0,$mul1 381238384Sjkim 382238384Sjkim mulx $mul0,$mul0,$car0 383238384Sjkim and $mul1,$mask,$mul1 384238384Sjkim 385238384Sjkim mulx $car1,$mul1,$car1 386238384Sjkim mulx $npj,$mul1,$acc1 387238384Sjkim add $tmp0,$car1,$car1 388238384Sjkim and $car0,$mask,$acc0 389238384Sjkim ld [$np+8],$npj ! np[2] 390238384Sjkim srlx $car1,32,$car1 391238384Sjkim add $tmp1,$car1,$car1 392238384Sjkim srlx $car0,32,$car0 393238384Sjkim add $acc0,$car1,$car1 394238384Sjkim and $car0,1,$sbit 395238384Sjkim add $acc1,$car1,$car1 396238384Sjkim srlx $car0,1,$car0 397238384Sjkim mov 12,$j 398238384Sjkim st $car1,[%sp+$bias+$frame] ! tp[0]= 399238384Sjkim srlx $car1,32,$car1 400238384Sjkim add %sp,$bias+$frame+4,$tp 401238384Sjkim 402238384Sjkim.Lsqr_2nd: 403238384Sjkim mulx $apj,$mul0,$acc0 404238384Sjkim mulx $npj,$mul1,$acc1 405238384Sjkim add $acc0,$car0,$car0 406238384Sjkim add $tpj,$car1,$car1 407238384Sjkim ld [$ap+$j],$apj ! ap[j] 408238384Sjkim and $car0,$mask,$acc0 409238384Sjkim ld [$np+$j],$npj ! np[j] 410238384Sjkim srlx $car0,32,$car0 411238384Sjkim add $acc1,$car1,$car1 412238384Sjkim ld [$tp+8],$tpj ! tp[j] 413238384Sjkim add $acc0,$acc0,$acc0 414238384Sjkim add $j,4,$j ! j++ 415238384Sjkim or $sbit,$acc0,$acc0 416238384Sjkim srlx $acc0,32,$sbit 417238384Sjkim and $acc0,$mask,$acc0 418238384Sjkim cmp $j,$num 419238384Sjkim add $acc0,$car1,$car1 420238384Sjkim st $car1,[$tp] ! tp[j-1] 421238384Sjkim srlx $car1,32,$car1 422238384Sjkim bl %icc,.Lsqr_2nd 423238384Sjkim add $tp,4,$tp ! tp++ 424238384Sjkim!.Lsqr_2nd 425238384Sjkim 426238384Sjkim mulx $apj,$mul0,$acc0 427238384Sjkim mulx $npj,$mul1,$acc1 428238384Sjkim add $acc0,$car0,$car0 429238384Sjkim add $tpj,$car1,$car1 430238384Sjkim and $car0,$mask,$acc0 431238384Sjkim srlx $car0,32,$car0 432238384Sjkim add $acc1,$car1,$car1 433238384Sjkim add $acc0,$acc0,$acc0 434238384Sjkim or $sbit,$acc0,$acc0 435238384Sjkim srlx $acc0,32,$sbit 436238384Sjkim and $acc0,$mask,$acc0 437238384Sjkim add $acc0,$car1,$car1 438238384Sjkim st $car1,[$tp] ! tp[j-1] 439238384Sjkim srlx $car1,32,$car1 440238384Sjkim 441238384Sjkim add $car0,$car0,$car0 442238384Sjkim or $sbit,$car0,$car0 443238384Sjkim add $car0,$car1,$car1 444238384Sjkim add $car2,$car1,$car1 445238384Sjkim st $car1,[$tp+4] 446238384Sjkim srlx $car1,32,$car2 447238384Sjkim 448238384Sjkim ld [%sp+$bias+$frame],$tmp1 ! tp[0] 449238384Sjkim ld [%sp+$bias+$frame+4],$tpj ! tp[1] 450238384Sjkim ld [$ap+8],$mul0 ! ap[2] 451238384Sjkim ld [$np],$car1 ! np[0] 452238384Sjkim ld [$np+4],$npj ! np[1] 453238384Sjkim mulx $n0,$tmp1,$mul1 454238384Sjkim and $mul1,$mask,$mul1 455238384Sjkim mov 8,$i 456238384Sjkim 457238384Sjkim mulx $mul0,$mul0,$car0 458238384Sjkim mulx $car1,$mul1,$car1 459238384Sjkim and $car0,$mask,$acc0 460238384Sjkim add $tmp1,$car1,$car1 461238384Sjkim srlx $car0,32,$car0 462238384Sjkim add %sp,$bias+$frame,$tp 463238384Sjkim srlx $car1,32,$car1 464238384Sjkim and $car0,1,$sbit 465238384Sjkim srlx $car0,1,$car0 466238384Sjkim mov 4,$j 467238384Sjkim 468238384Sjkim.Lsqr_outer: 469238384Sjkim.Lsqr_inner1: 470238384Sjkim mulx $npj,$mul1,$acc1 471238384Sjkim add $tpj,$car1,$car1 472238384Sjkim add $j,4,$j 473238384Sjkim ld [$tp+8],$tpj 474238384Sjkim cmp $j,$i 475238384Sjkim add $acc1,$car1,$car1 476238384Sjkim ld [$np+$j],$npj 477238384Sjkim st $car1,[$tp] 478238384Sjkim srlx $car1,32,$car1 479238384Sjkim bl %icc,.Lsqr_inner1 480238384Sjkim add $tp,4,$tp 481238384Sjkim!.Lsqr_inner1 482238384Sjkim 483238384Sjkim add $j,4,$j 484238384Sjkim ld [$ap+$j],$apj ! ap[j] 485238384Sjkim mulx $npj,$mul1,$acc1 486238384Sjkim add $tpj,$car1,$car1 487238384Sjkim ld [$np+$j],$npj ! np[j] 488238384Sjkim add $acc0,$car1,$car1 489238384Sjkim ld [$tp+8],$tpj ! tp[j] 490238384Sjkim add $acc1,$car1,$car1 491238384Sjkim st $car1,[$tp] 492238384Sjkim srlx $car1,32,$car1 493238384Sjkim 494238384Sjkim add $j,4,$j 495238384Sjkim cmp $j,$num 496238384Sjkim be,pn %icc,.Lsqr_no_inner2 497238384Sjkim add $tp,4,$tp 498238384Sjkim 499238384Sjkim.Lsqr_inner2: 500238384Sjkim mulx $apj,$mul0,$acc0 501238384Sjkim mulx $npj,$mul1,$acc1 502238384Sjkim add $tpj,$car1,$car1 503238384Sjkim add $acc0,$car0,$car0 504238384Sjkim ld [$ap+$j],$apj ! ap[j] 505238384Sjkim and $car0,$mask,$acc0 506238384Sjkim ld [$np+$j],$npj ! np[j] 507238384Sjkim srlx $car0,32,$car0 508238384Sjkim add $acc0,$acc0,$acc0 509238384Sjkim ld [$tp+8],$tpj ! tp[j] 510238384Sjkim or $sbit,$acc0,$acc0 511238384Sjkim add $j,4,$j ! j++ 512238384Sjkim srlx $acc0,32,$sbit 513238384Sjkim and $acc0,$mask,$acc0 514238384Sjkim cmp $j,$num 515238384Sjkim add $acc0,$car1,$car1 516238384Sjkim add $acc1,$car1,$car1 517238384Sjkim st $car1,[$tp] ! tp[j-1] 518238384Sjkim srlx $car1,32,$car1 519238384Sjkim bl %icc,.Lsqr_inner2 520238384Sjkim add $tp,4,$tp ! tp++ 521238384Sjkim 522238384Sjkim.Lsqr_no_inner2: 523238384Sjkim mulx $apj,$mul0,$acc0 524238384Sjkim mulx $npj,$mul1,$acc1 525238384Sjkim add $tpj,$car1,$car1 526238384Sjkim add $acc0,$car0,$car0 527238384Sjkim and $car0,$mask,$acc0 528238384Sjkim srlx $car0,32,$car0 529238384Sjkim add $acc0,$acc0,$acc0 530238384Sjkim or $sbit,$acc0,$acc0 531238384Sjkim srlx $acc0,32,$sbit 532238384Sjkim and $acc0,$mask,$acc0 533238384Sjkim add $acc0,$car1,$car1 534238384Sjkim add $acc1,$car1,$car1 535238384Sjkim st $car1,[$tp] ! tp[j-1] 536238384Sjkim srlx $car1,32,$car1 537238384Sjkim 538238384Sjkim add $car0,$car0,$car0 539238384Sjkim or $sbit,$car0,$car0 540238384Sjkim add $car0,$car1,$car1 541238384Sjkim add $car2,$car1,$car1 542238384Sjkim st $car1,[$tp+4] 543238384Sjkim srlx $car1,32,$car2 544238384Sjkim 545238384Sjkim add $i,4,$i ! i++ 546238384Sjkim ld [%sp+$bias+$frame],$tmp1 ! tp[0] 547238384Sjkim ld [%sp+$bias+$frame+4],$tpj ! tp[1] 548238384Sjkim ld [$ap+$i],$mul0 ! ap[j] 549238384Sjkim ld [$np],$car1 ! np[0] 550238384Sjkim ld [$np+4],$npj ! np[1] 551238384Sjkim mulx $n0,$tmp1,$mul1 552238384Sjkim and $mul1,$mask,$mul1 553238384Sjkim add $i,4,$tmp0 554238384Sjkim 555238384Sjkim mulx $mul0,$mul0,$car0 556238384Sjkim mulx $car1,$mul1,$car1 557238384Sjkim and $car0,$mask,$acc0 558238384Sjkim add $tmp1,$car1,$car1 559238384Sjkim srlx $car0,32,$car0 560238384Sjkim add %sp,$bias+$frame,$tp 561238384Sjkim srlx $car1,32,$car1 562238384Sjkim and $car0,1,$sbit 563238384Sjkim srlx $car0,1,$car0 564238384Sjkim 565238384Sjkim cmp $tmp0,$num ! i<num-1 566238384Sjkim bl %icc,.Lsqr_outer 567238384Sjkim mov 4,$j 568238384Sjkim 569238384Sjkim.Lsqr_last: 570238384Sjkim mulx $npj,$mul1,$acc1 571238384Sjkim add $tpj,$car1,$car1 572238384Sjkim add $j,4,$j 573238384Sjkim ld [$tp+8],$tpj 574238384Sjkim cmp $j,$i 575238384Sjkim add $acc1,$car1,$car1 576238384Sjkim ld [$np+$j],$npj 577238384Sjkim st $car1,[$tp] 578238384Sjkim srlx $car1,32,$car1 579238384Sjkim bl %icc,.Lsqr_last 580238384Sjkim add $tp,4,$tp 581238384Sjkim!.Lsqr_last 582238384Sjkim 583238384Sjkim mulx $npj,$mul1,$acc1 584238384Sjkim add $tpj,$car1,$car1 585238384Sjkim add $acc0,$car1,$car1 586238384Sjkim add $acc1,$car1,$car1 587238384Sjkim st $car1,[$tp] 588238384Sjkim srlx $car1,32,$car1 589238384Sjkim 590238384Sjkim add $car0,$car0,$car0 ! recover $car0 591238384Sjkim or $sbit,$car0,$car0 592238384Sjkim add $car0,$car1,$car1 593238384Sjkim add $car2,$car1,$car1 594238384Sjkim st $car1,[$tp+4] 595238384Sjkim srlx $car1,32,$car2 596238384Sjkim 597238384Sjkim ba .Ltail 598238384Sjkim add $tp,8,$tp 599238384Sjkim.type $fname,#function 600238384Sjkim.size $fname,(.-$fname) 601238384Sjkim.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 602238384Sjkim.align 32 603238384Sjkim___ 604238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 605238384Sjkimprint $code; 606238384Sjkimclose STDOUT; 607