1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# On PA-7100LC this module performs ~90-50% better, less for longer 11# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 12# that compiler utilized xmpyu instruction to perform 32x32=64-bit 13# multiplication, which in turn means that "baseline" performance was 14# optimal in respect to instruction set capabilities. Fair comparison 15# with vendor compiler is problematic, because OpenSSL doesn't define 16# BN_LLONG [presumably] for historical reasons, which drives compiler 17# toward 4 times 16x16=32-bit multiplicatons [plus complementary 18# shifts and additions] instead. This means that you should observe 19# several times improvement over code generated by vendor compiler 20# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 21# improvement coefficient was never collected on PA-7100LC, or any 22# other 1.1 CPU, because I don't have access to such machine with 23# vendor compiler. But to give you a taste, PA-RISC 1.1 code path 24# reportedly outperformed code generated by cc +DA1.1 +O3 by factor 25# of ~5x on PA-8600. 26# 27# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 28# reportedly ~2x faster than vendor compiler generated code [according 29# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 30# this implementation is actually 32-bit one, in the sense that it 31# operates on 32-bit values. But pa-risc2[W].s operates on arrays of 32# 64-bit BN_LONGs... How do they interoperate then? No problem. This 33# module picks halves of 64-bit values in reverse order and pretends 34# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 35# 64-bit code such as pa-risc2[W].s then? Well, the thing is that 36# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 37# i.e. there is no "wider" multiplication like on most other 64-bit 38# platforms. This means that even being effectively 32-bit, this 39# implementation performs "64-bit" computational task in same amount 40# of arithmetic operations, most notably multiplications. It requires 41# more memory references, most notably to tp[num], but this doesn't 42# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 43# 2.0 code path provides virtually same performance as pa-risc2[W].s: 44# it's ~10% better for shortest key length and ~10% worse for longest 45# one. 46# 47# In case it wasn't clear. The module has two distinct code paths: 48# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 49# additions and 64-bit integer loads, not to mention specific 50# instruction scheduling. In 64-bit build naturally only 2.0 code path 51# is assembled. In 32-bit application context both code paths are 52# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 53# is taken automatically. Also, in 32-bit build the module imposes 54# couple of limitations: vector lengths has to be even and vector 55# addresses has to be 64-bit aligned. Normally neither is a problem: 56# most common key lengths are even and vectors are commonly malloc-ed, 57# which ensures alignment. 58# 59# Special thanks to polarhome.com for providing HP-UX account on 60# PA-RISC 1.1 machine, and to correspondent who chose to remain 61# anonymous for testing the code on PA-RISC 2.0 machine. 62 63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64 65$flavour = shift; 66$output = shift; 67 68open STDOUT,">$output"; 69 70if ($flavour =~ /64/) { 71 $LEVEL ="2.0W"; 72 $SIZE_T =8; 73 $FRAME_MARKER =80; 74 $SAVED_RP =16; 75 $PUSH ="std"; 76 $PUSHMA ="std,ma"; 77 $POP ="ldd"; 78 $POPMB ="ldd,mb"; 79 $BN_SZ =$SIZE_T; 80} else { 81 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 82 $SIZE_T =4; 83 $FRAME_MARKER =48; 84 $SAVED_RP =20; 85 $PUSH ="stw"; 86 $PUSHMA ="stwm"; 87 $POP ="ldw"; 88 $POPMB ="ldwm"; 89 $BN_SZ =$SIZE_T; 90 if (open CONF,"<${dir}../../opensslconf.h") { 91 while(<CONF>) { 92 if (m/#\s*define\s+SIXTY_FOUR_BIT/) { 93 $BN_SZ=8; 94 $LEVEL="2.0"; 95 last; 96 } 97 } 98 close CONF; 99 } 100} 101 102$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 103 # [+ argument transfer] 104$LOCALS=$FRAME-$FRAME_MARKER; 105$FRAME+=32; # local variables 106 107$tp="%r31"; 108$ti1="%r29"; 109$ti0="%r28"; 110 111$rp="%r26"; 112$ap="%r25"; 113$bp="%r24"; 114$np="%r23"; 115$n0="%r22"; # passed through stack in 32-bit 116$num="%r21"; # passed through stack in 32-bit 117$idx="%r20"; 118$arrsz="%r19"; 119 120$nm1="%r7"; 121$nm0="%r6"; 122$ab1="%r5"; 123$ab0="%r4"; 124 125$fp="%r3"; 126$hi1="%r2"; 127$hi0="%r1"; 128 129$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s 130 131$fm0="%fr4"; $fti=$fm0; 132$fbi="%fr5L"; 133$fn0="%fr5R"; 134$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 135$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 136 137$code=<<___; 138 .LEVEL $LEVEL 139 .SPACE \$TEXT\$ 140 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 141 142 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 143 .ALIGN 64 144bn_mul_mont 145 .PROC 146 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 147 .ENTRY 148 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 149 $PUSHMA %r3,$FRAME(%sp) 150 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 151 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 152 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 153 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 154 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 155 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 156 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 157 ldo -$FRAME(%sp),$fp 158___ 159$code.=<<___ if ($SIZE_T==4); 160 ldw `-$FRAME_MARKER-4`($fp),$n0 161 ldw `-$FRAME_MARKER-8`($fp),$num 162 nop 163 nop ; alignment 164___ 165$code.=<<___ if ($BN_SZ==4); 166 comiclr,<= 6,$num,%r0 ; are vectors long enough? 167 b L\$abort 168 ldi 0,%r28 ; signal "unhandled" 169 add,ev %r0,$num,$num ; is $num even? 170 b L\$abort 171 nop 172 or $ap,$np,$ti1 173 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 174 b L\$abort 175 nop 176 nop ; alignment 177 nop 178 179 fldws 0($n0),${fn0} 180 fldws,ma 4($bp),${fbi} ; bp[0] 181___ 182$code.=<<___ if ($BN_SZ==8); 183 comib,> 3,$num,L\$abort ; are vectors long enough? 184 ldi 0,%r28 ; signal "unhandled" 185 addl $num,$num,$num ; I operate on 32-bit values 186 187 fldws 4($n0),${fn0} ; only low part of n0 188 fldws 4($bp),${fbi} ; bp[0] in flipped word order 189___ 190$code.=<<___; 191 fldds 0($ap),${fai} ; ap[0,1] 192 fldds 0($np),${fni} ; np[0,1] 193 194 sh2addl $num,%r0,$arrsz 195 ldi 31,$hi0 196 ldo 36($arrsz),$hi1 ; space for tp[num+1] 197 andcm $hi1,$hi0,$hi1 ; align 198 addl $hi1,%sp,%sp 199 $PUSH $fp,-$SIZE_T(%sp) 200 201 ldo `$LOCALS+16`($fp),$xfer 202 ldo `$LOCALS+32+4`($fp),$tp 203 204 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 205 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 206 xmpyu ${fn0},${fab0}R,${fm0} 207 208 addl $arrsz,$ap,$ap ; point at the end 209 addl $arrsz,$np,$np 210 subi 0,$arrsz,$idx ; j=0 211 ldo 8($idx),$idx ; j++++ 212 213 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 214 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 215 fstds ${fab0},-16($xfer) 216 fstds ${fnm0},-8($xfer) 217 fstds ${fab1},0($xfer) 218 fstds ${fnm1},8($xfer) 219 flddx $idx($ap),${fai} ; ap[2,3] 220 flddx $idx($np),${fni} ; np[2,3] 221___ 222$code.=<<___ if ($BN_SZ==4); 223 mtctl $hi0,%cr11 ; $hi0 still holds 31 224 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 225 b L\$parisc11 226 nop 227___ 228$code.=<<___; # PA-RISC 2.0 code-path 229 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 230 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 231 ldd -16($xfer),$ab0 232 fstds ${fab0},-16($xfer) 233 234 extrd,u $ab0,31,32,$hi0 235 extrd,u $ab0,63,32,$ab0 236 ldd -8($xfer),$nm0 237 fstds ${fnm0},-8($xfer) 238 ldo 8($idx),$idx ; j++++ 239 addl $ab0,$nm0,$nm0 ; low part is discarded 240 extrd,u $nm0,31,32,$hi1 241 242L\$1st 243 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 244 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 245 ldd 0($xfer),$ab1 246 fstds ${fab1},0($xfer) 247 addl $hi0,$ab1,$ab1 248 extrd,u $ab1,31,32,$hi0 249 ldd 8($xfer),$nm1 250 fstds ${fnm1},8($xfer) 251 extrd,u $ab1,63,32,$ab1 252 addl $hi1,$nm1,$nm1 253 flddx $idx($ap),${fai} ; ap[j,j+1] 254 flddx $idx($np),${fni} ; np[j,j+1] 255 addl $ab1,$nm1,$nm1 256 extrd,u $nm1,31,32,$hi1 257 258 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 259 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 260 ldd -16($xfer),$ab0 261 fstds ${fab0},-16($xfer) 262 addl $hi0,$ab0,$ab0 263 extrd,u $ab0,31,32,$hi0 264 ldd -8($xfer),$nm0 265 fstds ${fnm0},-8($xfer) 266 extrd,u $ab0,63,32,$ab0 267 addl $hi1,$nm0,$nm0 268 stw $nm1,-4($tp) ; tp[j-1] 269 addl $ab0,$nm0,$nm0 270 stw,ma $nm0,8($tp) ; tp[j-1] 271 addib,<> 8,$idx,L\$1st ; j++++ 272 extrd,u $nm0,31,32,$hi1 273 274 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 275 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 276 ldd 0($xfer),$ab1 277 fstds ${fab1},0($xfer) 278 addl $hi0,$ab1,$ab1 279 extrd,u $ab1,31,32,$hi0 280 ldd 8($xfer),$nm1 281 fstds ${fnm1},8($xfer) 282 extrd,u $ab1,63,32,$ab1 283 addl $hi1,$nm1,$nm1 284 ldd -16($xfer),$ab0 285 addl $ab1,$nm1,$nm1 286 ldd -8($xfer),$nm0 287 extrd,u $nm1,31,32,$hi1 288 289 addl $hi0,$ab0,$ab0 290 extrd,u $ab0,31,32,$hi0 291 stw $nm1,-4($tp) ; tp[j-1] 292 extrd,u $ab0,63,32,$ab0 293 addl $hi1,$nm0,$nm0 294 ldd 0($xfer),$ab1 295 addl $ab0,$nm0,$nm0 296 ldd,mb 8($xfer),$nm1 297 extrd,u $nm0,31,32,$hi1 298 stw,ma $nm0,8($tp) ; tp[j-1] 299 300 ldo -1($num),$num ; i-- 301 subi 0,$arrsz,$idx ; j=0 302___ 303$code.=<<___ if ($BN_SZ==4); 304 fldws,ma 4($bp),${fbi} ; bp[1] 305___ 306$code.=<<___ if ($BN_SZ==8); 307 fldws 0($bp),${fbi} ; bp[1] in flipped word order 308___ 309$code.=<<___; 310 flddx $idx($ap),${fai} ; ap[0,1] 311 flddx $idx($np),${fni} ; np[0,1] 312 fldws 8($xfer),${fti}R ; tp[0] 313 addl $hi0,$ab1,$ab1 314 extrd,u $ab1,31,32,$hi0 315 extrd,u $ab1,63,32,$ab1 316 ldo 8($idx),$idx ; j++++ 317 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 318 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 319 addl $hi1,$nm1,$nm1 320 addl $ab1,$nm1,$nm1 321 extrd,u $nm1,31,32,$hi1 322 fstws,mb ${fab0}L,-8($xfer) ; save high part 323 stw $nm1,-4($tp) ; tp[j-1] 324 325 fcpy,sgl %fr0,${fti}L ; zero high part 326 fcpy,sgl %fr0,${fab0}L 327 addl $hi1,$hi0,$hi0 328 extrd,u $hi0,31,32,$hi1 329 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 330 fcnvxf,dbl,dbl ${fab0},${fab0} 331 stw $hi0,0($tp) 332 stw $hi1,4($tp) 333 334 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 335 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 336 xmpyu ${fn0},${fab0}R,${fm0} 337 ldo `$LOCALS+32+4`($fp),$tp 338L\$outer 339 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 340 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 341 fstds ${fab0},-16($xfer) ; 33-bit value 342 fstds ${fnm0},-8($xfer) 343 flddx $idx($ap),${fai} ; ap[2] 344 flddx $idx($np),${fni} ; np[2] 345 ldo 8($idx),$idx ; j++++ 346 ldd -16($xfer),$ab0 ; 33-bit value 347 ldd -8($xfer),$nm0 348 ldw 0($xfer),$hi0 ; high part 349 350 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 351 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 352 extrd,u $ab0,31,32,$ti0 ; carry bit 353 extrd,u $ab0,63,32,$ab0 354 fstds ${fab1},0($xfer) 355 addl $ti0,$hi0,$hi0 ; account carry bit 356 fstds ${fnm1},8($xfer) 357 addl $ab0,$nm0,$nm0 ; low part is discarded 358 ldw 0($tp),$ti1 ; tp[1] 359 extrd,u $nm0,31,32,$hi1 360 fstds ${fab0},-16($xfer) 361 fstds ${fnm0},-8($xfer) 362 363L\$inner 364 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 365 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 366 ldd 0($xfer),$ab1 367 fstds ${fab1},0($xfer) 368 addl $hi0,$ti1,$ti1 369 addl $ti1,$ab1,$ab1 370 ldd 8($xfer),$nm1 371 fstds ${fnm1},8($xfer) 372 extrd,u $ab1,31,32,$hi0 373 extrd,u $ab1,63,32,$ab1 374 flddx $idx($ap),${fai} ; ap[j,j+1] 375 flddx $idx($np),${fni} ; np[j,j+1] 376 addl $hi1,$nm1,$nm1 377 addl $ab1,$nm1,$nm1 378 ldw 4($tp),$ti0 ; tp[j] 379 stw $nm1,-4($tp) ; tp[j-1] 380 381 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 382 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 383 ldd -16($xfer),$ab0 384 fstds ${fab0},-16($xfer) 385 addl $hi0,$ti0,$ti0 386 addl $ti0,$ab0,$ab0 387 ldd -8($xfer),$nm0 388 fstds ${fnm0},-8($xfer) 389 extrd,u $ab0,31,32,$hi0 390 extrd,u $nm1,31,32,$hi1 391 ldw 8($tp),$ti1 ; tp[j] 392 extrd,u $ab0,63,32,$ab0 393 addl $hi1,$nm0,$nm0 394 addl $ab0,$nm0,$nm0 395 stw,ma $nm0,8($tp) ; tp[j-1] 396 addib,<> 8,$idx,L\$inner ; j++++ 397 extrd,u $nm0,31,32,$hi1 398 399 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 400 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 401 ldd 0($xfer),$ab1 402 fstds ${fab1},0($xfer) 403 addl $hi0,$ti1,$ti1 404 addl $ti1,$ab1,$ab1 405 ldd 8($xfer),$nm1 406 fstds ${fnm1},8($xfer) 407 extrd,u $ab1,31,32,$hi0 408 extrd,u $ab1,63,32,$ab1 409 ldw 4($tp),$ti0 ; tp[j] 410 addl $hi1,$nm1,$nm1 411 addl $ab1,$nm1,$nm1 412 ldd -16($xfer),$ab0 413 ldd -8($xfer),$nm0 414 extrd,u $nm1,31,32,$hi1 415 416 addl $hi0,$ab0,$ab0 417 addl $ti0,$ab0,$ab0 418 stw $nm1,-4($tp) ; tp[j-1] 419 extrd,u $ab0,31,32,$hi0 420 ldw 8($tp),$ti1 ; tp[j] 421 extrd,u $ab0,63,32,$ab0 422 addl $hi1,$nm0,$nm0 423 ldd 0($xfer),$ab1 424 addl $ab0,$nm0,$nm0 425 ldd,mb 8($xfer),$nm1 426 extrd,u $nm0,31,32,$hi1 427 stw,ma $nm0,8($tp) ; tp[j-1] 428 429 addib,= -1,$num,L\$outerdone ; i-- 430 subi 0,$arrsz,$idx ; j=0 431___ 432$code.=<<___ if ($BN_SZ==4); 433 fldws,ma 4($bp),${fbi} ; bp[i] 434___ 435$code.=<<___ if ($BN_SZ==8); 436 ldi 12,$ti0 ; bp[i] in flipped word order 437 addl,ev %r0,$num,$num 438 ldi -4,$ti0 439 addl $ti0,$bp,$bp 440 fldws 0($bp),${fbi} 441___ 442$code.=<<___; 443 flddx $idx($ap),${fai} ; ap[0] 444 addl $hi0,$ab1,$ab1 445 flddx $idx($np),${fni} ; np[0] 446 fldws 8($xfer),${fti}R ; tp[0] 447 addl $ti1,$ab1,$ab1 448 extrd,u $ab1,31,32,$hi0 449 extrd,u $ab1,63,32,$ab1 450 451 ldo 8($idx),$idx ; j++++ 452 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 453 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 454 ldw 4($tp),$ti0 ; tp[j] 455 456 addl $hi1,$nm1,$nm1 457 fstws,mb ${fab0}L,-8($xfer) ; save high part 458 addl $ab1,$nm1,$nm1 459 extrd,u $nm1,31,32,$hi1 460 fcpy,sgl %fr0,${fti}L ; zero high part 461 fcpy,sgl %fr0,${fab0}L 462 stw $nm1,-4($tp) ; tp[j-1] 463 464 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 465 fcnvxf,dbl,dbl ${fab0},${fab0} 466 addl $hi1,$hi0,$hi0 467 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 468 addl $ti0,$hi0,$hi0 469 extrd,u $hi0,31,32,$hi1 470 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 471 stw $hi0,0($tp) 472 stw $hi1,4($tp) 473 xmpyu ${fn0},${fab0}R,${fm0} 474 475 b L\$outer 476 ldo `$LOCALS+32+4`($fp),$tp 477 478L\$outerdone 479 addl $hi0,$ab1,$ab1 480 addl $ti1,$ab1,$ab1 481 extrd,u $ab1,31,32,$hi0 482 extrd,u $ab1,63,32,$ab1 483 484 ldw 4($tp),$ti0 ; tp[j] 485 486 addl $hi1,$nm1,$nm1 487 addl $ab1,$nm1,$nm1 488 extrd,u $nm1,31,32,$hi1 489 stw $nm1,-4($tp) ; tp[j-1] 490 491 addl $hi1,$hi0,$hi0 492 addl $ti0,$hi0,$hi0 493 extrd,u $hi0,31,32,$hi1 494 stw $hi0,0($tp) 495 stw $hi1,4($tp) 496 497 ldo `$LOCALS+32`($fp),$tp 498 sub %r0,%r0,%r0 ; clear borrow 499___ 500$code.=<<___ if ($BN_SZ==4); 501 ldws,ma 4($tp),$ti0 502 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 503 b L\$sub_pa11 504 addl $tp,$arrsz,$tp 505L\$sub 506 ldwx $idx($np),$hi0 507 subb $ti0,$hi0,$hi1 508 ldwx $idx($tp),$ti0 509 addib,<> 4,$idx,L\$sub 510 stws,ma $hi1,4($rp) 511 512 subb $ti0,%r0,$hi1 513___ 514$code.=<<___ if ($BN_SZ==8); 515 ldd,ma 8($tp),$ti0 516L\$sub 517 ldd $idx($np),$hi0 518 shrpd $ti0,$ti0,32,$ti0 ; flip word order 519 std $ti0,-8($tp) ; save flipped value 520 sub,db $ti0,$hi0,$hi1 521 ldd,ma 8($tp),$ti0 522 addib,<> 8,$idx,L\$sub 523 std,ma $hi1,8($rp) 524 525 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 526 sub,db $ti0,%r0,$hi1 527___ 528$code.=<<___; 529 ldo `$LOCALS+32`($fp),$tp 530 sub $rp,$arrsz,$rp ; rewind rp 531 subi 0,$arrsz,$idx 532L\$copy 533 ldd 0($tp),$ti0 534 ldd 0($rp),$hi0 535 std,ma %r0,8($tp) 536 comiclr,= 0,$hi1,%r0 537 copy $ti0,$hi0 538 addib,<> 8,$idx,L\$copy 539 std,ma $hi0,8($rp) 540___ 541 542if ($BN_SZ==4) { # PA-RISC 1.1 code-path 543$ablo=$ab0; 544$abhi=$ab1; 545$nmlo0=$nm0; 546$nmhi0=$nm1; 547$nmlo1="%r9"; 548$nmhi1="%r8"; 549 550$code.=<<___; 551 b L\$done 552 nop 553 554 .ALIGN 8 555L\$parisc11 556 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 557 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 558 ldw -12($xfer),$ablo 559 ldw -16($xfer),$hi0 560 ldw -4($xfer),$nmlo0 561 ldw -8($xfer),$nmhi0 562 fstds ${fab0},-16($xfer) 563 fstds ${fnm0},-8($xfer) 564 565 ldo 8($idx),$idx ; j++++ 566 add $ablo,$nmlo0,$nmlo0 ; discarded 567 addc %r0,$nmhi0,$hi1 568 ldw 4($xfer),$ablo 569 ldw 0($xfer),$abhi 570 nop 571 572L\$1st_pa11 573 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 574 flddx $idx($ap),${fai} ; ap[j,j+1] 575 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 576 flddx $idx($np),${fni} ; np[j,j+1] 577 add $hi0,$ablo,$ablo 578 ldw 12($xfer),$nmlo1 579 addc %r0,$abhi,$hi0 580 ldw 8($xfer),$nmhi1 581 add $ablo,$nmlo1,$nmlo1 582 fstds ${fab1},0($xfer) 583 addc %r0,$nmhi1,$nmhi1 584 fstds ${fnm1},8($xfer) 585 add $hi1,$nmlo1,$nmlo1 586 ldw -12($xfer),$ablo 587 addc %r0,$nmhi1,$hi1 588 ldw -16($xfer),$abhi 589 590 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 591 ldw -4($xfer),$nmlo0 592 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 593 ldw -8($xfer),$nmhi0 594 add $hi0,$ablo,$ablo 595 stw $nmlo1,-4($tp) ; tp[j-1] 596 addc %r0,$abhi,$hi0 597 fstds ${fab0},-16($xfer) 598 add $ablo,$nmlo0,$nmlo0 599 fstds ${fnm0},-8($xfer) 600 addc %r0,$nmhi0,$nmhi0 601 ldw 0($xfer),$abhi 602 add $hi1,$nmlo0,$nmlo0 603 ldw 4($xfer),$ablo 604 stws,ma $nmlo0,8($tp) ; tp[j-1] 605 addib,<> 8,$idx,L\$1st_pa11 ; j++++ 606 addc %r0,$nmhi0,$hi1 607 608 ldw 8($xfer),$nmhi1 609 ldw 12($xfer),$nmlo1 610 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 611 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 612 add $hi0,$ablo,$ablo 613 fstds ${fab1},0($xfer) 614 addc %r0,$abhi,$hi0 615 fstds ${fnm1},8($xfer) 616 add $ablo,$nmlo1,$nmlo1 617 ldw -16($xfer),$abhi 618 addc %r0,$nmhi1,$nmhi1 619 ldw -12($xfer),$ablo 620 add $hi1,$nmlo1,$nmlo1 621 ldw -8($xfer),$nmhi0 622 addc %r0,$nmhi1,$hi1 623 ldw -4($xfer),$nmlo0 624 625 add $hi0,$ablo,$ablo 626 stw $nmlo1,-4($tp) ; tp[j-1] 627 addc %r0,$abhi,$hi0 628 ldw 0($xfer),$abhi 629 add $ablo,$nmlo0,$nmlo0 630 ldw 4($xfer),$ablo 631 addc %r0,$nmhi0,$nmhi0 632 ldws,mb 8($xfer),$nmhi1 633 add $hi1,$nmlo0,$nmlo0 634 ldw 4($xfer),$nmlo1 635 addc %r0,$nmhi0,$hi1 636 stws,ma $nmlo0,8($tp) ; tp[j-1] 637 638 ldo -1($num),$num ; i-- 639 subi 0,$arrsz,$idx ; j=0 640 641 fldws,ma 4($bp),${fbi} ; bp[1] 642 flddx $idx($ap),${fai} ; ap[0,1] 643 flddx $idx($np),${fni} ; np[0,1] 644 fldws 8($xfer),${fti}R ; tp[0] 645 add $hi0,$ablo,$ablo 646 addc %r0,$abhi,$hi0 647 ldo 8($idx),$idx ; j++++ 648 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 649 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 650 add $hi1,$nmlo1,$nmlo1 651 addc %r0,$nmhi1,$nmhi1 652 add $ablo,$nmlo1,$nmlo1 653 addc %r0,$nmhi1,$hi1 654 fstws,mb ${fab0}L,-8($xfer) ; save high part 655 stw $nmlo1,-4($tp) ; tp[j-1] 656 657 fcpy,sgl %fr0,${fti}L ; zero high part 658 fcpy,sgl %fr0,${fab0}L 659 add $hi1,$hi0,$hi0 660 addc %r0,%r0,$hi1 661 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 662 fcnvxf,dbl,dbl ${fab0},${fab0} 663 stw $hi0,0($tp) 664 stw $hi1,4($tp) 665 666 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 667 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 668 xmpyu ${fn0},${fab0}R,${fm0} 669 ldo `$LOCALS+32+4`($fp),$tp 670L\$outer_pa11 671 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 672 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 673 fstds ${fab0},-16($xfer) ; 33-bit value 674 fstds ${fnm0},-8($xfer) 675 flddx $idx($ap),${fai} ; ap[2,3] 676 flddx $idx($np),${fni} ; np[2,3] 677 ldw -16($xfer),$abhi ; carry bit actually 678 ldo 8($idx),$idx ; j++++ 679 ldw -12($xfer),$ablo 680 ldw -8($xfer),$nmhi0 681 ldw -4($xfer),$nmlo0 682 ldw 0($xfer),$hi0 ; high part 683 684 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 685 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 686 fstds ${fab1},0($xfer) 687 addl $abhi,$hi0,$hi0 ; account carry bit 688 fstds ${fnm1},8($xfer) 689 add $ablo,$nmlo0,$nmlo0 ; discarded 690 ldw 0($tp),$ti1 ; tp[1] 691 addc %r0,$nmhi0,$hi1 692 fstds ${fab0},-16($xfer) 693 fstds ${fnm0},-8($xfer) 694 ldw 4($xfer),$ablo 695 ldw 0($xfer),$abhi 696 697L\$inner_pa11 698 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 699 flddx $idx($ap),${fai} ; ap[j,j+1] 700 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 701 flddx $idx($np),${fni} ; np[j,j+1] 702 add $hi0,$ablo,$ablo 703 ldw 4($tp),$ti0 ; tp[j] 704 addc %r0,$abhi,$abhi 705 ldw 12($xfer),$nmlo1 706 add $ti1,$ablo,$ablo 707 ldw 8($xfer),$nmhi1 708 addc %r0,$abhi,$hi0 709 fstds ${fab1},0($xfer) 710 add $ablo,$nmlo1,$nmlo1 711 fstds ${fnm1},8($xfer) 712 addc %r0,$nmhi1,$nmhi1 713 ldw -12($xfer),$ablo 714 add $hi1,$nmlo1,$nmlo1 715 ldw -16($xfer),$abhi 716 addc %r0,$nmhi1,$hi1 717 718 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 719 ldw 8($tp),$ti1 ; tp[j] 720 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 721 ldw -4($xfer),$nmlo0 722 add $hi0,$ablo,$ablo 723 ldw -8($xfer),$nmhi0 724 addc %r0,$abhi,$abhi 725 stw $nmlo1,-4($tp) ; tp[j-1] 726 add $ti0,$ablo,$ablo 727 fstds ${fab0},-16($xfer) 728 addc %r0,$abhi,$hi0 729 fstds ${fnm0},-8($xfer) 730 add $ablo,$nmlo0,$nmlo0 731 ldw 4($xfer),$ablo 732 addc %r0,$nmhi0,$nmhi0 733 ldw 0($xfer),$abhi 734 add $hi1,$nmlo0,$nmlo0 735 stws,ma $nmlo0,8($tp) ; tp[j-1] 736 addib,<> 8,$idx,L\$inner_pa11 ; j++++ 737 addc %r0,$nmhi0,$hi1 738 739 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 740 ldw 12($xfer),$nmlo1 741 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 742 ldw 8($xfer),$nmhi1 743 add $hi0,$ablo,$ablo 744 ldw 4($tp),$ti0 ; tp[j] 745 addc %r0,$abhi,$abhi 746 fstds ${fab1},0($xfer) 747 add $ti1,$ablo,$ablo 748 fstds ${fnm1},8($xfer) 749 addc %r0,$abhi,$hi0 750 ldw -16($xfer),$abhi 751 add $ablo,$nmlo1,$nmlo1 752 ldw -12($xfer),$ablo 753 addc %r0,$nmhi1,$nmhi1 754 ldw -8($xfer),$nmhi0 755 add $hi1,$nmlo1,$nmlo1 756 ldw -4($xfer),$nmlo0 757 addc %r0,$nmhi1,$hi1 758 759 add $hi0,$ablo,$ablo 760 stw $nmlo1,-4($tp) ; tp[j-1] 761 addc %r0,$abhi,$abhi 762 add $ti0,$ablo,$ablo 763 ldw 8($tp),$ti1 ; tp[j] 764 addc %r0,$abhi,$hi0 765 ldw 0($xfer),$abhi 766 add $ablo,$nmlo0,$nmlo0 767 ldw 4($xfer),$ablo 768 addc %r0,$nmhi0,$nmhi0 769 ldws,mb 8($xfer),$nmhi1 770 add $hi1,$nmlo0,$nmlo0 771 ldw 4($xfer),$nmlo1 772 addc %r0,$nmhi0,$hi1 773 stws,ma $nmlo0,8($tp) ; tp[j-1] 774 775 addib,= -1,$num,L\$outerdone_pa11; i-- 776 subi 0,$arrsz,$idx ; j=0 777 778 fldws,ma 4($bp),${fbi} ; bp[i] 779 flddx $idx($ap),${fai} ; ap[0] 780 add $hi0,$ablo,$ablo 781 addc %r0,$abhi,$abhi 782 flddx $idx($np),${fni} ; np[0] 783 fldws 8($xfer),${fti}R ; tp[0] 784 add $ti1,$ablo,$ablo 785 addc %r0,$abhi,$hi0 786 787 ldo 8($idx),$idx ; j++++ 788 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 789 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 790 ldw 4($tp),$ti0 ; tp[j] 791 792 add $hi1,$nmlo1,$nmlo1 793 addc %r0,$nmhi1,$nmhi1 794 fstws,mb ${fab0}L,-8($xfer) ; save high part 795 add $ablo,$nmlo1,$nmlo1 796 addc %r0,$nmhi1,$hi1 797 fcpy,sgl %fr0,${fti}L ; zero high part 798 fcpy,sgl %fr0,${fab0}L 799 stw $nmlo1,-4($tp) ; tp[j-1] 800 801 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 802 fcnvxf,dbl,dbl ${fab0},${fab0} 803 add $hi1,$hi0,$hi0 804 addc %r0,%r0,$hi1 805 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 806 add $ti0,$hi0,$hi0 807 addc %r0,$hi1,$hi1 808 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 809 stw $hi0,0($tp) 810 stw $hi1,4($tp) 811 xmpyu ${fn0},${fab0}R,${fm0} 812 813 b L\$outer_pa11 814 ldo `$LOCALS+32+4`($fp),$tp 815 816L\$outerdone_pa11 817 add $hi0,$ablo,$ablo 818 addc %r0,$abhi,$abhi 819 add $ti1,$ablo,$ablo 820 addc %r0,$abhi,$hi0 821 822 ldw 4($tp),$ti0 ; tp[j] 823 824 add $hi1,$nmlo1,$nmlo1 825 addc %r0,$nmhi1,$nmhi1 826 add $ablo,$nmlo1,$nmlo1 827 addc %r0,$nmhi1,$hi1 828 stw $nmlo1,-4($tp) ; tp[j-1] 829 830 add $hi1,$hi0,$hi0 831 addc %r0,%r0,$hi1 832 add $ti0,$hi0,$hi0 833 addc %r0,$hi1,$hi1 834 stw $hi0,0($tp) 835 stw $hi1,4($tp) 836 837 ldo `$LOCALS+32+4`($fp),$tp 838 sub %r0,%r0,%r0 ; clear borrow 839 ldw -4($tp),$ti0 840 addl $tp,$arrsz,$tp 841L\$sub_pa11 842 ldwx $idx($np),$hi0 843 subb $ti0,$hi0,$hi1 844 ldwx $idx($tp),$ti0 845 addib,<> 4,$idx,L\$sub_pa11 846 stws,ma $hi1,4($rp) 847 848 subb $ti0,%r0,$hi1 849 850 ldo `$LOCALS+32`($fp),$tp 851 sub $rp,$arrsz,$rp ; rewind rp 852 subi 0,$arrsz,$idx 853L\$copy_pa11 854 ldw 0($tp),$ti0 855 ldw 0($rp),$hi0 856 stws,ma %r0,4($tp) 857 comiclr,= 0,$hi1,%r0 858 copy $ti0,$hi0 859 addib,<> 4,$idx,L\$copy_pa11 860 stws,ma $hi0,4($rp) 861 862 nop ; alignment 863L\$done 864___ 865} 866 867$code.=<<___; 868 ldi 1,%r28 ; signal "handled" 869 ldo $FRAME($fp),%sp ; destroy tp[num+1] 870 871 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 872 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 873 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 874 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 875 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 876 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 877 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 878 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 879L\$abort 880 bv (%r2) 881 .EXIT 882 $POPMB -$FRAME(%sp),%r3 883 .PROCEND 884 .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 885___ 886 887# Explicitly encode PA-RISC 2.0 instructions used in this module, so 888# that it can be compiled with .LEVEL 1.0. It should be noted that I 889# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 890# directive... 891 892my $ldd = sub { 893 my ($mod,$args) = @_; 894 my $orig = "ldd$mod\t$args"; 895 896 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 897 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 898 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 899 } 900 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 901 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 902 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 903 $opcode|=(1<<5) if ($mod =~ /^,m/); 904 $opcode|=(1<<13) if ($mod =~ /^,mb/); 905 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 906 } 907 else { "\t".$orig; } 908}; 909 910my $std = sub { 911 my ($mod,$args) = @_; 912 my $orig = "std$mod\t$args"; 913 914 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 915 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 916 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 917 $opcode|=(1<<5) if ($mod =~ /^,m/); 918 $opcode|=(1<<13) if ($mod =~ /^,mb/); 919 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 920 } 921 else { "\t".$orig; } 922}; 923 924my $extrd = sub { 925 my ($mod,$args) = @_; 926 my $orig = "extrd$mod\t$args"; 927 928 # I only have ",u" completer, it's implicitly encoded... 929 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 930 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 931 my $len=32-$3; 932 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 933 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 934 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 935 } 936 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 937 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 938 my $len=32-$2; 939 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 940 $opcode |= (1<<13) if ($mod =~ /,\**=/); 941 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 942 } 943 else { "\t".$orig; } 944}; 945 946my $shrpd = sub { 947 my ($mod,$args) = @_; 948 my $orig = "shrpd$mod\t$args"; 949 950 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 951 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 952 my $cpos=63-$3; 953 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 954 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 955 } 956 else { "\t".$orig; } 957}; 958 959my $sub = sub { 960 my ($mod,$args) = @_; 961 my $orig = "sub$mod\t$args"; 962 963 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 964 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 965 $opcode|=(1<<10); # e1 966 $opcode|=(1<<8); # e2 967 $opcode|=(1<<5); # d 968 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 969 } 970 else { "\t".$orig; } 971}; 972 973sub assemble { 974 my ($mnemonic,$mod,$args)=@_; 975 my $opcode = eval("\$$mnemonic"); 976 977 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 978} 979 980foreach (split("\n",$code)) { 981 s/\`([^\`]*)\`/eval $1/ge; 982 # flip word order in 64-bit mode... 983 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 984 # assemble 2.0 instructions in 32-bit mode... 985 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 986 987 s/\bbv\b/bve/gm if ($SIZE_T==8); 988 989 print $_,"\n"; 990} 991close STDOUT; 992