1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# December 2007 11 12# The reason for undertaken effort is basically following. Even though 13# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI 14# performance was observed to be less than impressive, essentially as 15# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. 16# Well, it's not surprising that IBM had to make some sacrifices to 17# boost the clock frequency that much, but no overall improvement? 18# Having observed how much difference did switching to FPU make on 19# UltraSPARC, playing same stunt on Power 6 appeared appropriate... 20# Unfortunately the resulting performance improvement is not as 21# impressive, ~30%, and in absolute terms is still very far from what 22# one would expect from 4.7GHz CPU. There is a chance that I'm doing 23# something wrong, but in the lack of assembler level micro-profiling 24# data or at least decent platform guide I can't tell... Or better 25# results might be achieved with VMX... Anyway, this module provides 26# *worse* performance on other PowerPC implementations, ~40-15% slower 27# on PPC970 depending on key length and ~40% slower on Power 5 for all 28# key lengths. As it's obviously inappropriate as "best all-round" 29# alternative, it has to be complemented with run-time CPU family 30# detection. Oh! It should also be noted that unlike other PowerPC 31# implementation IALU ppc-mont.pl module performs *suboptimaly* on 32# >=1024-bit key lengths on Power 6. It should also be noted that 33# *everything* said so far applies to 64-bit builds! As far as 32-bit 34# application executed on 64-bit CPU goes, this module is likely to 35# become preferred choice, because it's easy to adapt it for such 36# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. 37 38# February 2008 39 40# Micro-profiling assisted optimization results in ~15% improvement 41# over original ppc64-mont.pl version, or overall ~50% improvement 42# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same 43# Power 6 CPU, this module is 5-150% faster depending on key length, 44# [hereafter] more for longer keys. But if compared to ppc-mont.pl 45# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive 46# in absolute terms, but it's apparently the way Power 6 is... 47 48$flavour = shift; 49 50if ($flavour =~ /32/) { 51 $SIZE_T=4; 52 $RZONE= 224; 53 $FRAME= $SIZE_T*12+8*12; 54 $fname= "bn_mul_mont_ppc64"; 55 56 $STUX= "stwux"; # store indexed and update 57 $PUSH= "stw"; 58 $POP= "lwz"; 59 die "not implemented yet"; 60} elsif ($flavour =~ /64/) { 61 $SIZE_T=8; 62 $RZONE= 288; 63 $FRAME= $SIZE_T*12+8*12; 64 $fname= "bn_mul_mont"; 65 66 # same as above, but 64-bit mnemonics... 67 $STUX= "stdux"; # store indexed and update 68 $PUSH= "std"; 69 $POP= "ld"; 70} else { die "nonsense $flavour"; } 71 72$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 73( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 74( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 75die "can't locate ppc-xlate.pl"; 76 77open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 78 79$FRAME=($FRAME+63)&~63; 80$TRANSFER=16*8; 81 82$carry="r0"; 83$sp="r1"; 84$toc="r2"; 85$rp="r3"; $ovf="r3"; 86$ap="r4"; 87$bp="r5"; 88$np="r6"; 89$n0="r7"; 90$num="r8"; 91$rp="r9"; # $rp is reassigned 92$tp="r10"; 93$j="r11"; 94$i="r12"; 95# non-volatile registers 96$nap_d="r14"; # interleaved ap and np in double format 97$a0="r15"; # ap[0] 98$t0="r16"; # temporary registers 99$t1="r17"; 100$t2="r18"; 101$t3="r19"; 102$t4="r20"; 103$t5="r21"; 104$t6="r22"; 105$t7="r23"; 106 107# PPC offers enough register bank capacity to unroll inner loops twice 108# 109# ..A3A2A1A0 110# dcba 111# ----------- 112# A0a 113# A0b 114# A0c 115# A0d 116# A1a 117# A1b 118# A1c 119# A1d 120# A2a 121# A2b 122# A2c 123# A2d 124# A3a 125# A3b 126# A3c 127# A3d 128# ..a 129# ..b 130# 131$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; 132$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; 133$dota="f8"; $dotb="f9"; 134$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; 135$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; 136$T0a="f18"; $T0b="f19"; 137$T1a="f20"; $T1b="f21"; 138$T2a="f22"; $T2b="f23"; 139$T3a="f24"; $T3b="f25"; 140 141# sp----------->+-------------------------------+ 142# | saved sp | 143# +-------------------------------+ 144# | | 145# +-------------------------------+ 146# | 10 saved gpr, r14-r23 | 147# . . 148# . . 149# +12*size_t +-------------------------------+ 150# | 12 saved fpr, f14-f25 | 151# . . 152# . . 153# +12*8 +-------------------------------+ 154# | padding to 64 byte boundary | 155# . . 156# +X +-------------------------------+ 157# | 16 gpr<->fpr transfer zone | 158# . . 159# . . 160# +16*8 +-------------------------------+ 161# | __int64 tmp[-1] | 162# +-------------------------------+ 163# | __int64 tmp[num] | 164# . . 165# . . 166# . . 167# +(num+1)*8 +-------------------------------+ 168# | padding to 64 byte boundary | 169# . . 170# +X +-------------------------------+ 171# | double nap_d[4*num] | 172# . . 173# . . 174# . . 175# +-------------------------------+ 176 177$code=<<___; 178.machine "any" 179.text 180 181.globl .$fname 182.align 5 183.$fname: 184 cmpwi $num,4 185 mr $rp,r3 ; $rp is reassigned 186 li r3,0 ; possible "not handled" return code 187 bltlr- 188 andi. r0,$num,1 ; $num has to be even 189 bnelr- 190 191 slwi $num,$num,3 ; num*=8 192 li $i,-4096 193 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num 194 add $tp,$tp,$num ; place for tp[num+1] 195 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` 196 subf $tp,$tp,$sp ; $sp-$tp 197 and $tp,$tp,$i ; minimize TLB usage 198 subf $tp,$sp,$tp ; $tp-$sp 199 $STUX $sp,$sp,$tp ; alloca 200 201 $PUSH r14,`2*$SIZE_T`($sp) 202 $PUSH r15,`3*$SIZE_T`($sp) 203 $PUSH r16,`4*$SIZE_T`($sp) 204 $PUSH r17,`5*$SIZE_T`($sp) 205 $PUSH r18,`6*$SIZE_T`($sp) 206 $PUSH r19,`7*$SIZE_T`($sp) 207 $PUSH r20,`8*$SIZE_T`($sp) 208 $PUSH r21,`9*$SIZE_T`($sp) 209 $PUSH r22,`10*$SIZE_T`($sp) 210 $PUSH r23,`11*$SIZE_T`($sp) 211 stfd f14,`12*$SIZE_T+0`($sp) 212 stfd f15,`12*$SIZE_T+8`($sp) 213 stfd f16,`12*$SIZE_T+16`($sp) 214 stfd f17,`12*$SIZE_T+24`($sp) 215 stfd f18,`12*$SIZE_T+32`($sp) 216 stfd f19,`12*$SIZE_T+40`($sp) 217 stfd f20,`12*$SIZE_T+48`($sp) 218 stfd f21,`12*$SIZE_T+56`($sp) 219 stfd f22,`12*$SIZE_T+64`($sp) 220 stfd f23,`12*$SIZE_T+72`($sp) 221 stfd f24,`12*$SIZE_T+80`($sp) 222 stfd f25,`12*$SIZE_T+88`($sp) 223 224 ld $a0,0($ap) ; pull ap[0] value 225 ld $n0,0($n0) ; pull n0[0] value 226 ld $t3,0($bp) ; bp[0] 227 228 addi $tp,$sp,`$FRAME+$TRANSFER+8+64` 229 li $i,-64 230 add $nap_d,$tp,$num 231 and $nap_d,$nap_d,$i ; align to 64 bytes 232 233 mulld $t7,$a0,$t3 ; ap[0]*bp[0] 234 ; nap_d is off by 1, because it's used with stfdu/lfdu 235 addi $nap_d,$nap_d,-8 236 srwi $j,$num,`3+1` ; counter register, num/2 237 mulld $t7,$t7,$n0 ; tp[0]*n0 238 addi $j,$j,-1 239 addi $tp,$sp,`$FRAME+$TRANSFER-8` 240 li $carry,0 241 mtctr $j 242 243 ; transfer bp[0] to FPU as 4x16-bit values 244 extrdi $t0,$t3,16,48 245 extrdi $t1,$t3,16,32 246 extrdi $t2,$t3,16,16 247 extrdi $t3,$t3,16,0 248 std $t0,`$FRAME+0`($sp) 249 std $t1,`$FRAME+8`($sp) 250 std $t2,`$FRAME+16`($sp) 251 std $t3,`$FRAME+24`($sp) 252 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 253 extrdi $t4,$t7,16,48 254 extrdi $t5,$t7,16,32 255 extrdi $t6,$t7,16,16 256 extrdi $t7,$t7,16,0 257 std $t4,`$FRAME+32`($sp) 258 std $t5,`$FRAME+40`($sp) 259 std $t6,`$FRAME+48`($sp) 260 std $t7,`$FRAME+56`($sp) 261 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 262 lwz $t1,0($ap) 263 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 264 lwz $t3,8($ap) 265 lwz $t4,4($np) ; load n[j] as 32-bit word pair 266 lwz $t5,0($np) 267 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 268 lwz $t7,8($np) 269 lfd $ba,`$FRAME+0`($sp) 270 lfd $bb,`$FRAME+8`($sp) 271 lfd $bc,`$FRAME+16`($sp) 272 lfd $bd,`$FRAME+24`($sp) 273 lfd $na,`$FRAME+32`($sp) 274 lfd $nb,`$FRAME+40`($sp) 275 lfd $nc,`$FRAME+48`($sp) 276 lfd $nd,`$FRAME+56`($sp) 277 std $t0,`$FRAME+64`($sp) 278 std $t1,`$FRAME+72`($sp) 279 std $t2,`$FRAME+80`($sp) 280 std $t3,`$FRAME+88`($sp) 281 std $t4,`$FRAME+96`($sp) 282 std $t5,`$FRAME+104`($sp) 283 std $t6,`$FRAME+112`($sp) 284 std $t7,`$FRAME+120`($sp) 285 fcfid $ba,$ba 286 fcfid $bb,$bb 287 fcfid $bc,$bc 288 fcfid $bd,$bd 289 fcfid $na,$na 290 fcfid $nb,$nb 291 fcfid $nc,$nc 292 fcfid $nd,$nd 293 294 lfd $A0,`$FRAME+64`($sp) 295 lfd $A1,`$FRAME+72`($sp) 296 lfd $A2,`$FRAME+80`($sp) 297 lfd $A3,`$FRAME+88`($sp) 298 lfd $N0,`$FRAME+96`($sp) 299 lfd $N1,`$FRAME+104`($sp) 300 lfd $N2,`$FRAME+112`($sp) 301 lfd $N3,`$FRAME+120`($sp) 302 fcfid $A0,$A0 303 fcfid $A1,$A1 304 fcfid $A2,$A2 305 fcfid $A3,$A3 306 fcfid $N0,$N0 307 fcfid $N1,$N1 308 fcfid $N2,$N2 309 fcfid $N3,$N3 310 addi $ap,$ap,16 311 addi $np,$np,16 312 313 fmul $T1a,$A1,$ba 314 fmul $T1b,$A1,$bb 315 stfd $A0,8($nap_d) ; save a[j] in double format 316 stfd $A1,16($nap_d) 317 fmul $T2a,$A2,$ba 318 fmul $T2b,$A2,$bb 319 stfd $A2,24($nap_d) ; save a[j+1] in double format 320 stfd $A3,32($nap_d) 321 fmul $T3a,$A3,$ba 322 fmul $T3b,$A3,$bb 323 stfd $N0,40($nap_d) ; save n[j] in double format 324 stfd $N1,48($nap_d) 325 fmul $T0a,$A0,$ba 326 fmul $T0b,$A0,$bb 327 stfd $N2,56($nap_d) ; save n[j+1] in double format 328 stfdu $N3,64($nap_d) 329 330 fmadd $T1a,$A0,$bc,$T1a 331 fmadd $T1b,$A0,$bd,$T1b 332 fmadd $T2a,$A1,$bc,$T2a 333 fmadd $T2b,$A1,$bd,$T2b 334 fmadd $T3a,$A2,$bc,$T3a 335 fmadd $T3b,$A2,$bd,$T3b 336 fmul $dota,$A3,$bc 337 fmul $dotb,$A3,$bd 338 339 fmadd $T1a,$N1,$na,$T1a 340 fmadd $T1b,$N1,$nb,$T1b 341 fmadd $T2a,$N2,$na,$T2a 342 fmadd $T2b,$N2,$nb,$T2b 343 fmadd $T3a,$N3,$na,$T3a 344 fmadd $T3b,$N3,$nb,$T3b 345 fmadd $T0a,$N0,$na,$T0a 346 fmadd $T0b,$N0,$nb,$T0b 347 348 fmadd $T1a,$N0,$nc,$T1a 349 fmadd $T1b,$N0,$nd,$T1b 350 fmadd $T2a,$N1,$nc,$T2a 351 fmadd $T2b,$N1,$nd,$T2b 352 fmadd $T3a,$N2,$nc,$T3a 353 fmadd $T3b,$N2,$nd,$T3b 354 fmadd $dota,$N3,$nc,$dota 355 fmadd $dotb,$N3,$nd,$dotb 356 357 fctid $T0a,$T0a 358 fctid $T0b,$T0b 359 fctid $T1a,$T1a 360 fctid $T1b,$T1b 361 fctid $T2a,$T2a 362 fctid $T2b,$T2b 363 fctid $T3a,$T3a 364 fctid $T3b,$T3b 365 366 stfd $T0a,`$FRAME+0`($sp) 367 stfd $T0b,`$FRAME+8`($sp) 368 stfd $T1a,`$FRAME+16`($sp) 369 stfd $T1b,`$FRAME+24`($sp) 370 stfd $T2a,`$FRAME+32`($sp) 371 stfd $T2b,`$FRAME+40`($sp) 372 stfd $T3a,`$FRAME+48`($sp) 373 stfd $T3b,`$FRAME+56`($sp) 374 375.align 5 376L1st: 377 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 378 lwz $t1,0($ap) 379 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 380 lwz $t3,8($ap) 381 lwz $t4,4($np) ; load n[j] as 32-bit word pair 382 lwz $t5,0($np) 383 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 384 lwz $t7,8($np) 385 std $t0,`$FRAME+64`($sp) 386 std $t1,`$FRAME+72`($sp) 387 std $t2,`$FRAME+80`($sp) 388 std $t3,`$FRAME+88`($sp) 389 std $t4,`$FRAME+96`($sp) 390 std $t5,`$FRAME+104`($sp) 391 std $t6,`$FRAME+112`($sp) 392 std $t7,`$FRAME+120`($sp) 393 ld $t0,`$FRAME+0`($sp) 394 ld $t1,`$FRAME+8`($sp) 395 ld $t2,`$FRAME+16`($sp) 396 ld $t3,`$FRAME+24`($sp) 397 ld $t4,`$FRAME+32`($sp) 398 ld $t5,`$FRAME+40`($sp) 399 ld $t6,`$FRAME+48`($sp) 400 ld $t7,`$FRAME+56`($sp) 401 lfd $A0,`$FRAME+64`($sp) 402 lfd $A1,`$FRAME+72`($sp) 403 lfd $A2,`$FRAME+80`($sp) 404 lfd $A3,`$FRAME+88`($sp) 405 lfd $N0,`$FRAME+96`($sp) 406 lfd $N1,`$FRAME+104`($sp) 407 lfd $N2,`$FRAME+112`($sp) 408 lfd $N3,`$FRAME+120`($sp) 409 fcfid $A0,$A0 410 fcfid $A1,$A1 411 fcfid $A2,$A2 412 fcfid $A3,$A3 413 fcfid $N0,$N0 414 fcfid $N1,$N1 415 fcfid $N2,$N2 416 fcfid $N3,$N3 417 addi $ap,$ap,16 418 addi $np,$np,16 419 420 fmul $T1a,$A1,$ba 421 fmul $T1b,$A1,$bb 422 fmul $T2a,$A2,$ba 423 fmul $T2b,$A2,$bb 424 stfd $A0,8($nap_d) ; save a[j] in double format 425 stfd $A1,16($nap_d) 426 fmul $T3a,$A3,$ba 427 fmul $T3b,$A3,$bb 428 fmadd $T0a,$A0,$ba,$dota 429 fmadd $T0b,$A0,$bb,$dotb 430 stfd $A2,24($nap_d) ; save a[j+1] in double format 431 stfd $A3,32($nap_d) 432 433 fmadd $T1a,$A0,$bc,$T1a 434 fmadd $T1b,$A0,$bd,$T1b 435 fmadd $T2a,$A1,$bc,$T2a 436 fmadd $T2b,$A1,$bd,$T2b 437 stfd $N0,40($nap_d) ; save n[j] in double format 438 stfd $N1,48($nap_d) 439 fmadd $T3a,$A2,$bc,$T3a 440 fmadd $T3b,$A2,$bd,$T3b 441 add $t0,$t0,$carry ; can not overflow 442 fmul $dota,$A3,$bc 443 fmul $dotb,$A3,$bd 444 stfd $N2,56($nap_d) ; save n[j+1] in double format 445 stfdu $N3,64($nap_d) 446 srdi $carry,$t0,16 447 add $t1,$t1,$carry 448 srdi $carry,$t1,16 449 450 fmadd $T1a,$N1,$na,$T1a 451 fmadd $T1b,$N1,$nb,$T1b 452 insrdi $t0,$t1,16,32 453 fmadd $T2a,$N2,$na,$T2a 454 fmadd $T2b,$N2,$nb,$T2b 455 add $t2,$t2,$carry 456 fmadd $T3a,$N3,$na,$T3a 457 fmadd $T3b,$N3,$nb,$T3b 458 srdi $carry,$t2,16 459 fmadd $T0a,$N0,$na,$T0a 460 fmadd $T0b,$N0,$nb,$T0b 461 insrdi $t0,$t2,16,16 462 add $t3,$t3,$carry 463 srdi $carry,$t3,16 464 465 fmadd $T1a,$N0,$nc,$T1a 466 fmadd $T1b,$N0,$nd,$T1b 467 insrdi $t0,$t3,16,0 ; 0..63 bits 468 fmadd $T2a,$N1,$nc,$T2a 469 fmadd $T2b,$N1,$nd,$T2b 470 add $t4,$t4,$carry 471 fmadd $T3a,$N2,$nc,$T3a 472 fmadd $T3b,$N2,$nd,$T3b 473 srdi $carry,$t4,16 474 fmadd $dota,$N3,$nc,$dota 475 fmadd $dotb,$N3,$nd,$dotb 476 add $t5,$t5,$carry 477 srdi $carry,$t5,16 478 insrdi $t4,$t5,16,32 479 480 fctid $T0a,$T0a 481 fctid $T0b,$T0b 482 add $t6,$t6,$carry 483 fctid $T1a,$T1a 484 fctid $T1b,$T1b 485 srdi $carry,$t6,16 486 fctid $T2a,$T2a 487 fctid $T2b,$T2b 488 insrdi $t4,$t6,16,16 489 fctid $T3a,$T3a 490 fctid $T3b,$T3b 491 add $t7,$t7,$carry 492 insrdi $t4,$t7,16,0 ; 64..127 bits 493 srdi $carry,$t7,16 ; upper 33 bits 494 495 stfd $T0a,`$FRAME+0`($sp) 496 stfd $T0b,`$FRAME+8`($sp) 497 stfd $T1a,`$FRAME+16`($sp) 498 stfd $T1b,`$FRAME+24`($sp) 499 stfd $T2a,`$FRAME+32`($sp) 500 stfd $T2b,`$FRAME+40`($sp) 501 stfd $T3a,`$FRAME+48`($sp) 502 stfd $T3b,`$FRAME+56`($sp) 503 std $t0,8($tp) ; tp[j-1] 504 stdu $t4,16($tp) ; tp[j] 505 bdnz- L1st 506 507 fctid $dota,$dota 508 fctid $dotb,$dotb 509 510 ld $t0,`$FRAME+0`($sp) 511 ld $t1,`$FRAME+8`($sp) 512 ld $t2,`$FRAME+16`($sp) 513 ld $t3,`$FRAME+24`($sp) 514 ld $t4,`$FRAME+32`($sp) 515 ld $t5,`$FRAME+40`($sp) 516 ld $t6,`$FRAME+48`($sp) 517 ld $t7,`$FRAME+56`($sp) 518 stfd $dota,`$FRAME+64`($sp) 519 stfd $dotb,`$FRAME+72`($sp) 520 521 add $t0,$t0,$carry ; can not overflow 522 srdi $carry,$t0,16 523 add $t1,$t1,$carry 524 srdi $carry,$t1,16 525 insrdi $t0,$t1,16,32 526 add $t2,$t2,$carry 527 srdi $carry,$t2,16 528 insrdi $t0,$t2,16,16 529 add $t3,$t3,$carry 530 srdi $carry,$t3,16 531 insrdi $t0,$t3,16,0 ; 0..63 bits 532 add $t4,$t4,$carry 533 srdi $carry,$t4,16 534 add $t5,$t5,$carry 535 srdi $carry,$t5,16 536 insrdi $t4,$t5,16,32 537 add $t6,$t6,$carry 538 srdi $carry,$t6,16 539 insrdi $t4,$t6,16,16 540 add $t7,$t7,$carry 541 insrdi $t4,$t7,16,0 ; 64..127 bits 542 srdi $carry,$t7,16 ; upper 33 bits 543 ld $t6,`$FRAME+64`($sp) 544 ld $t7,`$FRAME+72`($sp) 545 546 std $t0,8($tp) ; tp[j-1] 547 stdu $t4,16($tp) ; tp[j] 548 549 add $t6,$t6,$carry ; can not overflow 550 srdi $carry,$t6,16 551 add $t7,$t7,$carry 552 insrdi $t6,$t7,48,0 553 srdi $ovf,$t7,48 554 std $t6,8($tp) ; tp[num-1] 555 556 slwi $t7,$num,2 557 subf $nap_d,$t7,$nap_d ; rewind pointer 558 559 li $i,8 ; i=1 560.align 5 561Louter: 562 ldx $t3,$bp,$i ; bp[i] 563 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 564 mulld $t7,$a0,$t3 ; ap[0]*bp[i] 565 566 addi $tp,$sp,`$FRAME+$TRANSFER` 567 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] 568 li $carry,0 569 mulld $t7,$t7,$n0 ; tp[0]*n0 570 mtctr $j 571 572 ; transfer bp[i] to FPU as 4x16-bit values 573 extrdi $t0,$t3,16,48 574 extrdi $t1,$t3,16,32 575 extrdi $t2,$t3,16,16 576 extrdi $t3,$t3,16,0 577 std $t0,`$FRAME+0`($sp) 578 std $t1,`$FRAME+8`($sp) 579 std $t2,`$FRAME+16`($sp) 580 std $t3,`$FRAME+24`($sp) 581 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 582 extrdi $t4,$t7,16,48 583 extrdi $t5,$t7,16,32 584 extrdi $t6,$t7,16,16 585 extrdi $t7,$t7,16,0 586 std $t4,`$FRAME+32`($sp) 587 std $t5,`$FRAME+40`($sp) 588 std $t6,`$FRAME+48`($sp) 589 std $t7,`$FRAME+56`($sp) 590 591 lfd $A0,8($nap_d) ; load a[j] in double format 592 lfd $A1,16($nap_d) 593 lfd $A2,24($nap_d) ; load a[j+1] in double format 594 lfd $A3,32($nap_d) 595 lfd $N0,40($nap_d) ; load n[j] in double format 596 lfd $N1,48($nap_d) 597 lfd $N2,56($nap_d) ; load n[j+1] in double format 598 lfdu $N3,64($nap_d) 599 600 lfd $ba,`$FRAME+0`($sp) 601 lfd $bb,`$FRAME+8`($sp) 602 lfd $bc,`$FRAME+16`($sp) 603 lfd $bd,`$FRAME+24`($sp) 604 lfd $na,`$FRAME+32`($sp) 605 lfd $nb,`$FRAME+40`($sp) 606 lfd $nc,`$FRAME+48`($sp) 607 lfd $nd,`$FRAME+56`($sp) 608 609 fcfid $ba,$ba 610 fcfid $bb,$bb 611 fcfid $bc,$bc 612 fcfid $bd,$bd 613 fcfid $na,$na 614 fcfid $nb,$nb 615 fcfid $nc,$nc 616 fcfid $nd,$nd 617 618 fmul $T1a,$A1,$ba 619 fmul $T1b,$A1,$bb 620 fmul $T2a,$A2,$ba 621 fmul $T2b,$A2,$bb 622 fmul $T3a,$A3,$ba 623 fmul $T3b,$A3,$bb 624 fmul $T0a,$A0,$ba 625 fmul $T0b,$A0,$bb 626 627 fmadd $T1a,$A0,$bc,$T1a 628 fmadd $T1b,$A0,$bd,$T1b 629 fmadd $T2a,$A1,$bc,$T2a 630 fmadd $T2b,$A1,$bd,$T2b 631 fmadd $T3a,$A2,$bc,$T3a 632 fmadd $T3b,$A2,$bd,$T3b 633 fmul $dota,$A3,$bc 634 fmul $dotb,$A3,$bd 635 636 fmadd $T1a,$N1,$na,$T1a 637 fmadd $T1b,$N1,$nb,$T1b 638 lfd $A0,8($nap_d) ; load a[j] in double format 639 lfd $A1,16($nap_d) 640 fmadd $T2a,$N2,$na,$T2a 641 fmadd $T2b,$N2,$nb,$T2b 642 lfd $A2,24($nap_d) ; load a[j+1] in double format 643 lfd $A3,32($nap_d) 644 fmadd $T3a,$N3,$na,$T3a 645 fmadd $T3b,$N3,$nb,$T3b 646 fmadd $T0a,$N0,$na,$T0a 647 fmadd $T0b,$N0,$nb,$T0b 648 649 fmadd $T1a,$N0,$nc,$T1a 650 fmadd $T1b,$N0,$nd,$T1b 651 fmadd $T2a,$N1,$nc,$T2a 652 fmadd $T2b,$N1,$nd,$T2b 653 fmadd $T3a,$N2,$nc,$T3a 654 fmadd $T3b,$N2,$nd,$T3b 655 fmadd $dota,$N3,$nc,$dota 656 fmadd $dotb,$N3,$nd,$dotb 657 658 fctid $T0a,$T0a 659 fctid $T0b,$T0b 660 fctid $T1a,$T1a 661 fctid $T1b,$T1b 662 fctid $T2a,$T2a 663 fctid $T2b,$T2b 664 fctid $T3a,$T3a 665 fctid $T3b,$T3b 666 667 stfd $T0a,`$FRAME+0`($sp) 668 stfd $T0b,`$FRAME+8`($sp) 669 stfd $T1a,`$FRAME+16`($sp) 670 stfd $T1b,`$FRAME+24`($sp) 671 stfd $T2a,`$FRAME+32`($sp) 672 stfd $T2b,`$FRAME+40`($sp) 673 stfd $T3a,`$FRAME+48`($sp) 674 stfd $T3b,`$FRAME+56`($sp) 675 676.align 5 677Linner: 678 fmul $T1a,$A1,$ba 679 fmul $T1b,$A1,$bb 680 fmul $T2a,$A2,$ba 681 fmul $T2b,$A2,$bb 682 lfd $N0,40($nap_d) ; load n[j] in double format 683 lfd $N1,48($nap_d) 684 fmul $T3a,$A3,$ba 685 fmul $T3b,$A3,$bb 686 fmadd $T0a,$A0,$ba,$dota 687 fmadd $T0b,$A0,$bb,$dotb 688 lfd $N2,56($nap_d) ; load n[j+1] in double format 689 lfdu $N3,64($nap_d) 690 691 fmadd $T1a,$A0,$bc,$T1a 692 fmadd $T1b,$A0,$bd,$T1b 693 fmadd $T2a,$A1,$bc,$T2a 694 fmadd $T2b,$A1,$bd,$T2b 695 lfd $A0,8($nap_d) ; load a[j] in double format 696 lfd $A1,16($nap_d) 697 fmadd $T3a,$A2,$bc,$T3a 698 fmadd $T3b,$A2,$bd,$T3b 699 fmul $dota,$A3,$bc 700 fmul $dotb,$A3,$bd 701 lfd $A2,24($nap_d) ; load a[j+1] in double format 702 lfd $A3,32($nap_d) 703 704 fmadd $T1a,$N1,$na,$T1a 705 fmadd $T1b,$N1,$nb,$T1b 706 ld $t0,`$FRAME+0`($sp) 707 ld $t1,`$FRAME+8`($sp) 708 fmadd $T2a,$N2,$na,$T2a 709 fmadd $T2b,$N2,$nb,$T2b 710 ld $t2,`$FRAME+16`($sp) 711 ld $t3,`$FRAME+24`($sp) 712 fmadd $T3a,$N3,$na,$T3a 713 fmadd $T3b,$N3,$nb,$T3b 714 add $t0,$t0,$carry ; can not overflow 715 ld $t4,`$FRAME+32`($sp) 716 ld $t5,`$FRAME+40`($sp) 717 fmadd $T0a,$N0,$na,$T0a 718 fmadd $T0b,$N0,$nb,$T0b 719 srdi $carry,$t0,16 720 add $t1,$t1,$carry 721 srdi $carry,$t1,16 722 ld $t6,`$FRAME+48`($sp) 723 ld $t7,`$FRAME+56`($sp) 724 725 fmadd $T1a,$N0,$nc,$T1a 726 fmadd $T1b,$N0,$nd,$T1b 727 insrdi $t0,$t1,16,32 728 ld $t1,8($tp) ; tp[j] 729 fmadd $T2a,$N1,$nc,$T2a 730 fmadd $T2b,$N1,$nd,$T2b 731 add $t2,$t2,$carry 732 fmadd $T3a,$N2,$nc,$T3a 733 fmadd $T3b,$N2,$nd,$T3b 734 srdi $carry,$t2,16 735 insrdi $t0,$t2,16,16 736 fmadd $dota,$N3,$nc,$dota 737 fmadd $dotb,$N3,$nd,$dotb 738 add $t3,$t3,$carry 739 ldu $t2,16($tp) ; tp[j+1] 740 srdi $carry,$t3,16 741 insrdi $t0,$t3,16,0 ; 0..63 bits 742 add $t4,$t4,$carry 743 744 fctid $T0a,$T0a 745 fctid $T0b,$T0b 746 srdi $carry,$t4,16 747 fctid $T1a,$T1a 748 fctid $T1b,$T1b 749 add $t5,$t5,$carry 750 fctid $T2a,$T2a 751 fctid $T2b,$T2b 752 srdi $carry,$t5,16 753 insrdi $t4,$t5,16,32 754 fctid $T3a,$T3a 755 fctid $T3b,$T3b 756 add $t6,$t6,$carry 757 srdi $carry,$t6,16 758 insrdi $t4,$t6,16,16 759 760 stfd $T0a,`$FRAME+0`($sp) 761 stfd $T0b,`$FRAME+8`($sp) 762 add $t7,$t7,$carry 763 addc $t3,$t0,$t1 764 stfd $T1a,`$FRAME+16`($sp) 765 stfd $T1b,`$FRAME+24`($sp) 766 insrdi $t4,$t7,16,0 ; 64..127 bits 767 srdi $carry,$t7,16 ; upper 33 bits 768 stfd $T2a,`$FRAME+32`($sp) 769 stfd $T2b,`$FRAME+40`($sp) 770 adde $t5,$t4,$t2 771 stfd $T3a,`$FRAME+48`($sp) 772 stfd $T3b,`$FRAME+56`($sp) 773 addze $carry,$carry 774 std $t3,-16($tp) ; tp[j-1] 775 std $t5,-8($tp) ; tp[j] 776 bdnz- Linner 777 778 fctid $dota,$dota 779 fctid $dotb,$dotb 780 ld $t0,`$FRAME+0`($sp) 781 ld $t1,`$FRAME+8`($sp) 782 ld $t2,`$FRAME+16`($sp) 783 ld $t3,`$FRAME+24`($sp) 784 ld $t4,`$FRAME+32`($sp) 785 ld $t5,`$FRAME+40`($sp) 786 ld $t6,`$FRAME+48`($sp) 787 ld $t7,`$FRAME+56`($sp) 788 stfd $dota,`$FRAME+64`($sp) 789 stfd $dotb,`$FRAME+72`($sp) 790 791 add $t0,$t0,$carry ; can not overflow 792 srdi $carry,$t0,16 793 add $t1,$t1,$carry 794 srdi $carry,$t1,16 795 insrdi $t0,$t1,16,32 796 add $t2,$t2,$carry 797 ld $t1,8($tp) ; tp[j] 798 srdi $carry,$t2,16 799 insrdi $t0,$t2,16,16 800 add $t3,$t3,$carry 801 ldu $t2,16($tp) ; tp[j+1] 802 srdi $carry,$t3,16 803 insrdi $t0,$t3,16,0 ; 0..63 bits 804 add $t4,$t4,$carry 805 srdi $carry,$t4,16 806 add $t5,$t5,$carry 807 srdi $carry,$t5,16 808 insrdi $t4,$t5,16,32 809 add $t6,$t6,$carry 810 srdi $carry,$t6,16 811 insrdi $t4,$t6,16,16 812 add $t7,$t7,$carry 813 insrdi $t4,$t7,16,0 ; 64..127 bits 814 srdi $carry,$t7,16 ; upper 33 bits 815 ld $t6,`$FRAME+64`($sp) 816 ld $t7,`$FRAME+72`($sp) 817 818 addc $t3,$t0,$t1 819 adde $t5,$t4,$t2 820 addze $carry,$carry 821 822 std $t3,-16($tp) ; tp[j-1] 823 std $t5,-8($tp) ; tp[j] 824 825 add $carry,$carry,$ovf ; comsume upmost overflow 826 add $t6,$t6,$carry ; can not overflow 827 srdi $carry,$t6,16 828 add $t7,$t7,$carry 829 insrdi $t6,$t7,48,0 830 srdi $ovf,$t7,48 831 std $t6,0($tp) ; tp[num-1] 832 833 slwi $t7,$num,2 834 addi $i,$i,8 835 subf $nap_d,$t7,$nap_d ; rewind pointer 836 cmpw $i,$num 837 blt- Louter 838 839 subf $np,$num,$np ; rewind np 840 addi $j,$j,1 ; restore counter 841 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 842 addi $tp,$sp,`$FRAME+$TRANSFER+8` 843 addi $t4,$sp,`$FRAME+$TRANSFER+16` 844 addi $t5,$np,8 845 addi $t6,$rp,8 846 mtctr $j 847 848.align 4 849Lsub: ldx $t0,$tp,$i 850 ldx $t1,$np,$i 851 ldx $t2,$t4,$i 852 ldx $t3,$t5,$i 853 subfe $t0,$t1,$t0 ; tp[j]-np[j] 854 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] 855 stdx $t0,$rp,$i 856 stdx $t2,$t6,$i 857 addi $i,$i,16 858 bdnz- Lsub 859 860 li $i,0 861 subfe $ovf,$i,$ovf ; handle upmost overflow bit 862 and $ap,$tp,$ovf 863 andc $np,$rp,$ovf 864 or $ap,$ap,$np ; ap=borrow?tp:rp 865 addi $t7,$ap,8 866 mtctr $j 867 868.align 4 869Lcopy: ; copy or in-place refresh 870 ldx $t0,$ap,$i 871 ldx $t1,$t7,$i 872 std $i,8($nap_d) ; zap nap_d 873 std $i,16($nap_d) 874 std $i,24($nap_d) 875 std $i,32($nap_d) 876 std $i,40($nap_d) 877 std $i,48($nap_d) 878 std $i,56($nap_d) 879 stdu $i,64($nap_d) 880 stdx $t0,$rp,$i 881 stdx $t1,$t6,$i 882 stdx $i,$tp,$i ; zap tp at once 883 stdx $i,$t4,$i 884 addi $i,$i,16 885 bdnz- Lcopy 886 887 $POP r14,`2*$SIZE_T`($sp) 888 $POP r15,`3*$SIZE_T`($sp) 889 $POP r16,`4*$SIZE_T`($sp) 890 $POP r17,`5*$SIZE_T`($sp) 891 $POP r18,`6*$SIZE_T`($sp) 892 $POP r19,`7*$SIZE_T`($sp) 893 $POP r20,`8*$SIZE_T`($sp) 894 $POP r21,`9*$SIZE_T`($sp) 895 $POP r22,`10*$SIZE_T`($sp) 896 $POP r23,`11*$SIZE_T`($sp) 897 lfd f14,`12*$SIZE_T+0`($sp) 898 lfd f15,`12*$SIZE_T+8`($sp) 899 lfd f16,`12*$SIZE_T+16`($sp) 900 lfd f17,`12*$SIZE_T+24`($sp) 901 lfd f18,`12*$SIZE_T+32`($sp) 902 lfd f19,`12*$SIZE_T+40`($sp) 903 lfd f20,`12*$SIZE_T+48`($sp) 904 lfd f21,`12*$SIZE_T+56`($sp) 905 lfd f22,`12*$SIZE_T+64`($sp) 906 lfd f23,`12*$SIZE_T+72`($sp) 907 lfd f24,`12*$SIZE_T+80`($sp) 908 lfd f25,`12*$SIZE_T+88`($sp) 909 $POP $sp,0($sp) 910 li r3,1 ; signal "handled" 911 blr 912 .long 0 913.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" 914___ 915 916$code =~ s/\`([^\`]*)\`/eval $1/gem; 917print $code; 918close STDOUT; 919