ppc64-mont.pl revision 337982
1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# December 2007 11 12# The reason for undertaken effort is basically following. Even though 13# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI 14# performance was observed to be less than impressive, essentially as 15# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. 16# Well, it's not surprising that IBM had to make some sacrifices to 17# boost the clock frequency that much, but no overall improvement? 18# Having observed how much difference did switching to FPU make on 19# UltraSPARC, playing same stunt on Power 6 appeared appropriate... 20# Unfortunately the resulting performance improvement is not as 21# impressive, ~30%, and in absolute terms is still very far from what 22# one would expect from 4.7GHz CPU. There is a chance that I'm doing 23# something wrong, but in the lack of assembler level micro-profiling 24# data or at least decent platform guide I can't tell... Or better 25# results might be achieved with VMX... Anyway, this module provides 26# *worse* performance on other PowerPC implementations, ~40-15% slower 27# on PPC970 depending on key length and ~40% slower on Power 5 for all 28# key lengths. As it's obviously inappropriate as "best all-round" 29# alternative, it has to be complemented with run-time CPU family 30# detection. Oh! It should also be noted that unlike other PowerPC 31# implementation IALU ppc-mont.pl module performs *suboptimaly* on 32# >=1024-bit key lengths on Power 6. It should also be noted that 33# *everything* said so far applies to 64-bit builds! As far as 32-bit 34# application executed on 64-bit CPU goes, this module is likely to 35# become preferred choice, because it's easy to adapt it for such 36# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. 37 38# February 2008 39 40# Micro-profiling assisted optimization results in ~15% improvement 41# over original ppc64-mont.pl version, or overall ~50% improvement 42# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same 43# Power 6 CPU, this module is 5-150% faster depending on key length, 44# [hereafter] more for longer keys. But if compared to ppc-mont.pl 45# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive 46# in absolute terms, but it's apparently the way Power 6 is... 47 48# December 2009 49 50# Adapted for 32-bit build this module delivers 25-120%, yes, more 51# than *twice* for longer keys, performance improvement over 32-bit 52# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes 53# even 64-bit integer operations and the trouble is that most PPC 54# operating systems don't preserve upper halves of general purpose 55# registers upon 32-bit signal delivery. They do preserve them upon 56# context switch, but not signalling:-( This means that asynchronous 57# signals have to be blocked upon entry to this subroutine. Signal 58# masking (and of course complementary unmasking) has quite an impact 59# on performance, naturally larger for shorter keys. It's so severe 60# that 512-bit key performance can be as low as 1/3 of expected one. 61# This is why this routine can be engaged for longer key operations 62# only on these OSes, see crypto/ppccap.c for further details. MacOS X 63# is an exception from this and doesn't require signal masking, and 64# that's where above improvement coefficients were collected. For 65# others alternative would be to break dependence on upper halves of 66# GPRs by sticking to 32-bit integer operations... 67 68# December 2012 69 70# Remove above mentioned dependence on GPRs' upper halves in 32-bit 71# build. No signal masking overhead, but integer instructions are 72# *more* numerous... It's still "universally" faster than 32-bit 73# ppc-mont.pl, but improvement coefficient is not as impressive 74# for longer keys... 75 76$flavour = shift; 77 78if ($flavour =~ /32/) { 79 $SIZE_T=4; 80 $RZONE= 224; 81 $fname= "bn_mul_mont_fpu64"; 82 83 $STUX= "stwux"; # store indexed and update 84 $PUSH= "stw"; 85 $POP= "lwz"; 86} elsif ($flavour =~ /64/) { 87 $SIZE_T=8; 88 $RZONE= 288; 89 $fname= "bn_mul_mont_fpu64"; 90 91 # same as above, but 64-bit mnemonics... 92 $STUX= "stdux"; # store indexed and update 93 $PUSH= "std"; 94 $POP= "ld"; 95} else { die "nonsense $flavour"; } 96 97$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0; 98 99$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 100( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 101( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 102die "can't locate ppc-xlate.pl"; 103 104open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 105 106$FRAME=64; # padded frame header 107$TRANSFER=16*8; 108 109$carry="r0"; 110$sp="r1"; 111$toc="r2"; 112$rp="r3"; $ovf="r3"; 113$ap="r4"; 114$bp="r5"; 115$np="r6"; 116$n0="r7"; 117$num="r8"; 118$rp="r9"; # $rp is reassigned 119$tp="r10"; 120$j="r11"; 121$i="r12"; 122# non-volatile registers 123$c1="r19"; 124$n1="r20"; 125$a1="r21"; 126$nap_d="r22"; # interleaved ap and np in double format 127$a0="r23"; # ap[0] 128$t0="r24"; # temporary registers 129$t1="r25"; 130$t2="r26"; 131$t3="r27"; 132$t4="r28"; 133$t5="r29"; 134$t6="r30"; 135$t7="r31"; 136 137# PPC offers enough register bank capacity to unroll inner loops twice 138# 139# ..A3A2A1A0 140# dcba 141# ----------- 142# A0a 143# A0b 144# A0c 145# A0d 146# A1a 147# A1b 148# A1c 149# A1d 150# A2a 151# A2b 152# A2c 153# A2d 154# A3a 155# A3b 156# A3c 157# A3d 158# ..a 159# ..b 160# 161$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; 162$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; 163$dota="f8"; $dotb="f9"; 164$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; 165$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; 166$T0a="f24"; $T0b="f25"; 167$T1a="f26"; $T1b="f27"; 168$T2a="f28"; $T2b="f29"; 169$T3a="f30"; $T3b="f31"; 170 171# sp----------->+-------------------------------+ 172# | saved sp | 173# +-------------------------------+ 174# . . 175# +64 +-------------------------------+ 176# | 16 gpr<->fpr transfer zone | 177# . . 178# . . 179# +16*8 +-------------------------------+ 180# | __int64 tmp[-1] | 181# +-------------------------------+ 182# | __int64 tmp[num] | 183# . . 184# . . 185# . . 186# +(num+1)*8 +-------------------------------+ 187# | padding to 64 byte boundary | 188# . . 189# +X +-------------------------------+ 190# | double nap_d[4*num] | 191# . . 192# . . 193# . . 194# +-------------------------------+ 195# . . 196# -13*size_t +-------------------------------+ 197# | 13 saved gpr, r19-r31 | 198# . . 199# . . 200# -12*8 +-------------------------------+ 201# | 12 saved fpr, f20-f31 | 202# . . 203# . . 204# +-------------------------------+ 205 206$code=<<___; 207.machine "any" 208.text 209 210.globl .$fname 211.align 5 212.$fname: 213 cmpwi $num,`3*8/$SIZE_T` 214 mr $rp,r3 ; $rp is reassigned 215 li r3,0 ; possible "not handled" return code 216 bltlr- 217 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" 218 bnelr- 219 220 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) 221 li $i,-4096 222 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num 223 add $tp,$tp,$num ; place for tp[num+1] 224 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` 225 subf $tp,$tp,$sp ; $sp-$tp 226 and $tp,$tp,$i ; minimize TLB usage 227 subf $tp,$sp,$tp ; $tp-$sp 228 mr $i,$sp 229 $STUX $sp,$sp,$tp ; alloca 230 231 $PUSH r19,`-12*8-13*$SIZE_T`($i) 232 $PUSH r20,`-12*8-12*$SIZE_T`($i) 233 $PUSH r21,`-12*8-11*$SIZE_T`($i) 234 $PUSH r22,`-12*8-10*$SIZE_T`($i) 235 $PUSH r23,`-12*8-9*$SIZE_T`($i) 236 $PUSH r24,`-12*8-8*$SIZE_T`($i) 237 $PUSH r25,`-12*8-7*$SIZE_T`($i) 238 $PUSH r26,`-12*8-6*$SIZE_T`($i) 239 $PUSH r27,`-12*8-5*$SIZE_T`($i) 240 $PUSH r28,`-12*8-4*$SIZE_T`($i) 241 $PUSH r29,`-12*8-3*$SIZE_T`($i) 242 $PUSH r30,`-12*8-2*$SIZE_T`($i) 243 $PUSH r31,`-12*8-1*$SIZE_T`($i) 244 stfd f20,`-12*8`($i) 245 stfd f21,`-11*8`($i) 246 stfd f22,`-10*8`($i) 247 stfd f23,`-9*8`($i) 248 stfd f24,`-8*8`($i) 249 stfd f25,`-7*8`($i) 250 stfd f26,`-6*8`($i) 251 stfd f27,`-5*8`($i) 252 stfd f28,`-4*8`($i) 253 stfd f29,`-3*8`($i) 254 stfd f30,`-2*8`($i) 255 stfd f31,`-1*8`($i) 256 257 addi $tp,$sp,`$FRAME+$TRANSFER+8+64` 258 li $i,-64 259 add $nap_d,$tp,$num 260 and $nap_d,$nap_d,$i ; align to 64 bytes 261 ; nap_d is off by 1, because it's used with stfdu/lfdu 262 addi $nap_d,$nap_d,-8 263 srwi $j,$num,`3+1` ; counter register, num/2 264 addi $j,$j,-1 265 addi $tp,$sp,`$FRAME+$TRANSFER-8` 266 li $carry,0 267 mtctr $j 268___ 269 270$code.=<<___ if ($SIZE_T==8); 271 ld $a0,0($ap) ; pull ap[0] value 272 ld $t3,0($bp) ; bp[0] 273 ld $n0,0($n0) ; pull n0[0] value 274 275 mulld $t7,$a0,$t3 ; ap[0]*bp[0] 276 ; transfer bp[0] to FPU as 4x16-bit values 277 extrdi $t0,$t3,16,48 278 extrdi $t1,$t3,16,32 279 extrdi $t2,$t3,16,16 280 extrdi $t3,$t3,16,0 281 std $t0,`$FRAME+0`($sp) 282 std $t1,`$FRAME+8`($sp) 283 std $t2,`$FRAME+16`($sp) 284 std $t3,`$FRAME+24`($sp) 285 286 mulld $t7,$t7,$n0 ; tp[0]*n0 287 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 288 extrdi $t4,$t7,16,48 289 extrdi $t5,$t7,16,32 290 extrdi $t6,$t7,16,16 291 extrdi $t7,$t7,16,0 292 std $t4,`$FRAME+32`($sp) 293 std $t5,`$FRAME+40`($sp) 294 std $t6,`$FRAME+48`($sp) 295 std $t7,`$FRAME+56`($sp) 296 297 extrdi $t0,$a0,32,32 ; lwz $t0,4($ap) 298 extrdi $t1,$a0,32,0 ; lwz $t1,0($ap) 299 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair 300 lwz $t3,`8^$LITTLE_ENDIAN`($ap) 301 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair 302 lwz $t5,`0^$LITTLE_ENDIAN`($np) 303 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair 304 lwz $t7,`8^$LITTLE_ENDIAN`($np) 305___ 306$code.=<<___ if ($SIZE_T==4); 307 lwz $a0,0($ap) ; pull ap[0,1] value 308 mr $n1,$n0 309 lwz $a1,4($ap) 310 li $c1,0 311 lwz $t1,0($bp) ; bp[0,1] 312 lwz $t3,4($bp) 313 lwz $n0,0($n1) ; pull n0[0,1] value 314 lwz $n1,4($n1) 315 316 mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0] 317 mulhwu $t5,$a0,$t1 318 mullw $t6,$a1,$t1 319 mullw $t7,$a0,$t3 320 add $t5,$t5,$t6 321 add $t5,$t5,$t7 322 ; transfer bp[0] to FPU as 4x16-bit values 323 extrwi $t0,$t1,16,16 324 extrwi $t1,$t1,16,0 325 extrwi $t2,$t3,16,16 326 extrwi $t3,$t3,16,0 327 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build 328 std $t1,`$FRAME+8`($sp) 329 std $t2,`$FRAME+16`($sp) 330 std $t3,`$FRAME+24`($sp) 331 332 mullw $t0,$t4,$n0 ; mulld tp[0]*n0 333 mulhwu $t1,$t4,$n0 334 mullw $t2,$t5,$n0 335 mullw $t3,$t4,$n1 336 add $t1,$t1,$t2 337 add $t1,$t1,$t3 338 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 339 extrwi $t4,$t0,16,16 340 extrwi $t5,$t0,16,0 341 extrwi $t6,$t1,16,16 342 extrwi $t7,$t1,16,0 343 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build 344 std $t5,`$FRAME+40`($sp) 345 std $t6,`$FRAME+48`($sp) 346 std $t7,`$FRAME+56`($sp) 347 348 mr $t0,$a0 ; lwz $t0,0($ap) 349 mr $t1,$a1 ; lwz $t1,4($ap) 350 lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs 351 lwz $t3,12($ap) 352 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 353 lwz $t5,4($np) 354 lwz $t6,8($np) 355 lwz $t7,12($np) 356___ 357$code.=<<___; 358 lfd $ba,`$FRAME+0`($sp) 359 lfd $bb,`$FRAME+8`($sp) 360 lfd $bc,`$FRAME+16`($sp) 361 lfd $bd,`$FRAME+24`($sp) 362 lfd $na,`$FRAME+32`($sp) 363 lfd $nb,`$FRAME+40`($sp) 364 lfd $nc,`$FRAME+48`($sp) 365 lfd $nd,`$FRAME+56`($sp) 366 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build 367 std $t1,`$FRAME+72`($sp) 368 std $t2,`$FRAME+80`($sp) 369 std $t3,`$FRAME+88`($sp) 370 std $t4,`$FRAME+96`($sp) 371 std $t5,`$FRAME+104`($sp) 372 std $t6,`$FRAME+112`($sp) 373 std $t7,`$FRAME+120`($sp) 374 fcfid $ba,$ba 375 fcfid $bb,$bb 376 fcfid $bc,$bc 377 fcfid $bd,$bd 378 fcfid $na,$na 379 fcfid $nb,$nb 380 fcfid $nc,$nc 381 fcfid $nd,$nd 382 383 lfd $A0,`$FRAME+64`($sp) 384 lfd $A1,`$FRAME+72`($sp) 385 lfd $A2,`$FRAME+80`($sp) 386 lfd $A3,`$FRAME+88`($sp) 387 lfd $N0,`$FRAME+96`($sp) 388 lfd $N1,`$FRAME+104`($sp) 389 lfd $N2,`$FRAME+112`($sp) 390 lfd $N3,`$FRAME+120`($sp) 391 fcfid $A0,$A0 392 fcfid $A1,$A1 393 fcfid $A2,$A2 394 fcfid $A3,$A3 395 fcfid $N0,$N0 396 fcfid $N1,$N1 397 fcfid $N2,$N2 398 fcfid $N3,$N3 399 addi $ap,$ap,16 400 addi $np,$np,16 401 402 fmul $T1a,$A1,$ba 403 fmul $T1b,$A1,$bb 404 stfd $A0,8($nap_d) ; save a[j] in double format 405 stfd $A1,16($nap_d) 406 fmul $T2a,$A2,$ba 407 fmul $T2b,$A2,$bb 408 stfd $A2,24($nap_d) ; save a[j+1] in double format 409 stfd $A3,32($nap_d) 410 fmul $T3a,$A3,$ba 411 fmul $T3b,$A3,$bb 412 stfd $N0,40($nap_d) ; save n[j] in double format 413 stfd $N1,48($nap_d) 414 fmul $T0a,$A0,$ba 415 fmul $T0b,$A0,$bb 416 stfd $N2,56($nap_d) ; save n[j+1] in double format 417 stfdu $N3,64($nap_d) 418 419 fmadd $T1a,$A0,$bc,$T1a 420 fmadd $T1b,$A0,$bd,$T1b 421 fmadd $T2a,$A1,$bc,$T2a 422 fmadd $T2b,$A1,$bd,$T2b 423 fmadd $T3a,$A2,$bc,$T3a 424 fmadd $T3b,$A2,$bd,$T3b 425 fmul $dota,$A3,$bc 426 fmul $dotb,$A3,$bd 427 428 fmadd $T1a,$N1,$na,$T1a 429 fmadd $T1b,$N1,$nb,$T1b 430 fmadd $T2a,$N2,$na,$T2a 431 fmadd $T2b,$N2,$nb,$T2b 432 fmadd $T3a,$N3,$na,$T3a 433 fmadd $T3b,$N3,$nb,$T3b 434 fmadd $T0a,$N0,$na,$T0a 435 fmadd $T0b,$N0,$nb,$T0b 436 437 fmadd $T1a,$N0,$nc,$T1a 438 fmadd $T1b,$N0,$nd,$T1b 439 fmadd $T2a,$N1,$nc,$T2a 440 fmadd $T2b,$N1,$nd,$T2b 441 fmadd $T3a,$N2,$nc,$T3a 442 fmadd $T3b,$N2,$nd,$T3b 443 fmadd $dota,$N3,$nc,$dota 444 fmadd $dotb,$N3,$nd,$dotb 445 446 fctid $T0a,$T0a 447 fctid $T0b,$T0b 448 fctid $T1a,$T1a 449 fctid $T1b,$T1b 450 fctid $T2a,$T2a 451 fctid $T2b,$T2b 452 fctid $T3a,$T3a 453 fctid $T3b,$T3b 454 455 stfd $T0a,`$FRAME+0`($sp) 456 stfd $T0b,`$FRAME+8`($sp) 457 stfd $T1a,`$FRAME+16`($sp) 458 stfd $T1b,`$FRAME+24`($sp) 459 stfd $T2a,`$FRAME+32`($sp) 460 stfd $T2b,`$FRAME+40`($sp) 461 stfd $T3a,`$FRAME+48`($sp) 462 stfd $T3b,`$FRAME+56`($sp) 463 464.align 5 465L1st: 466___ 467$code.=<<___ if ($SIZE_T==8); 468 lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair 469 lwz $t1,`0^$LITTLE_ENDIAN`($ap) 470 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair 471 lwz $t3,`8^$LITTLE_ENDIAN`($ap) 472 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair 473 lwz $t5,`0^$LITTLE_ENDIAN`($np) 474 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair 475 lwz $t7,`8^$LITTLE_ENDIAN`($np) 476___ 477$code.=<<___ if ($SIZE_T==4); 478 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs 479 lwz $t1,4($ap) 480 lwz $t2,8($ap) 481 lwz $t3,12($ap) 482 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 483 lwz $t5,4($np) 484 lwz $t6,8($np) 485 lwz $t7,12($np) 486___ 487$code.=<<___; 488 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build 489 std $t1,`$FRAME+72`($sp) 490 std $t2,`$FRAME+80`($sp) 491 std $t3,`$FRAME+88`($sp) 492 std $t4,`$FRAME+96`($sp) 493 std $t5,`$FRAME+104`($sp) 494 std $t6,`$FRAME+112`($sp) 495 std $t7,`$FRAME+120`($sp) 496___ 497if ($SIZE_T==8 or $flavour =~ /osx/) { 498$code.=<<___; 499 ld $t0,`$FRAME+0`($sp) 500 ld $t1,`$FRAME+8`($sp) 501 ld $t2,`$FRAME+16`($sp) 502 ld $t3,`$FRAME+24`($sp) 503 ld $t4,`$FRAME+32`($sp) 504 ld $t5,`$FRAME+40`($sp) 505 ld $t6,`$FRAME+48`($sp) 506 ld $t7,`$FRAME+56`($sp) 507___ 508} else { 509$code.=<<___; 510 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 511 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 512 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 513 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 514 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 515 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 516 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 517 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 518___ 519} 520$code.=<<___; 521 lfd $A0,`$FRAME+64`($sp) 522 lfd $A1,`$FRAME+72`($sp) 523 lfd $A2,`$FRAME+80`($sp) 524 lfd $A3,`$FRAME+88`($sp) 525 lfd $N0,`$FRAME+96`($sp) 526 lfd $N1,`$FRAME+104`($sp) 527 lfd $N2,`$FRAME+112`($sp) 528 lfd $N3,`$FRAME+120`($sp) 529 fcfid $A0,$A0 530 fcfid $A1,$A1 531 fcfid $A2,$A2 532 fcfid $A3,$A3 533 fcfid $N0,$N0 534 fcfid $N1,$N1 535 fcfid $N2,$N2 536 fcfid $N3,$N3 537 addi $ap,$ap,16 538 addi $np,$np,16 539 540 fmul $T1a,$A1,$ba 541 fmul $T1b,$A1,$bb 542 fmul $T2a,$A2,$ba 543 fmul $T2b,$A2,$bb 544 stfd $A0,8($nap_d) ; save a[j] in double format 545 stfd $A1,16($nap_d) 546 fmul $T3a,$A3,$ba 547 fmul $T3b,$A3,$bb 548 fmadd $T0a,$A0,$ba,$dota 549 fmadd $T0b,$A0,$bb,$dotb 550 stfd $A2,24($nap_d) ; save a[j+1] in double format 551 stfd $A3,32($nap_d) 552___ 553if ($SIZE_T==8 or $flavour =~ /osx/) { 554$code.=<<___; 555 fmadd $T1a,$A0,$bc,$T1a 556 fmadd $T1b,$A0,$bd,$T1b 557 fmadd $T2a,$A1,$bc,$T2a 558 fmadd $T2b,$A1,$bd,$T2b 559 stfd $N0,40($nap_d) ; save n[j] in double format 560 stfd $N1,48($nap_d) 561 fmadd $T3a,$A2,$bc,$T3a 562 fmadd $T3b,$A2,$bd,$T3b 563 add $t0,$t0,$carry ; can not overflow 564 fmul $dota,$A3,$bc 565 fmul $dotb,$A3,$bd 566 stfd $N2,56($nap_d) ; save n[j+1] in double format 567 stfdu $N3,64($nap_d) 568 srdi $carry,$t0,16 569 add $t1,$t1,$carry 570 srdi $carry,$t1,16 571 572 fmadd $T1a,$N1,$na,$T1a 573 fmadd $T1b,$N1,$nb,$T1b 574 insrdi $t0,$t1,16,32 575 fmadd $T2a,$N2,$na,$T2a 576 fmadd $T2b,$N2,$nb,$T2b 577 add $t2,$t2,$carry 578 fmadd $T3a,$N3,$na,$T3a 579 fmadd $T3b,$N3,$nb,$T3b 580 srdi $carry,$t2,16 581 fmadd $T0a,$N0,$na,$T0a 582 fmadd $T0b,$N0,$nb,$T0b 583 insrdi $t0,$t2,16,16 584 add $t3,$t3,$carry 585 srdi $carry,$t3,16 586 587 fmadd $T1a,$N0,$nc,$T1a 588 fmadd $T1b,$N0,$nd,$T1b 589 insrdi $t0,$t3,16,0 ; 0..63 bits 590 fmadd $T2a,$N1,$nc,$T2a 591 fmadd $T2b,$N1,$nd,$T2b 592 add $t4,$t4,$carry 593 fmadd $T3a,$N2,$nc,$T3a 594 fmadd $T3b,$N2,$nd,$T3b 595 srdi $carry,$t4,16 596 fmadd $dota,$N3,$nc,$dota 597 fmadd $dotb,$N3,$nd,$dotb 598 add $t5,$t5,$carry 599 srdi $carry,$t5,16 600 insrdi $t4,$t5,16,32 601 602 fctid $T0a,$T0a 603 fctid $T0b,$T0b 604 add $t6,$t6,$carry 605 fctid $T1a,$T1a 606 fctid $T1b,$T1b 607 srdi $carry,$t6,16 608 fctid $T2a,$T2a 609 fctid $T2b,$T2b 610 insrdi $t4,$t6,16,16 611 fctid $T3a,$T3a 612 fctid $T3b,$T3b 613 add $t7,$t7,$carry 614 insrdi $t4,$t7,16,0 ; 64..127 bits 615 srdi $carry,$t7,16 ; upper 33 bits 616 617 stfd $T0a,`$FRAME+0`($sp) 618 stfd $T0b,`$FRAME+8`($sp) 619 stfd $T1a,`$FRAME+16`($sp) 620 stfd $T1b,`$FRAME+24`($sp) 621 stfd $T2a,`$FRAME+32`($sp) 622 stfd $T2b,`$FRAME+40`($sp) 623 stfd $T3a,`$FRAME+48`($sp) 624 stfd $T3b,`$FRAME+56`($sp) 625 std $t0,8($tp) ; tp[j-1] 626 stdu $t4,16($tp) ; tp[j] 627___ 628} else { 629$code.=<<___; 630 fmadd $T1a,$A0,$bc,$T1a 631 fmadd $T1b,$A0,$bd,$T1b 632 addc $t0,$t0,$carry 633 adde $t1,$t1,$c1 634 srwi $carry,$t0,16 635 fmadd $T2a,$A1,$bc,$T2a 636 fmadd $T2b,$A1,$bd,$T2b 637 stfd $N0,40($nap_d) ; save n[j] in double format 638 stfd $N1,48($nap_d) 639 srwi $c1,$t1,16 640 insrwi $carry,$t1,16,0 641 fmadd $T3a,$A2,$bc,$T3a 642 fmadd $T3b,$A2,$bd,$T3b 643 addc $t2,$t2,$carry 644 adde $t3,$t3,$c1 645 srwi $carry,$t2,16 646 fmul $dota,$A3,$bc 647 fmul $dotb,$A3,$bd 648 stfd $N2,56($nap_d) ; save n[j+1] in double format 649 stfdu $N3,64($nap_d) 650 insrwi $t0,$t2,16,0 ; 0..31 bits 651 srwi $c1,$t3,16 652 insrwi $carry,$t3,16,0 653 654 fmadd $T1a,$N1,$na,$T1a 655 fmadd $T1b,$N1,$nb,$T1b 656 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 657 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 658 addc $t4,$t4,$carry 659 adde $t5,$t5,$c1 660 srwi $carry,$t4,16 661 fmadd $T2a,$N2,$na,$T2a 662 fmadd $T2b,$N2,$nb,$T2b 663 srwi $c1,$t5,16 664 insrwi $carry,$t5,16,0 665 fmadd $T3a,$N3,$na,$T3a 666 fmadd $T3b,$N3,$nb,$T3b 667 addc $t6,$t6,$carry 668 adde $t7,$t7,$c1 669 srwi $carry,$t6,16 670 fmadd $T0a,$N0,$na,$T0a 671 fmadd $T0b,$N0,$nb,$T0b 672 insrwi $t4,$t6,16,0 ; 32..63 bits 673 srwi $c1,$t7,16 674 insrwi $carry,$t7,16,0 675 676 fmadd $T1a,$N0,$nc,$T1a 677 fmadd $T1b,$N0,$nd,$T1b 678 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 679 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 680 addc $t2,$t2,$carry 681 adde $t3,$t3,$c1 682 srwi $carry,$t2,16 683 fmadd $T2a,$N1,$nc,$T2a 684 fmadd $T2b,$N1,$nd,$T2b 685 stw $t0,12($tp) ; tp[j-1] 686 stw $t4,8($tp) 687 srwi $c1,$t3,16 688 insrwi $carry,$t3,16,0 689 fmadd $T3a,$N2,$nc,$T3a 690 fmadd $T3b,$N2,$nd,$T3b 691 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 692 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 693 addc $t6,$t6,$carry 694 adde $t7,$t7,$c1 695 srwi $carry,$t6,16 696 fmadd $dota,$N3,$nc,$dota 697 fmadd $dotb,$N3,$nd,$dotb 698 insrwi $t2,$t6,16,0 ; 64..95 bits 699 srwi $c1,$t7,16 700 insrwi $carry,$t7,16,0 701 702 fctid $T0a,$T0a 703 fctid $T0b,$T0b 704 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 705 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 706 addc $t0,$t0,$carry 707 adde $t1,$t1,$c1 708 srwi $carry,$t0,16 709 fctid $T1a,$T1a 710 fctid $T1b,$T1b 711 srwi $c1,$t1,16 712 insrwi $carry,$t1,16,0 713 fctid $T2a,$T2a 714 fctid $T2b,$T2b 715 addc $t4,$t4,$carry 716 adde $t5,$t5,$c1 717 srwi $carry,$t4,16 718 fctid $T3a,$T3a 719 fctid $T3b,$T3b 720 insrwi $t0,$t4,16,0 ; 96..127 bits 721 srwi $c1,$t5,16 722 insrwi $carry,$t5,16,0 723 724 stfd $T0a,`$FRAME+0`($sp) 725 stfd $T0b,`$FRAME+8`($sp) 726 stfd $T1a,`$FRAME+16`($sp) 727 stfd $T1b,`$FRAME+24`($sp) 728 stfd $T2a,`$FRAME+32`($sp) 729 stfd $T2b,`$FRAME+40`($sp) 730 stfd $T3a,`$FRAME+48`($sp) 731 stfd $T3b,`$FRAME+56`($sp) 732 stw $t2,20($tp) ; tp[j] 733 stwu $t0,16($tp) 734___ 735} 736$code.=<<___; 737 bdnz L1st 738 739 fctid $dota,$dota 740 fctid $dotb,$dotb 741___ 742if ($SIZE_T==8 or $flavour =~ /osx/) { 743$code.=<<___; 744 ld $t0,`$FRAME+0`($sp) 745 ld $t1,`$FRAME+8`($sp) 746 ld $t2,`$FRAME+16`($sp) 747 ld $t3,`$FRAME+24`($sp) 748 ld $t4,`$FRAME+32`($sp) 749 ld $t5,`$FRAME+40`($sp) 750 ld $t6,`$FRAME+48`($sp) 751 ld $t7,`$FRAME+56`($sp) 752 stfd $dota,`$FRAME+64`($sp) 753 stfd $dotb,`$FRAME+72`($sp) 754 755 add $t0,$t0,$carry ; can not overflow 756 srdi $carry,$t0,16 757 add $t1,$t1,$carry 758 srdi $carry,$t1,16 759 insrdi $t0,$t1,16,32 760 add $t2,$t2,$carry 761 srdi $carry,$t2,16 762 insrdi $t0,$t2,16,16 763 add $t3,$t3,$carry 764 srdi $carry,$t3,16 765 insrdi $t0,$t3,16,0 ; 0..63 bits 766 add $t4,$t4,$carry 767 srdi $carry,$t4,16 768 add $t5,$t5,$carry 769 srdi $carry,$t5,16 770 insrdi $t4,$t5,16,32 771 add $t6,$t6,$carry 772 srdi $carry,$t6,16 773 insrdi $t4,$t6,16,16 774 add $t7,$t7,$carry 775 insrdi $t4,$t7,16,0 ; 64..127 bits 776 srdi $carry,$t7,16 ; upper 33 bits 777 ld $t6,`$FRAME+64`($sp) 778 ld $t7,`$FRAME+72`($sp) 779 780 std $t0,8($tp) ; tp[j-1] 781 stdu $t4,16($tp) ; tp[j] 782 783 add $t6,$t6,$carry ; can not overflow 784 srdi $carry,$t6,16 785 add $t7,$t7,$carry 786 insrdi $t6,$t7,48,0 787 srdi $ovf,$t7,48 788 std $t6,8($tp) ; tp[num-1] 789___ 790} else { 791$code.=<<___; 792 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 793 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 794 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 795 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 796 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 797 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 798 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 799 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 800 stfd $dota,`$FRAME+64`($sp) 801 stfd $dotb,`$FRAME+72`($sp) 802 803 addc $t0,$t0,$carry 804 adde $t1,$t1,$c1 805 srwi $carry,$t0,16 806 insrwi $carry,$t1,16,0 807 srwi $c1,$t1,16 808 addc $t2,$t2,$carry 809 adde $t3,$t3,$c1 810 srwi $carry,$t2,16 811 insrwi $t0,$t2,16,0 ; 0..31 bits 812 insrwi $carry,$t3,16,0 813 srwi $c1,$t3,16 814 addc $t4,$t4,$carry 815 adde $t5,$t5,$c1 816 srwi $carry,$t4,16 817 insrwi $carry,$t5,16,0 818 srwi $c1,$t5,16 819 addc $t6,$t6,$carry 820 adde $t7,$t7,$c1 821 srwi $carry,$t6,16 822 insrwi $t4,$t6,16,0 ; 32..63 bits 823 insrwi $carry,$t7,16,0 824 srwi $c1,$t7,16 825 stw $t0,12($tp) ; tp[j-1] 826 stw $t4,8($tp) 827 828 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 829 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 830 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 831 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 832 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 833 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 834 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 835 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 836 837 addc $t2,$t2,$carry 838 adde $t3,$t3,$c1 839 srwi $carry,$t2,16 840 insrwi $carry,$t3,16,0 841 srwi $c1,$t3,16 842 addc $t6,$t6,$carry 843 adde $t7,$t7,$c1 844 srwi $carry,$t6,16 845 insrwi $t2,$t6,16,0 ; 64..95 bits 846 insrwi $carry,$t7,16,0 847 srwi $c1,$t7,16 848 addc $t0,$t0,$carry 849 adde $t1,$t1,$c1 850 srwi $carry,$t0,16 851 insrwi $carry,$t1,16,0 852 srwi $c1,$t1,16 853 addc $t4,$t4,$carry 854 adde $t5,$t5,$c1 855 srwi $carry,$t4,16 856 insrwi $t0,$t4,16,0 ; 96..127 bits 857 insrwi $carry,$t5,16,0 858 srwi $c1,$t5,16 859 stw $t2,20($tp) ; tp[j] 860 stwu $t0,16($tp) 861 862 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp) 863 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp) 864 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp) 865 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp) 866 867 addc $t6,$t6,$carry 868 adde $t7,$t7,$c1 869 srwi $carry,$t6,16 870 insrwi $carry,$t7,16,0 871 srwi $c1,$t7,16 872 addc $t4,$t4,$carry 873 adde $t5,$t5,$c1 874 875 insrwi $t6,$t4,16,0 876 srwi $t4,$t4,16 877 insrwi $t4,$t5,16,0 878 srwi $ovf,$t5,16 879 stw $t6,12($tp) ; tp[num-1] 880 stw $t4,8($tp) 881___ 882} 883$code.=<<___; 884 slwi $t7,$num,2 885 subf $nap_d,$t7,$nap_d ; rewind pointer 886 887 li $i,8 ; i=1 888.align 5 889Louter: 890 addi $tp,$sp,`$FRAME+$TRANSFER` 891 li $carry,0 892 mtctr $j 893___ 894$code.=<<___ if ($SIZE_T==8); 895 ldx $t3,$bp,$i ; bp[i] 896 897 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 898 mulld $t7,$a0,$t3 ; ap[0]*bp[i] 899 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] 900 ; transfer bp[i] to FPU as 4x16-bit values 901 extrdi $t0,$t3,16,48 902 extrdi $t1,$t3,16,32 903 extrdi $t2,$t3,16,16 904 extrdi $t3,$t3,16,0 905 std $t0,`$FRAME+0`($sp) 906 std $t1,`$FRAME+8`($sp) 907 std $t2,`$FRAME+16`($sp) 908 std $t3,`$FRAME+24`($sp) 909 910 mulld $t7,$t7,$n0 ; tp[0]*n0 911 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 912 extrdi $t4,$t7,16,48 913 extrdi $t5,$t7,16,32 914 extrdi $t6,$t7,16,16 915 extrdi $t7,$t7,16,0 916 std $t4,`$FRAME+32`($sp) 917 std $t5,`$FRAME+40`($sp) 918 std $t6,`$FRAME+48`($sp) 919 std $t7,`$FRAME+56`($sp) 920___ 921$code.=<<___ if ($SIZE_T==4); 922 add $t0,$bp,$i 923 li $c1,0 924 lwz $t1,0($t0) ; bp[i,i+1] 925 lwz $t3,4($t0) 926 927 mullw $t4,$a0,$t1 ; ap[0]*bp[i] 928 lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0] 929 mulhwu $t5,$a0,$t1 930 lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 931 mullw $t6,$a1,$t1 932 mullw $t7,$a0,$t3 933 add $t5,$t5,$t6 934 add $t5,$t5,$t7 935 addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0] 936 adde $t5,$t5,$t2 937 ; transfer bp[i] to FPU as 4x16-bit values 938 extrwi $t0,$t1,16,16 939 extrwi $t1,$t1,16,0 940 extrwi $t2,$t3,16,16 941 extrwi $t3,$t3,16,0 942 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build 943 std $t1,`$FRAME+8`($sp) 944 std $t2,`$FRAME+16`($sp) 945 std $t3,`$FRAME+24`($sp) 946 947 mullw $t0,$t4,$n0 ; mulld tp[0]*n0 948 mulhwu $t1,$t4,$n0 949 mullw $t2,$t5,$n0 950 mullw $t3,$t4,$n1 951 add $t1,$t1,$t2 952 add $t1,$t1,$t3 953 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 954 extrwi $t4,$t0,16,16 955 extrwi $t5,$t0,16,0 956 extrwi $t6,$t1,16,16 957 extrwi $t7,$t1,16,0 958 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build 959 std $t5,`$FRAME+40`($sp) 960 std $t6,`$FRAME+48`($sp) 961 std $t7,`$FRAME+56`($sp) 962___ 963$code.=<<___; 964 lfd $A0,8($nap_d) ; load a[j] in double format 965 lfd $A1,16($nap_d) 966 lfd $A2,24($nap_d) ; load a[j+1] in double format 967 lfd $A3,32($nap_d) 968 lfd $N0,40($nap_d) ; load n[j] in double format 969 lfd $N1,48($nap_d) 970 lfd $N2,56($nap_d) ; load n[j+1] in double format 971 lfdu $N3,64($nap_d) 972 973 lfd $ba,`$FRAME+0`($sp) 974 lfd $bb,`$FRAME+8`($sp) 975 lfd $bc,`$FRAME+16`($sp) 976 lfd $bd,`$FRAME+24`($sp) 977 lfd $na,`$FRAME+32`($sp) 978 lfd $nb,`$FRAME+40`($sp) 979 lfd $nc,`$FRAME+48`($sp) 980 lfd $nd,`$FRAME+56`($sp) 981 982 fcfid $ba,$ba 983 fcfid $bb,$bb 984 fcfid $bc,$bc 985 fcfid $bd,$bd 986 fcfid $na,$na 987 fcfid $nb,$nb 988 fcfid $nc,$nc 989 fcfid $nd,$nd 990 991 fmul $T1a,$A1,$ba 992 fmul $T1b,$A1,$bb 993 fmul $T2a,$A2,$ba 994 fmul $T2b,$A2,$bb 995 fmul $T3a,$A3,$ba 996 fmul $T3b,$A3,$bb 997 fmul $T0a,$A0,$ba 998 fmul $T0b,$A0,$bb 999 1000 fmadd $T1a,$A0,$bc,$T1a 1001 fmadd $T1b,$A0,$bd,$T1b 1002 fmadd $T2a,$A1,$bc,$T2a 1003 fmadd $T2b,$A1,$bd,$T2b 1004 fmadd $T3a,$A2,$bc,$T3a 1005 fmadd $T3b,$A2,$bd,$T3b 1006 fmul $dota,$A3,$bc 1007 fmul $dotb,$A3,$bd 1008 1009 fmadd $T1a,$N1,$na,$T1a 1010 fmadd $T1b,$N1,$nb,$T1b 1011 lfd $A0,8($nap_d) ; load a[j] in double format 1012 lfd $A1,16($nap_d) 1013 fmadd $T2a,$N2,$na,$T2a 1014 fmadd $T2b,$N2,$nb,$T2b 1015 lfd $A2,24($nap_d) ; load a[j+1] in double format 1016 lfd $A3,32($nap_d) 1017 fmadd $T3a,$N3,$na,$T3a 1018 fmadd $T3b,$N3,$nb,$T3b 1019 fmadd $T0a,$N0,$na,$T0a 1020 fmadd $T0b,$N0,$nb,$T0b 1021 1022 fmadd $T1a,$N0,$nc,$T1a 1023 fmadd $T1b,$N0,$nd,$T1b 1024 fmadd $T2a,$N1,$nc,$T2a 1025 fmadd $T2b,$N1,$nd,$T2b 1026 fmadd $T3a,$N2,$nc,$T3a 1027 fmadd $T3b,$N2,$nd,$T3b 1028 fmadd $dota,$N3,$nc,$dota 1029 fmadd $dotb,$N3,$nd,$dotb 1030 1031 fctid $T0a,$T0a 1032 fctid $T0b,$T0b 1033 fctid $T1a,$T1a 1034 fctid $T1b,$T1b 1035 fctid $T2a,$T2a 1036 fctid $T2b,$T2b 1037 fctid $T3a,$T3a 1038 fctid $T3b,$T3b 1039 1040 stfd $T0a,`$FRAME+0`($sp) 1041 stfd $T0b,`$FRAME+8`($sp) 1042 stfd $T1a,`$FRAME+16`($sp) 1043 stfd $T1b,`$FRAME+24`($sp) 1044 stfd $T2a,`$FRAME+32`($sp) 1045 stfd $T2b,`$FRAME+40`($sp) 1046 stfd $T3a,`$FRAME+48`($sp) 1047 stfd $T3b,`$FRAME+56`($sp) 1048 1049.align 5 1050Linner: 1051 fmul $T1a,$A1,$ba 1052 fmul $T1b,$A1,$bb 1053 fmul $T2a,$A2,$ba 1054 fmul $T2b,$A2,$bb 1055 lfd $N0,40($nap_d) ; load n[j] in double format 1056 lfd $N1,48($nap_d) 1057 fmul $T3a,$A3,$ba 1058 fmul $T3b,$A3,$bb 1059 fmadd $T0a,$A0,$ba,$dota 1060 fmadd $T0b,$A0,$bb,$dotb 1061 lfd $N2,56($nap_d) ; load n[j+1] in double format 1062 lfdu $N3,64($nap_d) 1063 1064 fmadd $T1a,$A0,$bc,$T1a 1065 fmadd $T1b,$A0,$bd,$T1b 1066 fmadd $T2a,$A1,$bc,$T2a 1067 fmadd $T2b,$A1,$bd,$T2b 1068 lfd $A0,8($nap_d) ; load a[j] in double format 1069 lfd $A1,16($nap_d) 1070 fmadd $T3a,$A2,$bc,$T3a 1071 fmadd $T3b,$A2,$bd,$T3b 1072 fmul $dota,$A3,$bc 1073 fmul $dotb,$A3,$bd 1074 lfd $A2,24($nap_d) ; load a[j+1] in double format 1075 lfd $A3,32($nap_d) 1076___ 1077if ($SIZE_T==8 or $flavour =~ /osx/) { 1078$code.=<<___; 1079 fmadd $T1a,$N1,$na,$T1a 1080 fmadd $T1b,$N1,$nb,$T1b 1081 ld $t0,`$FRAME+0`($sp) 1082 ld $t1,`$FRAME+8`($sp) 1083 fmadd $T2a,$N2,$na,$T2a 1084 fmadd $T2b,$N2,$nb,$T2b 1085 ld $t2,`$FRAME+16`($sp) 1086 ld $t3,`$FRAME+24`($sp) 1087 fmadd $T3a,$N3,$na,$T3a 1088 fmadd $T3b,$N3,$nb,$T3b 1089 add $t0,$t0,$carry ; can not overflow 1090 ld $t4,`$FRAME+32`($sp) 1091 ld $t5,`$FRAME+40`($sp) 1092 fmadd $T0a,$N0,$na,$T0a 1093 fmadd $T0b,$N0,$nb,$T0b 1094 srdi $carry,$t0,16 1095 add $t1,$t1,$carry 1096 srdi $carry,$t1,16 1097 ld $t6,`$FRAME+48`($sp) 1098 ld $t7,`$FRAME+56`($sp) 1099 1100 fmadd $T1a,$N0,$nc,$T1a 1101 fmadd $T1b,$N0,$nd,$T1b 1102 insrdi $t0,$t1,16,32 1103 ld $t1,8($tp) ; tp[j] 1104 fmadd $T2a,$N1,$nc,$T2a 1105 fmadd $T2b,$N1,$nd,$T2b 1106 add $t2,$t2,$carry 1107 fmadd $T3a,$N2,$nc,$T3a 1108 fmadd $T3b,$N2,$nd,$T3b 1109 srdi $carry,$t2,16 1110 insrdi $t0,$t2,16,16 1111 fmadd $dota,$N3,$nc,$dota 1112 fmadd $dotb,$N3,$nd,$dotb 1113 add $t3,$t3,$carry 1114 ldu $t2,16($tp) ; tp[j+1] 1115 srdi $carry,$t3,16 1116 insrdi $t0,$t3,16,0 ; 0..63 bits 1117 add $t4,$t4,$carry 1118 1119 fctid $T0a,$T0a 1120 fctid $T0b,$T0b 1121 srdi $carry,$t4,16 1122 fctid $T1a,$T1a 1123 fctid $T1b,$T1b 1124 add $t5,$t5,$carry 1125 fctid $T2a,$T2a 1126 fctid $T2b,$T2b 1127 srdi $carry,$t5,16 1128 insrdi $t4,$t5,16,32 1129 fctid $T3a,$T3a 1130 fctid $T3b,$T3b 1131 add $t6,$t6,$carry 1132 srdi $carry,$t6,16 1133 insrdi $t4,$t6,16,16 1134 1135 stfd $T0a,`$FRAME+0`($sp) 1136 stfd $T0b,`$FRAME+8`($sp) 1137 add $t7,$t7,$carry 1138 addc $t3,$t0,$t1 1139___ 1140$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1141 extrdi $t0,$t0,32,0 1142 extrdi $t1,$t1,32,0 1143 adde $t0,$t0,$t1 1144___ 1145$code.=<<___; 1146 stfd $T1a,`$FRAME+16`($sp) 1147 stfd $T1b,`$FRAME+24`($sp) 1148 insrdi $t4,$t7,16,0 ; 64..127 bits 1149 srdi $carry,$t7,16 ; upper 33 bits 1150 stfd $T2a,`$FRAME+32`($sp) 1151 stfd $T2b,`$FRAME+40`($sp) 1152 adde $t5,$t4,$t2 1153___ 1154$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1155 extrdi $t4,$t4,32,0 1156 extrdi $t2,$t2,32,0 1157 adde $t4,$t4,$t2 1158___ 1159$code.=<<___; 1160 stfd $T3a,`$FRAME+48`($sp) 1161 stfd $T3b,`$FRAME+56`($sp) 1162 addze $carry,$carry 1163 std $t3,-16($tp) ; tp[j-1] 1164 std $t5,-8($tp) ; tp[j] 1165___ 1166} else { 1167$code.=<<___; 1168 fmadd $T1a,$N1,$na,$T1a 1169 fmadd $T1b,$N1,$nb,$T1b 1170 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 1171 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 1172 fmadd $T2a,$N2,$na,$T2a 1173 fmadd $T2b,$N2,$nb,$T2b 1174 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 1175 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 1176 fmadd $T3a,$N3,$na,$T3a 1177 fmadd $T3b,$N3,$nb,$T3b 1178 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 1179 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 1180 addc $t0,$t0,$carry 1181 adde $t1,$t1,$c1 1182 srwi $carry,$t0,16 1183 fmadd $T0a,$N0,$na,$T0a 1184 fmadd $T0b,$N0,$nb,$T0b 1185 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 1186 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 1187 srwi $c1,$t1,16 1188 insrwi $carry,$t1,16,0 1189 1190 fmadd $T1a,$N0,$nc,$T1a 1191 fmadd $T1b,$N0,$nd,$T1b 1192 addc $t2,$t2,$carry 1193 adde $t3,$t3,$c1 1194 srwi $carry,$t2,16 1195 fmadd $T2a,$N1,$nc,$T2a 1196 fmadd $T2b,$N1,$nd,$T2b 1197 insrwi $t0,$t2,16,0 ; 0..31 bits 1198 srwi $c1,$t3,16 1199 insrwi $carry,$t3,16,0 1200 fmadd $T3a,$N2,$nc,$T3a 1201 fmadd $T3b,$N2,$nd,$T3b 1202 lwz $t2,12($tp) ; tp[j] 1203 lwz $t3,8($tp) 1204 addc $t4,$t4,$carry 1205 adde $t5,$t5,$c1 1206 srwi $carry,$t4,16 1207 fmadd $dota,$N3,$nc,$dota 1208 fmadd $dotb,$N3,$nd,$dotb 1209 srwi $c1,$t5,16 1210 insrwi $carry,$t5,16,0 1211 1212 fctid $T0a,$T0a 1213 addc $t6,$t6,$carry 1214 adde $t7,$t7,$c1 1215 srwi $carry,$t6,16 1216 fctid $T0b,$T0b 1217 insrwi $t4,$t6,16,0 ; 32..63 bits 1218 srwi $c1,$t7,16 1219 insrwi $carry,$t7,16,0 1220 fctid $T1a,$T1a 1221 addc $t0,$t0,$t2 1222 adde $t4,$t4,$t3 1223 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 1224 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 1225 fctid $T1b,$T1b 1226 addze $carry,$carry 1227 addze $c1,$c1 1228 stw $t0,4($tp) ; tp[j-1] 1229 stw $t4,0($tp) 1230 fctid $T2a,$T2a 1231 addc $t2,$t2,$carry 1232 adde $t3,$t3,$c1 1233 srwi $carry,$t2,16 1234 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 1235 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 1236 fctid $T2b,$T2b 1237 srwi $c1,$t3,16 1238 insrwi $carry,$t3,16,0 1239 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 1240 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 1241 fctid $T3a,$T3a 1242 addc $t6,$t6,$carry 1243 adde $t7,$t7,$c1 1244 srwi $carry,$t6,16 1245 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 1246 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 1247 fctid $T3b,$T3b 1248 1249 insrwi $t2,$t6,16,0 ; 64..95 bits 1250 insrwi $carry,$t7,16,0 1251 srwi $c1,$t7,16 1252 lwz $t6,20($tp) 1253 lwzu $t7,16($tp) 1254 addc $t0,$t0,$carry 1255 stfd $T0a,`$FRAME+0`($sp) 1256 adde $t1,$t1,$c1 1257 srwi $carry,$t0,16 1258 stfd $T0b,`$FRAME+8`($sp) 1259 insrwi $carry,$t1,16,0 1260 srwi $c1,$t1,16 1261 addc $t4,$t4,$carry 1262 stfd $T1a,`$FRAME+16`($sp) 1263 adde $t5,$t5,$c1 1264 srwi $carry,$t4,16 1265 insrwi $t0,$t4,16,0 ; 96..127 bits 1266 stfd $T1b,`$FRAME+24`($sp) 1267 insrwi $carry,$t5,16,0 1268 srwi $c1,$t5,16 1269 1270 addc $t2,$t2,$t6 1271 stfd $T2a,`$FRAME+32`($sp) 1272 adde $t0,$t0,$t7 1273 stfd $T2b,`$FRAME+40`($sp) 1274 addze $carry,$carry 1275 stfd $T3a,`$FRAME+48`($sp) 1276 addze $c1,$c1 1277 stfd $T3b,`$FRAME+56`($sp) 1278 stw $t2,-4($tp) ; tp[j] 1279 stw $t0,-8($tp) 1280___ 1281} 1282$code.=<<___; 1283 bdnz Linner 1284 1285 fctid $dota,$dota 1286 fctid $dotb,$dotb 1287___ 1288if ($SIZE_T==8 or $flavour =~ /osx/) { 1289$code.=<<___; 1290 ld $t0,`$FRAME+0`($sp) 1291 ld $t1,`$FRAME+8`($sp) 1292 ld $t2,`$FRAME+16`($sp) 1293 ld $t3,`$FRAME+24`($sp) 1294 ld $t4,`$FRAME+32`($sp) 1295 ld $t5,`$FRAME+40`($sp) 1296 ld $t6,`$FRAME+48`($sp) 1297 ld $t7,`$FRAME+56`($sp) 1298 stfd $dota,`$FRAME+64`($sp) 1299 stfd $dotb,`$FRAME+72`($sp) 1300 1301 add $t0,$t0,$carry ; can not overflow 1302 srdi $carry,$t0,16 1303 add $t1,$t1,$carry 1304 srdi $carry,$t1,16 1305 insrdi $t0,$t1,16,32 1306 add $t2,$t2,$carry 1307 ld $t1,8($tp) ; tp[j] 1308 srdi $carry,$t2,16 1309 insrdi $t0,$t2,16,16 1310 add $t3,$t3,$carry 1311 ldu $t2,16($tp) ; tp[j+1] 1312 srdi $carry,$t3,16 1313 insrdi $t0,$t3,16,0 ; 0..63 bits 1314 add $t4,$t4,$carry 1315 srdi $carry,$t4,16 1316 add $t5,$t5,$carry 1317 srdi $carry,$t5,16 1318 insrdi $t4,$t5,16,32 1319 add $t6,$t6,$carry 1320 srdi $carry,$t6,16 1321 insrdi $t4,$t6,16,16 1322 add $t7,$t7,$carry 1323 insrdi $t4,$t7,16,0 ; 64..127 bits 1324 srdi $carry,$t7,16 ; upper 33 bits 1325 ld $t6,`$FRAME+64`($sp) 1326 ld $t7,`$FRAME+72`($sp) 1327 1328 addc $t3,$t0,$t1 1329___ 1330$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1331 extrdi $t0,$t0,32,0 1332 extrdi $t1,$t1,32,0 1333 adde $t0,$t0,$t1 1334___ 1335$code.=<<___; 1336 adde $t5,$t4,$t2 1337___ 1338$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1339 extrdi $t4,$t4,32,0 1340 extrdi $t2,$t2,32,0 1341 adde $t4,$t4,$t2 1342___ 1343$code.=<<___; 1344 addze $carry,$carry 1345 1346 std $t3,-16($tp) ; tp[j-1] 1347 std $t5,-8($tp) ; tp[j] 1348 1349 add $carry,$carry,$ovf ; comsume upmost overflow 1350 add $t6,$t6,$carry ; can not overflow 1351 srdi $carry,$t6,16 1352 add $t7,$t7,$carry 1353 insrdi $t6,$t7,48,0 1354 srdi $ovf,$t7,48 1355 std $t6,0($tp) ; tp[num-1] 1356___ 1357} else { 1358$code.=<<___; 1359 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 1360 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 1361 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 1362 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 1363 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 1364 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 1365 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 1366 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 1367 stfd $dota,`$FRAME+64`($sp) 1368 stfd $dotb,`$FRAME+72`($sp) 1369 1370 addc $t0,$t0,$carry 1371 adde $t1,$t1,$c1 1372 srwi $carry,$t0,16 1373 insrwi $carry,$t1,16,0 1374 srwi $c1,$t1,16 1375 addc $t2,$t2,$carry 1376 adde $t3,$t3,$c1 1377 srwi $carry,$t2,16 1378 insrwi $t0,$t2,16,0 ; 0..31 bits 1379 lwz $t2,12($tp) ; tp[j] 1380 insrwi $carry,$t3,16,0 1381 srwi $c1,$t3,16 1382 lwz $t3,8($tp) 1383 addc $t4,$t4,$carry 1384 adde $t5,$t5,$c1 1385 srwi $carry,$t4,16 1386 insrwi $carry,$t5,16,0 1387 srwi $c1,$t5,16 1388 addc $t6,$t6,$carry 1389 adde $t7,$t7,$c1 1390 srwi $carry,$t6,16 1391 insrwi $t4,$t6,16,0 ; 32..63 bits 1392 insrwi $carry,$t7,16,0 1393 srwi $c1,$t7,16 1394 1395 addc $t0,$t0,$t2 1396 adde $t4,$t4,$t3 1397 addze $carry,$carry 1398 addze $c1,$c1 1399 stw $t0,4($tp) ; tp[j-1] 1400 stw $t4,0($tp) 1401 1402 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 1403 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 1404 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 1405 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 1406 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 1407 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 1408 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 1409 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 1410 1411 addc $t2,$t2,$carry 1412 adde $t3,$t3,$c1 1413 srwi $carry,$t2,16 1414 insrwi $carry,$t3,16,0 1415 srwi $c1,$t3,16 1416 addc $t6,$t6,$carry 1417 adde $t7,$t7,$c1 1418 srwi $carry,$t6,16 1419 insrwi $t2,$t6,16,0 ; 64..95 bits 1420 lwz $t6,20($tp) 1421 insrwi $carry,$t7,16,0 1422 srwi $c1,$t7,16 1423 lwzu $t7,16($tp) 1424 addc $t0,$t0,$carry 1425 adde $t1,$t1,$c1 1426 srwi $carry,$t0,16 1427 insrwi $carry,$t1,16,0 1428 srwi $c1,$t1,16 1429 addc $t4,$t4,$carry 1430 adde $t5,$t5,$c1 1431 srwi $carry,$t4,16 1432 insrwi $t0,$t4,16,0 ; 96..127 bits 1433 insrwi $carry,$t5,16,0 1434 srwi $c1,$t5,16 1435 1436 addc $t2,$t2,$t6 1437 adde $t0,$t0,$t7 1438 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp) 1439 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp) 1440 addze $carry,$carry 1441 addze $c1,$c1 1442 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp) 1443 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp) 1444 1445 addc $t6,$t6,$carry 1446 adde $t7,$t7,$c1 1447 stw $t2,-4($tp) ; tp[j] 1448 stw $t0,-8($tp) 1449 addc $t6,$t6,$ovf 1450 addze $t7,$t7 1451 srwi $carry,$t6,16 1452 insrwi $carry,$t7,16,0 1453 srwi $c1,$t7,16 1454 addc $t4,$t4,$carry 1455 adde $t5,$t5,$c1 1456 1457 insrwi $t6,$t4,16,0 1458 srwi $t4,$t4,16 1459 insrwi $t4,$t5,16,0 1460 srwi $ovf,$t5,16 1461 stw $t6,4($tp) ; tp[num-1] 1462 stw $t4,0($tp) 1463___ 1464} 1465$code.=<<___; 1466 slwi $t7,$num,2 1467 addi $i,$i,8 1468 subf $nap_d,$t7,$nap_d ; rewind pointer 1469 cmpw $i,$num 1470 blt- Louter 1471___ 1472 1473$code.=<<___ if ($SIZE_T==8); 1474 subf $np,$num,$np ; rewind np 1475 addi $j,$j,1 ; restore counter 1476 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 1477 addi $tp,$sp,`$FRAME+$TRANSFER+8` 1478 addi $t4,$sp,`$FRAME+$TRANSFER+16` 1479 addi $t5,$np,8 1480 addi $t6,$rp,8 1481 mtctr $j 1482 1483.align 4 1484Lsub: ldx $t0,$tp,$i 1485 ldx $t1,$np,$i 1486 ldx $t2,$t4,$i 1487 ldx $t3,$t5,$i 1488 subfe $t0,$t1,$t0 ; tp[j]-np[j] 1489 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] 1490 stdx $t0,$rp,$i 1491 stdx $t2,$t6,$i 1492 addi $i,$i,16 1493 bdnz Lsub 1494 1495 li $i,0 1496 subfe $ovf,$i,$ovf ; handle upmost overflow bit 1497 mtctr $j 1498 1499.align 4 1500Lcopy: ; conditional copy 1501 ldx $t0,$tp,$i 1502 ldx $t1,$t4,$i 1503 ldx $t2,$rp,$i 1504 ldx $t3,$t6,$i 1505 std $i,8($nap_d) ; zap nap_d 1506 std $i,16($nap_d) 1507 std $i,24($nap_d) 1508 std $i,32($nap_d) 1509 std $i,40($nap_d) 1510 std $i,48($nap_d) 1511 std $i,56($nap_d) 1512 stdu $i,64($nap_d) 1513 and $t0,$t0,$ovf 1514 and $t1,$t1,$ovf 1515 andc $t2,$t2,$ovf 1516 andc $t3,$t3,$ovf 1517 or $t0,$t0,$t2 1518 or $t1,$t1,$t3 1519 stdx $t0,$rp,$i 1520 stdx $t1,$t6,$i 1521 stdx $i,$tp,$i ; zap tp at once 1522 stdx $i,$t4,$i 1523 addi $i,$i,16 1524 bdnz Lcopy 1525___ 1526$code.=<<___ if ($SIZE_T==4); 1527 subf $np,$num,$np ; rewind np 1528 addi $j,$j,1 ; restore counter 1529 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 1530 addi $tp,$sp,`$FRAME+$TRANSFER` 1531 addi $np,$np,-4 1532 addi $rp,$rp,-4 1533 addi $ap,$sp,`$FRAME+$TRANSFER+4` 1534 mtctr $j 1535 1536.align 4 1537Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order 1538 lwz $t1,8($tp) 1539 lwz $t2,20($tp) 1540 lwzu $t3,16($tp) 1541 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order 1542 lwz $t5,8($np) 1543 lwz $t6,12($np) 1544 lwzu $t7,16($np) 1545 subfe $t4,$t4,$t0 ; tp[j]-np[j] 1546 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order 1547 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] 1548 stw $t1,8($ap) 1549 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] 1550 stw $t2,12($ap) 1551 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] 1552 stwu $t3,16($ap) 1553 stw $t4,4($rp) 1554 stw $t5,8($rp) 1555 stw $t6,12($rp) 1556 stwu $t7,16($rp) 1557 bdnz Lsub 1558 1559 li $i,0 1560 subfe $ovf,$i,$ovf ; handle upmost overflow bit 1561 addi $ap,$sp,`$FRAME+$TRANSFER+4` 1562 subf $rp,$num,$rp ; rewind rp 1563 addi $tp,$sp,`$FRAME+$TRANSFER` 1564 mtctr $j 1565 1566.align 4 1567Lcopy: ; conditional copy 1568 lwz $t0,4($ap) 1569 lwz $t1,8($ap) 1570 lwz $t2,12($ap) 1571 lwzu $t3,16($ap) 1572 lwz $t4,4($rp) 1573 lwz $t5,8($rp) 1574 lwz $t6,12($rp) 1575 lwz $t7,16($rp) 1576 std $i,8($nap_d) ; zap nap_d 1577 std $i,16($nap_d) 1578 std $i,24($nap_d) 1579 std $i,32($nap_d) 1580 std $i,40($nap_d) 1581 std $i,48($nap_d) 1582 std $i,56($nap_d) 1583 stdu $i,64($nap_d) 1584 and $t0,$t0,$ovf 1585 and $t1,$t1,$ovf 1586 and $t2,$t2,$ovf 1587 and $t3,$t3,$ovf 1588 andc $t4,$t4,$ovf 1589 andc $t5,$t5,$ovf 1590 andc $t6,$t6,$ovf 1591 andc $t7,$t7,$ovf 1592 or $t0,$t0,$t4 1593 or $t1,$t1,$t5 1594 or $t2,$t2,$t6 1595 or $t3,$t3,$t7 1596 stw $t0,4($rp) 1597 stw $t1,8($rp) 1598 stw $t2,12($rp) 1599 stwu $t3,16($rp) 1600 std $i,8($tp) ; zap tp at once 1601 stdu $i,16($tp) 1602 bdnz Lcopy 1603___ 1604 1605$code.=<<___; 1606 $POP $i,0($sp) 1607 li r3,1 ; signal "handled" 1608 $POP r19,`-12*8-13*$SIZE_T`($i) 1609 $POP r20,`-12*8-12*$SIZE_T`($i) 1610 $POP r21,`-12*8-11*$SIZE_T`($i) 1611 $POP r22,`-12*8-10*$SIZE_T`($i) 1612 $POP r23,`-12*8-9*$SIZE_T`($i) 1613 $POP r24,`-12*8-8*$SIZE_T`($i) 1614 $POP r25,`-12*8-7*$SIZE_T`($i) 1615 $POP r26,`-12*8-6*$SIZE_T`($i) 1616 $POP r27,`-12*8-5*$SIZE_T`($i) 1617 $POP r28,`-12*8-4*$SIZE_T`($i) 1618 $POP r29,`-12*8-3*$SIZE_T`($i) 1619 $POP r30,`-12*8-2*$SIZE_T`($i) 1620 $POP r31,`-12*8-1*$SIZE_T`($i) 1621 lfd f20,`-12*8`($i) 1622 lfd f21,`-11*8`($i) 1623 lfd f22,`-10*8`($i) 1624 lfd f23,`-9*8`($i) 1625 lfd f24,`-8*8`($i) 1626 lfd f25,`-7*8`($i) 1627 lfd f26,`-6*8`($i) 1628 lfd f27,`-5*8`($i) 1629 lfd f28,`-4*8`($i) 1630 lfd f29,`-3*8`($i) 1631 lfd f30,`-2*8`($i) 1632 lfd f31,`-1*8`($i) 1633 mr $sp,$i 1634 blr 1635 .long 0 1636 .byte 0,12,4,0,0x8c,13,6,0 1637 .long 0 1638.size .$fname,.-.$fname 1639 1640.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" 1641___ 1642 1643$code =~ s/\`([^\`]*)\`/eval $1/gem; 1644print $code; 1645close STDOUT; 1646