1#!/usr/bin/env perl 2# 3# Implemented as a Perl wrapper as we want to support several different 4# architectures with single file. We pick up the target based on the 5# file name we are asked to generate. 6# 7# It should be noted though that this perl code is nothing like 8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 9# as pre-processor to cover for platform differences in name decoration, 10# linker tables, 32-/64-bit instruction sets... 11# 12# As you might know there're several PowerPC ABI in use. Most notably 13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 14# are similar enough to implement leaf(!) functions, which would be ABI 15# neutral. And that's what you find here: ABI neutral leaf functions. 16# In case you wonder what that is... 17# 18# AIX performance 19# 20# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 21# 22# The following is the performance of 32-bit compiler 23# generated code: 24# 25# OpenSSL 0.9.6c 21 dec 2001 26# built on: Tue Jun 11 11:06:51 EDT 2002 27# options:bn(64,32) ... 28#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 29# sign verify sign/s verify/s 30#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 31#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 32#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 33#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 34#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 35#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 36# 37# Same bechmark with this assembler code: 38# 39#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 40#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 41#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 42#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 43#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 44#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 45# 46# Number of operations increases by at almost 75% 47# 48# Here are performance numbers for 64-bit compiler 49# generated code: 50# 51# OpenSSL 0.9.6g [engine] 9 Aug 2002 52# built on: Fri Apr 18 16:59:20 EDT 2003 53# options:bn(64,64) ... 54# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 55# sign verify sign/s verify/s 56#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 57#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 58#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 59#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 60#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 61#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 62# 63# Same benchmark with this assembler code: 64# 65#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 66#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 67#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 68#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 69#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 70#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 71# 72# Again, performance increases by at about 75% 73# 74# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 75# OpenSSL 0.9.7c 30 Sep 2003 76# 77# Original code. 78# 79#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 80#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 81#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 82#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 83#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 84#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 85#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 86# 87# Same benchmark with this assembler code: 88# 89#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 90#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 91#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 92#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 93#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 94#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 95#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 96# 97# Performance increase of ~60% 98# 99# If you have comments or suggestions to improve code send 100# me a note at schari@us.ibm.com 101# 102 103$opf = shift; 104 105if ($opf =~ /32\.s/) { 106 $BITS= 32; 107 $BNSZ= $BITS/8; 108 $ISA= "\"ppc\""; 109 110 $LD= "lwz"; # load 111 $LDU= "lwzu"; # load and update 112 $ST= "stw"; # store 113 $STU= "stwu"; # store and update 114 $UMULL= "mullw"; # unsigned multiply low 115 $UMULH= "mulhwu"; # unsigned multiply high 116 $UDIV= "divwu"; # unsigned divide 117 $UCMPI= "cmplwi"; # unsigned compare with immediate 118 $UCMP= "cmplw"; # unsigned compare 119 $COUNTZ="cntlzw"; # count leading zeros 120 $SHL= "slw"; # shift left 121 $SHR= "srw"; # unsigned shift right 122 $SHRI= "srwi"; # unsigned shift right by immediate 123 $SHLI= "slwi"; # shift left by immediate 124 $CLRU= "clrlwi"; # clear upper bits 125 $INSR= "insrwi"; # insert right 126 $ROTL= "rotlwi"; # rotate left by immediate 127} elsif ($opf =~ /64\.s/) { 128 $BITS= 64; 129 $BNSZ= $BITS/8; 130 $ISA= "\"ppc64\""; 131 132 # same as above, but 64-bit mnemonics... 133 $LD= "ld"; # load 134 $LDU= "ldu"; # load and update 135 $ST= "std"; # store 136 $STU= "stdu"; # store and update 137 $UMULL= "mulld"; # unsigned multiply low 138 $UMULH= "mulhdu"; # unsigned multiply high 139 $UDIV= "divdu"; # unsigned divide 140 $UCMPI= "cmpldi"; # unsigned compare with immediate 141 $UCMP= "cmpld"; # unsigned compare 142 $COUNTZ="cntlzd"; # count leading zeros 143 $SHL= "sld"; # shift left 144 $SHR= "srd"; # unsigned shift right 145 $SHRI= "srdi"; # unsigned shift right by immediate 146 $SHLI= "sldi"; # shift left by immediate 147 $CLRU= "clrldi"; # clear upper bits 148 $INSR= "insrdi"; # insert right 149 $ROTL= "rotldi"; # rotate left by immediate 150} else { die "nonsense $opf"; } 151 152( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!"; 153 154# function entry points from the AIX code 155# 156# There are other, more elegant, ways to handle this. We (IBM) chose 157# this approach as it plays well with scripts we run to 'namespace' 158# OpenSSL .i.e. we add a prefix to all the public symbols so we can 159# co-exist in the same process with other implementations of OpenSSL. 160# 'cleverer' ways of doing these substitutions tend to hide data we 161# need to be obvious. 162# 163my @items = ("bn_sqr_comba4", 164 "bn_sqr_comba8", 165 "bn_mul_comba4", 166 "bn_mul_comba8", 167 "bn_sub_words", 168 "bn_add_words", 169 "bn_div_words", 170 "bn_sqr_words", 171 "bn_mul_words", 172 "bn_mul_add_words"); 173 174if ($opf =~ /linux/) { do_linux(); } 175elsif ($opf =~ /aix/) { do_aix(); } 176elsif ($opf =~ /osx/) { do_osx(); } 177else { do_bsd(); } 178 179sub do_linux { 180 $d=&data(); 181 182 if ($BITS==64) { 183 foreach $t (@items) { 184 $d =~ s/\.$t:/\ 185\t.section\t".opd","aw"\ 186\t.align\t3\ 187\t.globl\t$t\ 188$t:\ 189\t.quad\t.$t,.TOC.\@tocbase,0\ 190\t.size\t$t,24\ 191\t.previous\n\ 192\t.type\t.$t,\@function\ 193\t.globl\t.$t\ 194.$t:/g; 195 } 196 } 197 else { 198 foreach $t (@items) { 199 $d=~s/\.$t/$t/g; 200 } 201 } 202 # hide internal labels to avoid pollution of name table... 203 $d=~s/Lppcasm_/.Lppcasm_/gm; 204 print $d; 205} 206 207sub do_aix { 208 # AIX assembler is smart enough to please the linker without 209 # making us do something special... 210 print &data(); 211} 212 213# MacOSX 32 bit 214sub do_osx { 215 $d=&data(); 216 # Change the bn symbol prefix from '.' to '_' 217 foreach $t (@items) { 218 $d=~s/\.$t/_$t/g; 219 } 220 # Change .machine to something OS X asm will accept 221 $d=~s/\.machine.*/.text/g; 222 $d=~s/\#/;/g; # change comment from '#' to ';' 223 print $d; 224} 225 226# BSD (Untested) 227sub do_bsd { 228 $d=&data(); 229 foreach $t (@items) { 230 $d=~s/\.$t/_$t/g; 231 } 232 print $d; 233} 234 235sub data { 236 local($data)=<<EOF; 237#-------------------------------------------------------------------- 238# 239# 240# 241# 242# File: ppc32.s 243# 244# Created by: Suresh Chari 245# IBM Thomas J. Watson Research Library 246# Hawthorne, NY 247# 248# 249# Description: Optimized assembly routines for OpenSSL crypto 250# on the 32 bitPowerPC platform. 251# 252# 253# Version History 254# 255# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 256# cleaned up code. Also made a single version which can 257# be used for both the AIX and Linux compilers. See NOTE 258# below. 259# 12/05/03 Suresh Chari 260# (with lots of help from) Andy Polyakov 261## 262# 1. Initial version 10/20/02 Suresh Chari 263# 264# 265# The following file works for the xlc,cc 266# and gcc compilers. 267# 268# NOTE: To get the file to link correctly with the gcc compiler 269# you have to change the names of the routines and remove 270# the first .(dot) character. This should automatically 271# be done in the build process. 272# 273# Hand optimized assembly code for the following routines 274# 275# bn_sqr_comba4 276# bn_sqr_comba8 277# bn_mul_comba4 278# bn_mul_comba8 279# bn_sub_words 280# bn_add_words 281# bn_div_words 282# bn_sqr_words 283# bn_mul_words 284# bn_mul_add_words 285# 286# NOTE: It is possible to optimize this code more for 287# specific PowerPC or Power architectures. On the Northstar 288# architecture the optimizations in this file do 289# NOT provide much improvement. 290# 291# If you have comments or suggestions to improve code send 292# me a note at schari\@us.ibm.com 293# 294#-------------------------------------------------------------------------- 295# 296# Defines to be used in the assembly code. 297# 298.set r0,0 # we use it as storage for value of 0 299.set SP,1 # preserved 300.set RTOC,2 # preserved 301.set r3,3 # 1st argument/return value 302.set r4,4 # 2nd argument/volatile register 303.set r5,5 # 3rd argument/volatile register 304.set r6,6 # ... 305.set r7,7 306.set r8,8 307.set r9,9 308.set r10,10 309.set r11,11 310.set r12,12 311.set r13,13 # not used, nor any other "below" it... 312 313.set BO_IF_NOT,4 314.set BO_IF,12 315.set BO_dCTR_NZERO,16 316.set BO_dCTR_ZERO,18 317.set BO_ALWAYS,20 318.set CR0_LT,0; 319.set CR0_GT,1; 320.set CR0_EQ,2 321.set CR1_FX,4; 322.set CR1_FEX,5; 323.set CR1_VX,6 324.set LR,8 325 326# Declare function names to be global 327# NOTE: For gcc these names MUST be changed to remove 328# the first . i.e. for example change ".bn_sqr_comba4" 329# to "bn_sqr_comba4". This should be automatically done 330# in the build. 331 332 .globl .bn_sqr_comba4 333 .globl .bn_sqr_comba8 334 .globl .bn_mul_comba4 335 .globl .bn_mul_comba8 336 .globl .bn_sub_words 337 .globl .bn_add_words 338 .globl .bn_div_words 339 .globl .bn_sqr_words 340 .globl .bn_mul_words 341 .globl .bn_mul_add_words 342 343# .text section 344 345 .machine $ISA 346 347# 348# NOTE: The following label name should be changed to 349# "bn_sqr_comba4" i.e. remove the first dot 350# for the gcc compiler. This should be automatically 351# done in the build 352# 353 354.align 4 355.bn_sqr_comba4: 356# 357# Optimized version of bn_sqr_comba4. 358# 359# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 360# r3 contains r 361# r4 contains a 362# 363# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 364# 365# r5,r6 are the two BN_ULONGs being multiplied. 366# r7,r8 are the results of the 32x32 giving 64 bit multiply. 367# r9,r10, r11 are the equivalents of c1,c2, c3. 368# Here's the assembly 369# 370# 371 xor r0,r0,r0 # set r0 = 0. Used in the addze 372 # instructions below 373 374 #sqr_add_c(a,0,c1,c2,c3) 375 $LD r5,`0*$BNSZ`(r4) 376 $UMULL r9,r5,r5 377 $UMULH r10,r5,r5 #in first iteration. No need 378 #to add since c1=c2=c3=0. 379 # Note c3(r11) is NOT set to 0 380 # but will be. 381 382 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 383 # sqr_add_c2(a,1,0,c2,c3,c1); 384 $LD r6,`1*$BNSZ`(r4) 385 $UMULL r7,r5,r6 386 $UMULH r8,r5,r6 387 388 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 389 adde r8,r8,r8 390 addze r9,r0 # catch carry if any. 391 # r9= r0(=0) and carry 392 393 addc r10,r7,r10 # now add to temp result. 394 addze r11,r8 # r8 added to r11 which is 0 395 addze r9,r9 396 397 $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 398 #sqr_add_c(a,1,c3,c1,c2) 399 $UMULL r7,r6,r6 400 $UMULH r8,r6,r6 401 addc r11,r7,r11 402 adde r9,r8,r9 403 addze r10,r0 404 #sqr_add_c2(a,2,0,c3,c1,c2) 405 $LD r6,`2*$BNSZ`(r4) 406 $UMULL r7,r5,r6 407 $UMULH r8,r5,r6 408 409 addc r7,r7,r7 410 adde r8,r8,r8 411 addze r10,r10 412 413 addc r11,r7,r11 414 adde r9,r8,r9 415 addze r10,r10 416 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 417 #sqr_add_c2(a,3,0,c1,c2,c3); 418 $LD r6,`3*$BNSZ`(r4) 419 $UMULL r7,r5,r6 420 $UMULH r8,r5,r6 421 addc r7,r7,r7 422 adde r8,r8,r8 423 addze r11,r0 424 425 addc r9,r7,r9 426 adde r10,r8,r10 427 addze r11,r11 428 #sqr_add_c2(a,2,1,c1,c2,c3); 429 $LD r5,`1*$BNSZ`(r4) 430 $LD r6,`2*$BNSZ`(r4) 431 $UMULL r7,r5,r6 432 $UMULH r8,r5,r6 433 434 addc r7,r7,r7 435 adde r8,r8,r8 436 addze r11,r11 437 addc r9,r7,r9 438 adde r10,r8,r10 439 addze r11,r11 440 $ST r9,`3*$BNSZ`(r3) #r[3]=c1 441 #sqr_add_c(a,2,c2,c3,c1); 442 $UMULL r7,r6,r6 443 $UMULH r8,r6,r6 444 addc r10,r7,r10 445 adde r11,r8,r11 446 addze r9,r0 447 #sqr_add_c2(a,3,1,c2,c3,c1); 448 $LD r6,`3*$BNSZ`(r4) 449 $UMULL r7,r5,r6 450 $UMULH r8,r5,r6 451 addc r7,r7,r7 452 adde r8,r8,r8 453 addze r9,r9 454 455 addc r10,r7,r10 456 adde r11,r8,r11 457 addze r9,r9 458 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 459 #sqr_add_c2(a,3,2,c3,c1,c2); 460 $LD r5,`2*$BNSZ`(r4) 461 $UMULL r7,r5,r6 462 $UMULH r8,r5,r6 463 addc r7,r7,r7 464 adde r8,r8,r8 465 addze r10,r0 466 467 addc r11,r7,r11 468 adde r9,r8,r9 469 addze r10,r10 470 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 471 #sqr_add_c(a,3,c1,c2,c3); 472 $UMULL r7,r6,r6 473 $UMULH r8,r6,r6 474 addc r9,r7,r9 475 adde r10,r8,r10 476 477 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 478 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 479 bclr BO_ALWAYS,CR0_LT 480 .long 0x00000000 481 482# 483# NOTE: The following label name should be changed to 484# "bn_sqr_comba8" i.e. remove the first dot 485# for the gcc compiler. This should be automatically 486# done in the build 487# 488 489.align 4 490.bn_sqr_comba8: 491# 492# This is an optimized version of the bn_sqr_comba8 routine. 493# Tightly uses the adde instruction 494# 495# 496# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 497# r3 contains r 498# r4 contains a 499# 500# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 501# 502# r5,r6 are the two BN_ULONGs being multiplied. 503# r7,r8 are the results of the 32x32 giving 64 bit multiply. 504# r9,r10, r11 are the equivalents of c1,c2, c3. 505# 506# Possible optimization of loading all 8 longs of a into registers 507# doesnt provide any speedup 508# 509 510 xor r0,r0,r0 #set r0 = 0.Used in addze 511 #instructions below. 512 513 #sqr_add_c(a,0,c1,c2,c3); 514 $LD r5,`0*$BNSZ`(r4) 515 $UMULL r9,r5,r5 #1st iteration: no carries. 516 $UMULH r10,r5,r5 517 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 518 #sqr_add_c2(a,1,0,c2,c3,c1); 519 $LD r6,`1*$BNSZ`(r4) 520 $UMULL r7,r5,r6 521 $UMULH r8,r5,r6 522 523 addc r10,r7,r10 #add the two register number 524 adde r11,r8,r0 # (r8,r7) to the three register 525 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 526 527 addc r10,r7,r10 #add the two register number 528 adde r11,r8,r11 # (r8,r7) to the three register 529 addze r9,r9 # number (r9,r11,r10). 530 531 $ST r10,`1*$BNSZ`(r3) # r[1]=c2 532 533 #sqr_add_c(a,1,c3,c1,c2); 534 $UMULL r7,r6,r6 535 $UMULH r8,r6,r6 536 addc r11,r7,r11 537 adde r9,r8,r9 538 addze r10,r0 539 #sqr_add_c2(a,2,0,c3,c1,c2); 540 $LD r6,`2*$BNSZ`(r4) 541 $UMULL r7,r5,r6 542 $UMULH r8,r5,r6 543 544 addc r11,r7,r11 545 adde r9,r8,r9 546 addze r10,r10 547 548 addc r11,r7,r11 549 adde r9,r8,r9 550 addze r10,r10 551 552 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 553 #sqr_add_c2(a,3,0,c1,c2,c3); 554 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 555 $UMULL r7,r5,r6 556 $UMULH r8,r5,r6 557 558 addc r9,r7,r9 559 adde r10,r8,r10 560 addze r11,r0 561 562 addc r9,r7,r9 563 adde r10,r8,r10 564 addze r11,r11 565 #sqr_add_c2(a,2,1,c1,c2,c3); 566 $LD r5,`1*$BNSZ`(r4) 567 $LD r6,`2*$BNSZ`(r4) 568 $UMULL r7,r5,r6 569 $UMULH r8,r5,r6 570 571 addc r9,r7,r9 572 adde r10,r8,r10 573 addze r11,r11 574 575 addc r9,r7,r9 576 adde r10,r8,r10 577 addze r11,r11 578 579 $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 580 #sqr_add_c(a,2,c2,c3,c1); 581 $UMULL r7,r6,r6 582 $UMULH r8,r6,r6 583 584 addc r10,r7,r10 585 adde r11,r8,r11 586 addze r9,r0 587 #sqr_add_c2(a,3,1,c2,c3,c1); 588 $LD r6,`3*$BNSZ`(r4) 589 $UMULL r7,r5,r6 590 $UMULH r8,r5,r6 591 592 addc r10,r7,r10 593 adde r11,r8,r11 594 addze r9,r9 595 596 addc r10,r7,r10 597 adde r11,r8,r11 598 addze r9,r9 599 #sqr_add_c2(a,4,0,c2,c3,c1); 600 $LD r5,`0*$BNSZ`(r4) 601 $LD r6,`4*$BNSZ`(r4) 602 $UMULL r7,r5,r6 603 $UMULH r8,r5,r6 604 605 addc r10,r7,r10 606 adde r11,r8,r11 607 addze r9,r9 608 609 addc r10,r7,r10 610 adde r11,r8,r11 611 addze r9,r9 612 $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 613 #sqr_add_c2(a,5,0,c3,c1,c2); 614 $LD r6,`5*$BNSZ`(r4) 615 $UMULL r7,r5,r6 616 $UMULH r8,r5,r6 617 618 addc r11,r7,r11 619 adde r9,r8,r9 620 addze r10,r0 621 622 addc r11,r7,r11 623 adde r9,r8,r9 624 addze r10,r10 625 #sqr_add_c2(a,4,1,c3,c1,c2); 626 $LD r5,`1*$BNSZ`(r4) 627 $LD r6,`4*$BNSZ`(r4) 628 $UMULL r7,r5,r6 629 $UMULH r8,r5,r6 630 631 addc r11,r7,r11 632 adde r9,r8,r9 633 addze r10,r10 634 635 addc r11,r7,r11 636 adde r9,r8,r9 637 addze r10,r10 638 #sqr_add_c2(a,3,2,c3,c1,c2); 639 $LD r5,`2*$BNSZ`(r4) 640 $LD r6,`3*$BNSZ`(r4) 641 $UMULL r7,r5,r6 642 $UMULH r8,r5,r6 643 644 addc r11,r7,r11 645 adde r9,r8,r9 646 addze r10,r10 647 648 addc r11,r7,r11 649 adde r9,r8,r9 650 addze r10,r10 651 $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 652 #sqr_add_c(a,3,c1,c2,c3); 653 $UMULL r7,r6,r6 654 $UMULH r8,r6,r6 655 addc r9,r7,r9 656 adde r10,r8,r10 657 addze r11,r0 658 #sqr_add_c2(a,4,2,c1,c2,c3); 659 $LD r6,`4*$BNSZ`(r4) 660 $UMULL r7,r5,r6 661 $UMULH r8,r5,r6 662 663 addc r9,r7,r9 664 adde r10,r8,r10 665 addze r11,r11 666 667 addc r9,r7,r9 668 adde r10,r8,r10 669 addze r11,r11 670 #sqr_add_c2(a,5,1,c1,c2,c3); 671 $LD r5,`1*$BNSZ`(r4) 672 $LD r6,`5*$BNSZ`(r4) 673 $UMULL r7,r5,r6 674 $UMULH r8,r5,r6 675 676 addc r9,r7,r9 677 adde r10,r8,r10 678 addze r11,r11 679 680 addc r9,r7,r9 681 adde r10,r8,r10 682 addze r11,r11 683 #sqr_add_c2(a,6,0,c1,c2,c3); 684 $LD r5,`0*$BNSZ`(r4) 685 $LD r6,`6*$BNSZ`(r4) 686 $UMULL r7,r5,r6 687 $UMULH r8,r5,r6 688 addc r9,r7,r9 689 adde r10,r8,r10 690 addze r11,r11 691 addc r9,r7,r9 692 adde r10,r8,r10 693 addze r11,r11 694 $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 695 #sqr_add_c2(a,7,0,c2,c3,c1); 696 $LD r6,`7*$BNSZ`(r4) 697 $UMULL r7,r5,r6 698 $UMULH r8,r5,r6 699 700 addc r10,r7,r10 701 adde r11,r8,r11 702 addze r9,r0 703 addc r10,r7,r10 704 adde r11,r8,r11 705 addze r9,r9 706 #sqr_add_c2(a,6,1,c2,c3,c1); 707 $LD r5,`1*$BNSZ`(r4) 708 $LD r6,`6*$BNSZ`(r4) 709 $UMULL r7,r5,r6 710 $UMULH r8,r5,r6 711 712 addc r10,r7,r10 713 adde r11,r8,r11 714 addze r9,r9 715 addc r10,r7,r10 716 adde r11,r8,r11 717 addze r9,r9 718 #sqr_add_c2(a,5,2,c2,c3,c1); 719 $LD r5,`2*$BNSZ`(r4) 720 $LD r6,`5*$BNSZ`(r4) 721 $UMULL r7,r5,r6 722 $UMULH r8,r5,r6 723 addc r10,r7,r10 724 adde r11,r8,r11 725 addze r9,r9 726 addc r10,r7,r10 727 adde r11,r8,r11 728 addze r9,r9 729 #sqr_add_c2(a,4,3,c2,c3,c1); 730 $LD r5,`3*$BNSZ`(r4) 731 $LD r6,`4*$BNSZ`(r4) 732 $UMULL r7,r5,r6 733 $UMULH r8,r5,r6 734 735 addc r10,r7,r10 736 adde r11,r8,r11 737 addze r9,r9 738 addc r10,r7,r10 739 adde r11,r8,r11 740 addze r9,r9 741 $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 742 #sqr_add_c(a,4,c3,c1,c2); 743 $UMULL r7,r6,r6 744 $UMULH r8,r6,r6 745 addc r11,r7,r11 746 adde r9,r8,r9 747 addze r10,r0 748 #sqr_add_c2(a,5,3,c3,c1,c2); 749 $LD r6,`5*$BNSZ`(r4) 750 $UMULL r7,r5,r6 751 $UMULH r8,r5,r6 752 addc r11,r7,r11 753 adde r9,r8,r9 754 addze r10,r10 755 addc r11,r7,r11 756 adde r9,r8,r9 757 addze r10,r10 758 #sqr_add_c2(a,6,2,c3,c1,c2); 759 $LD r5,`2*$BNSZ`(r4) 760 $LD r6,`6*$BNSZ`(r4) 761 $UMULL r7,r5,r6 762 $UMULH r8,r5,r6 763 addc r11,r7,r11 764 adde r9,r8,r9 765 addze r10,r10 766 767 addc r11,r7,r11 768 adde r9,r8,r9 769 addze r10,r10 770 #sqr_add_c2(a,7,1,c3,c1,c2); 771 $LD r5,`1*$BNSZ`(r4) 772 $LD r6,`7*$BNSZ`(r4) 773 $UMULL r7,r5,r6 774 $UMULH r8,r5,r6 775 addc r11,r7,r11 776 adde r9,r8,r9 777 addze r10,r10 778 addc r11,r7,r11 779 adde r9,r8,r9 780 addze r10,r10 781 $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 782 #sqr_add_c2(a,7,2,c1,c2,c3); 783 $LD r5,`2*$BNSZ`(r4) 784 $UMULL r7,r5,r6 785 $UMULH r8,r5,r6 786 787 addc r9,r7,r9 788 adde r10,r8,r10 789 addze r11,r0 790 addc r9,r7,r9 791 adde r10,r8,r10 792 addze r11,r11 793 #sqr_add_c2(a,6,3,c1,c2,c3); 794 $LD r5,`3*$BNSZ`(r4) 795 $LD r6,`6*$BNSZ`(r4) 796 $UMULL r7,r5,r6 797 $UMULH r8,r5,r6 798 addc r9,r7,r9 799 adde r10,r8,r10 800 addze r11,r11 801 addc r9,r7,r9 802 adde r10,r8,r10 803 addze r11,r11 804 #sqr_add_c2(a,5,4,c1,c2,c3); 805 $LD r5,`4*$BNSZ`(r4) 806 $LD r6,`5*$BNSZ`(r4) 807 $UMULL r7,r5,r6 808 $UMULH r8,r5,r6 809 addc r9,r7,r9 810 adde r10,r8,r10 811 addze r11,r11 812 addc r9,r7,r9 813 adde r10,r8,r10 814 addze r11,r11 815 $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 816 #sqr_add_c(a,5,c2,c3,c1); 817 $UMULL r7,r6,r6 818 $UMULH r8,r6,r6 819 addc r10,r7,r10 820 adde r11,r8,r11 821 addze r9,r0 822 #sqr_add_c2(a,6,4,c2,c3,c1); 823 $LD r6,`6*$BNSZ`(r4) 824 $UMULL r7,r5,r6 825 $UMULH r8,r5,r6 826 addc r10,r7,r10 827 adde r11,r8,r11 828 addze r9,r9 829 addc r10,r7,r10 830 adde r11,r8,r11 831 addze r9,r9 832 #sqr_add_c2(a,7,3,c2,c3,c1); 833 $LD r5,`3*$BNSZ`(r4) 834 $LD r6,`7*$BNSZ`(r4) 835 $UMULL r7,r5,r6 836 $UMULH r8,r5,r6 837 addc r10,r7,r10 838 adde r11,r8,r11 839 addze r9,r9 840 addc r10,r7,r10 841 adde r11,r8,r11 842 addze r9,r9 843 $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 844 #sqr_add_c2(a,7,4,c3,c1,c2); 845 $LD r5,`4*$BNSZ`(r4) 846 $UMULL r7,r5,r6 847 $UMULH r8,r5,r6 848 addc r11,r7,r11 849 adde r9,r8,r9 850 addze r10,r0 851 addc r11,r7,r11 852 adde r9,r8,r9 853 addze r10,r10 854 #sqr_add_c2(a,6,5,c3,c1,c2); 855 $LD r5,`5*$BNSZ`(r4) 856 $LD r6,`6*$BNSZ`(r4) 857 $UMULL r7,r5,r6 858 $UMULH r8,r5,r6 859 addc r11,r7,r11 860 adde r9,r8,r9 861 addze r10,r10 862 addc r11,r7,r11 863 adde r9,r8,r9 864 addze r10,r10 865 $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 866 #sqr_add_c(a,6,c1,c2,c3); 867 $UMULL r7,r6,r6 868 $UMULH r8,r6,r6 869 addc r9,r7,r9 870 adde r10,r8,r10 871 addze r11,r0 872 #sqr_add_c2(a,7,5,c1,c2,c3) 873 $LD r6,`7*$BNSZ`(r4) 874 $UMULL r7,r5,r6 875 $UMULH r8,r5,r6 876 addc r9,r7,r9 877 adde r10,r8,r10 878 addze r11,r11 879 addc r9,r7,r9 880 adde r10,r8,r10 881 addze r11,r11 882 $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 883 884 #sqr_add_c2(a,7,6,c2,c3,c1) 885 $LD r5,`6*$BNSZ`(r4) 886 $UMULL r7,r5,r6 887 $UMULH r8,r5,r6 888 addc r10,r7,r10 889 adde r11,r8,r11 890 addze r9,r0 891 addc r10,r7,r10 892 adde r11,r8,r11 893 addze r9,r9 894 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 895 #sqr_add_c(a,7,c3,c1,c2); 896 $UMULL r7,r6,r6 897 $UMULH r8,r6,r6 898 addc r11,r7,r11 899 adde r9,r8,r9 900 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 901 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 902 903 904 bclr BO_ALWAYS,CR0_LT 905 906 .long 0x00000000 907 908# 909# NOTE: The following label name should be changed to 910# "bn_mul_comba4" i.e. remove the first dot 911# for the gcc compiler. This should be automatically 912# done in the build 913# 914 915.align 4 916.bn_mul_comba4: 917# 918# This is an optimized version of the bn_mul_comba4 routine. 919# 920# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 921# r3 contains r 922# r4 contains a 923# r5 contains b 924# r6, r7 are the 2 BN_ULONGs being multiplied. 925# r8, r9 are the results of the 32x32 giving 64 multiply. 926# r10, r11, r12 are the equivalents of c1, c2, and c3. 927# 928 xor r0,r0,r0 #r0=0. Used in addze below. 929 #mul_add_c(a[0],b[0],c1,c2,c3); 930 $LD r6,`0*$BNSZ`(r4) 931 $LD r7,`0*$BNSZ`(r5) 932 $UMULL r10,r6,r7 933 $UMULH r11,r6,r7 934 $ST r10,`0*$BNSZ`(r3) #r[0]=c1 935 #mul_add_c(a[0],b[1],c2,c3,c1); 936 $LD r7,`1*$BNSZ`(r5) 937 $UMULL r8,r6,r7 938 $UMULH r9,r6,r7 939 addc r11,r8,r11 940 adde r12,r9,r0 941 addze r10,r0 942 #mul_add_c(a[1],b[0],c2,c3,c1); 943 $LD r6, `1*$BNSZ`(r4) 944 $LD r7, `0*$BNSZ`(r5) 945 $UMULL r8,r6,r7 946 $UMULH r9,r6,r7 947 addc r11,r8,r11 948 adde r12,r9,r12 949 addze r10,r10 950 $ST r11,`1*$BNSZ`(r3) #r[1]=c2 951 #mul_add_c(a[2],b[0],c3,c1,c2); 952 $LD r6,`2*$BNSZ`(r4) 953 $UMULL r8,r6,r7 954 $UMULH r9,r6,r7 955 addc r12,r8,r12 956 adde r10,r9,r10 957 addze r11,r0 958 #mul_add_c(a[1],b[1],c3,c1,c2); 959 $LD r6,`1*$BNSZ`(r4) 960 $LD r7,`1*$BNSZ`(r5) 961 $UMULL r8,r6,r7 962 $UMULH r9,r6,r7 963 addc r12,r8,r12 964 adde r10,r9,r10 965 addze r11,r11 966 #mul_add_c(a[0],b[2],c3,c1,c2); 967 $LD r6,`0*$BNSZ`(r4) 968 $LD r7,`2*$BNSZ`(r5) 969 $UMULL r8,r6,r7 970 $UMULH r9,r6,r7 971 addc r12,r8,r12 972 adde r10,r9,r10 973 addze r11,r11 974 $ST r12,`2*$BNSZ`(r3) #r[2]=c3 975 #mul_add_c(a[0],b[3],c1,c2,c3); 976 $LD r7,`3*$BNSZ`(r5) 977 $UMULL r8,r6,r7 978 $UMULH r9,r6,r7 979 addc r10,r8,r10 980 adde r11,r9,r11 981 addze r12,r0 982 #mul_add_c(a[1],b[2],c1,c2,c3); 983 $LD r6,`1*$BNSZ`(r4) 984 $LD r7,`2*$BNSZ`(r5) 985 $UMULL r8,r6,r7 986 $UMULH r9,r6,r7 987 addc r10,r8,r10 988 adde r11,r9,r11 989 addze r12,r12 990 #mul_add_c(a[2],b[1],c1,c2,c3); 991 $LD r6,`2*$BNSZ`(r4) 992 $LD r7,`1*$BNSZ`(r5) 993 $UMULL r8,r6,r7 994 $UMULH r9,r6,r7 995 addc r10,r8,r10 996 adde r11,r9,r11 997 addze r12,r12 998 #mul_add_c(a[3],b[0],c1,c2,c3); 999 $LD r6,`3*$BNSZ`(r4) 1000 $LD r7,`0*$BNSZ`(r5) 1001 $UMULL r8,r6,r7 1002 $UMULH r9,r6,r7 1003 addc r10,r8,r10 1004 adde r11,r9,r11 1005 addze r12,r12 1006 $ST r10,`3*$BNSZ`(r3) #r[3]=c1 1007 #mul_add_c(a[3],b[1],c2,c3,c1); 1008 $LD r7,`1*$BNSZ`(r5) 1009 $UMULL r8,r6,r7 1010 $UMULH r9,r6,r7 1011 addc r11,r8,r11 1012 adde r12,r9,r12 1013 addze r10,r0 1014 #mul_add_c(a[2],b[2],c2,c3,c1); 1015 $LD r6,`2*$BNSZ`(r4) 1016 $LD r7,`2*$BNSZ`(r5) 1017 $UMULL r8,r6,r7 1018 $UMULH r9,r6,r7 1019 addc r11,r8,r11 1020 adde r12,r9,r12 1021 addze r10,r10 1022 #mul_add_c(a[1],b[3],c2,c3,c1); 1023 $LD r6,`1*$BNSZ`(r4) 1024 $LD r7,`3*$BNSZ`(r5) 1025 $UMULL r8,r6,r7 1026 $UMULH r9,r6,r7 1027 addc r11,r8,r11 1028 adde r12,r9,r12 1029 addze r10,r10 1030 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 1031 #mul_add_c(a[2],b[3],c3,c1,c2); 1032 $LD r6,`2*$BNSZ`(r4) 1033 $UMULL r8,r6,r7 1034 $UMULH r9,r6,r7 1035 addc r12,r8,r12 1036 adde r10,r9,r10 1037 addze r11,r0 1038 #mul_add_c(a[3],b[2],c3,c1,c2); 1039 $LD r6,`3*$BNSZ`(r4) 1040 $LD r7,`2*$BNSZ`(r4) 1041 $UMULL r8,r6,r7 1042 $UMULH r9,r6,r7 1043 addc r12,r8,r12 1044 adde r10,r9,r10 1045 addze r11,r11 1046 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 1047 #mul_add_c(a[3],b[3],c1,c2,c3); 1048 $LD r7,`3*$BNSZ`(r5) 1049 $UMULL r8,r6,r7 1050 $UMULH r9,r6,r7 1051 addc r10,r8,r10 1052 adde r11,r9,r11 1053 1054 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 1055 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 1056 bclr BO_ALWAYS,CR0_LT 1057 .long 0x00000000 1058 1059# 1060# NOTE: The following label name should be changed to 1061# "bn_mul_comba8" i.e. remove the first dot 1062# for the gcc compiler. This should be automatically 1063# done in the build 1064# 1065 1066.align 4 1067.bn_mul_comba8: 1068# 1069# Optimized version of the bn_mul_comba8 routine. 1070# 1071# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1072# r3 contains r 1073# r4 contains a 1074# r5 contains b 1075# r6, r7 are the 2 BN_ULONGs being multiplied. 1076# r8, r9 are the results of the 32x32 giving 64 multiply. 1077# r10, r11, r12 are the equivalents of c1, c2, and c3. 1078# 1079 xor r0,r0,r0 #r0=0. Used in addze below. 1080 1081 #mul_add_c(a[0],b[0],c1,c2,c3); 1082 $LD r6,`0*$BNSZ`(r4) #a[0] 1083 $LD r7,`0*$BNSZ`(r5) #b[0] 1084 $UMULL r10,r6,r7 1085 $UMULH r11,r6,r7 1086 $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1087 #mul_add_c(a[0],b[1],c2,c3,c1); 1088 $LD r7,`1*$BNSZ`(r5) 1089 $UMULL r8,r6,r7 1090 $UMULH r9,r6,r7 1091 addc r11,r11,r8 1092 addze r12,r9 # since we didnt set r12 to zero before. 1093 addze r10,r0 1094 #mul_add_c(a[1],b[0],c2,c3,c1); 1095 $LD r6,`1*$BNSZ`(r4) 1096 $LD r7,`0*$BNSZ`(r5) 1097 $UMULL r8,r6,r7 1098 $UMULH r9,r6,r7 1099 addc r11,r11,r8 1100 adde r12,r12,r9 1101 addze r10,r10 1102 $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1103 #mul_add_c(a[2],b[0],c3,c1,c2); 1104 $LD r6,`2*$BNSZ`(r4) 1105 $UMULL r8,r6,r7 1106 $UMULH r9,r6,r7 1107 addc r12,r12,r8 1108 adde r10,r10,r9 1109 addze r11,r0 1110 #mul_add_c(a[1],b[1],c3,c1,c2); 1111 $LD r6,`1*$BNSZ`(r4) 1112 $LD r7,`1*$BNSZ`(r5) 1113 $UMULL r8,r6,r7 1114 $UMULH r9,r6,r7 1115 addc r12,r12,r8 1116 adde r10,r10,r9 1117 addze r11,r11 1118 #mul_add_c(a[0],b[2],c3,c1,c2); 1119 $LD r6,`0*$BNSZ`(r4) 1120 $LD r7,`2*$BNSZ`(r5) 1121 $UMULL r8,r6,r7 1122 $UMULH r9,r6,r7 1123 addc r12,r12,r8 1124 adde r10,r10,r9 1125 addze r11,r11 1126 $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1127 #mul_add_c(a[0],b[3],c1,c2,c3); 1128 $LD r7,`3*$BNSZ`(r5) 1129 $UMULL r8,r6,r7 1130 $UMULH r9,r6,r7 1131 addc r10,r10,r8 1132 adde r11,r11,r9 1133 addze r12,r0 1134 #mul_add_c(a[1],b[2],c1,c2,c3); 1135 $LD r6,`1*$BNSZ`(r4) 1136 $LD r7,`2*$BNSZ`(r5) 1137 $UMULL r8,r6,r7 1138 $UMULH r9,r6,r7 1139 addc r10,r10,r8 1140 adde r11,r11,r9 1141 addze r12,r12 1142 1143 #mul_add_c(a[2],b[1],c1,c2,c3); 1144 $LD r6,`2*$BNSZ`(r4) 1145 $LD r7,`1*$BNSZ`(r5) 1146 $UMULL r8,r6,r7 1147 $UMULH r9,r6,r7 1148 addc r10,r10,r8 1149 adde r11,r11,r9 1150 addze r12,r12 1151 #mul_add_c(a[3],b[0],c1,c2,c3); 1152 $LD r6,`3*$BNSZ`(r4) 1153 $LD r7,`0*$BNSZ`(r5) 1154 $UMULL r8,r6,r7 1155 $UMULH r9,r6,r7 1156 addc r10,r10,r8 1157 adde r11,r11,r9 1158 addze r12,r12 1159 $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1160 #mul_add_c(a[4],b[0],c2,c3,c1); 1161 $LD r6,`4*$BNSZ`(r4) 1162 $UMULL r8,r6,r7 1163 $UMULH r9,r6,r7 1164 addc r11,r11,r8 1165 adde r12,r12,r9 1166 addze r10,r0 1167 #mul_add_c(a[3],b[1],c2,c3,c1); 1168 $LD r6,`3*$BNSZ`(r4) 1169 $LD r7,`1*$BNSZ`(r5) 1170 $UMULL r8,r6,r7 1171 $UMULH r9,r6,r7 1172 addc r11,r11,r8 1173 adde r12,r12,r9 1174 addze r10,r10 1175 #mul_add_c(a[2],b[2],c2,c3,c1); 1176 $LD r6,`2*$BNSZ`(r4) 1177 $LD r7,`2*$BNSZ`(r5) 1178 $UMULL r8,r6,r7 1179 $UMULH r9,r6,r7 1180 addc r11,r11,r8 1181 adde r12,r12,r9 1182 addze r10,r10 1183 #mul_add_c(a[1],b[3],c2,c3,c1); 1184 $LD r6,`1*$BNSZ`(r4) 1185 $LD r7,`3*$BNSZ`(r5) 1186 $UMULL r8,r6,r7 1187 $UMULH r9,r6,r7 1188 addc r11,r11,r8 1189 adde r12,r12,r9 1190 addze r10,r10 1191 #mul_add_c(a[0],b[4],c2,c3,c1); 1192 $LD r6,`0*$BNSZ`(r4) 1193 $LD r7,`4*$BNSZ`(r5) 1194 $UMULL r8,r6,r7 1195 $UMULH r9,r6,r7 1196 addc r11,r11,r8 1197 adde r12,r12,r9 1198 addze r10,r10 1199 $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1200 #mul_add_c(a[0],b[5],c3,c1,c2); 1201 $LD r7,`5*$BNSZ`(r5) 1202 $UMULL r8,r6,r7 1203 $UMULH r9,r6,r7 1204 addc r12,r12,r8 1205 adde r10,r10,r9 1206 addze r11,r0 1207 #mul_add_c(a[1],b[4],c3,c1,c2); 1208 $LD r6,`1*$BNSZ`(r4) 1209 $LD r7,`4*$BNSZ`(r5) 1210 $UMULL r8,r6,r7 1211 $UMULH r9,r6,r7 1212 addc r12,r12,r8 1213 adde r10,r10,r9 1214 addze r11,r11 1215 #mul_add_c(a[2],b[3],c3,c1,c2); 1216 $LD r6,`2*$BNSZ`(r4) 1217 $LD r7,`3*$BNSZ`(r5) 1218 $UMULL r8,r6,r7 1219 $UMULH r9,r6,r7 1220 addc r12,r12,r8 1221 adde r10,r10,r9 1222 addze r11,r11 1223 #mul_add_c(a[3],b[2],c3,c1,c2); 1224 $LD r6,`3*$BNSZ`(r4) 1225 $LD r7,`2*$BNSZ`(r5) 1226 $UMULL r8,r6,r7 1227 $UMULH r9,r6,r7 1228 addc r12,r12,r8 1229 adde r10,r10,r9 1230 addze r11,r11 1231 #mul_add_c(a[4],b[1],c3,c1,c2); 1232 $LD r6,`4*$BNSZ`(r4) 1233 $LD r7,`1*$BNSZ`(r5) 1234 $UMULL r8,r6,r7 1235 $UMULH r9,r6,r7 1236 addc r12,r12,r8 1237 adde r10,r10,r9 1238 addze r11,r11 1239 #mul_add_c(a[5],b[0],c3,c1,c2); 1240 $LD r6,`5*$BNSZ`(r4) 1241 $LD r7,`0*$BNSZ`(r5) 1242 $UMULL r8,r6,r7 1243 $UMULH r9,r6,r7 1244 addc r12,r12,r8 1245 adde r10,r10,r9 1246 addze r11,r11 1247 $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1248 #mul_add_c(a[6],b[0],c1,c2,c3); 1249 $LD r6,`6*$BNSZ`(r4) 1250 $UMULL r8,r6,r7 1251 $UMULH r9,r6,r7 1252 addc r10,r10,r8 1253 adde r11,r11,r9 1254 addze r12,r0 1255 #mul_add_c(a[5],b[1],c1,c2,c3); 1256 $LD r6,`5*$BNSZ`(r4) 1257 $LD r7,`1*$BNSZ`(r5) 1258 $UMULL r8,r6,r7 1259 $UMULH r9,r6,r7 1260 addc r10,r10,r8 1261 adde r11,r11,r9 1262 addze r12,r12 1263 #mul_add_c(a[4],b[2],c1,c2,c3); 1264 $LD r6,`4*$BNSZ`(r4) 1265 $LD r7,`2*$BNSZ`(r5) 1266 $UMULL r8,r6,r7 1267 $UMULH r9,r6,r7 1268 addc r10,r10,r8 1269 adde r11,r11,r9 1270 addze r12,r12 1271 #mul_add_c(a[3],b[3],c1,c2,c3); 1272 $LD r6,`3*$BNSZ`(r4) 1273 $LD r7,`3*$BNSZ`(r5) 1274 $UMULL r8,r6,r7 1275 $UMULH r9,r6,r7 1276 addc r10,r10,r8 1277 adde r11,r11,r9 1278 addze r12,r12 1279 #mul_add_c(a[2],b[4],c1,c2,c3); 1280 $LD r6,`2*$BNSZ`(r4) 1281 $LD r7,`4*$BNSZ`(r5) 1282 $UMULL r8,r6,r7 1283 $UMULH r9,r6,r7 1284 addc r10,r10,r8 1285 adde r11,r11,r9 1286 addze r12,r12 1287 #mul_add_c(a[1],b[5],c1,c2,c3); 1288 $LD r6,`1*$BNSZ`(r4) 1289 $LD r7,`5*$BNSZ`(r5) 1290 $UMULL r8,r6,r7 1291 $UMULH r9,r6,r7 1292 addc r10,r10,r8 1293 adde r11,r11,r9 1294 addze r12,r12 1295 #mul_add_c(a[0],b[6],c1,c2,c3); 1296 $LD r6,`0*$BNSZ`(r4) 1297 $LD r7,`6*$BNSZ`(r5) 1298 $UMULL r8,r6,r7 1299 $UMULH r9,r6,r7 1300 addc r10,r10,r8 1301 adde r11,r11,r9 1302 addze r12,r12 1303 $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1304 #mul_add_c(a[0],b[7],c2,c3,c1); 1305 $LD r7,`7*$BNSZ`(r5) 1306 $UMULL r8,r6,r7 1307 $UMULH r9,r6,r7 1308 addc r11,r11,r8 1309 adde r12,r12,r9 1310 addze r10,r0 1311 #mul_add_c(a[1],b[6],c2,c3,c1); 1312 $LD r6,`1*$BNSZ`(r4) 1313 $LD r7,`6*$BNSZ`(r5) 1314 $UMULL r8,r6,r7 1315 $UMULH r9,r6,r7 1316 addc r11,r11,r8 1317 adde r12,r12,r9 1318 addze r10,r10 1319 #mul_add_c(a[2],b[5],c2,c3,c1); 1320 $LD r6,`2*$BNSZ`(r4) 1321 $LD r7,`5*$BNSZ`(r5) 1322 $UMULL r8,r6,r7 1323 $UMULH r9,r6,r7 1324 addc r11,r11,r8 1325 adde r12,r12,r9 1326 addze r10,r10 1327 #mul_add_c(a[3],b[4],c2,c3,c1); 1328 $LD r6,`3*$BNSZ`(r4) 1329 $LD r7,`4*$BNSZ`(r5) 1330 $UMULL r8,r6,r7 1331 $UMULH r9,r6,r7 1332 addc r11,r11,r8 1333 adde r12,r12,r9 1334 addze r10,r10 1335 #mul_add_c(a[4],b[3],c2,c3,c1); 1336 $LD r6,`4*$BNSZ`(r4) 1337 $LD r7,`3*$BNSZ`(r5) 1338 $UMULL r8,r6,r7 1339 $UMULH r9,r6,r7 1340 addc r11,r11,r8 1341 adde r12,r12,r9 1342 addze r10,r10 1343 #mul_add_c(a[5],b[2],c2,c3,c1); 1344 $LD r6,`5*$BNSZ`(r4) 1345 $LD r7,`2*$BNSZ`(r5) 1346 $UMULL r8,r6,r7 1347 $UMULH r9,r6,r7 1348 addc r11,r11,r8 1349 adde r12,r12,r9 1350 addze r10,r10 1351 #mul_add_c(a[6],b[1],c2,c3,c1); 1352 $LD r6,`6*$BNSZ`(r4) 1353 $LD r7,`1*$BNSZ`(r5) 1354 $UMULL r8,r6,r7 1355 $UMULH r9,r6,r7 1356 addc r11,r11,r8 1357 adde r12,r12,r9 1358 addze r10,r10 1359 #mul_add_c(a[7],b[0],c2,c3,c1); 1360 $LD r6,`7*$BNSZ`(r4) 1361 $LD r7,`0*$BNSZ`(r5) 1362 $UMULL r8,r6,r7 1363 $UMULH r9,r6,r7 1364 addc r11,r11,r8 1365 adde r12,r12,r9 1366 addze r10,r10 1367 $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1368 #mul_add_c(a[7],b[1],c3,c1,c2); 1369 $LD r7,`1*$BNSZ`(r5) 1370 $UMULL r8,r6,r7 1371 $UMULH r9,r6,r7 1372 addc r12,r12,r8 1373 adde r10,r10,r9 1374 addze r11,r0 1375 #mul_add_c(a[6],b[2],c3,c1,c2); 1376 $LD r6,`6*$BNSZ`(r4) 1377 $LD r7,`2*$BNSZ`(r5) 1378 $UMULL r8,r6,r7 1379 $UMULH r9,r6,r7 1380 addc r12,r12,r8 1381 adde r10,r10,r9 1382 addze r11,r11 1383 #mul_add_c(a[5],b[3],c3,c1,c2); 1384 $LD r6,`5*$BNSZ`(r4) 1385 $LD r7,`3*$BNSZ`(r5) 1386 $UMULL r8,r6,r7 1387 $UMULH r9,r6,r7 1388 addc r12,r12,r8 1389 adde r10,r10,r9 1390 addze r11,r11 1391 #mul_add_c(a[4],b[4],c3,c1,c2); 1392 $LD r6,`4*$BNSZ`(r4) 1393 $LD r7,`4*$BNSZ`(r5) 1394 $UMULL r8,r6,r7 1395 $UMULH r9,r6,r7 1396 addc r12,r12,r8 1397 adde r10,r10,r9 1398 addze r11,r11 1399 #mul_add_c(a[3],b[5],c3,c1,c2); 1400 $LD r6,`3*$BNSZ`(r4) 1401 $LD r7,`5*$BNSZ`(r5) 1402 $UMULL r8,r6,r7 1403 $UMULH r9,r6,r7 1404 addc r12,r12,r8 1405 adde r10,r10,r9 1406 addze r11,r11 1407 #mul_add_c(a[2],b[6],c3,c1,c2); 1408 $LD r6,`2*$BNSZ`(r4) 1409 $LD r7,`6*$BNSZ`(r5) 1410 $UMULL r8,r6,r7 1411 $UMULH r9,r6,r7 1412 addc r12,r12,r8 1413 adde r10,r10,r9 1414 addze r11,r11 1415 #mul_add_c(a[1],b[7],c3,c1,c2); 1416 $LD r6,`1*$BNSZ`(r4) 1417 $LD r7,`7*$BNSZ`(r5) 1418 $UMULL r8,r6,r7 1419 $UMULH r9,r6,r7 1420 addc r12,r12,r8 1421 adde r10,r10,r9 1422 addze r11,r11 1423 $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1424 #mul_add_c(a[2],b[7],c1,c2,c3); 1425 $LD r6,`2*$BNSZ`(r4) 1426 $UMULL r8,r6,r7 1427 $UMULH r9,r6,r7 1428 addc r10,r10,r8 1429 adde r11,r11,r9 1430 addze r12,r0 1431 #mul_add_c(a[3],b[6],c1,c2,c3); 1432 $LD r6,`3*$BNSZ`(r4) 1433 $LD r7,`6*$BNSZ`(r5) 1434 $UMULL r8,r6,r7 1435 $UMULH r9,r6,r7 1436 addc r10,r10,r8 1437 adde r11,r11,r9 1438 addze r12,r12 1439 #mul_add_c(a[4],b[5],c1,c2,c3); 1440 $LD r6,`4*$BNSZ`(r4) 1441 $LD r7,`5*$BNSZ`(r5) 1442 $UMULL r8,r6,r7 1443 $UMULH r9,r6,r7 1444 addc r10,r10,r8 1445 adde r11,r11,r9 1446 addze r12,r12 1447 #mul_add_c(a[5],b[4],c1,c2,c3); 1448 $LD r6,`5*$BNSZ`(r4) 1449 $LD r7,`4*$BNSZ`(r5) 1450 $UMULL r8,r6,r7 1451 $UMULH r9,r6,r7 1452 addc r10,r10,r8 1453 adde r11,r11,r9 1454 addze r12,r12 1455 #mul_add_c(a[6],b[3],c1,c2,c3); 1456 $LD r6,`6*$BNSZ`(r4) 1457 $LD r7,`3*$BNSZ`(r5) 1458 $UMULL r8,r6,r7 1459 $UMULH r9,r6,r7 1460 addc r10,r10,r8 1461 adde r11,r11,r9 1462 addze r12,r12 1463 #mul_add_c(a[7],b[2],c1,c2,c3); 1464 $LD r6,`7*$BNSZ`(r4) 1465 $LD r7,`2*$BNSZ`(r5) 1466 $UMULL r8,r6,r7 1467 $UMULH r9,r6,r7 1468 addc r10,r10,r8 1469 adde r11,r11,r9 1470 addze r12,r12 1471 $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1472 #mul_add_c(a[7],b[3],c2,c3,c1); 1473 $LD r7,`3*$BNSZ`(r5) 1474 $UMULL r8,r6,r7 1475 $UMULH r9,r6,r7 1476 addc r11,r11,r8 1477 adde r12,r12,r9 1478 addze r10,r0 1479 #mul_add_c(a[6],b[4],c2,c3,c1); 1480 $LD r6,`6*$BNSZ`(r4) 1481 $LD r7,`4*$BNSZ`(r5) 1482 $UMULL r8,r6,r7 1483 $UMULH r9,r6,r7 1484 addc r11,r11,r8 1485 adde r12,r12,r9 1486 addze r10,r10 1487 #mul_add_c(a[5],b[5],c2,c3,c1); 1488 $LD r6,`5*$BNSZ`(r4) 1489 $LD r7,`5*$BNSZ`(r5) 1490 $UMULL r8,r6,r7 1491 $UMULH r9,r6,r7 1492 addc r11,r11,r8 1493 adde r12,r12,r9 1494 addze r10,r10 1495 #mul_add_c(a[4],b[6],c2,c3,c1); 1496 $LD r6,`4*$BNSZ`(r4) 1497 $LD r7,`6*$BNSZ`(r5) 1498 $UMULL r8,r6,r7 1499 $UMULH r9,r6,r7 1500 addc r11,r11,r8 1501 adde r12,r12,r9 1502 addze r10,r10 1503 #mul_add_c(a[3],b[7],c2,c3,c1); 1504 $LD r6,`3*$BNSZ`(r4) 1505 $LD r7,`7*$BNSZ`(r5) 1506 $UMULL r8,r6,r7 1507 $UMULH r9,r6,r7 1508 addc r11,r11,r8 1509 adde r12,r12,r9 1510 addze r10,r10 1511 $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1512 #mul_add_c(a[4],b[7],c3,c1,c2); 1513 $LD r6,`4*$BNSZ`(r4) 1514 $UMULL r8,r6,r7 1515 $UMULH r9,r6,r7 1516 addc r12,r12,r8 1517 adde r10,r10,r9 1518 addze r11,r0 1519 #mul_add_c(a[5],b[6],c3,c1,c2); 1520 $LD r6,`5*$BNSZ`(r4) 1521 $LD r7,`6*$BNSZ`(r5) 1522 $UMULL r8,r6,r7 1523 $UMULH r9,r6,r7 1524 addc r12,r12,r8 1525 adde r10,r10,r9 1526 addze r11,r11 1527 #mul_add_c(a[6],b[5],c3,c1,c2); 1528 $LD r6,`6*$BNSZ`(r4) 1529 $LD r7,`5*$BNSZ`(r5) 1530 $UMULL r8,r6,r7 1531 $UMULH r9,r6,r7 1532 addc r12,r12,r8 1533 adde r10,r10,r9 1534 addze r11,r11 1535 #mul_add_c(a[7],b[4],c3,c1,c2); 1536 $LD r6,`7*$BNSZ`(r4) 1537 $LD r7,`4*$BNSZ`(r5) 1538 $UMULL r8,r6,r7 1539 $UMULH r9,r6,r7 1540 addc r12,r12,r8 1541 adde r10,r10,r9 1542 addze r11,r11 1543 $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1544 #mul_add_c(a[7],b[5],c1,c2,c3); 1545 $LD r7,`5*$BNSZ`(r5) 1546 $UMULL r8,r6,r7 1547 $UMULH r9,r6,r7 1548 addc r10,r10,r8 1549 adde r11,r11,r9 1550 addze r12,r0 1551 #mul_add_c(a[6],b[6],c1,c2,c3); 1552 $LD r6,`6*$BNSZ`(r4) 1553 $LD r7,`6*$BNSZ`(r5) 1554 $UMULL r8,r6,r7 1555 $UMULH r9,r6,r7 1556 addc r10,r10,r8 1557 adde r11,r11,r9 1558 addze r12,r12 1559 #mul_add_c(a[5],b[7],c1,c2,c3); 1560 $LD r6,`5*$BNSZ`(r4) 1561 $LD r7,`7*$BNSZ`(r5) 1562 $UMULL r8,r6,r7 1563 $UMULH r9,r6,r7 1564 addc r10,r10,r8 1565 adde r11,r11,r9 1566 addze r12,r12 1567 $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1568 #mul_add_c(a[6],b[7],c2,c3,c1); 1569 $LD r6,`6*$BNSZ`(r4) 1570 $UMULL r8,r6,r7 1571 $UMULH r9,r6,r7 1572 addc r11,r11,r8 1573 adde r12,r12,r9 1574 addze r10,r0 1575 #mul_add_c(a[7],b[6],c2,c3,c1); 1576 $LD r6,`7*$BNSZ`(r4) 1577 $LD r7,`6*$BNSZ`(r5) 1578 $UMULL r8,r6,r7 1579 $UMULH r9,r6,r7 1580 addc r11,r11,r8 1581 adde r12,r12,r9 1582 addze r10,r10 1583 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1584 #mul_add_c(a[7],b[7],c3,c1,c2); 1585 $LD r7,`7*$BNSZ`(r5) 1586 $UMULL r8,r6,r7 1587 $UMULH r9,r6,r7 1588 addc r12,r12,r8 1589 adde r10,r10,r9 1590 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1591 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1592 bclr BO_ALWAYS,CR0_LT 1593 .long 0x00000000 1594 1595# 1596# NOTE: The following label name should be changed to 1597# "bn_sub_words" i.e. remove the first dot 1598# for the gcc compiler. This should be automatically 1599# done in the build 1600# 1601# 1602.align 4 1603.bn_sub_words: 1604# 1605# Handcoded version of bn_sub_words 1606# 1607#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1608# 1609# r3 = r 1610# r4 = a 1611# r5 = b 1612# r6 = n 1613# 1614# Note: No loop unrolling done since this is not a performance 1615# critical loop. 1616 1617 xor r0,r0,r0 #set r0 = 0 1618# 1619# check for r6 = 0 AND set carry bit. 1620# 1621 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1622 # if r6 > 0 then result !=0 1623 # In either case carry bit is set. 1624 bc BO_IF,CR0_EQ,Lppcasm_sub_adios 1625 addi r4,r4,-$BNSZ 1626 addi r3,r3,-$BNSZ 1627 addi r5,r5,-$BNSZ 1628 mtctr r6 1629Lppcasm_sub_mainloop: 1630 $LDU r7,$BNSZ(r4) 1631 $LDU r8,$BNSZ(r5) 1632 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1633 # if carry = 1 this is r7-r8. Else it 1634 # is r7-r8 -1 as we need. 1635 $STU r6,$BNSZ(r3) 1636 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop 1637Lppcasm_sub_adios: 1638 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1639 andi. r3,r3,1 # keep only last bit. 1640 bclr BO_ALWAYS,CR0_LT 1641 .long 0x00000000 1642 1643 1644# 1645# NOTE: The following label name should be changed to 1646# "bn_add_words" i.e. remove the first dot 1647# for the gcc compiler. This should be automatically 1648# done in the build 1649# 1650 1651.align 4 1652.bn_add_words: 1653# 1654# Handcoded version of bn_add_words 1655# 1656#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1657# 1658# r3 = r 1659# r4 = a 1660# r5 = b 1661# r6 = n 1662# 1663# Note: No loop unrolling done since this is not a performance 1664# critical loop. 1665 1666 xor r0,r0,r0 1667# 1668# check for r6 = 0. Is this needed? 1669# 1670 addic. r6,r6,0 #test r6 and clear carry bit. 1671 bc BO_IF,CR0_EQ,Lppcasm_add_adios 1672 addi r4,r4,-$BNSZ 1673 addi r3,r3,-$BNSZ 1674 addi r5,r5,-$BNSZ 1675 mtctr r6 1676Lppcasm_add_mainloop: 1677 $LDU r7,$BNSZ(r4) 1678 $LDU r8,$BNSZ(r5) 1679 adde r8,r7,r8 1680 $STU r8,$BNSZ(r3) 1681 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop 1682Lppcasm_add_adios: 1683 addze r3,r0 #return carry bit. 1684 bclr BO_ALWAYS,CR0_LT 1685 .long 0x00000000 1686 1687# 1688# NOTE: The following label name should be changed to 1689# "bn_div_words" i.e. remove the first dot 1690# for the gcc compiler. This should be automatically 1691# done in the build 1692# 1693 1694.align 4 1695.bn_div_words: 1696# 1697# This is a cleaned up version of code generated by 1698# the AIX compiler. The only optimization is to use 1699# the PPC instruction to count leading zeros instead 1700# of call to num_bits_word. Since this was compiled 1701# only at level -O2 we can possibly squeeze it more? 1702# 1703# r3 = h 1704# r4 = l 1705# r5 = d 1706 1707 $UCMPI 0,r5,0 # compare r5 and 0 1708 bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0 1709 li r3,-1 # d=0 return -1 1710 bclr BO_ALWAYS,CR0_LT 1711Lppcasm_div1: 1712 xor r0,r0,r0 #r0=0 1713 $COUNTZ r7,r5 #r7 = num leading 0s in d. 1714 subfic r8,r7,$BITS #r8 = BN_num_bits_word(d) 1715 cmpi 0,0,r8,$BITS # 1716 bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if (r8==$BITS) 1717 li r9,1 # r9=1 1718 $SHL r10,r9,r8 # r9<<=r8 1719 $UCMP 0,r3,r10 # 1720 bc BO_IF,CR0_GT,Lppcasm_div2 #or if (h > (1<<r8)) 1721 $UDIV r3,r3,r0 #if not assert(0) divide by 0! 1722 #that's how we signal overflow 1723 bclr BO_ALWAYS,CR0_LT #return. NEVER REACHED. 1724Lppcasm_div2: 1725 $UCMP 0,r3,r5 #h>=d? 1726 bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not 1727 subf r3,r5,r3 #h-=d ; 1728Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1729 cmpi 0,0,r7,0 # is (i == 0)? 1730 bc BO_IF,CR0_EQ,Lppcasm_div4 1731 $SHL r3,r3,r7 # h = (h<< i) 1732 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1733 $SHL r5,r5,r7 # d<<=i 1734 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1735 $SHL r4,r4,r7 # l <<=i 1736Lppcasm_div4: 1737 $SHRI r9,r5,`$BITS/2` # r9 = dh 1738 # dl will be computed when needed 1739 # as it saves registers. 1740 li r6,2 #r6=2 1741 mtctr r6 #counter will be in count. 1742Lppcasm_divouterloop: 1743 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1744 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1745 # compute here for innerloop. 1746 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1747 bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not 1748 1749 li r8,-1 1750 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1751 b Lppcasm_div6 1752Lppcasm_div5: 1753 $UDIV r8,r3,r9 #q = h/dh 1754Lppcasm_div6: 1755 $UMULL r12,r9,r8 #th = q*dh 1756 $CLRU r10,r5,`$BITS/2` #r10=dl 1757 $UMULL r6,r8,r10 #tl = q*dl 1758 1759Lppcasm_divinnerloop: 1760 subf r10,r12,r3 #t = h -th 1761 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1762 addic. r7,r7,0 #test if r7 == 0. used below. 1763 # now want to compute 1764 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1765 # the following 2 instructions do that 1766 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1767 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1768 $UCMP 1,r6,r7 # compare (tl <= r7) 1769 bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit 1770 bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit 1771 addi r8,r8,-1 #q-- 1772 subf r12,r9,r12 #th -=dh 1773 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1774 subf r6,r10,r6 #tl -=dl 1775 b Lppcasm_divinnerloop 1776Lppcasm_divinnerexit: 1777 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1778 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1779 $UCMP 1,r4,r11 # compare l and tl 1780 add r12,r12,r10 # th+=t 1781 bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1782 addi r12,r12,1 # th++ 1783Lppcasm_div7: 1784 subf r11,r11,r4 #r11=l-tl 1785 $UCMP 1,r3,r12 #compare h and th 1786 bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1787 addi r8,r8,-1 # q-- 1788 add r3,r5,r3 # h+=d 1789Lppcasm_div8: 1790 subf r12,r12,r3 #r12 = h-th 1791 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1792 # want to compute 1793 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1794 # the following 2 instructions will do this. 1795 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1796 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1797 bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ; 1798 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1799 b Lppcasm_divouterloop 1800Lppcasm_div9: 1801 or r3,r8,r0 1802 bclr BO_ALWAYS,CR0_LT 1803 .long 0x00000000 1804 1805# 1806# NOTE: The following label name should be changed to 1807# "bn_sqr_words" i.e. remove the first dot 1808# for the gcc compiler. This should be automatically 1809# done in the build 1810# 1811.align 4 1812.bn_sqr_words: 1813# 1814# Optimized version of bn_sqr_words 1815# 1816# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1817# 1818# r3 = r 1819# r4 = a 1820# r5 = n 1821# 1822# r6 = a[i]. 1823# r7,r8 = product. 1824# 1825# No unrolling done here. Not performance critical. 1826 1827 addic. r5,r5,0 #test r5. 1828 bc BO_IF,CR0_EQ,Lppcasm_sqr_adios 1829 addi r4,r4,-$BNSZ 1830 addi r3,r3,-$BNSZ 1831 mtctr r5 1832Lppcasm_sqr_mainloop: 1833 #sqr(r[0],r[1],a[0]); 1834 $LDU r6,$BNSZ(r4) 1835 $UMULL r7,r6,r6 1836 $UMULH r8,r6,r6 1837 $STU r7,$BNSZ(r3) 1838 $STU r8,$BNSZ(r3) 1839 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop 1840Lppcasm_sqr_adios: 1841 bclr BO_ALWAYS,CR0_LT 1842 .long 0x00000000 1843 1844 1845# 1846# NOTE: The following label name should be changed to 1847# "bn_mul_words" i.e. remove the first dot 1848# for the gcc compiler. This should be automatically 1849# done in the build 1850# 1851 1852.align 4 1853.bn_mul_words: 1854# 1855# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1856# 1857# r3 = rp 1858# r4 = ap 1859# r5 = num 1860# r6 = w 1861 xor r0,r0,r0 1862 xor r12,r12,r12 # used for carry 1863 rlwinm. r7,r5,30,2,31 # num >> 2 1864 bc BO_IF,CR0_EQ,Lppcasm_mw_REM 1865 mtctr r7 1866Lppcasm_mw_LOOP: 1867 #mul(rp[0],ap[0],w,c1); 1868 $LD r8,`0*$BNSZ`(r4) 1869 $UMULL r9,r6,r8 1870 $UMULH r10,r6,r8 1871 addc r9,r9,r12 1872 #addze r10,r10 #carry is NOT ignored. 1873 #will be taken care of 1874 #in second spin below 1875 #using adde. 1876 $ST r9,`0*$BNSZ`(r3) 1877 #mul(rp[1],ap[1],w,c1); 1878 $LD r8,`1*$BNSZ`(r4) 1879 $UMULL r11,r6,r8 1880 $UMULH r12,r6,r8 1881 adde r11,r11,r10 1882 #addze r12,r12 1883 $ST r11,`1*$BNSZ`(r3) 1884 #mul(rp[2],ap[2],w,c1); 1885 $LD r8,`2*$BNSZ`(r4) 1886 $UMULL r9,r6,r8 1887 $UMULH r10,r6,r8 1888 adde r9,r9,r12 1889 #addze r10,r10 1890 $ST r9,`2*$BNSZ`(r3) 1891 #mul_add(rp[3],ap[3],w,c1); 1892 $LD r8,`3*$BNSZ`(r4) 1893 $UMULL r11,r6,r8 1894 $UMULH r12,r6,r8 1895 adde r11,r11,r10 1896 addze r12,r12 #this spin we collect carry into 1897 #r12 1898 $ST r11,`3*$BNSZ`(r3) 1899 1900 addi r3,r3,`4*$BNSZ` 1901 addi r4,r4,`4*$BNSZ` 1902 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP 1903 1904Lppcasm_mw_REM: 1905 andi. r5,r5,0x3 1906 bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1907 #mul(rp[0],ap[0],w,c1); 1908 $LD r8,`0*$BNSZ`(r4) 1909 $UMULL r9,r6,r8 1910 $UMULH r10,r6,r8 1911 addc r9,r9,r12 1912 addze r10,r10 1913 $ST r9,`0*$BNSZ`(r3) 1914 addi r12,r10,0 1915 1916 addi r5,r5,-1 1917 cmpli 0,0,r5,0 1918 bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1919 1920 1921 #mul(rp[1],ap[1],w,c1); 1922 $LD r8,`1*$BNSZ`(r4) 1923 $UMULL r9,r6,r8 1924 $UMULH r10,r6,r8 1925 addc r9,r9,r12 1926 addze r10,r10 1927 $ST r9,`1*$BNSZ`(r3) 1928 addi r12,r10,0 1929 1930 addi r5,r5,-1 1931 cmpli 0,0,r5,0 1932 bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1933 1934 #mul_add(rp[2],ap[2],w,c1); 1935 $LD r8,`2*$BNSZ`(r4) 1936 $UMULL r9,r6,r8 1937 $UMULH r10,r6,r8 1938 addc r9,r9,r12 1939 addze r10,r10 1940 $ST r9,`2*$BNSZ`(r3) 1941 addi r12,r10,0 1942 1943Lppcasm_mw_OVER: 1944 addi r3,r12,0 1945 bclr BO_ALWAYS,CR0_LT 1946 .long 0x00000000 1947 1948# 1949# NOTE: The following label name should be changed to 1950# "bn_mul_add_words" i.e. remove the first dot 1951# for the gcc compiler. This should be automatically 1952# done in the build 1953# 1954 1955.align 4 1956.bn_mul_add_words: 1957# 1958# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1959# 1960# r3 = rp 1961# r4 = ap 1962# r5 = num 1963# r6 = w 1964# 1965# empirical evidence suggests that unrolled version performs best!! 1966# 1967 xor r0,r0,r0 #r0 = 0 1968 xor r12,r12,r12 #r12 = 0 . used for carry 1969 rlwinm. r7,r5,30,2,31 # num >> 2 1970 bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1971 mtctr r7 1972Lppcasm_maw_mainloop: 1973 #mul_add(rp[0],ap[0],w,c1); 1974 $LD r8,`0*$BNSZ`(r4) 1975 $LD r11,`0*$BNSZ`(r3) 1976 $UMULL r9,r6,r8 1977 $UMULH r10,r6,r8 1978 addc r9,r9,r12 #r12 is carry. 1979 addze r10,r10 1980 addc r9,r9,r11 1981 #addze r10,r10 1982 #the above instruction addze 1983 #is NOT needed. Carry will NOT 1984 #be ignored. It's not affected 1985 #by multiply and will be collected 1986 #in the next spin 1987 $ST r9,`0*$BNSZ`(r3) 1988 1989 #mul_add(rp[1],ap[1],w,c1); 1990 $LD r8,`1*$BNSZ`(r4) 1991 $LD r9,`1*$BNSZ`(r3) 1992 $UMULL r11,r6,r8 1993 $UMULH r12,r6,r8 1994 adde r11,r11,r10 #r10 is carry. 1995 addze r12,r12 1996 addc r11,r11,r9 1997 #addze r12,r12 1998 $ST r11,`1*$BNSZ`(r3) 1999 2000 #mul_add(rp[2],ap[2],w,c1); 2001 $LD r8,`2*$BNSZ`(r4) 2002 $UMULL r9,r6,r8 2003 $LD r11,`2*$BNSZ`(r3) 2004 $UMULH r10,r6,r8 2005 adde r9,r9,r12 2006 addze r10,r10 2007 addc r9,r9,r11 2008 #addze r10,r10 2009 $ST r9,`2*$BNSZ`(r3) 2010 2011 #mul_add(rp[3],ap[3],w,c1); 2012 $LD r8,`3*$BNSZ`(r4) 2013 $UMULL r11,r6,r8 2014 $LD r9,`3*$BNSZ`(r3) 2015 $UMULH r12,r6,r8 2016 adde r11,r11,r10 2017 addze r12,r12 2018 addc r11,r11,r9 2019 addze r12,r12 2020 $ST r11,`3*$BNSZ`(r3) 2021 addi r3,r3,`4*$BNSZ` 2022 addi r4,r4,`4*$BNSZ` 2023 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop 2024 2025Lppcasm_maw_leftover: 2026 andi. r5,r5,0x3 2027 bc BO_IF,CR0_EQ,Lppcasm_maw_adios 2028 addi r3,r3,-$BNSZ 2029 addi r4,r4,-$BNSZ 2030 #mul_add(rp[0],ap[0],w,c1); 2031 mtctr r5 2032 $LDU r8,$BNSZ(r4) 2033 $UMULL r9,r6,r8 2034 $UMULH r10,r6,r8 2035 $LDU r11,$BNSZ(r3) 2036 addc r9,r9,r11 2037 addze r10,r10 2038 addc r9,r9,r12 2039 addze r12,r10 2040 $ST r9,0(r3) 2041 2042 bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 2043 #mul_add(rp[1],ap[1],w,c1); 2044 $LDU r8,$BNSZ(r4) 2045 $UMULL r9,r6,r8 2046 $UMULH r10,r6,r8 2047 $LDU r11,$BNSZ(r3) 2048 addc r9,r9,r11 2049 addze r10,r10 2050 addc r9,r9,r12 2051 addze r12,r10 2052 $ST r9,0(r3) 2053 2054 bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 2055 #mul_add(rp[2],ap[2],w,c1); 2056 $LDU r8,$BNSZ(r4) 2057 $UMULL r9,r6,r8 2058 $UMULH r10,r6,r8 2059 $LDU r11,$BNSZ(r3) 2060 addc r9,r9,r11 2061 addze r10,r10 2062 addc r9,r9,r12 2063 addze r12,r10 2064 $ST r9,0(r3) 2065 2066Lppcasm_maw_adios: 2067 addi r3,r12,0 2068 bclr BO_ALWAYS,CR0_LT 2069 .long 0x00000000 2070 .align 4 2071EOF 2072 $data =~ s/\`([^\`]*)\`/eval $1/gem; 2073 2074 # if some assembler chokes on some simplified mnemonic, 2075 # this is the spot to fix it up, e.g.: 2076 # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare 2077 $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm; 2078 # assembler X doesn't accept li, load immediate value 2079 #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm; 2080 return($data); 2081} 2082