1.ident "sparcv8plus.s, Version 1.4" 2.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" 3 4/* 5 * ==================================================================== 6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 7 * project. 8 * 9 * Rights for redistribution and usage in source and binary forms are 10 * granted according to the OpenSSL license. Warranty of any kind is 11 * disclaimed. 12 * ==================================================================== 13 */ 14 15/* 16 * This is my modest contributon to OpenSSL project (see 17 * http://www.openssl.org/ for more information about it) and is 18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c 19 * module. For updates see http://fy.chalmers.se/~appro/hpe/. 20 * 21 * Questions-n-answers. 22 * 23 * Q. How to compile? 24 * A. With SC4.x/SC5.x: 25 * 26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o 27 * 28 * and with gcc: 29 * 30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o 31 * 32 * or if above fails (it does if you have gas installed): 33 * 34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o 35 * 36 * Quick-n-dirty way to fuse the module into the library. 37 * Provided that the library is already configured and built 38 * (in 0.9.2 case with no-asm option): 39 * 40 * # cd crypto/bn 41 * # cp /some/place/bn_asm.sparc.v8plus.S . 42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o 43 * # make 44 * # cd ../.. 45 * # make; make test 46 * 47 * Quick-n-dirty way to get rid of it: 48 * 49 * # cd crypto/bn 50 * # touch bn_asm.c 51 * # make 52 * # cd ../.. 53 * # make; make test 54 * 55 * Q. V8plus achitecture? What kind of beast is that? 56 * A. Well, it's rather a programming model than an architecture... 57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under 58 * special conditions, namely when kernel doesn't preserve upper 59 * 32 bits of otherwise 64-bit registers during a context switch. 60 * 61 * Q. Why just UltraSPARC? What about SuperSPARC? 62 * A. Original release did target UltraSPARC only. Now SuperSPARC 63 * version is provided along. Both version share bn_*comba[48] 64 * implementations (see comment later in code for explanation). 65 * But what's so special about this UltraSPARC implementation? 66 * Why didn't I let compiler do the job? Trouble is that most of 67 * available compilers (well, SC5.0 is the only exception) don't 68 * attempt to take advantage of UltraSPARC's 64-bitness under 69 * 32-bit kernels even though it's perfectly possible (see next 70 * question). 71 * 72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it 73 * doesn't work? 74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is 75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully 76 * preserved if you're in a leaf function, i.e. such never calling 77 * any other functions. All functions in this module are leaf and 78 * 10 registers is a handful. And as a matter of fact none-"comba" 79 * routines don't require even that much and I could even afford to 80 * not allocate own stack frame for 'em:-) 81 * 82 * Q. What about 64-bit kernels? 83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently 84 * under evaluation and development... 85 * 86 * Q. What about shared libraries? 87 * A. What about 'em? Kidding again:-) Code does *not* contain any 88 * code position dependencies and it's safe to include it into 89 * shared library as is. 90 * 91 * Q. How much faster does it go? 92 * A. Do you have a good benchmark? In either case below is what I 93 * experience with crypto/bn/expspeed.c test program: 94 * 95 * v8plus module on U10/300MHz against bn_asm.c compiled with: 96 * 97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12% 98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35% 99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45% 100 * 101 * v8 module on SS10/60MHz against bn_asm.c compiled with: 102 * 103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10% 104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10% 105 * egcs-1.1.2 -mv8 -O3 +35-45% 106 * 107 * As you can see it's damn hard to beat the new Sun C compiler 108 * and it's in first place GNU C users who will appreciate this 109 * assembler implementation:-) 110 */ 111 112/* 113 * Revision history. 114 * 115 * 1.0 - initial release; 116 * 1.1 - new loop unrolling model(*); 117 * - some more fine tuning; 118 * 1.2 - made gas friendly; 119 * - updates to documentation concerning v9; 120 * - new performance comparison matrix; 121 * 1.3 - fixed problem with /usr/ccs/lib/cpp; 122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient) 123 * resulting in slight overall performance kick; 124 * - some retunes; 125 * - support for GNU as added; 126 * 127 * (*) Originally unrolled loop looked like this: 128 * for (;;) { 129 * op(p+0); if (--n==0) break; 130 * op(p+1); if (--n==0) break; 131 * op(p+2); if (--n==0) break; 132 * op(p+3); if (--n==0) break; 133 * p+=4; 134 * } 135 * I unroll according to following: 136 * while (n&~3) { 137 * op(p+0); op(p+1); op(p+2); op(p+3); 138 * p+=4; n=-4; 139 * } 140 * if (n) { 141 * op(p+0); if (--n==0) return; 142 * op(p+2); if (--n==0) return; 143 * op(p+3); return; 144 * } 145 */ 146 147/* 148 * GNU assembler can't stand stuw:-( 149 */ 150#define stuw st 151 152.section ".text",#alloc,#execinstr 153.file "bn_asm.sparc.v8plus.S" 154 155.align 32 156 157.global bn_mul_add_words 158/* 159 * BN_ULONG bn_mul_add_words(rp,ap,num,w) 160 * BN_ULONG *rp,*ap; 161 * int num; 162 * BN_ULONG w; 163 */ 164bn_mul_add_words: 165 brgz,a %o2,.L_bn_mul_add_words_proceed 166 lduw [%o1],%g2 167 retl 168 clr %o0 169 170.L_bn_mul_add_words_proceed: 171 srl %o3,%g0,%o3 ! clruw %o3 172 andcc %o2,-4,%g0 173 bz,pn %icc,.L_bn_mul_add_words_tail 174 clr %o5 175 176.L_bn_mul_add_words_loop: ! wow! 32 aligned! 177 lduw [%o0],%g1 178 lduw [%o1+4],%g3 179 mulx %o3,%g2,%g2 180 add %g1,%o5,%o4 181 nop 182 add %o4,%g2,%o4 183 stuw %o4,[%o0] 184 srlx %o4,32,%o5 185 186 lduw [%o0+4],%g1 187 lduw [%o1+8],%g2 188 mulx %o3,%g3,%g3 189 add %g1,%o5,%o4 190 dec 4,%o2 191 add %o4,%g3,%o4 192 stuw %o4,[%o0+4] 193 srlx %o4,32,%o5 194 195 lduw [%o0+8],%g1 196 lduw [%o1+12],%g3 197 mulx %o3,%g2,%g2 198 add %g1,%o5,%o4 199 inc 16,%o1 200 add %o4,%g2,%o4 201 stuw %o4,[%o0+8] 202 srlx %o4,32,%o5 203 204 lduw [%o0+12],%g1 205 mulx %o3,%g3,%g3 206 add %g1,%o5,%o4 207 inc 16,%o0 208 add %o4,%g3,%o4 209 andcc %o2,-4,%g0 210 stuw %o4,[%o0-4] 211 srlx %o4,32,%o5 212 bnz,a,pt %icc,.L_bn_mul_add_words_loop 213 lduw [%o1],%g2 214 215 brnz,a,pn %o2,.L_bn_mul_add_words_tail 216 lduw [%o1],%g2 217.L_bn_mul_add_words_return: 218 retl 219 mov %o5,%o0 220 221.L_bn_mul_add_words_tail: 222 lduw [%o0],%g1 223 mulx %o3,%g2,%g2 224 add %g1,%o5,%o4 225 dec %o2 226 add %o4,%g2,%o4 227 srlx %o4,32,%o5 228 brz,pt %o2,.L_bn_mul_add_words_return 229 stuw %o4,[%o0] 230 231 lduw [%o1+4],%g2 232 lduw [%o0+4],%g1 233 mulx %o3,%g2,%g2 234 add %g1,%o5,%o4 235 dec %o2 236 add %o4,%g2,%o4 237 srlx %o4,32,%o5 238 brz,pt %o2,.L_bn_mul_add_words_return 239 stuw %o4,[%o0+4] 240 241 lduw [%o1+8],%g2 242 lduw [%o0+8],%g1 243 mulx %o3,%g2,%g2 244 add %g1,%o5,%o4 245 add %o4,%g2,%o4 246 stuw %o4,[%o0+8] 247 retl 248 srlx %o4,32,%o0 249 250.type bn_mul_add_words,#function 251.size bn_mul_add_words,(.-bn_mul_add_words) 252 253.align 32 254 255.global bn_mul_words 256/* 257 * BN_ULONG bn_mul_words(rp,ap,num,w) 258 * BN_ULONG *rp,*ap; 259 * int num; 260 * BN_ULONG w; 261 */ 262bn_mul_words: 263 brgz,a %o2,.L_bn_mul_words_proceeed 264 lduw [%o1],%g2 265 retl 266 clr %o0 267 268.L_bn_mul_words_proceeed: 269 srl %o3,%g0,%o3 ! clruw %o3 270 andcc %o2,-4,%g0 271 bz,pn %icc,.L_bn_mul_words_tail 272 clr %o5 273 274.L_bn_mul_words_loop: ! wow! 32 aligned! 275 lduw [%o1+4],%g3 276 mulx %o3,%g2,%g2 277 add %g2,%o5,%o4 278 nop 279 stuw %o4,[%o0] 280 srlx %o4,32,%o5 281 282 lduw [%o1+8],%g2 283 mulx %o3,%g3,%g3 284 add %g3,%o5,%o4 285 dec 4,%o2 286 stuw %o4,[%o0+4] 287 srlx %o4,32,%o5 288 289 lduw [%o1+12],%g3 290 mulx %o3,%g2,%g2 291 add %g2,%o5,%o4 292 inc 16,%o1 293 stuw %o4,[%o0+8] 294 srlx %o4,32,%o5 295 296 mulx %o3,%g3,%g3 297 add %g3,%o5,%o4 298 inc 16,%o0 299 stuw %o4,[%o0-4] 300 srlx %o4,32,%o5 301 andcc %o2,-4,%g0 302 bnz,a,pt %icc,.L_bn_mul_words_loop 303 lduw [%o1],%g2 304 nop 305 nop 306 307 brnz,a,pn %o2,.L_bn_mul_words_tail 308 lduw [%o1],%g2 309.L_bn_mul_words_return: 310 retl 311 mov %o5,%o0 312 313.L_bn_mul_words_tail: 314 mulx %o3,%g2,%g2 315 add %g2,%o5,%o4 316 dec %o2 317 srlx %o4,32,%o5 318 brz,pt %o2,.L_bn_mul_words_return 319 stuw %o4,[%o0] 320 321 lduw [%o1+4],%g2 322 mulx %o3,%g2,%g2 323 add %g2,%o5,%o4 324 dec %o2 325 srlx %o4,32,%o5 326 brz,pt %o2,.L_bn_mul_words_return 327 stuw %o4,[%o0+4] 328 329 lduw [%o1+8],%g2 330 mulx %o3,%g2,%g2 331 add %g2,%o5,%o4 332 stuw %o4,[%o0+8] 333 retl 334 srlx %o4,32,%o0 335 336.type bn_mul_words,#function 337.size bn_mul_words,(.-bn_mul_words) 338 339.align 32 340.global bn_sqr_words 341/* 342 * void bn_sqr_words(r,a,n) 343 * BN_ULONG *r,*a; 344 * int n; 345 */ 346bn_sqr_words: 347 brgz,a %o2,.L_bn_sqr_words_proceeed 348 lduw [%o1],%g2 349 retl 350 clr %o0 351 352.L_bn_sqr_words_proceeed: 353 andcc %o2,-4,%g0 354 nop 355 bz,pn %icc,.L_bn_sqr_words_tail 356 nop 357 358.L_bn_sqr_words_loop: ! wow! 32 aligned! 359 lduw [%o1+4],%g3 360 mulx %g2,%g2,%o4 361 stuw %o4,[%o0] 362 srlx %o4,32,%o5 363 stuw %o5,[%o0+4] 364 nop 365 366 lduw [%o1+8],%g2 367 mulx %g3,%g3,%o4 368 dec 4,%o2 369 stuw %o4,[%o0+8] 370 srlx %o4,32,%o5 371 stuw %o5,[%o0+12] 372 373 lduw [%o1+12],%g3 374 mulx %g2,%g2,%o4 375 srlx %o4,32,%o5 376 stuw %o4,[%o0+16] 377 inc 16,%o1 378 stuw %o5,[%o0+20] 379 380 mulx %g3,%g3,%o4 381 inc 32,%o0 382 stuw %o4,[%o0-8] 383 srlx %o4,32,%o5 384 andcc %o2,-4,%g2 385 stuw %o5,[%o0-4] 386 bnz,a,pt %icc,.L_bn_sqr_words_loop 387 lduw [%o1],%g2 388 nop 389 390 brnz,a,pn %o2,.L_bn_sqr_words_tail 391 lduw [%o1],%g2 392.L_bn_sqr_words_return: 393 retl 394 clr %o0 395 396.L_bn_sqr_words_tail: 397 mulx %g2,%g2,%o4 398 dec %o2 399 stuw %o4,[%o0] 400 srlx %o4,32,%o5 401 brz,pt %o2,.L_bn_sqr_words_return 402 stuw %o5,[%o0+4] 403 404 lduw [%o1+4],%g2 405 mulx %g2,%g2,%o4 406 dec %o2 407 stuw %o4,[%o0+8] 408 srlx %o4,32,%o5 409 brz,pt %o2,.L_bn_sqr_words_return 410 stuw %o5,[%o0+12] 411 412 lduw [%o1+8],%g2 413 mulx %g2,%g2,%o4 414 srlx %o4,32,%o5 415 stuw %o4,[%o0+16] 416 stuw %o5,[%o0+20] 417 retl 418 clr %o0 419 420.type bn_sqr_words,#function 421.size bn_sqr_words,(.-bn_sqr_words) 422 423.align 32 424.global bn_div_words 425/* 426 * BN_ULONG bn_div_words(h,l,d) 427 * BN_ULONG h,l,d; 428 */ 429bn_div_words: 430 sllx %o0,32,%o0 431 or %o0,%o1,%o0 432 udivx %o0,%o2,%o0 433 retl 434 srl %o0,%g0,%o0 ! clruw %o0 435 436.type bn_div_words,#function 437.size bn_div_words,(.-bn_div_words) 438 439.align 32 440 441.global bn_add_words 442/* 443 * BN_ULONG bn_add_words(rp,ap,bp,n) 444 * BN_ULONG *rp,*ap,*bp; 445 * int n; 446 */ 447bn_add_words: 448 brgz,a %o3,.L_bn_add_words_proceed 449 lduw [%o1],%o4 450 retl 451 clr %o0 452 453.L_bn_add_words_proceed: 454 andcc %o3,-4,%g0 455 bz,pn %icc,.L_bn_add_words_tail 456 addcc %g0,0,%g0 ! clear carry flag 457 nop 458 459.L_bn_add_words_loop: ! wow! 32 aligned! 460 dec 4,%o3 461 lduw [%o2],%o5 462 lduw [%o1+4],%g1 463 lduw [%o2+4],%g2 464 lduw [%o1+8],%g3 465 lduw [%o2+8],%g4 466 addccc %o5,%o4,%o5 467 stuw %o5,[%o0] 468 469 lduw [%o1+12],%o4 470 lduw [%o2+12],%o5 471 inc 16,%o1 472 addccc %g1,%g2,%g1 473 stuw %g1,[%o0+4] 474 475 inc 16,%o2 476 addccc %g3,%g4,%g3 477 stuw %g3,[%o0+8] 478 479 inc 16,%o0 480 addccc %o5,%o4,%o5 481 stuw %o5,[%o0-4] 482 and %o3,-4,%g1 483 brnz,a,pt %g1,.L_bn_add_words_loop 484 lduw [%o1],%o4 485 486 brnz,a,pn %o3,.L_bn_add_words_tail 487 lduw [%o1],%o4 488.L_bn_add_words_return: 489 clr %o0 490 retl 491 movcs %icc,1,%o0 492 nop 493 494.L_bn_add_words_tail: 495 lduw [%o2],%o5 496 dec %o3 497 addccc %o5,%o4,%o5 498 brz,pt %o3,.L_bn_add_words_return 499 stuw %o5,[%o0] 500 501 lduw [%o1+4],%o4 502 lduw [%o2+4],%o5 503 dec %o3 504 addccc %o5,%o4,%o5 505 brz,pt %o3,.L_bn_add_words_return 506 stuw %o5,[%o0+4] 507 508 lduw [%o1+8],%o4 509 lduw [%o2+8],%o5 510 addccc %o5,%o4,%o5 511 stuw %o5,[%o0+8] 512 clr %o0 513 retl 514 movcs %icc,1,%o0 515 516.type bn_add_words,#function 517.size bn_add_words,(.-bn_add_words) 518 519.global bn_sub_words 520/* 521 * BN_ULONG bn_sub_words(rp,ap,bp,n) 522 * BN_ULONG *rp,*ap,*bp; 523 * int n; 524 */ 525bn_sub_words: 526 brgz,a %o3,.L_bn_sub_words_proceed 527 lduw [%o1],%o4 528 retl 529 clr %o0 530 531.L_bn_sub_words_proceed: 532 andcc %o3,-4,%g0 533 bz,pn %icc,.L_bn_sub_words_tail 534 addcc %g0,0,%g0 ! clear carry flag 535 nop 536 537.L_bn_sub_words_loop: ! wow! 32 aligned! 538 dec 4,%o3 539 lduw [%o2],%o5 540 lduw [%o1+4],%g1 541 lduw [%o2+4],%g2 542 lduw [%o1+8],%g3 543 lduw [%o2+8],%g4 544 subccc %o4,%o5,%o5 545 stuw %o5,[%o0] 546 547 lduw [%o1+12],%o4 548 lduw [%o2+12],%o5 549 inc 16,%o1 550 subccc %g1,%g2,%g2 551 stuw %g2,[%o0+4] 552 553 inc 16,%o2 554 subccc %g3,%g4,%g4 555 stuw %g4,[%o0+8] 556 557 inc 16,%o0 558 subccc %o4,%o5,%o5 559 stuw %o5,[%o0-4] 560 and %o3,-4,%g1 561 brnz,a,pt %g1,.L_bn_sub_words_loop 562 lduw [%o1],%o4 563 564 brnz,a,pn %o3,.L_bn_sub_words_tail 565 lduw [%o1],%o4 566.L_bn_sub_words_return: 567 clr %o0 568 retl 569 movcs %icc,1,%o0 570 nop 571 572.L_bn_sub_words_tail: ! wow! 32 aligned! 573 lduw [%o2],%o5 574 dec %o3 575 subccc %o4,%o5,%o5 576 brz,pt %o3,.L_bn_sub_words_return 577 stuw %o5,[%o0] 578 579 lduw [%o1+4],%o4 580 lduw [%o2+4],%o5 581 dec %o3 582 subccc %o4,%o5,%o5 583 brz,pt %o3,.L_bn_sub_words_return 584 stuw %o5,[%o0+4] 585 586 lduw [%o1+8],%o4 587 lduw [%o2+8],%o5 588 subccc %o4,%o5,%o5 589 stuw %o5,[%o0+8] 590 clr %o0 591 retl 592 movcs %icc,1,%o0 593 594.type bn_sub_words,#function 595.size bn_sub_words,(.-bn_sub_words) 596 597/* 598 * Code below depends on the fact that upper parts of the %l0-%l7 599 * and %i0-%i7 are zeroed by kernel after context switch. In 600 * previous versions this comment stated that "the trouble is that 601 * it's not feasible to implement the mumbo-jumbo in less V9 602 * instructions:-(" which apparently isn't true thanks to 603 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement 604 * results not from the shorter code, but from elimination of 605 * multicycle none-pairable 'rd %y,%rd' instructions. 606 * 607 * Andy. 608 */ 609 610#define FRAME_SIZE -96 611 612/* 613 * Here is register usage map for *all* routines below. 614 */ 615#define t_1 %o0 616#define t_2 %o1 617#define c_12 %o2 618#define c_3 %o3 619 620#define ap(I) [%i1+4*I] 621#define bp(I) [%i2+4*I] 622#define rp(I) [%i0+4*I] 623 624#define a_0 %l0 625#define a_1 %l1 626#define a_2 %l2 627#define a_3 %l3 628#define a_4 %l4 629#define a_5 %l5 630#define a_6 %l6 631#define a_7 %l7 632 633#define b_0 %i3 634#define b_1 %i4 635#define b_2 %i5 636#define b_3 %o4 637#define b_4 %o5 638#define b_5 %o7 639#define b_6 %g1 640#define b_7 %g4 641 642.align 32 643.global bn_mul_comba8 644/* 645 * void bn_mul_comba8(r,a,b) 646 * BN_ULONG *r,*a,*b; 647 */ 648bn_mul_comba8: 649 save %sp,FRAME_SIZE,%sp 650 mov 1,t_2 651 lduw ap(0),a_0 652 sllx t_2,32,t_2 653 lduw bp(0),b_0 != 654 lduw bp(1),b_1 655 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); 656 srlx t_1,32,c_12 657 stuw t_1,rp(0) !=!r[0]=c1; 658 659 lduw ap(1),a_1 660 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); 661 addcc c_12,t_1,c_12 662 clr c_3 != 663 bcs,a %xcc,.+8 664 add c_3,t_2,c_3 665 lduw ap(2),a_2 666 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); 667 addcc c_12,t_1,t_1 668 bcs,a %xcc,.+8 669 add c_3,t_2,c_3 670 srlx t_1,32,c_12 != 671 stuw t_1,rp(1) !r[1]=c2; 672 or c_12,c_3,c_12 673 674 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); 675 addcc c_12,t_1,c_12 != 676 clr c_3 677 bcs,a %xcc,.+8 678 add c_3,t_2,c_3 679 lduw bp(2),b_2 != 680 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); 681 addcc c_12,t_1,c_12 682 bcs,a %xcc,.+8 683 add c_3,t_2,c_3 != 684 lduw bp(3),b_3 685 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); 686 addcc c_12,t_1,t_1 687 bcs,a %xcc,.+8 != 688 add c_3,t_2,c_3 689 srlx t_1,32,c_12 690 stuw t_1,rp(2) !r[2]=c3; 691 or c_12,c_3,c_12 != 692 693 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); 694 addcc c_12,t_1,c_12 695 clr c_3 696 bcs,a %xcc,.+8 != 697 add c_3,t_2,c_3 698 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); 699 addcc c_12,t_1,c_12 700 bcs,a %xcc,.+8 != 701 add c_3,t_2,c_3 702 lduw ap(3),a_3 703 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); 704 addcc c_12,t_1,c_12 != 705 bcs,a %xcc,.+8 706 add c_3,t_2,c_3 707 lduw ap(4),a_4 708 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!= 709 addcc c_12,t_1,t_1 710 bcs,a %xcc,.+8 711 add c_3,t_2,c_3 712 srlx t_1,32,c_12 != 713 stuw t_1,rp(3) !r[3]=c1; 714 or c_12,c_3,c_12 715 716 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); 717 addcc c_12,t_1,c_12 != 718 clr c_3 719 bcs,a %xcc,.+8 720 add c_3,t_2,c_3 721 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1); 722 addcc c_12,t_1,c_12 723 bcs,a %xcc,.+8 724 add c_3,t_2,c_3 725 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); 726 addcc c_12,t_1,c_12 727 bcs,a %xcc,.+8 728 add c_3,t_2,c_3 729 lduw bp(4),b_4 != 730 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); 731 addcc c_12,t_1,c_12 732 bcs,a %xcc,.+8 733 add c_3,t_2,c_3 != 734 lduw bp(5),b_5 735 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1); 736 addcc c_12,t_1,t_1 737 bcs,a %xcc,.+8 != 738 add c_3,t_2,c_3 739 srlx t_1,32,c_12 740 stuw t_1,rp(4) !r[4]=c2; 741 or c_12,c_3,c_12 != 742 743 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); 744 addcc c_12,t_1,c_12 745 clr c_3 746 bcs,a %xcc,.+8 != 747 add c_3,t_2,c_3 748 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); 749 addcc c_12,t_1,c_12 750 bcs,a %xcc,.+8 != 751 add c_3,t_2,c_3 752 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); 753 addcc c_12,t_1,c_12 754 bcs,a %xcc,.+8 != 755 add c_3,t_2,c_3 756 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); 757 addcc c_12,t_1,c_12 758 bcs,a %xcc,.+8 != 759 add c_3,t_2,c_3 760 lduw ap(5),a_5 761 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); 762 addcc c_12,t_1,c_12 != 763 bcs,a %xcc,.+8 764 add c_3,t_2,c_3 765 lduw ap(6),a_6 766 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2); 767 addcc c_12,t_1,t_1 768 bcs,a %xcc,.+8 769 add c_3,t_2,c_3 770 srlx t_1,32,c_12 != 771 stuw t_1,rp(5) !r[5]=c3; 772 or c_12,c_3,c_12 773 774 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); 775 addcc c_12,t_1,c_12 != 776 clr c_3 777 bcs,a %xcc,.+8 778 add c_3,t_2,c_3 779 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); 780 addcc c_12,t_1,c_12 781 bcs,a %xcc,.+8 782 add c_3,t_2,c_3 783 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3); 784 addcc c_12,t_1,c_12 785 bcs,a %xcc,.+8 786 add c_3,t_2,c_3 787 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3); 788 addcc c_12,t_1,c_12 789 bcs,a %xcc,.+8 790 add c_3,t_2,c_3 791 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3); 792 addcc c_12,t_1,c_12 793 bcs,a %xcc,.+8 794 add c_3,t_2,c_3 795 lduw bp(6),b_6 != 796 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); 797 addcc c_12,t_1,c_12 798 bcs,a %xcc,.+8 799 add c_3,t_2,c_3 != 800 lduw bp(7),b_7 801 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); 802 addcc c_12,t_1,t_1 803 bcs,a %xcc,.+8 != 804 add c_3,t_2,c_3 805 srlx t_1,32,c_12 806 stuw t_1,rp(6) !r[6]=c1; 807 or c_12,c_3,c_12 != 808 809 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); 810 addcc c_12,t_1,c_12 811 clr c_3 812 bcs,a %xcc,.+8 != 813 add c_3,t_2,c_3 814 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); 815 addcc c_12,t_1,c_12 816 bcs,a %xcc,.+8 != 817 add c_3,t_2,c_3 818 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); 819 addcc c_12,t_1,c_12 820 bcs,a %xcc,.+8 != 821 add c_3,t_2,c_3 822 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1); 823 addcc c_12,t_1,c_12 824 bcs,a %xcc,.+8 != 825 add c_3,t_2,c_3 826 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); 827 addcc c_12,t_1,c_12 828 bcs,a %xcc,.+8 != 829 add c_3,t_2,c_3 830 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); 831 addcc c_12,t_1,c_12 832 bcs,a %xcc,.+8 != 833 add c_3,t_2,c_3 834 lduw ap(7),a_7 835 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); 836 addcc c_12,t_1,c_12 837 bcs,a %xcc,.+8 838 add c_3,t_2,c_3 839 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1); 840 addcc c_12,t_1,t_1 841 bcs,a %xcc,.+8 842 add c_3,t_2,c_3 843 srlx t_1,32,c_12 != 844 stuw t_1,rp(7) !r[7]=c2; 845 or c_12,c_3,c_12 846 847 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2); 848 addcc c_12,t_1,c_12 849 clr c_3 850 bcs,a %xcc,.+8 851 add c_3,t_2,c_3 != 852 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2); 853 addcc c_12,t_1,c_12 854 bcs,a %xcc,.+8 855 add c_3,t_2,c_3 != 856 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); 857 addcc c_12,t_1,c_12 858 bcs,a %xcc,.+8 859 add c_3,t_2,c_3 != 860 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); 861 addcc c_12,t_1,c_12 862 bcs,a %xcc,.+8 863 add c_3,t_2,c_3 != 864 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); 865 addcc c_12,t_1,c_12 866 bcs,a %xcc,.+8 867 add c_3,t_2,c_3 != 868 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2); 869 addcc c_12,t_1,c_12 870 bcs,a %xcc,.+8 871 add c_3,t_2,c_3 != 872 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); 873 addcc c_12,t_1,t_1 874 bcs,a %xcc,.+8 875 add c_3,t_2,c_3 != 876 srlx t_1,32,c_12 877 stuw t_1,rp(8) !r[8]=c3; 878 or c_12,c_3,c_12 879 880 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3); 881 addcc c_12,t_1,c_12 882 clr c_3 883 bcs,a %xcc,.+8 884 add c_3,t_2,c_3 != 885 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3); 886 addcc c_12,t_1,c_12 887 bcs,a %xcc,.+8 != 888 add c_3,t_2,c_3 889 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); 890 addcc c_12,t_1,c_12 891 bcs,a %xcc,.+8 != 892 add c_3,t_2,c_3 893 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); 894 addcc c_12,t_1,c_12 895 bcs,a %xcc,.+8 != 896 add c_3,t_2,c_3 897 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); 898 addcc c_12,t_1,c_12 899 bcs,a %xcc,.+8 != 900 add c_3,t_2,c_3 901 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3); 902 addcc c_12,t_1,t_1 903 bcs,a %xcc,.+8 != 904 add c_3,t_2,c_3 905 srlx t_1,32,c_12 906 stuw t_1,rp(9) !r[9]=c1; 907 or c_12,c_3,c_12 != 908 909 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); 910 addcc c_12,t_1,c_12 911 clr c_3 912 bcs,a %xcc,.+8 != 913 add c_3,t_2,c_3 914 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); 915 addcc c_12,t_1,c_12 916 bcs,a %xcc,.+8 != 917 add c_3,t_2,c_3 918 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1); 919 addcc c_12,t_1,c_12 920 bcs,a %xcc,.+8 != 921 add c_3,t_2,c_3 922 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); 923 addcc c_12,t_1,c_12 924 bcs,a %xcc,.+8 != 925 add c_3,t_2,c_3 926 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); 927 addcc c_12,t_1,t_1 928 bcs,a %xcc,.+8 != 929 add c_3,t_2,c_3 930 srlx t_1,32,c_12 931 stuw t_1,rp(10) !r[10]=c2; 932 or c_12,c_3,c_12 != 933 934 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2); 935 addcc c_12,t_1,c_12 936 clr c_3 937 bcs,a %xcc,.+8 != 938 add c_3,t_2,c_3 939 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); 940 addcc c_12,t_1,c_12 941 bcs,a %xcc,.+8 != 942 add c_3,t_2,c_3 943 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); 944 addcc c_12,t_1,c_12 945 bcs,a %xcc,.+8 != 946 add c_3,t_2,c_3 947 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); 948 addcc c_12,t_1,t_1 949 bcs,a %xcc,.+8 != 950 add c_3,t_2,c_3 951 srlx t_1,32,c_12 952 stuw t_1,rp(11) !r[11]=c3; 953 or c_12,c_3,c_12 != 954 955 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); 956 addcc c_12,t_1,c_12 957 clr c_3 958 bcs,a %xcc,.+8 != 959 add c_3,t_2,c_3 960 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); 961 addcc c_12,t_1,c_12 962 bcs,a %xcc,.+8 != 963 add c_3,t_2,c_3 964 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); 965 addcc c_12,t_1,t_1 966 bcs,a %xcc,.+8 != 967 add c_3,t_2,c_3 968 srlx t_1,32,c_12 969 stuw t_1,rp(12) !r[12]=c1; 970 or c_12,c_3,c_12 != 971 972 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); 973 addcc c_12,t_1,c_12 974 clr c_3 975 bcs,a %xcc,.+8 != 976 add c_3,t_2,c_3 977 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); 978 addcc c_12,t_1,t_1 979 bcs,a %xcc,.+8 != 980 add c_3,t_2,c_3 981 srlx t_1,32,c_12 982 st t_1,rp(13) !r[13]=c2; 983 or c_12,c_3,c_12 != 984 985 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2); 986 addcc c_12,t_1,t_1 987 srlx t_1,32,c_12 != 988 stuw t_1,rp(14) !r[14]=c3; 989 stuw c_12,rp(15) !r[15]=c1; 990 991 ret 992 restore %g0,%g0,%o0 != 993 994.type bn_mul_comba8,#function 995.size bn_mul_comba8,(.-bn_mul_comba8) 996 997.align 32 998 999.global bn_mul_comba4 1000/* 1001 * void bn_mul_comba4(r,a,b) 1002 * BN_ULONG *r,*a,*b; 1003 */ 1004bn_mul_comba4: 1005 save %sp,FRAME_SIZE,%sp 1006 lduw ap(0),a_0 1007 mov 1,t_2 1008 lduw bp(0),b_0 1009 sllx t_2,32,t_2 != 1010 lduw bp(1),b_1 1011 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); 1012 srlx t_1,32,c_12 1013 stuw t_1,rp(0) !=!r[0]=c1; 1014 1015 lduw ap(1),a_1 1016 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); 1017 addcc c_12,t_1,c_12 1018 clr c_3 != 1019 bcs,a %xcc,.+8 1020 add c_3,t_2,c_3 1021 lduw ap(2),a_2 1022 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); 1023 addcc c_12,t_1,t_1 1024 bcs,a %xcc,.+8 1025 add c_3,t_2,c_3 1026 srlx t_1,32,c_12 != 1027 stuw t_1,rp(1) !r[1]=c2; 1028 or c_12,c_3,c_12 1029 1030 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); 1031 addcc c_12,t_1,c_12 != 1032 clr c_3 1033 bcs,a %xcc,.+8 1034 add c_3,t_2,c_3 1035 lduw bp(2),b_2 != 1036 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); 1037 addcc c_12,t_1,c_12 1038 bcs,a %xcc,.+8 1039 add c_3,t_2,c_3 != 1040 lduw bp(3),b_3 1041 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); 1042 addcc c_12,t_1,t_1 1043 bcs,a %xcc,.+8 != 1044 add c_3,t_2,c_3 1045 srlx t_1,32,c_12 1046 stuw t_1,rp(2) !r[2]=c3; 1047 or c_12,c_3,c_12 != 1048 1049 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); 1050 addcc c_12,t_1,c_12 1051 clr c_3 1052 bcs,a %xcc,.+8 != 1053 add c_3,t_2,c_3 1054 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); 1055 addcc c_12,t_1,c_12 1056 bcs,a %xcc,.+8 != 1057 add c_3,t_2,c_3 1058 lduw ap(3),a_3 1059 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); 1060 addcc c_12,t_1,c_12 != 1061 bcs,a %xcc,.+8 1062 add c_3,t_2,c_3 1063 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= 1064 addcc c_12,t_1,t_1 != 1065 bcs,a %xcc,.+8 1066 add c_3,t_2,c_3 1067 srlx t_1,32,c_12 1068 stuw t_1,rp(3) !=!r[3]=c1; 1069 or c_12,c_3,c_12 1070 1071 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); 1072 addcc c_12,t_1,c_12 1073 clr c_3 != 1074 bcs,a %xcc,.+8 1075 add c_3,t_2,c_3 1076 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); 1077 addcc c_12,t_1,c_12 != 1078 bcs,a %xcc,.+8 1079 add c_3,t_2,c_3 1080 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); 1081 addcc c_12,t_1,t_1 != 1082 bcs,a %xcc,.+8 1083 add c_3,t_2,c_3 1084 srlx t_1,32,c_12 1085 stuw t_1,rp(4) !=!r[4]=c2; 1086 or c_12,c_3,c_12 1087 1088 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); 1089 addcc c_12,t_1,c_12 1090 clr c_3 != 1091 bcs,a %xcc,.+8 1092 add c_3,t_2,c_3 1093 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); 1094 addcc c_12,t_1,t_1 != 1095 bcs,a %xcc,.+8 1096 add c_3,t_2,c_3 1097 srlx t_1,32,c_12 1098 stuw t_1,rp(5) !=!r[5]=c3; 1099 or c_12,c_3,c_12 1100 1101 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); 1102 addcc c_12,t_1,t_1 1103 srlx t_1,32,c_12 != 1104 stuw t_1,rp(6) !r[6]=c1; 1105 stuw c_12,rp(7) !r[7]=c2; 1106 1107 ret 1108 restore %g0,%g0,%o0 1109 1110.type bn_mul_comba4,#function 1111.size bn_mul_comba4,(.-bn_mul_comba4) 1112 1113.align 32 1114 1115.global bn_sqr_comba8 1116bn_sqr_comba8: 1117 save %sp,FRAME_SIZE,%sp 1118 mov 1,t_2 1119 lduw ap(0),a_0 1120 sllx t_2,32,t_2 1121 lduw ap(1),a_1 1122 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); 1123 srlx t_1,32,c_12 1124 stuw t_1,rp(0) !r[0]=c1; 1125 1126 lduw ap(2),a_2 1127 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); 1128 addcc c_12,t_1,c_12 1129 clr c_3 1130 bcs,a %xcc,.+8 1131 add c_3,t_2,c_3 1132 addcc c_12,t_1,t_1 1133 bcs,a %xcc,.+8 1134 add c_3,t_2,c_3 1135 srlx t_1,32,c_12 1136 stuw t_1,rp(1) !r[1]=c2; 1137 or c_12,c_3,c_12 1138 1139 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); 1140 addcc c_12,t_1,c_12 1141 clr c_3 1142 bcs,a %xcc,.+8 1143 add c_3,t_2,c_3 1144 addcc c_12,t_1,c_12 1145 bcs,a %xcc,.+8 1146 add c_3,t_2,c_3 1147 lduw ap(3),a_3 1148 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); 1149 addcc c_12,t_1,t_1 1150 bcs,a %xcc,.+8 1151 add c_3,t_2,c_3 1152 srlx t_1,32,c_12 1153 stuw t_1,rp(2) !r[2]=c3; 1154 or c_12,c_3,c_12 1155 1156 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); 1157 addcc c_12,t_1,c_12 1158 clr c_3 1159 bcs,a %xcc,.+8 1160 add c_3,t_2,c_3 1161 addcc c_12,t_1,c_12 1162 bcs,a %xcc,.+8 1163 add c_3,t_2,c_3 1164 lduw ap(4),a_4 1165 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); 1166 addcc c_12,t_1,c_12 1167 bcs,a %xcc,.+8 1168 add c_3,t_2,c_3 1169 addcc c_12,t_1,t_1 1170 bcs,a %xcc,.+8 1171 add c_3,t_2,c_3 1172 srlx t_1,32,c_12 1173 st t_1,rp(3) !r[3]=c1; 1174 or c_12,c_3,c_12 1175 1176 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); 1177 addcc c_12,t_1,c_12 1178 clr c_3 1179 bcs,a %xcc,.+8 1180 add c_3,t_2,c_3 1181 addcc c_12,t_1,c_12 1182 bcs,a %xcc,.+8 1183 add c_3,t_2,c_3 1184 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); 1185 addcc c_12,t_1,c_12 1186 bcs,a %xcc,.+8 1187 add c_3,t_2,c_3 1188 addcc c_12,t_1,c_12 1189 bcs,a %xcc,.+8 1190 add c_3,t_2,c_3 1191 lduw ap(5),a_5 1192 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); 1193 addcc c_12,t_1,t_1 1194 bcs,a %xcc,.+8 1195 add c_3,t_2,c_3 1196 srlx t_1,32,c_12 1197 stuw t_1,rp(4) !r[4]=c2; 1198 or c_12,c_3,c_12 1199 1200 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); 1201 addcc c_12,t_1,c_12 1202 clr c_3 1203 bcs,a %xcc,.+8 1204 add c_3,t_2,c_3 1205 addcc c_12,t_1,c_12 1206 bcs,a %xcc,.+8 1207 add c_3,t_2,c_3 1208 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); 1209 addcc c_12,t_1,c_12 1210 bcs,a %xcc,.+8 1211 add c_3,t_2,c_3 1212 addcc c_12,t_1,c_12 1213 bcs,a %xcc,.+8 1214 add c_3,t_2,c_3 1215 lduw ap(6),a_6 1216 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); 1217 addcc c_12,t_1,c_12 1218 bcs,a %xcc,.+8 1219 add c_3,t_2,c_3 1220 addcc c_12,t_1,t_1 1221 bcs,a %xcc,.+8 1222 add c_3,t_2,c_3 1223 srlx t_1,32,c_12 1224 stuw t_1,rp(5) !r[5]=c3; 1225 or c_12,c_3,c_12 1226 1227 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); 1228 addcc c_12,t_1,c_12 1229 clr c_3 1230 bcs,a %xcc,.+8 1231 add c_3,t_2,c_3 1232 addcc c_12,t_1,c_12 1233 bcs,a %xcc,.+8 1234 add c_3,t_2,c_3 1235 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); 1236 addcc c_12,t_1,c_12 1237 bcs,a %xcc,.+8 1238 add c_3,t_2,c_3 1239 addcc c_12,t_1,c_12 1240 bcs,a %xcc,.+8 1241 add c_3,t_2,c_3 1242 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); 1243 addcc c_12,t_1,c_12 1244 bcs,a %xcc,.+8 1245 add c_3,t_2,c_3 1246 addcc c_12,t_1,c_12 1247 bcs,a %xcc,.+8 1248 add c_3,t_2,c_3 1249 lduw ap(7),a_7 1250 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); 1251 addcc c_12,t_1,t_1 1252 bcs,a %xcc,.+8 1253 add c_3,t_2,c_3 1254 srlx t_1,32,c_12 1255 stuw t_1,rp(6) !r[6]=c1; 1256 or c_12,c_3,c_12 1257 1258 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); 1259 addcc c_12,t_1,c_12 1260 clr c_3 1261 bcs,a %xcc,.+8 1262 add c_3,t_2,c_3 1263 addcc c_12,t_1,c_12 1264 bcs,a %xcc,.+8 1265 add c_3,t_2,c_3 1266 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); 1267 addcc c_12,t_1,c_12 1268 bcs,a %xcc,.+8 1269 add c_3,t_2,c_3 1270 addcc c_12,t_1,c_12 1271 bcs,a %xcc,.+8 1272 add c_3,t_2,c_3 1273 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); 1274 addcc c_12,t_1,c_12 1275 bcs,a %xcc,.+8 1276 add c_3,t_2,c_3 1277 addcc c_12,t_1,c_12 1278 bcs,a %xcc,.+8 1279 add c_3,t_2,c_3 1280 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); 1281 addcc c_12,t_1,c_12 1282 bcs,a %xcc,.+8 1283 add c_3,t_2,c_3 1284 addcc c_12,t_1,t_1 1285 bcs,a %xcc,.+8 1286 add c_3,t_2,c_3 1287 srlx t_1,32,c_12 1288 stuw t_1,rp(7) !r[7]=c2; 1289 or c_12,c_3,c_12 1290 1291 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); 1292 addcc c_12,t_1,c_12 1293 clr c_3 1294 bcs,a %xcc,.+8 1295 add c_3,t_2,c_3 1296 addcc c_12,t_1,c_12 1297 bcs,a %xcc,.+8 1298 add c_3,t_2,c_3 1299 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); 1300 addcc c_12,t_1,c_12 1301 bcs,a %xcc,.+8 1302 add c_3,t_2,c_3 1303 addcc c_12,t_1,c_12 1304 bcs,a %xcc,.+8 1305 add c_3,t_2,c_3 1306 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); 1307 addcc c_12,t_1,c_12 1308 bcs,a %xcc,.+8 1309 add c_3,t_2,c_3 1310 addcc c_12,t_1,c_12 1311 bcs,a %xcc,.+8 1312 add c_3,t_2,c_3 1313 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); 1314 addcc c_12,t_1,t_1 1315 bcs,a %xcc,.+8 1316 add c_3,t_2,c_3 1317 srlx t_1,32,c_12 1318 stuw t_1,rp(8) !r[8]=c3; 1319 or c_12,c_3,c_12 1320 1321 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); 1322 addcc c_12,t_1,c_12 1323 clr c_3 1324 bcs,a %xcc,.+8 1325 add c_3,t_2,c_3 1326 addcc c_12,t_1,c_12 1327 bcs,a %xcc,.+8 1328 add c_3,t_2,c_3 1329 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); 1330 addcc c_12,t_1,c_12 1331 bcs,a %xcc,.+8 1332 add c_3,t_2,c_3 1333 addcc c_12,t_1,c_12 1334 bcs,a %xcc,.+8 1335 add c_3,t_2,c_3 1336 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); 1337 addcc c_12,t_1,c_12 1338 bcs,a %xcc,.+8 1339 add c_3,t_2,c_3 1340 addcc c_12,t_1,t_1 1341 bcs,a %xcc,.+8 1342 add c_3,t_2,c_3 1343 srlx t_1,32,c_12 1344 stuw t_1,rp(9) !r[9]=c1; 1345 or c_12,c_3,c_12 1346 1347 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); 1348 addcc c_12,t_1,c_12 1349 clr c_3 1350 bcs,a %xcc,.+8 1351 add c_3,t_2,c_3 1352 addcc c_12,t_1,c_12 1353 bcs,a %xcc,.+8 1354 add c_3,t_2,c_3 1355 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); 1356 addcc c_12,t_1,c_12 1357 bcs,a %xcc,.+8 1358 add c_3,t_2,c_3 1359 addcc c_12,t_1,c_12 1360 bcs,a %xcc,.+8 1361 add c_3,t_2,c_3 1362 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); 1363 addcc c_12,t_1,t_1 1364 bcs,a %xcc,.+8 1365 add c_3,t_2,c_3 1366 srlx t_1,32,c_12 1367 stuw t_1,rp(10) !r[10]=c2; 1368 or c_12,c_3,c_12 1369 1370 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2); 1371 addcc c_12,t_1,c_12 1372 clr c_3 1373 bcs,a %xcc,.+8 1374 add c_3,t_2,c_3 1375 addcc c_12,t_1,c_12 1376 bcs,a %xcc,.+8 1377 add c_3,t_2,c_3 1378 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2); 1379 addcc c_12,t_1,c_12 1380 bcs,a %xcc,.+8 1381 add c_3,t_2,c_3 1382 addcc c_12,t_1,t_1 1383 bcs,a %xcc,.+8 1384 add c_3,t_2,c_3 1385 srlx t_1,32,c_12 1386 stuw t_1,rp(11) !r[11]=c3; 1387 or c_12,c_3,c_12 1388 1389 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); 1390 addcc c_12,t_1,c_12 1391 clr c_3 1392 bcs,a %xcc,.+8 1393 add c_3,t_2,c_3 1394 addcc c_12,t_1,c_12 1395 bcs,a %xcc,.+8 1396 add c_3,t_2,c_3 1397 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); 1398 addcc c_12,t_1,t_1 1399 bcs,a %xcc,.+8 1400 add c_3,t_2,c_3 1401 srlx t_1,32,c_12 1402 stuw t_1,rp(12) !r[12]=c1; 1403 or c_12,c_3,c_12 1404 1405 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); 1406 addcc c_12,t_1,c_12 1407 clr c_3 1408 bcs,a %xcc,.+8 1409 add c_3,t_2,c_3 1410 addcc c_12,t_1,t_1 1411 bcs,a %xcc,.+8 1412 add c_3,t_2,c_3 1413 srlx t_1,32,c_12 1414 stuw t_1,rp(13) !r[13]=c2; 1415 or c_12,c_3,c_12 1416 1417 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); 1418 addcc c_12,t_1,t_1 1419 srlx t_1,32,c_12 1420 stuw t_1,rp(14) !r[14]=c3; 1421 stuw c_12,rp(15) !r[15]=c1; 1422 1423 ret 1424 restore %g0,%g0,%o0 1425 1426.type bn_sqr_comba8,#function 1427.size bn_sqr_comba8,(.-bn_sqr_comba8) 1428 1429.align 32 1430 1431.global bn_sqr_comba4 1432/* 1433 * void bn_sqr_comba4(r,a) 1434 * BN_ULONG *r,*a; 1435 */ 1436bn_sqr_comba4: 1437 save %sp,FRAME_SIZE,%sp 1438 mov 1,t_2 1439 lduw ap(0),a_0 1440 sllx t_2,32,t_2 1441 lduw ap(1),a_1 1442 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); 1443 srlx t_1,32,c_12 1444 stuw t_1,rp(0) !r[0]=c1; 1445 1446 lduw ap(2),a_2 1447 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1); 1448 addcc c_12,t_1,c_12 1449 clr c_3 1450 bcs,a %xcc,.+8 1451 add c_3,t_2,c_3 1452 addcc c_12,t_1,t_1 1453 bcs,a %xcc,.+8 1454 add c_3,t_2,c_3 1455 srlx t_1,32,c_12 1456 stuw t_1,rp(1) !r[1]=c2; 1457 or c_12,c_3,c_12 1458 1459 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); 1460 addcc c_12,t_1,c_12 1461 clr c_3 1462 bcs,a %xcc,.+8 1463 add c_3,t_2,c_3 1464 addcc c_12,t_1,c_12 1465 bcs,a %xcc,.+8 1466 add c_3,t_2,c_3 1467 lduw ap(3),a_3 1468 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); 1469 addcc c_12,t_1,t_1 1470 bcs,a %xcc,.+8 1471 add c_3,t_2,c_3 1472 srlx t_1,32,c_12 1473 stuw t_1,rp(2) !r[2]=c3; 1474 or c_12,c_3,c_12 1475 1476 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); 1477 addcc c_12,t_1,c_12 1478 clr c_3 1479 bcs,a %xcc,.+8 1480 add c_3,t_2,c_3 1481 addcc c_12,t_1,c_12 1482 bcs,a %xcc,.+8 1483 add c_3,t_2,c_3 1484 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); 1485 addcc c_12,t_1,c_12 1486 bcs,a %xcc,.+8 1487 add c_3,t_2,c_3 1488 addcc c_12,t_1,t_1 1489 bcs,a %xcc,.+8 1490 add c_3,t_2,c_3 1491 srlx t_1,32,c_12 1492 stuw t_1,rp(3) !r[3]=c1; 1493 or c_12,c_3,c_12 1494 1495 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); 1496 addcc c_12,t_1,c_12 1497 clr c_3 1498 bcs,a %xcc,.+8 1499 add c_3,t_2,c_3 1500 addcc c_12,t_1,c_12 1501 bcs,a %xcc,.+8 1502 add c_3,t_2,c_3 1503 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); 1504 addcc c_12,t_1,t_1 1505 bcs,a %xcc,.+8 1506 add c_3,t_2,c_3 1507 srlx t_1,32,c_12 1508 stuw t_1,rp(4) !r[4]=c2; 1509 or c_12,c_3,c_12 1510 1511 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); 1512 addcc c_12,t_1,c_12 1513 clr c_3 1514 bcs,a %xcc,.+8 1515 add c_3,t_2,c_3 1516 addcc c_12,t_1,t_1 1517 bcs,a %xcc,.+8 1518 add c_3,t_2,c_3 1519 srlx t_1,32,c_12 1520 stuw t_1,rp(5) !r[5]=c3; 1521 or c_12,c_3,c_12 1522 1523 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); 1524 addcc c_12,t_1,t_1 1525 srlx t_1,32,c_12 1526 stuw t_1,rp(6) !r[6]=c1; 1527 stuw c_12,rp(7) !r[7]=c2; 1528 1529 ret 1530 restore %g0,%g0,%o0 1531 1532.type bn_sqr_comba4,#function 1533.size bn_sqr_comba4,(.-bn_sqr_comba4) 1534 1535.align 32 1536