1.ident "sparcv8plus.s, Version 1.4" 2.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" 3 4/* 5 * ==================================================================== 6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 7 * project. 8 * 9 * Rights for redistribution and usage in source and binary forms are 10 * granted according to the OpenSSL license. Warranty of any kind is 11 * disclaimed. 12 * ==================================================================== 13 */ 14 15/* 16 * This is my modest contributon to OpenSSL project (see 17 * http://www.openssl.org/ for more information about it) and is 18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c 19 * module. For updates see http://fy.chalmers.se/~appro/hpe/. 20 * 21 * Questions-n-answers. 22 * 23 * Q. How to compile? 24 * A. With SC4.x/SC5.x: 25 * 26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o 27 * 28 * and with gcc: 29 * 30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o 31 * 32 * or if above fails (it does if you have gas installed): 33 * 34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o 35 * 36 * Quick-n-dirty way to fuse the module into the library. 37 * Provided that the library is already configured and built 38 * (in 0.9.2 case with no-asm option): 39 * 40 * # cd crypto/bn 41 * # cp /some/place/bn_asm.sparc.v8plus.S . 42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o 43 * # make 44 * # cd ../.. 45 * # make; make test 46 * 47 * Quick-n-dirty way to get rid of it: 48 * 49 * # cd crypto/bn 50 * # touch bn_asm.c 51 * # make 52 * # cd ../.. 53 * # make; make test 54 * 55 * Q. V8plus achitecture? What kind of beast is that? 56 * A. Well, it's rather a programming model than an architecture... 57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under 58 * special conditions, namely when kernel doesn't preserve upper 59 * 32 bits of otherwise 64-bit registers during a context switch. 60 * 61 * Q. Why just UltraSPARC? What about SuperSPARC? 62 * A. Original release did target UltraSPARC only. Now SuperSPARC 63 * version is provided along. Both version share bn_*comba[48] 64 * implementations (see comment later in code for explanation). 65 * But what's so special about this UltraSPARC implementation? 66 * Why didn't I let compiler do the job? Trouble is that most of 67 * available compilers (well, SC5.0 is the only exception) don't 68 * attempt to take advantage of UltraSPARC's 64-bitness under 69 * 32-bit kernels even though it's perfectly possible (see next 70 * question). 71 * 72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it 73 * doesn't work? 74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is 75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully 76 * preserved if you're in a leaf function, i.e. such never calling 77 * any other functions. All functions in this module are leaf and 78 * 10 registers is a handful. And as a matter of fact none-"comba" 79 * routines don't require even that much and I could even afford to 80 * not allocate own stack frame for 'em:-) 81 * 82 * Q. What about 64-bit kernels? 83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently 84 * under evaluation and development... 85 * 86 * Q. What about shared libraries? 87 * A. What about 'em? Kidding again:-) Code does *not* contain any 88 * code position dependencies and it's safe to include it into 89 * shared library as is. 90 * 91 * Q. How much faster does it go? 92 * A. Do you have a good benchmark? In either case below is what I 93 * experience with crypto/bn/expspeed.c test program: 94 * 95 * v8plus module on U10/300MHz against bn_asm.c compiled with: 96 * 97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12% 98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35% 99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45% 100 * 101 * v8 module on SS10/60MHz against bn_asm.c compiled with: 102 * 103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10% 104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10% 105 * egcs-1.1.2 -mv8 -O3 +35-45% 106 * 107 * As you can see it's damn hard to beat the new Sun C compiler 108 * and it's in first place GNU C users who will appreciate this 109 * assembler implementation:-) 110 */ 111 112/* 113 * Revision history. 114 * 115 * 1.0 - initial release; 116 * 1.1 - new loop unrolling model(*); 117 * - some more fine tuning; 118 * 1.2 - made gas friendly; 119 * - updates to documentation concerning v9; 120 * - new performance comparison matrix; 121 * 1.3 - fixed problem with /usr/ccs/lib/cpp; 122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient) 123 * resulting in slight overall performance kick; 124 * - some retunes; 125 * - support for GNU as added; 126 * 127 * (*) Originally unrolled loop looked like this: 128 * for (;;) { 129 * op(p+0); if (--n==0) break; 130 * op(p+1); if (--n==0) break; 131 * op(p+2); if (--n==0) break; 132 * op(p+3); if (--n==0) break; 133 * p+=4; 134 * } 135 * I unroll according to following: 136 * while (n&~3) { 137 * op(p+0); op(p+1); op(p+2); op(p+3); 138 * p+=4; n=-4; 139 * } 140 * if (n) { 141 * op(p+0); if (--n==0) return; 142 * op(p+2); if (--n==0) return; 143 * op(p+3); return; 144 * } 145 */ 146 147/* 148 * GNU assembler can't stand stuw:-( 149 */ 150#define stuw st 151 152.section ".text",#alloc,#execinstr 153.file "bn_asm.sparc.v8plus.S" 154 155.align 32 156 157.global bn_mul_add_words 158/* 159 * BN_ULONG bn_mul_add_words(rp,ap,num,w) 160 * BN_ULONG *rp,*ap; 161 * int num; 162 * BN_ULONG w; 163 */ 164bn_mul_add_words: 165 sra %o2,%g0,%o2 ! signx %o2 166 brgz,a %o2,.L_bn_mul_add_words_proceed 167 lduw [%o1],%g2 168 retl 169 clr %o0 170 nop 171 nop 172 nop 173 174.L_bn_mul_add_words_proceed: 175 srl %o3,%g0,%o3 ! clruw %o3 176 andcc %o2,-4,%g0 177 bz,pn %icc,.L_bn_mul_add_words_tail 178 clr %o5 179 180.L_bn_mul_add_words_loop: ! wow! 32 aligned! 181 lduw [%o0],%g1 182 lduw [%o1+4],%g3 183 mulx %o3,%g2,%g2 184 add %g1,%o5,%o4 185 nop 186 add %o4,%g2,%o4 187 stuw %o4,[%o0] 188 srlx %o4,32,%o5 189 190 lduw [%o0+4],%g1 191 lduw [%o1+8],%g2 192 mulx %o3,%g3,%g3 193 add %g1,%o5,%o4 194 dec 4,%o2 195 add %o4,%g3,%o4 196 stuw %o4,[%o0+4] 197 srlx %o4,32,%o5 198 199 lduw [%o0+8],%g1 200 lduw [%o1+12],%g3 201 mulx %o3,%g2,%g2 202 add %g1,%o5,%o4 203 inc 16,%o1 204 add %o4,%g2,%o4 205 stuw %o4,[%o0+8] 206 srlx %o4,32,%o5 207 208 lduw [%o0+12],%g1 209 mulx %o3,%g3,%g3 210 add %g1,%o5,%o4 211 inc 16,%o0 212 add %o4,%g3,%o4 213 andcc %o2,-4,%g0 214 stuw %o4,[%o0-4] 215 srlx %o4,32,%o5 216 bnz,a,pt %icc,.L_bn_mul_add_words_loop 217 lduw [%o1],%g2 218 219 brnz,a,pn %o2,.L_bn_mul_add_words_tail 220 lduw [%o1],%g2 221.L_bn_mul_add_words_return: 222 retl 223 mov %o5,%o0 224 225.L_bn_mul_add_words_tail: 226 lduw [%o0],%g1 227 mulx %o3,%g2,%g2 228 add %g1,%o5,%o4 229 dec %o2 230 add %o4,%g2,%o4 231 srlx %o4,32,%o5 232 brz,pt %o2,.L_bn_mul_add_words_return 233 stuw %o4,[%o0] 234 235 lduw [%o1+4],%g2 236 lduw [%o0+4],%g1 237 mulx %o3,%g2,%g2 238 add %g1,%o5,%o4 239 dec %o2 240 add %o4,%g2,%o4 241 srlx %o4,32,%o5 242 brz,pt %o2,.L_bn_mul_add_words_return 243 stuw %o4,[%o0+4] 244 245 lduw [%o1+8],%g2 246 lduw [%o0+8],%g1 247 mulx %o3,%g2,%g2 248 add %g1,%o5,%o4 249 add %o4,%g2,%o4 250 stuw %o4,[%o0+8] 251 retl 252 srlx %o4,32,%o0 253 254.type bn_mul_add_words,#function 255.size bn_mul_add_words,(.-bn_mul_add_words) 256 257.align 32 258 259.global bn_mul_words 260/* 261 * BN_ULONG bn_mul_words(rp,ap,num,w) 262 * BN_ULONG *rp,*ap; 263 * int num; 264 * BN_ULONG w; 265 */ 266bn_mul_words: 267 sra %o2,%g0,%o2 ! signx %o2 268 brgz,a %o2,.L_bn_mul_words_proceeed 269 lduw [%o1],%g2 270 retl 271 clr %o0 272 nop 273 nop 274 nop 275 276.L_bn_mul_words_proceeed: 277 srl %o3,%g0,%o3 ! clruw %o3 278 andcc %o2,-4,%g0 279 bz,pn %icc,.L_bn_mul_words_tail 280 clr %o5 281 282.L_bn_mul_words_loop: ! wow! 32 aligned! 283 lduw [%o1+4],%g3 284 mulx %o3,%g2,%g2 285 add %g2,%o5,%o4 286 nop 287 stuw %o4,[%o0] 288 srlx %o4,32,%o5 289 290 lduw [%o1+8],%g2 291 mulx %o3,%g3,%g3 292 add %g3,%o5,%o4 293 dec 4,%o2 294 stuw %o4,[%o0+4] 295 srlx %o4,32,%o5 296 297 lduw [%o1+12],%g3 298 mulx %o3,%g2,%g2 299 add %g2,%o5,%o4 300 inc 16,%o1 301 stuw %o4,[%o0+8] 302 srlx %o4,32,%o5 303 304 mulx %o3,%g3,%g3 305 add %g3,%o5,%o4 306 inc 16,%o0 307 stuw %o4,[%o0-4] 308 srlx %o4,32,%o5 309 andcc %o2,-4,%g0 310 bnz,a,pt %icc,.L_bn_mul_words_loop 311 lduw [%o1],%g2 312 nop 313 nop 314 315 brnz,a,pn %o2,.L_bn_mul_words_tail 316 lduw [%o1],%g2 317.L_bn_mul_words_return: 318 retl 319 mov %o5,%o0 320 321.L_bn_mul_words_tail: 322 mulx %o3,%g2,%g2 323 add %g2,%o5,%o4 324 dec %o2 325 srlx %o4,32,%o5 326 brz,pt %o2,.L_bn_mul_words_return 327 stuw %o4,[%o0] 328 329 lduw [%o1+4],%g2 330 mulx %o3,%g2,%g2 331 add %g2,%o5,%o4 332 dec %o2 333 srlx %o4,32,%o5 334 brz,pt %o2,.L_bn_mul_words_return 335 stuw %o4,[%o0+4] 336 337 lduw [%o1+8],%g2 338 mulx %o3,%g2,%g2 339 add %g2,%o5,%o4 340 stuw %o4,[%o0+8] 341 retl 342 srlx %o4,32,%o0 343 344.type bn_mul_words,#function 345.size bn_mul_words,(.-bn_mul_words) 346 347.align 32 348.global bn_sqr_words 349/* 350 * void bn_sqr_words(r,a,n) 351 * BN_ULONG *r,*a; 352 * int n; 353 */ 354bn_sqr_words: 355 sra %o2,%g0,%o2 ! signx %o2 356 brgz,a %o2,.L_bn_sqr_words_proceeed 357 lduw [%o1],%g2 358 retl 359 clr %o0 360 nop 361 nop 362 nop 363 364.L_bn_sqr_words_proceeed: 365 andcc %o2,-4,%g0 366 nop 367 bz,pn %icc,.L_bn_sqr_words_tail 368 nop 369 370.L_bn_sqr_words_loop: ! wow! 32 aligned! 371 lduw [%o1+4],%g3 372 mulx %g2,%g2,%o4 373 stuw %o4,[%o0] 374 srlx %o4,32,%o5 375 stuw %o5,[%o0+4] 376 nop 377 378 lduw [%o1+8],%g2 379 mulx %g3,%g3,%o4 380 dec 4,%o2 381 stuw %o4,[%o0+8] 382 srlx %o4,32,%o5 383 stuw %o5,[%o0+12] 384 385 lduw [%o1+12],%g3 386 mulx %g2,%g2,%o4 387 srlx %o4,32,%o5 388 stuw %o4,[%o0+16] 389 inc 16,%o1 390 stuw %o5,[%o0+20] 391 392 mulx %g3,%g3,%o4 393 inc 32,%o0 394 stuw %o4,[%o0-8] 395 srlx %o4,32,%o5 396 andcc %o2,-4,%g2 397 stuw %o5,[%o0-4] 398 bnz,a,pt %icc,.L_bn_sqr_words_loop 399 lduw [%o1],%g2 400 nop 401 402 brnz,a,pn %o2,.L_bn_sqr_words_tail 403 lduw [%o1],%g2 404.L_bn_sqr_words_return: 405 retl 406 clr %o0 407 408.L_bn_sqr_words_tail: 409 mulx %g2,%g2,%o4 410 dec %o2 411 stuw %o4,[%o0] 412 srlx %o4,32,%o5 413 brz,pt %o2,.L_bn_sqr_words_return 414 stuw %o5,[%o0+4] 415 416 lduw [%o1+4],%g2 417 mulx %g2,%g2,%o4 418 dec %o2 419 stuw %o4,[%o0+8] 420 srlx %o4,32,%o5 421 brz,pt %o2,.L_bn_sqr_words_return 422 stuw %o5,[%o0+12] 423 424 lduw [%o1+8],%g2 425 mulx %g2,%g2,%o4 426 srlx %o4,32,%o5 427 stuw %o4,[%o0+16] 428 stuw %o5,[%o0+20] 429 retl 430 clr %o0 431 432.type bn_sqr_words,#function 433.size bn_sqr_words,(.-bn_sqr_words) 434 435.align 32 436.global bn_div_words 437/* 438 * BN_ULONG bn_div_words(h,l,d) 439 * BN_ULONG h,l,d; 440 */ 441bn_div_words: 442 sllx %o0,32,%o0 443 or %o0,%o1,%o0 444 udivx %o0,%o2,%o0 445 retl 446 srl %o0,%g0,%o0 ! clruw %o0 447 448.type bn_div_words,#function 449.size bn_div_words,(.-bn_div_words) 450 451.align 32 452 453.global bn_add_words 454/* 455 * BN_ULONG bn_add_words(rp,ap,bp,n) 456 * BN_ULONG *rp,*ap,*bp; 457 * int n; 458 */ 459bn_add_words: 460 sra %o3,%g0,%o3 ! signx %o3 461 brgz,a %o3,.L_bn_add_words_proceed 462 lduw [%o1],%o4 463 retl 464 clr %o0 465 466.L_bn_add_words_proceed: 467 andcc %o3,-4,%g0 468 bz,pn %icc,.L_bn_add_words_tail 469 addcc %g0,0,%g0 ! clear carry flag 470 471.L_bn_add_words_loop: ! wow! 32 aligned! 472 dec 4,%o3 473 lduw [%o2],%o5 474 lduw [%o1+4],%g1 475 lduw [%o2+4],%g2 476 lduw [%o1+8],%g3 477 lduw [%o2+8],%g4 478 addccc %o5,%o4,%o5 479 stuw %o5,[%o0] 480 481 lduw [%o1+12],%o4 482 lduw [%o2+12],%o5 483 inc 16,%o1 484 addccc %g1,%g2,%g1 485 stuw %g1,[%o0+4] 486 487 inc 16,%o2 488 addccc %g3,%g4,%g3 489 stuw %g3,[%o0+8] 490 491 inc 16,%o0 492 addccc %o5,%o4,%o5 493 stuw %o5,[%o0-4] 494 and %o3,-4,%g1 495 brnz,a,pt %g1,.L_bn_add_words_loop 496 lduw [%o1],%o4 497 498 brnz,a,pn %o3,.L_bn_add_words_tail 499 lduw [%o1],%o4 500.L_bn_add_words_return: 501 clr %o0 502 retl 503 movcs %icc,1,%o0 504 nop 505 506.L_bn_add_words_tail: 507 lduw [%o2],%o5 508 dec %o3 509 addccc %o5,%o4,%o5 510 brz,pt %o3,.L_bn_add_words_return 511 stuw %o5,[%o0] 512 513 lduw [%o1+4],%o4 514 lduw [%o2+4],%o5 515 dec %o3 516 addccc %o5,%o4,%o5 517 brz,pt %o3,.L_bn_add_words_return 518 stuw %o5,[%o0+4] 519 520 lduw [%o1+8],%o4 521 lduw [%o2+8],%o5 522 addccc %o5,%o4,%o5 523 stuw %o5,[%o0+8] 524 clr %o0 525 retl 526 movcs %icc,1,%o0 527 528.type bn_add_words,#function 529.size bn_add_words,(.-bn_add_words) 530 531.global bn_sub_words 532/* 533 * BN_ULONG bn_sub_words(rp,ap,bp,n) 534 * BN_ULONG *rp,*ap,*bp; 535 * int n; 536 */ 537bn_sub_words: 538 sra %o3,%g0,%o3 ! signx %o3 539 brgz,a %o3,.L_bn_sub_words_proceed 540 lduw [%o1],%o4 541 retl 542 clr %o0 543 544.L_bn_sub_words_proceed: 545 andcc %o3,-4,%g0 546 bz,pn %icc,.L_bn_sub_words_tail 547 addcc %g0,0,%g0 ! clear carry flag 548 549.L_bn_sub_words_loop: ! wow! 32 aligned! 550 dec 4,%o3 551 lduw [%o2],%o5 552 lduw [%o1+4],%g1 553 lduw [%o2+4],%g2 554 lduw [%o1+8],%g3 555 lduw [%o2+8],%g4 556 subccc %o4,%o5,%o5 557 stuw %o5,[%o0] 558 559 lduw [%o1+12],%o4 560 lduw [%o2+12],%o5 561 inc 16,%o1 562 subccc %g1,%g2,%g2 563 stuw %g2,[%o0+4] 564 565 inc 16,%o2 566 subccc %g3,%g4,%g4 567 stuw %g4,[%o0+8] 568 569 inc 16,%o0 570 subccc %o4,%o5,%o5 571 stuw %o5,[%o0-4] 572 and %o3,-4,%g1 573 brnz,a,pt %g1,.L_bn_sub_words_loop 574 lduw [%o1],%o4 575 576 brnz,a,pn %o3,.L_bn_sub_words_tail 577 lduw [%o1],%o4 578.L_bn_sub_words_return: 579 clr %o0 580 retl 581 movcs %icc,1,%o0 582 nop 583 584.L_bn_sub_words_tail: ! wow! 32 aligned! 585 lduw [%o2],%o5 586 dec %o3 587 subccc %o4,%o5,%o5 588 brz,pt %o3,.L_bn_sub_words_return 589 stuw %o5,[%o0] 590 591 lduw [%o1+4],%o4 592 lduw [%o2+4],%o5 593 dec %o3 594 subccc %o4,%o5,%o5 595 brz,pt %o3,.L_bn_sub_words_return 596 stuw %o5,[%o0+4] 597 598 lduw [%o1+8],%o4 599 lduw [%o2+8],%o5 600 subccc %o4,%o5,%o5 601 stuw %o5,[%o0+8] 602 clr %o0 603 retl 604 movcs %icc,1,%o0 605 606.type bn_sub_words,#function 607.size bn_sub_words,(.-bn_sub_words) 608 609/* 610 * Code below depends on the fact that upper parts of the %l0-%l7 611 * and %i0-%i7 are zeroed by kernel after context switch. In 612 * previous versions this comment stated that "the trouble is that 613 * it's not feasible to implement the mumbo-jumbo in less V9 614 * instructions:-(" which apparently isn't true thanks to 615 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement 616 * results not from the shorter code, but from elimination of 617 * multicycle none-pairable 'rd %y,%rd' instructions. 618 * 619 * Andy. 620 */ 621 622#define FRAME_SIZE -96 623 624/* 625 * Here is register usage map for *all* routines below. 626 */ 627#define t_1 %o0 628#define t_2 %o1 629#define c_12 %o2 630#define c_3 %o3 631 632#define ap(I) [%i1+4*I] 633#define bp(I) [%i2+4*I] 634#define rp(I) [%i0+4*I] 635 636#define a_0 %l0 637#define a_1 %l1 638#define a_2 %l2 639#define a_3 %l3 640#define a_4 %l4 641#define a_5 %l5 642#define a_6 %l6 643#define a_7 %l7 644 645#define b_0 %i3 646#define b_1 %i4 647#define b_2 %i5 648#define b_3 %o4 649#define b_4 %o5 650#define b_5 %o7 651#define b_6 %g1 652#define b_7 %g4 653 654.align 32 655.global bn_mul_comba8 656/* 657 * void bn_mul_comba8(r,a,b) 658 * BN_ULONG *r,*a,*b; 659 */ 660bn_mul_comba8: 661 save %sp,FRAME_SIZE,%sp 662 mov 1,t_2 663 lduw ap(0),a_0 664 sllx t_2,32,t_2 665 lduw bp(0),b_0 != 666 lduw bp(1),b_1 667 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); 668 srlx t_1,32,c_12 669 stuw t_1,rp(0) !=!r[0]=c1; 670 671 lduw ap(1),a_1 672 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); 673 addcc c_12,t_1,c_12 674 clr c_3 != 675 bcs,a %xcc,.+8 676 add c_3,t_2,c_3 677 lduw ap(2),a_2 678 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); 679 addcc c_12,t_1,t_1 680 bcs,a %xcc,.+8 681 add c_3,t_2,c_3 682 srlx t_1,32,c_12 != 683 stuw t_1,rp(1) !r[1]=c2; 684 or c_12,c_3,c_12 685 686 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); 687 addcc c_12,t_1,c_12 != 688 clr c_3 689 bcs,a %xcc,.+8 690 add c_3,t_2,c_3 691 lduw bp(2),b_2 != 692 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); 693 addcc c_12,t_1,c_12 694 bcs,a %xcc,.+8 695 add c_3,t_2,c_3 != 696 lduw bp(3),b_3 697 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); 698 addcc c_12,t_1,t_1 699 bcs,a %xcc,.+8 != 700 add c_3,t_2,c_3 701 srlx t_1,32,c_12 702 stuw t_1,rp(2) !r[2]=c3; 703 or c_12,c_3,c_12 != 704 705 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); 706 addcc c_12,t_1,c_12 707 clr c_3 708 bcs,a %xcc,.+8 != 709 add c_3,t_2,c_3 710 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); 711 addcc c_12,t_1,c_12 712 bcs,a %xcc,.+8 != 713 add c_3,t_2,c_3 714 lduw ap(3),a_3 715 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); 716 addcc c_12,t_1,c_12 != 717 bcs,a %xcc,.+8 718 add c_3,t_2,c_3 719 lduw ap(4),a_4 720 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!= 721 addcc c_12,t_1,t_1 722 bcs,a %xcc,.+8 723 add c_3,t_2,c_3 724 srlx t_1,32,c_12 != 725 stuw t_1,rp(3) !r[3]=c1; 726 or c_12,c_3,c_12 727 728 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); 729 addcc c_12,t_1,c_12 != 730 clr c_3 731 bcs,a %xcc,.+8 732 add c_3,t_2,c_3 733 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1); 734 addcc c_12,t_1,c_12 735 bcs,a %xcc,.+8 736 add c_3,t_2,c_3 737 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); 738 addcc c_12,t_1,c_12 739 bcs,a %xcc,.+8 740 add c_3,t_2,c_3 741 lduw bp(4),b_4 != 742 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); 743 addcc c_12,t_1,c_12 744 bcs,a %xcc,.+8 745 add c_3,t_2,c_3 != 746 lduw bp(5),b_5 747 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1); 748 addcc c_12,t_1,t_1 749 bcs,a %xcc,.+8 != 750 add c_3,t_2,c_3 751 srlx t_1,32,c_12 752 stuw t_1,rp(4) !r[4]=c2; 753 or c_12,c_3,c_12 != 754 755 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); 756 addcc c_12,t_1,c_12 757 clr c_3 758 bcs,a %xcc,.+8 != 759 add c_3,t_2,c_3 760 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); 761 addcc c_12,t_1,c_12 762 bcs,a %xcc,.+8 != 763 add c_3,t_2,c_3 764 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); 765 addcc c_12,t_1,c_12 766 bcs,a %xcc,.+8 != 767 add c_3,t_2,c_3 768 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); 769 addcc c_12,t_1,c_12 770 bcs,a %xcc,.+8 != 771 add c_3,t_2,c_3 772 lduw ap(5),a_5 773 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); 774 addcc c_12,t_1,c_12 != 775 bcs,a %xcc,.+8 776 add c_3,t_2,c_3 777 lduw ap(6),a_6 778 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2); 779 addcc c_12,t_1,t_1 780 bcs,a %xcc,.+8 781 add c_3,t_2,c_3 782 srlx t_1,32,c_12 != 783 stuw t_1,rp(5) !r[5]=c3; 784 or c_12,c_3,c_12 785 786 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); 787 addcc c_12,t_1,c_12 != 788 clr c_3 789 bcs,a %xcc,.+8 790 add c_3,t_2,c_3 791 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); 792 addcc c_12,t_1,c_12 793 bcs,a %xcc,.+8 794 add c_3,t_2,c_3 795 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3); 796 addcc c_12,t_1,c_12 797 bcs,a %xcc,.+8 798 add c_3,t_2,c_3 799 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3); 800 addcc c_12,t_1,c_12 801 bcs,a %xcc,.+8 802 add c_3,t_2,c_3 803 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3); 804 addcc c_12,t_1,c_12 805 bcs,a %xcc,.+8 806 add c_3,t_2,c_3 807 lduw bp(6),b_6 != 808 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); 809 addcc c_12,t_1,c_12 810 bcs,a %xcc,.+8 811 add c_3,t_2,c_3 != 812 lduw bp(7),b_7 813 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); 814 addcc c_12,t_1,t_1 815 bcs,a %xcc,.+8 != 816 add c_3,t_2,c_3 817 srlx t_1,32,c_12 818 stuw t_1,rp(6) !r[6]=c1; 819 or c_12,c_3,c_12 != 820 821 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); 822 addcc c_12,t_1,c_12 823 clr c_3 824 bcs,a %xcc,.+8 != 825 add c_3,t_2,c_3 826 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); 827 addcc c_12,t_1,c_12 828 bcs,a %xcc,.+8 != 829 add c_3,t_2,c_3 830 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); 831 addcc c_12,t_1,c_12 832 bcs,a %xcc,.+8 != 833 add c_3,t_2,c_3 834 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1); 835 addcc c_12,t_1,c_12 836 bcs,a %xcc,.+8 != 837 add c_3,t_2,c_3 838 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); 839 addcc c_12,t_1,c_12 840 bcs,a %xcc,.+8 != 841 add c_3,t_2,c_3 842 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); 843 addcc c_12,t_1,c_12 844 bcs,a %xcc,.+8 != 845 add c_3,t_2,c_3 846 lduw ap(7),a_7 847 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); 848 addcc c_12,t_1,c_12 849 bcs,a %xcc,.+8 850 add c_3,t_2,c_3 851 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1); 852 addcc c_12,t_1,t_1 853 bcs,a %xcc,.+8 854 add c_3,t_2,c_3 855 srlx t_1,32,c_12 != 856 stuw t_1,rp(7) !r[7]=c2; 857 or c_12,c_3,c_12 858 859 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2); 860 addcc c_12,t_1,c_12 861 clr c_3 862 bcs,a %xcc,.+8 863 add c_3,t_2,c_3 != 864 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2); 865 addcc c_12,t_1,c_12 866 bcs,a %xcc,.+8 867 add c_3,t_2,c_3 != 868 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); 869 addcc c_12,t_1,c_12 870 bcs,a %xcc,.+8 871 add c_3,t_2,c_3 != 872 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); 873 addcc c_12,t_1,c_12 874 bcs,a %xcc,.+8 875 add c_3,t_2,c_3 != 876 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); 877 addcc c_12,t_1,c_12 878 bcs,a %xcc,.+8 879 add c_3,t_2,c_3 != 880 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2); 881 addcc c_12,t_1,c_12 882 bcs,a %xcc,.+8 883 add c_3,t_2,c_3 != 884 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); 885 addcc c_12,t_1,t_1 886 bcs,a %xcc,.+8 887 add c_3,t_2,c_3 != 888 srlx t_1,32,c_12 889 stuw t_1,rp(8) !r[8]=c3; 890 or c_12,c_3,c_12 891 892 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3); 893 addcc c_12,t_1,c_12 894 clr c_3 895 bcs,a %xcc,.+8 896 add c_3,t_2,c_3 != 897 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3); 898 addcc c_12,t_1,c_12 899 bcs,a %xcc,.+8 != 900 add c_3,t_2,c_3 901 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); 902 addcc c_12,t_1,c_12 903 bcs,a %xcc,.+8 != 904 add c_3,t_2,c_3 905 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); 906 addcc c_12,t_1,c_12 907 bcs,a %xcc,.+8 != 908 add c_3,t_2,c_3 909 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); 910 addcc c_12,t_1,c_12 911 bcs,a %xcc,.+8 != 912 add c_3,t_2,c_3 913 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3); 914 addcc c_12,t_1,t_1 915 bcs,a %xcc,.+8 != 916 add c_3,t_2,c_3 917 srlx t_1,32,c_12 918 stuw t_1,rp(9) !r[9]=c1; 919 or c_12,c_3,c_12 != 920 921 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); 922 addcc c_12,t_1,c_12 923 clr c_3 924 bcs,a %xcc,.+8 != 925 add c_3,t_2,c_3 926 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); 927 addcc c_12,t_1,c_12 928 bcs,a %xcc,.+8 != 929 add c_3,t_2,c_3 930 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1); 931 addcc c_12,t_1,c_12 932 bcs,a %xcc,.+8 != 933 add c_3,t_2,c_3 934 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); 935 addcc c_12,t_1,c_12 936 bcs,a %xcc,.+8 != 937 add c_3,t_2,c_3 938 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); 939 addcc c_12,t_1,t_1 940 bcs,a %xcc,.+8 != 941 add c_3,t_2,c_3 942 srlx t_1,32,c_12 943 stuw t_1,rp(10) !r[10]=c2; 944 or c_12,c_3,c_12 != 945 946 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2); 947 addcc c_12,t_1,c_12 948 clr c_3 949 bcs,a %xcc,.+8 != 950 add c_3,t_2,c_3 951 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); 952 addcc c_12,t_1,c_12 953 bcs,a %xcc,.+8 != 954 add c_3,t_2,c_3 955 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); 956 addcc c_12,t_1,c_12 957 bcs,a %xcc,.+8 != 958 add c_3,t_2,c_3 959 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); 960 addcc c_12,t_1,t_1 961 bcs,a %xcc,.+8 != 962 add c_3,t_2,c_3 963 srlx t_1,32,c_12 964 stuw t_1,rp(11) !r[11]=c3; 965 or c_12,c_3,c_12 != 966 967 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); 968 addcc c_12,t_1,c_12 969 clr c_3 970 bcs,a %xcc,.+8 != 971 add c_3,t_2,c_3 972 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); 973 addcc c_12,t_1,c_12 974 bcs,a %xcc,.+8 != 975 add c_3,t_2,c_3 976 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); 977 addcc c_12,t_1,t_1 978 bcs,a %xcc,.+8 != 979 add c_3,t_2,c_3 980 srlx t_1,32,c_12 981 stuw t_1,rp(12) !r[12]=c1; 982 or c_12,c_3,c_12 != 983 984 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); 985 addcc c_12,t_1,c_12 986 clr c_3 987 bcs,a %xcc,.+8 != 988 add c_3,t_2,c_3 989 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); 990 addcc c_12,t_1,t_1 991 bcs,a %xcc,.+8 != 992 add c_3,t_2,c_3 993 srlx t_1,32,c_12 994 st t_1,rp(13) !r[13]=c2; 995 or c_12,c_3,c_12 != 996 997 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2); 998 addcc c_12,t_1,t_1 999 srlx t_1,32,c_12 != 1000 stuw t_1,rp(14) !r[14]=c3; 1001 stuw c_12,rp(15) !r[15]=c1; 1002 1003 ret 1004 restore %g0,%g0,%o0 != 1005 1006.type bn_mul_comba8,#function 1007.size bn_mul_comba8,(.-bn_mul_comba8) 1008 1009.align 32 1010 1011.global bn_mul_comba4 1012/* 1013 * void bn_mul_comba4(r,a,b) 1014 * BN_ULONG *r,*a,*b; 1015 */ 1016bn_mul_comba4: 1017 save %sp,FRAME_SIZE,%sp 1018 lduw ap(0),a_0 1019 mov 1,t_2 1020 lduw bp(0),b_0 1021 sllx t_2,32,t_2 != 1022 lduw bp(1),b_1 1023 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); 1024 srlx t_1,32,c_12 1025 stuw t_1,rp(0) !=!r[0]=c1; 1026 1027 lduw ap(1),a_1 1028 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); 1029 addcc c_12,t_1,c_12 1030 clr c_3 != 1031 bcs,a %xcc,.+8 1032 add c_3,t_2,c_3 1033 lduw ap(2),a_2 1034 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); 1035 addcc c_12,t_1,t_1 1036 bcs,a %xcc,.+8 1037 add c_3,t_2,c_3 1038 srlx t_1,32,c_12 != 1039 stuw t_1,rp(1) !r[1]=c2; 1040 or c_12,c_3,c_12 1041 1042 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); 1043 addcc c_12,t_1,c_12 != 1044 clr c_3 1045 bcs,a %xcc,.+8 1046 add c_3,t_2,c_3 1047 lduw bp(2),b_2 != 1048 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); 1049 addcc c_12,t_1,c_12 1050 bcs,a %xcc,.+8 1051 add c_3,t_2,c_3 != 1052 lduw bp(3),b_3 1053 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); 1054 addcc c_12,t_1,t_1 1055 bcs,a %xcc,.+8 != 1056 add c_3,t_2,c_3 1057 srlx t_1,32,c_12 1058 stuw t_1,rp(2) !r[2]=c3; 1059 or c_12,c_3,c_12 != 1060 1061 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); 1062 addcc c_12,t_1,c_12 1063 clr c_3 1064 bcs,a %xcc,.+8 != 1065 add c_3,t_2,c_3 1066 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); 1067 addcc c_12,t_1,c_12 1068 bcs,a %xcc,.+8 != 1069 add c_3,t_2,c_3 1070 lduw ap(3),a_3 1071 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); 1072 addcc c_12,t_1,c_12 != 1073 bcs,a %xcc,.+8 1074 add c_3,t_2,c_3 1075 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= 1076 addcc c_12,t_1,t_1 != 1077 bcs,a %xcc,.+8 1078 add c_3,t_2,c_3 1079 srlx t_1,32,c_12 1080 stuw t_1,rp(3) !=!r[3]=c1; 1081 or c_12,c_3,c_12 1082 1083 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); 1084 addcc c_12,t_1,c_12 1085 clr c_3 != 1086 bcs,a %xcc,.+8 1087 add c_3,t_2,c_3 1088 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); 1089 addcc c_12,t_1,c_12 != 1090 bcs,a %xcc,.+8 1091 add c_3,t_2,c_3 1092 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); 1093 addcc c_12,t_1,t_1 != 1094 bcs,a %xcc,.+8 1095 add c_3,t_2,c_3 1096 srlx t_1,32,c_12 1097 stuw t_1,rp(4) !=!r[4]=c2; 1098 or c_12,c_3,c_12 1099 1100 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); 1101 addcc c_12,t_1,c_12 1102 clr c_3 != 1103 bcs,a %xcc,.+8 1104 add c_3,t_2,c_3 1105 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); 1106 addcc c_12,t_1,t_1 != 1107 bcs,a %xcc,.+8 1108 add c_3,t_2,c_3 1109 srlx t_1,32,c_12 1110 stuw t_1,rp(5) !=!r[5]=c3; 1111 or c_12,c_3,c_12 1112 1113 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); 1114 addcc c_12,t_1,t_1 1115 srlx t_1,32,c_12 != 1116 stuw t_1,rp(6) !r[6]=c1; 1117 stuw c_12,rp(7) !r[7]=c2; 1118 1119 ret 1120 restore %g0,%g0,%o0 1121 1122.type bn_mul_comba4,#function 1123.size bn_mul_comba4,(.-bn_mul_comba4) 1124 1125.align 32 1126 1127.global bn_sqr_comba8 1128bn_sqr_comba8: 1129 save %sp,FRAME_SIZE,%sp 1130 mov 1,t_2 1131 lduw ap(0),a_0 1132 sllx t_2,32,t_2 1133 lduw ap(1),a_1 1134 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); 1135 srlx t_1,32,c_12 1136 stuw t_1,rp(0) !r[0]=c1; 1137 1138 lduw ap(2),a_2 1139 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); 1140 addcc c_12,t_1,c_12 1141 clr c_3 1142 bcs,a %xcc,.+8 1143 add c_3,t_2,c_3 1144 addcc c_12,t_1,t_1 1145 bcs,a %xcc,.+8 1146 add c_3,t_2,c_3 1147 srlx t_1,32,c_12 1148 stuw t_1,rp(1) !r[1]=c2; 1149 or c_12,c_3,c_12 1150 1151 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); 1152 addcc c_12,t_1,c_12 1153 clr c_3 1154 bcs,a %xcc,.+8 1155 add c_3,t_2,c_3 1156 addcc c_12,t_1,c_12 1157 bcs,a %xcc,.+8 1158 add c_3,t_2,c_3 1159 lduw ap(3),a_3 1160 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); 1161 addcc c_12,t_1,t_1 1162 bcs,a %xcc,.+8 1163 add c_3,t_2,c_3 1164 srlx t_1,32,c_12 1165 stuw t_1,rp(2) !r[2]=c3; 1166 or c_12,c_3,c_12 1167 1168 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); 1169 addcc c_12,t_1,c_12 1170 clr c_3 1171 bcs,a %xcc,.+8 1172 add c_3,t_2,c_3 1173 addcc c_12,t_1,c_12 1174 bcs,a %xcc,.+8 1175 add c_3,t_2,c_3 1176 lduw ap(4),a_4 1177 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); 1178 addcc c_12,t_1,c_12 1179 bcs,a %xcc,.+8 1180 add c_3,t_2,c_3 1181 addcc c_12,t_1,t_1 1182 bcs,a %xcc,.+8 1183 add c_3,t_2,c_3 1184 srlx t_1,32,c_12 1185 st t_1,rp(3) !r[3]=c1; 1186 or c_12,c_3,c_12 1187 1188 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); 1189 addcc c_12,t_1,c_12 1190 clr c_3 1191 bcs,a %xcc,.+8 1192 add c_3,t_2,c_3 1193 addcc c_12,t_1,c_12 1194 bcs,a %xcc,.+8 1195 add c_3,t_2,c_3 1196 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); 1197 addcc c_12,t_1,c_12 1198 bcs,a %xcc,.+8 1199 add c_3,t_2,c_3 1200 addcc c_12,t_1,c_12 1201 bcs,a %xcc,.+8 1202 add c_3,t_2,c_3 1203 lduw ap(5),a_5 1204 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); 1205 addcc c_12,t_1,t_1 1206 bcs,a %xcc,.+8 1207 add c_3,t_2,c_3 1208 srlx t_1,32,c_12 1209 stuw t_1,rp(4) !r[4]=c2; 1210 or c_12,c_3,c_12 1211 1212 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); 1213 addcc c_12,t_1,c_12 1214 clr c_3 1215 bcs,a %xcc,.+8 1216 add c_3,t_2,c_3 1217 addcc c_12,t_1,c_12 1218 bcs,a %xcc,.+8 1219 add c_3,t_2,c_3 1220 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); 1221 addcc c_12,t_1,c_12 1222 bcs,a %xcc,.+8 1223 add c_3,t_2,c_3 1224 addcc c_12,t_1,c_12 1225 bcs,a %xcc,.+8 1226 add c_3,t_2,c_3 1227 lduw ap(6),a_6 1228 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); 1229 addcc c_12,t_1,c_12 1230 bcs,a %xcc,.+8 1231 add c_3,t_2,c_3 1232 addcc c_12,t_1,t_1 1233 bcs,a %xcc,.+8 1234 add c_3,t_2,c_3 1235 srlx t_1,32,c_12 1236 stuw t_1,rp(5) !r[5]=c3; 1237 or c_12,c_3,c_12 1238 1239 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); 1240 addcc c_12,t_1,c_12 1241 clr c_3 1242 bcs,a %xcc,.+8 1243 add c_3,t_2,c_3 1244 addcc c_12,t_1,c_12 1245 bcs,a %xcc,.+8 1246 add c_3,t_2,c_3 1247 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); 1248 addcc c_12,t_1,c_12 1249 bcs,a %xcc,.+8 1250 add c_3,t_2,c_3 1251 addcc c_12,t_1,c_12 1252 bcs,a %xcc,.+8 1253 add c_3,t_2,c_3 1254 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); 1255 addcc c_12,t_1,c_12 1256 bcs,a %xcc,.+8 1257 add c_3,t_2,c_3 1258 addcc c_12,t_1,c_12 1259 bcs,a %xcc,.+8 1260 add c_3,t_2,c_3 1261 lduw ap(7),a_7 1262 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); 1263 addcc c_12,t_1,t_1 1264 bcs,a %xcc,.+8 1265 add c_3,t_2,c_3 1266 srlx t_1,32,c_12 1267 stuw t_1,rp(6) !r[6]=c1; 1268 or c_12,c_3,c_12 1269 1270 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); 1271 addcc c_12,t_1,c_12 1272 clr c_3 1273 bcs,a %xcc,.+8 1274 add c_3,t_2,c_3 1275 addcc c_12,t_1,c_12 1276 bcs,a %xcc,.+8 1277 add c_3,t_2,c_3 1278 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); 1279 addcc c_12,t_1,c_12 1280 bcs,a %xcc,.+8 1281 add c_3,t_2,c_3 1282 addcc c_12,t_1,c_12 1283 bcs,a %xcc,.+8 1284 add c_3,t_2,c_3 1285 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); 1286 addcc c_12,t_1,c_12 1287 bcs,a %xcc,.+8 1288 add c_3,t_2,c_3 1289 addcc c_12,t_1,c_12 1290 bcs,a %xcc,.+8 1291 add c_3,t_2,c_3 1292 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); 1293 addcc c_12,t_1,c_12 1294 bcs,a %xcc,.+8 1295 add c_3,t_2,c_3 1296 addcc c_12,t_1,t_1 1297 bcs,a %xcc,.+8 1298 add c_3,t_2,c_3 1299 srlx t_1,32,c_12 1300 stuw t_1,rp(7) !r[7]=c2; 1301 or c_12,c_3,c_12 1302 1303 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); 1304 addcc c_12,t_1,c_12 1305 clr c_3 1306 bcs,a %xcc,.+8 1307 add c_3,t_2,c_3 1308 addcc c_12,t_1,c_12 1309 bcs,a %xcc,.+8 1310 add c_3,t_2,c_3 1311 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); 1312 addcc c_12,t_1,c_12 1313 bcs,a %xcc,.+8 1314 add c_3,t_2,c_3 1315 addcc c_12,t_1,c_12 1316 bcs,a %xcc,.+8 1317 add c_3,t_2,c_3 1318 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); 1319 addcc c_12,t_1,c_12 1320 bcs,a %xcc,.+8 1321 add c_3,t_2,c_3 1322 addcc c_12,t_1,c_12 1323 bcs,a %xcc,.+8 1324 add c_3,t_2,c_3 1325 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); 1326 addcc c_12,t_1,t_1 1327 bcs,a %xcc,.+8 1328 add c_3,t_2,c_3 1329 srlx t_1,32,c_12 1330 stuw t_1,rp(8) !r[8]=c3; 1331 or c_12,c_3,c_12 1332 1333 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); 1334 addcc c_12,t_1,c_12 1335 clr c_3 1336 bcs,a %xcc,.+8 1337 add c_3,t_2,c_3 1338 addcc c_12,t_1,c_12 1339 bcs,a %xcc,.+8 1340 add c_3,t_2,c_3 1341 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); 1342 addcc c_12,t_1,c_12 1343 bcs,a %xcc,.+8 1344 add c_3,t_2,c_3 1345 addcc c_12,t_1,c_12 1346 bcs,a %xcc,.+8 1347 add c_3,t_2,c_3 1348 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); 1349 addcc c_12,t_1,c_12 1350 bcs,a %xcc,.+8 1351 add c_3,t_2,c_3 1352 addcc c_12,t_1,t_1 1353 bcs,a %xcc,.+8 1354 add c_3,t_2,c_3 1355 srlx t_1,32,c_12 1356 stuw t_1,rp(9) !r[9]=c1; 1357 or c_12,c_3,c_12 1358 1359 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); 1360 addcc c_12,t_1,c_12 1361 clr c_3 1362 bcs,a %xcc,.+8 1363 add c_3,t_2,c_3 1364 addcc c_12,t_1,c_12 1365 bcs,a %xcc,.+8 1366 add c_3,t_2,c_3 1367 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); 1368 addcc c_12,t_1,c_12 1369 bcs,a %xcc,.+8 1370 add c_3,t_2,c_3 1371 addcc c_12,t_1,c_12 1372 bcs,a %xcc,.+8 1373 add c_3,t_2,c_3 1374 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); 1375 addcc c_12,t_1,t_1 1376 bcs,a %xcc,.+8 1377 add c_3,t_2,c_3 1378 srlx t_1,32,c_12 1379 stuw t_1,rp(10) !r[10]=c2; 1380 or c_12,c_3,c_12 1381 1382 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2); 1383 addcc c_12,t_1,c_12 1384 clr c_3 1385 bcs,a %xcc,.+8 1386 add c_3,t_2,c_3 1387 addcc c_12,t_1,c_12 1388 bcs,a %xcc,.+8 1389 add c_3,t_2,c_3 1390 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2); 1391 addcc c_12,t_1,c_12 1392 bcs,a %xcc,.+8 1393 add c_3,t_2,c_3 1394 addcc c_12,t_1,t_1 1395 bcs,a %xcc,.+8 1396 add c_3,t_2,c_3 1397 srlx t_1,32,c_12 1398 stuw t_1,rp(11) !r[11]=c3; 1399 or c_12,c_3,c_12 1400 1401 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); 1402 addcc c_12,t_1,c_12 1403 clr c_3 1404 bcs,a %xcc,.+8 1405 add c_3,t_2,c_3 1406 addcc c_12,t_1,c_12 1407 bcs,a %xcc,.+8 1408 add c_3,t_2,c_3 1409 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); 1410 addcc c_12,t_1,t_1 1411 bcs,a %xcc,.+8 1412 add c_3,t_2,c_3 1413 srlx t_1,32,c_12 1414 stuw t_1,rp(12) !r[12]=c1; 1415 or c_12,c_3,c_12 1416 1417 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); 1418 addcc c_12,t_1,c_12 1419 clr c_3 1420 bcs,a %xcc,.+8 1421 add c_3,t_2,c_3 1422 addcc c_12,t_1,t_1 1423 bcs,a %xcc,.+8 1424 add c_3,t_2,c_3 1425 srlx t_1,32,c_12 1426 stuw t_1,rp(13) !r[13]=c2; 1427 or c_12,c_3,c_12 1428 1429 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); 1430 addcc c_12,t_1,t_1 1431 srlx t_1,32,c_12 1432 stuw t_1,rp(14) !r[14]=c3; 1433 stuw c_12,rp(15) !r[15]=c1; 1434 1435 ret 1436 restore %g0,%g0,%o0 1437 1438.type bn_sqr_comba8,#function 1439.size bn_sqr_comba8,(.-bn_sqr_comba8) 1440 1441.align 32 1442 1443.global bn_sqr_comba4 1444/* 1445 * void bn_sqr_comba4(r,a) 1446 * BN_ULONG *r,*a; 1447 */ 1448bn_sqr_comba4: 1449 save %sp,FRAME_SIZE,%sp 1450 mov 1,t_2 1451 lduw ap(0),a_0 1452 sllx t_2,32,t_2 1453 lduw ap(1),a_1 1454 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); 1455 srlx t_1,32,c_12 1456 stuw t_1,rp(0) !r[0]=c1; 1457 1458 lduw ap(2),a_2 1459 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1); 1460 addcc c_12,t_1,c_12 1461 clr c_3 1462 bcs,a %xcc,.+8 1463 add c_3,t_2,c_3 1464 addcc c_12,t_1,t_1 1465 bcs,a %xcc,.+8 1466 add c_3,t_2,c_3 1467 srlx t_1,32,c_12 1468 stuw t_1,rp(1) !r[1]=c2; 1469 or c_12,c_3,c_12 1470 1471 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); 1472 addcc c_12,t_1,c_12 1473 clr c_3 1474 bcs,a %xcc,.+8 1475 add c_3,t_2,c_3 1476 addcc c_12,t_1,c_12 1477 bcs,a %xcc,.+8 1478 add c_3,t_2,c_3 1479 lduw ap(3),a_3 1480 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); 1481 addcc c_12,t_1,t_1 1482 bcs,a %xcc,.+8 1483 add c_3,t_2,c_3 1484 srlx t_1,32,c_12 1485 stuw t_1,rp(2) !r[2]=c3; 1486 or c_12,c_3,c_12 1487 1488 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); 1489 addcc c_12,t_1,c_12 1490 clr c_3 1491 bcs,a %xcc,.+8 1492 add c_3,t_2,c_3 1493 addcc c_12,t_1,c_12 1494 bcs,a %xcc,.+8 1495 add c_3,t_2,c_3 1496 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); 1497 addcc c_12,t_1,c_12 1498 bcs,a %xcc,.+8 1499 add c_3,t_2,c_3 1500 addcc c_12,t_1,t_1 1501 bcs,a %xcc,.+8 1502 add c_3,t_2,c_3 1503 srlx t_1,32,c_12 1504 stuw t_1,rp(3) !r[3]=c1; 1505 or c_12,c_3,c_12 1506 1507 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); 1508 addcc c_12,t_1,c_12 1509 clr c_3 1510 bcs,a %xcc,.+8 1511 add c_3,t_2,c_3 1512 addcc c_12,t_1,c_12 1513 bcs,a %xcc,.+8 1514 add c_3,t_2,c_3 1515 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); 1516 addcc c_12,t_1,t_1 1517 bcs,a %xcc,.+8 1518 add c_3,t_2,c_3 1519 srlx t_1,32,c_12 1520 stuw t_1,rp(4) !r[4]=c2; 1521 or c_12,c_3,c_12 1522 1523 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); 1524 addcc c_12,t_1,c_12 1525 clr c_3 1526 bcs,a %xcc,.+8 1527 add c_3,t_2,c_3 1528 addcc c_12,t_1,t_1 1529 bcs,a %xcc,.+8 1530 add c_3,t_2,c_3 1531 srlx t_1,32,c_12 1532 stuw t_1,rp(5) !r[5]=c3; 1533 or c_12,c_3,c_12 1534 1535 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); 1536 addcc c_12,t_1,t_1 1537 srlx t_1,32,c_12 1538 stuw t_1,rp(6) !r[6]=c1; 1539 stuw c_12,rp(7) !r[7]=c2; 1540 1541 ret 1542 restore %g0,%g0,%o0 1543 1544.type bn_sqr_comba4,#function 1545.size bn_sqr_comba4,(.-bn_sqr_comba4) 1546 1547.align 32 1548