1dnl PowerPC-64 mpn_basecase. 2 3dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software 4dnl Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C POWER3/PPC630: 6-18 25C POWER4/PPC970: 8 26C POWER5: 8 27 28 29C INPUT PARAMETERS 30define(`rp', `r3') 31define(`up', `r4') 32define(`un', `r5') 33define(`vp', `r6') 34define(`vn', `r7') 35 36define(`v0', `r25') 37define(`outer_rp', `r22') 38define(`outer_up', `r23') 39 40ASM_START() 41PROLOGUE(mpn_mul_basecase) 42 43C Special code for un <= 2, for efficiency of these important cases, 44C and since it simplifies the default code. 45 cmpdi cr0, un, 2 46 bgt cr0, L(un_gt2) 47 cmpdi cr6, vn, 1 48 ld r7, 0(vp) 49 ld r5, 0(up) 50 mulld r8, r5, r7 C weight 0 51 mulhdu r9, r5, r7 C weight 1 52 std r8, 0(rp) 53 beq cr0, L(2x) 54 std r9, 8(rp) 55 blr 56 ALIGN(16) 57L(2x): ld r0, 8(up) 58 mulld r8, r0, r7 C weight 1 59 mulhdu r10, r0, r7 C weight 2 60 addc r9, r9, r8 61 addze r10, r10 62 bne cr6, L(2x2) 63 std r9, 8(rp) 64 std r10, 16(rp) 65 blr 66 ALIGN(16) 67L(2x2): ld r6, 8(vp) 68 nop 69 mulld r8, r5, r6 C weight 1 70 mulhdu r11, r5, r6 C weight 2 71 addc r9, r9, r8 72 std r9, 8(rp) 73 adde r11, r11, r10 74 mulld r12, r0, r6 C weight 2 75 mulhdu r0, r0, r6 C weight 3 76 addze r0, r0 77 addc r11, r11, r12 78 addze r0, r0 79 std r11, 16(rp) 80 std r0, 24(rp) 81 blr 82 83L(un_gt2): 84 std r31, -8(r1) 85 std r30, -16(r1) 86 std r29, -24(r1) 87 std r28, -32(r1) 88 std r27, -40(r1) 89 std r26, -48(r1) 90 std r25, -56(r1) 91 std r24, -64(r1) 92 std r23, -72(r1) 93 std r22, -80(r1) 94 95 mr outer_rp, rp 96 mr outer_up, up 97 98 ld v0, 0(vp) C new v limb 99 addi vp, vp, 8 100 ld r26, 0(up) 101 102 rldicl. r0, un, 0,62 C r0 = n & 3, set cr0 103 cmpdi cr6, r0, 2 104 addi un, un, 1 C compute count... 105 srdi un, un, 2 C ...for ctr 106 mtctr un C copy inner loop count into ctr 107 beq cr0, L(b0) 108 blt cr6, L(b1) 109 beq cr6, L(b2) 110 111 112 ALIGN(16) 113L(b3): mulld r0, r26, v0 114 mulhdu r12, r26, v0 115 addic r0, r0, 0 116 std r0, 0(rp) 117 ld r26, 8(up) 118 ld r27, 16(up) 119 bdz L(end_m_3) 120 121 ALIGN(16) 122L(lo_m_3): 123 mulld r0, r26, v0 124 mulhdu r31, r26, v0 125 ld r26, 24(up) 126 nop 127 mulld r24, r27, v0 128 mulhdu r8, r27, v0 129 ld r27, 32(up) 130 nop 131 adde r0, r0, r12 132 adde r24, r24, r31 133 mulld r9, r26, v0 134 mulhdu r10, r26, v0 135 ld r26, 40(up) 136 nop 137 mulld r11, r27, v0 138 mulhdu r12, r27, v0 139 ld r27, 48(up) 140 std r0, 8(rp) 141 adde r9, r9, r8 142 std r24, 16(rp) 143 adde r11, r11, r10 144 std r9, 24(rp) 145 addi up, up, 32 146 std r11, 32(rp) 147 addi rp, rp, 32 148 bdnz L(lo_m_3) 149 150 ALIGN(16) 151L(end_m_3): 152 mulld r0, r26, v0 153 mulhdu r31, r26, v0 154 155 mulld r24, r27, v0 156 mulhdu r8, r27, v0 157 158 adde r0, r0, r12 159 adde r24, r24, r31 160 161 std r0, 8(rp) 162 std r24, 16(rp) 163 addze r8, r8 164 std r8, 24(rp) 165 addic. vn, vn, -1 166 beq L(ret) 167 168 ALIGN(16) 169L(outer_lo_3): 170 mtctr un C copy inner loop count into ctr 171 addi rp, outer_rp, 8 172 mr up, outer_up 173 addi outer_rp, outer_rp, 8 174 ld v0, 0(vp) C new v limb 175 addi vp, vp, 8 176 ld r26, 0(up) 177 ld r28, 0(rp) 178 mulld r0, r26, v0 179 mulhdu r12, r26, v0 180 addc r0, r0, r28 181 std r0, 0(rp) 182 ld r26, 8(up) 183 ld r27, 16(up) 184 bdz L(end_3) 185 186 ALIGN(16) C registers dying 187L(lo_3): 188 mulld r0, r26, v0 C 189 mulhdu r10, r26, v0 C 26 190 ld r26, 24(up) C 191 ld r28, 8(rp) C 192 mulld r24, r27, v0 C 193 mulhdu r8, r27, v0 C 27 194 ld r27, 32(up) C 195 ld r29, 16(rp) C 196 adde r0, r0, r12 C 0 12 197 adde r24, r24, r10 C 24 10 198 mulld r9, r26, v0 C 199 mulhdu r10, r26, v0 C 26 200 ld r26, 40(up) C 201 ld r30, 24(rp) C 202 mulld r11, r27, v0 C 203 mulhdu r12, r27, v0 C 27 204 ld r27, 48(up) C 205 ld r31, 32(rp) C 206 adde r9, r9, r8 C 8 9 207 adde r11, r11, r10 C 10 11 208 addze r12, r12 C 12 209 addc r0, r0, r28 C 0 28 210 std r0, 8(rp) C 0 211 adde r24, r24, r29 C 7 29 212 std r24, 16(rp) C 7 213 adde r9, r9, r30 C 9 30 214 std r9, 24(rp) C 9 215 adde r11, r11, r31 C 11 31 216 std r11, 32(rp) C 11 217 addi up, up, 32 C 218 addi rp, rp, 32 C 219 bdnz L(lo_3) C 220 221 ALIGN(16) 222L(end_3): 223 mulld r0, r26, v0 224 mulhdu r10, r26, v0 225 ld r28, 8(rp) 226 nop 227 mulld r24, r27, v0 228 mulhdu r8, r27, v0 229 ld r29, 16(rp) 230 nop 231 adde r0, r0, r12 232 adde r24, r24, r10 233 addze r8, r8 234 addc r0, r0, r28 235 std r0, 8(rp) 236 adde r24, r24, r29 237 std r24, 16(rp) 238 addze r8, r8 239 std r8, 24(rp) 240 241 addic. vn, vn, -1 242 bne L(outer_lo_3) 243 b L(ret) 244 245 246 ALIGN(16) 247L(b0): ld r27, 8(up) 248 addi up, up, 8 249 mulld r0, r26, v0 250 mulhdu r10, r26, v0 251 mulld r24, r27, v0 252 mulhdu r8, r27, v0 253 addc r24, r24, r10 254 addze r12, r8 255 std r0, 0(rp) 256 std r24, 8(rp) 257 addi rp, rp, 8 258 ld r26, 8(up) 259 ld r27, 16(up) 260 bdz L(end_m_0) 261 262 ALIGN(16) 263L(lo_m_0): 264 mulld r0, r26, v0 265 mulhdu r31, r26, v0 266 ld r26, 24(up) 267 nop 268 mulld r24, r27, v0 269 mulhdu r8, r27, v0 270 ld r27, 32(up) 271 nop 272 adde r0, r0, r12 273 adde r24, r24, r31 274 mulld r9, r26, v0 275 mulhdu r10, r26, v0 276 ld r26, 40(up) 277 nop 278 mulld r11, r27, v0 279 mulhdu r12, r27, v0 280 ld r27, 48(up) 281 std r0, 8(rp) 282 adde r9, r9, r8 283 std r24, 16(rp) 284 adde r11, r11, r10 285 std r9, 24(rp) 286 addi up, up, 32 287 std r11, 32(rp) 288 addi rp, rp, 32 289 bdnz L(lo_m_0) 290 291 ALIGN(16) 292L(end_m_0): 293 mulld r0, r26, v0 294 mulhdu r31, r26, v0 295 296 mulld r24, r27, v0 297 mulhdu r8, r27, v0 298 299 adde r0, r0, r12 300 adde r24, r24, r31 301 302 std r0, 8(rp) 303 addze r8, r8 304 std r24, 16(rp) 305 addic. vn, vn, -1 306 std r8, 24(rp) 307 nop 308 beq L(ret) 309 310 ALIGN(16) 311L(outer_lo_0): 312 mtctr un C copy inner loop count into ctr 313 addi rp, outer_rp, 16 314 addi up, outer_up, 8 315 addi outer_rp, outer_rp, 8 316 ld v0, 0(vp) C new v limb 317 addi vp, vp, 8 318 ld r26, -8(up) 319 ld r27, 0(up) 320 ld r28, -8(rp) 321 ld r29, 0(rp) 322 nop 323 nop 324 mulld r0, r26, v0 325 mulhdu r10, r26, v0 326 mulld r24, r27, v0 327 mulhdu r8, r27, v0 328 addc r24, r24, r10 329 addze r12, r8 330 addc r0, r0, r28 331 std r0, -8(rp) 332 adde r24, r24, r29 333 std r24, 0(rp) 334 ld r26, 8(up) 335 ld r27, 16(up) 336 bdz L(end_0) 337 338 ALIGN(16) C registers dying 339L(lo_0): 340 mulld r0, r26, v0 C 341 mulhdu r10, r26, v0 C 26 342 ld r26, 24(up) C 343 ld r28, 8(rp) C 344 mulld r24, r27, v0 C 345 mulhdu r8, r27, v0 C 27 346 ld r27, 32(up) C 347 ld r29, 16(rp) C 348 adde r0, r0, r12 C 0 12 349 adde r24, r24, r10 C 24 10 350 mulld r9, r26, v0 C 351 mulhdu r10, r26, v0 C 26 352 ld r26, 40(up) C 353 ld r30, 24(rp) C 354 mulld r11, r27, v0 C 355 mulhdu r12, r27, v0 C 27 356 ld r27, 48(up) C 357 ld r31, 32(rp) C 358 adde r9, r9, r8 C 8 9 359 adde r11, r11, r10 C 10 11 360 addze r12, r12 C 12 361 addc r0, r0, r28 C 0 28 362 std r0, 8(rp) C 0 363 adde r24, r24, r29 C 7 29 364 std r24, 16(rp) C 7 365 adde r9, r9, r30 C 9 30 366 std r9, 24(rp) C 9 367 adde r11, r11, r31 C 11 31 368 std r11, 32(rp) C 11 369 addi up, up, 32 C 370 addi rp, rp, 32 C 371 bdnz L(lo_0) C 372 373 ALIGN(16) 374L(end_0): 375 mulld r0, r26, v0 376 mulhdu r10, r26, v0 377 ld r28, 8(rp) 378 nop 379 mulld r24, r27, v0 380 mulhdu r8, r27, v0 381 ld r29, 16(rp) 382 nop 383 adde r0, r0, r12 384 adde r24, r24, r10 385 addze r8, r8 386 addic. vn, vn, -1 387 addc r0, r0, r28 388 std r0, 8(rp) 389 adde r24, r24, r29 390 std r24, 16(rp) 391 addze r8, r8 392 std r8, 24(rp) 393 bne L(outer_lo_0) 394 b L(ret) 395 396 397 ALIGN(16) 398L(b1): ld r27, 8(up) 399 nop 400 mulld r0, r26, v0 401 mulhdu r31, r26, v0 402 ld r26, 16(up) 403 mulld r24, r27, v0 404 mulhdu r8, r27, v0 405 mulld r9, r26, v0 406 mulhdu r10, r26, v0 407 addc r24, r24, r31 408 adde r9, r9, r8 409 addze r12, r10 410 std r0, 0(rp) 411 std r24, 8(rp) 412 std r9, 16(rp) 413 addi up, up, 16 414 addi rp, rp, 16 415 ld r26, 8(up) 416 ld r27, 16(up) 417 bdz L(end_m_1) 418 419 ALIGN(16) 420L(lo_m_1): 421 mulld r0, r26, v0 422 mulhdu r31, r26, v0 423 ld r26, 24(up) 424 nop 425 mulld r24, r27, v0 426 mulhdu r8, r27, v0 427 ld r27, 32(up) 428 nop 429 adde r0, r0, r12 430 adde r24, r24, r31 431 mulld r9, r26, v0 432 mulhdu r10, r26, v0 433 ld r26, 40(up) 434 nop 435 mulld r11, r27, v0 436 mulhdu r12, r27, v0 437 ld r27, 48(up) 438 std r0, 8(rp) 439 adde r9, r9, r8 440 std r24, 16(rp) 441 adde r11, r11, r10 442 std r9, 24(rp) 443 addi up, up, 32 444 std r11, 32(rp) 445 addi rp, rp, 32 446 bdnz L(lo_m_1) 447 448 ALIGN(16) 449L(end_m_1): 450 mulld r0, r26, v0 451 mulhdu r31, r26, v0 452 453 mulld r24, r27, v0 454 mulhdu r8, r27, v0 455 456 adde r0, r0, r12 457 adde r24, r24, r31 458 459 std r0, 8(rp) 460 addze r8, r8 461 std r24, 16(rp) 462 addic. vn, vn, -1 463 std r8, 24(rp) 464 nop 465 beq L(ret) 466 467 ALIGN(16) 468L(outer_lo_1): 469 mtctr un C copy inner loop count into ctr 470 addi rp, outer_rp, 24 471 addi up, outer_up, 16 472 addi outer_rp, outer_rp, 8 473 ld v0, 0(vp) C new v limb 474 addi vp, vp, 8 475 ld r26, -16(up) 476 ld r27, -8(up) 477 mulld r0, r26, v0 478 mulhdu r31, r26, v0 479 ld r26, 0(up) 480 ld r28, -16(rp) 481 mulld r24, r27, v0 482 mulhdu r8, r27, v0 483 ld r29, -8(rp) 484 ld r30, 0(rp) 485 mulld r9, r26, v0 486 mulhdu r10, r26, v0 487 addc r24, r24, r31 488 adde r9, r9, r8 489 addze r12, r10 490 addc r0, r0, r28 491 std r0, -16(rp) 492 adde r24, r24, r29 493 std r24, -8(rp) 494 adde r9, r9, r30 495 std r9, 0(rp) 496 ld r26, 8(up) 497 ld r27, 16(up) 498 bdz L(end_1) 499 500 ALIGN(16) C registers dying 501L(lo_1): 502 mulld r0, r26, v0 C 503 mulhdu r10, r26, v0 C 26 504 ld r26, 24(up) C 505 ld r28, 8(rp) C 506 mulld r24, r27, v0 C 507 mulhdu r8, r27, v0 C 27 508 ld r27, 32(up) C 509 ld r29, 16(rp) C 510 adde r0, r0, r12 C 0 12 511 adde r24, r24, r10 C 24 10 512 mulld r9, r26, v0 C 513 mulhdu r10, r26, v0 C 26 514 ld r26, 40(up) C 515 ld r30, 24(rp) C 516 mulld r11, r27, v0 C 517 mulhdu r12, r27, v0 C 27 518 ld r27, 48(up) C 519 ld r31, 32(rp) C 520 adde r9, r9, r8 C 8 9 521 adde r11, r11, r10 C 10 11 522 addze r12, r12 C 12 523 addc r0, r0, r28 C 0 28 524 std r0, 8(rp) C 0 525 adde r24, r24, r29 C 7 29 526 std r24, 16(rp) C 7 527 adde r9, r9, r30 C 9 30 528 std r9, 24(rp) C 9 529 adde r11, r11, r31 C 11 31 530 std r11, 32(rp) C 11 531 addi up, up, 32 C 532 addi rp, rp, 32 C 533 bdnz L(lo_1) C 534 535 ALIGN(16) 536L(end_1): 537 mulld r0, r26, v0 538 mulhdu r10, r26, v0 539 ld r28, 8(rp) 540 nop 541 mulld r24, r27, v0 542 mulhdu r8, r27, v0 543 ld r29, 16(rp) 544 nop 545 adde r0, r0, r12 546 adde r24, r24, r10 547 addze r8, r8 548 addic. vn, vn, -1 549 addc r0, r0, r28 550 std r0, 8(rp) 551 adde r24, r24, r29 552 std r24, 16(rp) 553 addze r8, r8 554 std r8, 24(rp) 555 bne L(outer_lo_1) 556 b L(ret) 557 558 559 ALIGN(16) 560L(b2): ld r27, 8(up) 561 addi up, up, -8 562 addi rp, rp, -8 563 li r12, 0 564 addic r12, r12, 0 565 566 ALIGN(16) 567L(lo_m_2): 568 mulld r0, r26, v0 569 mulhdu r31, r26, v0 570 ld r26, 24(up) 571 nop 572 mulld r24, r27, v0 573 mulhdu r8, r27, v0 574 ld r27, 32(up) 575 nop 576 adde r0, r0, r12 577 adde r24, r24, r31 578 mulld r9, r26, v0 579 mulhdu r10, r26, v0 580 ld r26, 40(up) 581 nop 582 mulld r11, r27, v0 583 mulhdu r12, r27, v0 584 ld r27, 48(up) 585 std r0, 8(rp) 586 adde r9, r9, r8 587 std r24, 16(rp) 588 adde r11, r11, r10 589 std r9, 24(rp) 590 addi up, up, 32 591 std r11, 32(rp) 592 593 addi rp, rp, 32 594 bdnz L(lo_m_2) 595 596 ALIGN(16) 597L(end_m_2): 598 mulld r0, r26, v0 599 mulhdu r31, r26, v0 600 601 mulld r24, r27, v0 602 mulhdu r8, r27, v0 603 604 adde r0, r0, r12 605 adde r24, r24, r31 606 607 std r0, 8(rp) 608 addze r8, r8 609 std r24, 16(rp) 610 addic. vn, vn, -1 611 std r8, 24(rp) 612 nop 613 beq L(ret) 614 615 ALIGN(16) 616L(outer_lo_2): 617 mtctr un C copy inner loop count into ctr 618 addi rp, outer_rp, 0 619 addi up, outer_up, -8 620 addi outer_rp, outer_rp, 8 621 ld v0, 0(vp) C new v limb 622 addi vp, vp, 8 623 ld r26, 8(up) 624 ld r27, 16(up) 625 li r12, 0 626 addic r12, r12, 0 627 628 ALIGN(16) C registers dying 629L(lo_2): 630 mulld r0, r26, v0 C 631 mulhdu r10, r26, v0 C 26 632 ld r26, 24(up) C 633 ld r28, 8(rp) C 634 mulld r24, r27, v0 C 635 mulhdu r8, r27, v0 C 27 636 ld r27, 32(up) C 637 ld r29, 16(rp) C 638 adde r0, r0, r12 C 0 12 639 adde r24, r24, r10 C 24 10 640 mulld r9, r26, v0 C 641 mulhdu r10, r26, v0 C 26 642 ld r26, 40(up) C 643 ld r30, 24(rp) C 644 mulld r11, r27, v0 C 645 mulhdu r12, r27, v0 C 27 646 ld r27, 48(up) C 647 ld r31, 32(rp) C 648 adde r9, r9, r8 C 8 9 649 adde r11, r11, r10 C 10 11 650 addze r12, r12 C 12 651 addc r0, r0, r28 C 0 28 652 std r0, 8(rp) C 0 653 adde r24, r24, r29 C 7 29 654 std r24, 16(rp) C 7 655 adde r9, r9, r30 C 9 30 656 std r9, 24(rp) C 9 657 adde r11, r11, r31 C 11 31 658 std r11, 32(rp) C 11 659 addi up, up, 32 C 660 addi rp, rp, 32 C 661 bdnz L(lo_2) C 662 663 ALIGN(16) 664L(end_2): 665 mulld r0, r26, v0 666 mulhdu r10, r26, v0 667 ld r28, 8(rp) 668 nop 669 mulld r24, r27, v0 670 mulhdu r8, r27, v0 671 ld r29, 16(rp) 672 nop 673 adde r0, r0, r12 674 adde r24, r24, r10 675 addze r8, r8 676 addic. vn, vn, -1 677 addc r0, r0, r28 678 std r0, 8(rp) 679 adde r24, r24, r29 680 std r24, 16(rp) 681 addze r8, r8 682 std r8, 24(rp) 683 bne L(outer_lo_2) 684 b L(ret) 685 686 687L(ret): ld r31, -8(r1) 688 ld r30, -16(r1) 689 ld r29, -24(r1) 690 ld r28, -32(r1) 691 ld r27, -40(r1) 692 ld r26, -48(r1) 693 ld r25, -56(r1) 694 ld r24, -64(r1) 695 ld r23, -72(r1) 696 ld r22, -80(r1) 697 blr 698EPILOGUE() 699