1dnl AMD K6 mpn_sqr_basecase -- square an mpn number. 2 3dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular 24C product (measured on the speed difference between 17 and 33 limbs, 25C which is roughly the Karatsuba recursing range). 26 27 28dnl SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this 29dnl code supports. This value is used only by the tune program to know 30dnl what it can go up to. (An attempt to compile with a bigger value will 31dnl trigger some m4_assert()s in the code, making the build fail.) 32dnl 33dnl The value is determined by requiring the displacements in the unrolled 34dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of 35dnl 63, giving a maximum SQR_TOOM2_THRESHOLD of 66. 36 37deflit(SQR_TOOM2_THRESHOLD_MAX, 66) 38 39 40dnl Allow a value from the tune program to override config.m4. 41 42ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', 43`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') 44 45 46dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The 47dnl number required is determined by SQR_TOOM2_THRESHOLD, since 48dnl mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD. 49dnl 50dnl The first addmul is the biggest, and this takes the second least 51dnl significant limb and multiplies it by the third least significant and 52dnl up. Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1 53dnl limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3. 54 55m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') 56deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) 57 58 59C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 60C 61C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a 62C lot of function call overheads are avoided, especially when the given size 63C is small. 64C 65C The code size might look a bit excessive, but not all of it is executed 66C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases 67C clearly apply only to those sizes; mid sizes like 10x10 only need part of 68C the unrolled addmul; and big sizes like 35x35 that do need all of it will 69C at least be getting value for money, because 35x35 spends something like 70C 5780 cycles here. 71C 72C Different values of UNROLL_COUNT give slightly different speeds, between 73C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs. 74C This isn't a big difference, but it's presumably some alignment effect 75C which if understood could give a simple speedup. 76 77defframe(PARAM_SIZE,12) 78defframe(PARAM_SRC, 8) 79defframe(PARAM_DST, 4) 80 81 TEXT 82 ALIGN(32) 83PROLOGUE(mpn_sqr_basecase) 84deflit(`FRAME',0) 85 86 movl PARAM_SIZE, %ecx 87 movl PARAM_SRC, %eax 88 89 cmpl $2, %ecx 90 je L(two_limbs) 91 92 movl PARAM_DST, %edx 93 ja L(three_or_more) 94 95 96C ----------------------------------------------------------------------------- 97C one limb only 98 C eax src 99 C ebx 100 C ecx size 101 C edx dst 102 103 movl (%eax), %eax 104 movl %edx, %ecx 105 106 mull %eax 107 108 movl %eax, (%ecx) 109 movl %edx, 4(%ecx) 110 ret 111 112 113C ----------------------------------------------------------------------------- 114 ALIGN(16) 115L(two_limbs): 116 C eax src 117 C ebx 118 C ecx size 119 C edx dst 120 121 pushl %ebx 122 movl %eax, %ebx C src 123deflit(`FRAME',4) 124 125 movl (%ebx), %eax 126 movl PARAM_DST, %ecx 127 128 mull %eax C src[0]^2 129 130 movl %eax, (%ecx) 131 movl 4(%ebx), %eax 132 133 movl %edx, 4(%ecx) 134 135 mull %eax C src[1]^2 136 137 movl %eax, 8(%ecx) 138 movl (%ebx), %eax 139 140 movl %edx, 12(%ecx) 141 movl 4(%ebx), %edx 142 143 mull %edx C src[0]*src[1] 144 145 addl %eax, 4(%ecx) 146 147 adcl %edx, 8(%ecx) 148 adcl $0, 12(%ecx) 149 150 popl %ebx 151 addl %eax, 4(%ecx) 152 153 adcl %edx, 8(%ecx) 154 adcl $0, 12(%ecx) 155 156 ret 157 158 159C ----------------------------------------------------------------------------- 160L(three_or_more): 161deflit(`FRAME',0) 162 cmpl $4, %ecx 163 jae L(four_or_more) 164 165 166C ----------------------------------------------------------------------------- 167C three limbs 168 C eax src 169 C ecx size 170 C edx dst 171 172 pushl %ebx 173 movl %eax, %ebx C src 174 175 movl (%ebx), %eax 176 movl %edx, %ecx C dst 177 178 mull %eax C src[0] ^ 2 179 180 movl %eax, (%ecx) 181 movl 4(%ebx), %eax 182 183 movl %edx, 4(%ecx) 184 pushl %esi 185 186 mull %eax C src[1] ^ 2 187 188 movl %eax, 8(%ecx) 189 movl 8(%ebx), %eax 190 191 movl %edx, 12(%ecx) 192 pushl %edi 193 194 mull %eax C src[2] ^ 2 195 196 movl %eax, 16(%ecx) 197 movl (%ebx), %eax 198 199 movl %edx, 20(%ecx) 200 movl 4(%ebx), %edx 201 202 mull %edx C src[0] * src[1] 203 204 movl %eax, %esi 205 movl (%ebx), %eax 206 207 movl %edx, %edi 208 movl 8(%ebx), %edx 209 210 pushl %ebp 211 xorl %ebp, %ebp 212 213 mull %edx C src[0] * src[2] 214 215 addl %eax, %edi 216 movl 4(%ebx), %eax 217 218 adcl %edx, %ebp 219 220 movl 8(%ebx), %edx 221 222 mull %edx C src[1] * src[2] 223 224 addl %eax, %ebp 225 226 adcl $0, %edx 227 228 229 C eax will be dst[5] 230 C ebx 231 C ecx dst 232 C edx dst[4] 233 C esi dst[1] 234 C edi dst[2] 235 C ebp dst[3] 236 237 xorl %eax, %eax 238 addl %esi, %esi 239 adcl %edi, %edi 240 adcl %ebp, %ebp 241 adcl %edx, %edx 242 adcl $0, %eax 243 244 addl %esi, 4(%ecx) 245 adcl %edi, 8(%ecx) 246 adcl %ebp, 12(%ecx) 247 248 popl %ebp 249 popl %edi 250 251 adcl %edx, 16(%ecx) 252 253 popl %esi 254 popl %ebx 255 256 adcl %eax, 20(%ecx) 257 ASSERT(nc) 258 259 ret 260 261 262C ----------------------------------------------------------------------------- 263 264defframe(SAVE_EBX, -4) 265defframe(SAVE_ESI, -8) 266defframe(SAVE_EDI, -12) 267defframe(SAVE_EBP, -16) 268defframe(VAR_COUNTER,-20) 269defframe(VAR_JMP, -24) 270deflit(STACK_SPACE, 24) 271 272 ALIGN(16) 273L(four_or_more): 274 275 C eax src 276 C ebx 277 C ecx size 278 C edx dst 279 C esi 280 C edi 281 C ebp 282 283C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 284C 285C A test was done calling mpn_mul_1 here to get the benefit of its unrolled 286C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off 287C a 5780 cycle operation, which is not surprising since the loop here is 8 288C c/l and mpn_mul_1 is 6.25 c/l. 289 290 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 291 292 movl %edi, SAVE_EDI 293 leal 4(%edx), %edi 294 295 movl %ebx, SAVE_EBX 296 leal 4(%eax), %ebx 297 298 movl %esi, SAVE_ESI 299 xorl %esi, %esi 300 301 movl %ebp, SAVE_EBP 302 303 C eax 304 C ebx src+4 305 C ecx size 306 C edx 307 C esi 308 C edi dst+4 309 C ebp 310 311 movl (%eax), %ebp C multiplier 312 leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary 313 314 315 ALIGN(16) 316L(mul_1): 317 C eax scratch 318 C ebx src ptr 319 C ecx counter 320 C edx scratch 321 C esi carry 322 C edi dst ptr 323 C ebp multiplier 324 325 movl (%ebx), %eax 326 addl $4, %ebx 327 328 mull %ebp 329 330 addl %esi, %eax 331 movl $0, %esi 332 333 adcl %edx, %esi 334 335 movl %eax, (%edi) 336 addl $4, %edi 337 338 loop L(mul_1) 339 340 341C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. 342C 343C The last two addmuls, which are the bottom right corner of the product 344C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] 345C and src[size-2]*src[size-1]. If size is 4 then it's only these corner 346C cases that need to be done. 347C 348C The unrolled code is the same as mpn_addmul_1(), see that routine for some 349C comments. 350C 351C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. 352C 353C VAR_JMP is the computed jump into the unrolled code, stepped by one code 354C chunk each outer loop. 355C 356C K6 doesn't do any branch prediction on indirect jumps, which is good 357C actually because it's a different target each time. The unrolled addmul 358C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of 359C the indirect jump is quickly recovered. 360 361 362dnl This value is also implicitly encoded in a shift and add. 363dnl 364deflit(CODE_BYTES_PER_LIMB, 15) 365 366dnl With the unmodified &src[size] and &dst[size] pointers, the 367dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT 368dnl values up to 31. Above that an offset must be added to them. 369dnl 370deflit(OFFSET, 371ifelse(eval(UNROLL_COUNT>31),1, 372eval((UNROLL_COUNT-31)*4), 3730)) 374 375 C eax 376 C ebx &src[size] 377 C ecx 378 C edx 379 C esi carry 380 C edi &dst[size] 381 C ebp 382 383 movl PARAM_SIZE, %ecx 384 movl %esi, (%edi) 385 386 subl $4, %ecx 387 jz L(corner) 388 389 movl %ecx, %edx 390ifelse(OFFSET,0,, 391` subl $OFFSET, %ebx') 392 393 shll $4, %ecx 394ifelse(OFFSET,0,, 395` subl $OFFSET, %edi') 396 397 negl %ecx 398 399ifdef(`PIC',` 400 call L(pic_calc) 401L(here): 402',` 403 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx 404') 405 negl %edx 406 407 408 C The calculated jump mustn't be before the start of the available 409 C code. This is the limitation UNROLL_COUNT puts on the src operand 410 C size, but checked here using the jump address directly. 411 C 412 ASSERT(ae,` 413 movl_text_address( L(unroll_inner_start), %eax) 414 cmpl %eax, %ecx 415 ') 416 417 418C ----------------------------------------------------------------------------- 419 ALIGN(16) 420L(unroll_outer_top): 421 C eax 422 C ebx &src[size], constant 423 C ecx VAR_JMP 424 C edx VAR_COUNTER, limbs, negative 425 C esi high limb to store 426 C edi dst ptr, high of last addmul 427 C ebp 428 429 movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier 430 movl %edx, VAR_COUNTER 431 432 movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand 433 434 mull %ebp 435 436 testb $1, %cl 437 438 movl %edx, %esi C high carry 439 movl %ecx, %edx C jump 440 441 movl %eax, %ecx C low carry 442 leal CODE_BYTES_PER_LIMB(%edx), %edx 443 444 movl %edx, VAR_JMP 445 leal 4(%edi), %edi 446 447 C A branch-free version of this using some xors was found to be a 448 C touch slower than just a conditional jump, despite the jump 449 C switching between taken and not taken on every loop. 450 451ifelse(eval(UNROLL_COUNT%2),0, 452 jz,jnz) L(unroll_noswap) 453 movl %esi, %eax C high,low carry other way around 454 455 movl %ecx, %esi 456 movl %eax, %ecx 457L(unroll_noswap): 458 459 jmp *%edx 460 461 462 C Must be on an even address here so the low bit of the jump address 463 C will indicate which way around ecx/esi should start. 464 C 465 C An attempt was made at padding here to get the end of the unrolled 466 C code to come out on a good alignment, to save padding before 467 C L(corner). This worked, but turned out to run slower than just an 468 C ALIGN(2). The reason for this is not clear, it might be related 469 C to the different speeds on different UNROLL_COUNTs noted above. 470 471 ALIGN(2) 472 473L(unroll_inner_start): 474 C eax scratch 475 C ebx src 476 C ecx carry low 477 C edx scratch 478 C esi carry high 479 C edi dst 480 C ebp multiplier 481 C 482 C 15 code bytes each limb 483 C ecx/esi swapped on each chunk 484 485forloop(`i', UNROLL_COUNT, 1, ` 486 deflit(`disp_src', eval(-i*4 + OFFSET)) 487 deflit(`disp_dst', eval(disp_src - 4)) 488 489 m4_assert(`disp_src>=-128 && disp_src<128') 490 m4_assert(`disp_dst>=-128 && disp_dst<128') 491 492ifelse(eval(i%2),0,` 493Zdisp( movl, disp_src,(%ebx), %eax) 494 mull %ebp 495Zdisp( addl, %esi, disp_dst,(%edi)) 496 adcl %eax, %ecx 497 movl %edx, %esi 498 jadcl0( %esi) 499',` 500 dnl this one comes out last 501Zdisp( movl, disp_src,(%ebx), %eax) 502 mull %ebp 503Zdisp( addl, %ecx, disp_dst,(%edi)) 504 adcl %eax, %esi 505 movl %edx, %ecx 506 jadcl0( %ecx) 507') 508') 509L(unroll_inner_end): 510 511 addl %esi, -4+OFFSET(%edi) 512 513 movl VAR_COUNTER, %edx 514 jadcl0( %ecx) 515 516 movl %ecx, m4_empty_if_zero(OFFSET)(%edi) 517 movl VAR_JMP, %ecx 518 519 incl %edx 520 jnz L(unroll_outer_top) 521 522 523ifelse(OFFSET,0,,` 524 addl $OFFSET, %ebx 525 addl $OFFSET, %edi 526') 527 528 529C ----------------------------------------------------------------------------- 530 ALIGN(16) 531L(corner): 532 C ebx &src[size] 533 C edi &dst[2*size-5] 534 535 movl -12(%ebx), %ebp 536 537 movl -8(%ebx), %eax 538 movl %eax, %ecx 539 540 mull %ebp 541 542 addl %eax, -4(%edi) 543 adcl $0, %edx 544 545 movl -4(%ebx), %eax 546 movl %edx, %esi 547 movl %eax, %ebx 548 549 mull %ebp 550 551 addl %esi, %eax 552 adcl $0, %edx 553 554 addl %eax, (%edi) 555 adcl $0, %edx 556 557 movl %edx, %esi 558 movl %ebx, %eax 559 560 mull %ecx 561 562 addl %esi, %eax 563 movl %eax, 4(%edi) 564 565 adcl $0, %edx 566 567 movl %edx, 8(%edi) 568 569 570C ----------------------------------------------------------------------------- 571C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. 572C The loop measures about 6 cycles/iteration, though it looks like it should 573C decode in 5. 574 575L(lshift_start): 576 movl PARAM_SIZE, %ecx 577 578 movl PARAM_DST, %edi 579 subl $1, %ecx C size-1 and clear carry 580 581 movl PARAM_SRC, %ebx 582 movl %ecx, %edx 583 584 xorl %eax, %eax C ready for adcl 585 586 587 ALIGN(16) 588L(lshift): 589 C eax 590 C ebx src (for later use) 591 C ecx counter, decrementing 592 C edx size-1 (for later use) 593 C esi 594 C edi dst, incrementing 595 C ebp 596 597 rcll 4(%edi) 598 rcll 8(%edi) 599 leal 8(%edi), %edi 600 loop L(lshift) 601 602 603 adcl %eax, %eax 604 605 movl %eax, 4(%edi) C dst most significant limb 606 movl (%ebx), %eax C src[0] 607 608 leal 4(%ebx,%edx,4), %ebx C &src[size] 609 subl %edx, %ecx C -(size-1) 610 611 612C ----------------------------------------------------------------------------- 613C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., 614C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 615C low limb of src[0]^2. 616 617 618 mull %eax 619 620 movl %eax, (%edi,%ecx,8) C dst[0] 621 622 623 ALIGN(16) 624L(diag): 625 C eax scratch 626 C ebx &src[size] 627 C ecx counter, negative 628 C edx carry 629 C esi scratch 630 C edi dst[2*size-2] 631 C ebp 632 633 movl (%ebx,%ecx,4), %eax 634 movl %edx, %esi 635 636 mull %eax 637 638 addl %esi, 4(%edi,%ecx,8) 639 adcl %eax, 8(%edi,%ecx,8) 640 adcl $0, %edx 641 642 incl %ecx 643 jnz L(diag) 644 645 646 movl SAVE_EBX, %ebx 647 movl SAVE_ESI, %esi 648 649 addl %edx, 4(%edi) C dst most significant limb 650 651 movl SAVE_EDI, %edi 652 movl SAVE_EBP, %ebp 653 addl $FRAME, %esp 654 ret 655 656 657 658C ----------------------------------------------------------------------------- 659ifdef(`PIC',` 660L(pic_calc): 661 C See mpn/x86/README about old gas bugs 662 addl (%esp), %ecx 663 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx 664 addl %edx, %ecx 665 ret_internal 666') 667 668 669EPILOGUE() 670