1dnl AMD K7 mpn_sqr_basecase -- square an mpn number. 2 3dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product 24C (measured on the speed difference between 25 and 50 limbs, which is 25C roughly the Karatsuba recursing range). 26 27 28dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for 29dnl some comments. 30 31deflit(SQR_TOOM2_THRESHOLD_MAX, 66) 32 33ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', 34`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') 35 36m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') 37deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) 38 39 40C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 41C 42C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes, 43C which is quite a bit, but is considered good value since squares big 44C enough to use most of the code will be spending quite a few cycles in it. 45 46 47defframe(PARAM_SIZE,12) 48defframe(PARAM_SRC, 8) 49defframe(PARAM_DST, 4) 50 51 TEXT 52 ALIGN(32) 53PROLOGUE(mpn_sqr_basecase) 54deflit(`FRAME',0) 55 56 movl PARAM_SIZE, %ecx 57 movl PARAM_SRC, %eax 58 cmpl $2, %ecx 59 60 movl PARAM_DST, %edx 61 je L(two_limbs) 62 ja L(three_or_more) 63 64 65C------------------------------------------------------------------------------ 66C one limb only 67 C eax src 68 C ecx size 69 C edx dst 70 71 movl (%eax), %eax 72 movl %edx, %ecx 73 74 mull %eax 75 76 movl %edx, 4(%ecx) 77 movl %eax, (%ecx) 78 ret 79 80 81C------------------------------------------------------------------------------ 82C 83C Using the read/modify/write "add"s seems to be faster than saving and 84C restoring registers. Perhaps the loads for the first set hide under the 85C mul latency and the second gets store to load forwarding. 86 87 ALIGN(16) 88L(two_limbs): 89 C eax src 90 C ebx 91 C ecx size 92 C edx dst 93deflit(`FRAME',0) 94 95 pushl %ebx FRAME_pushl() 96 movl %eax, %ebx C src 97 movl (%eax), %eax 98 99 movl %edx, %ecx C dst 100 101 mull %eax C src[0]^2 102 103 movl %eax, (%ecx) C dst[0] 104 movl 4(%ebx), %eax 105 106 movl %edx, 4(%ecx) C dst[1] 107 108 mull %eax C src[1]^2 109 110 movl %eax, 8(%ecx) C dst[2] 111 movl (%ebx), %eax 112 113 movl %edx, 12(%ecx) C dst[3] 114 115 mull 4(%ebx) C src[0]*src[1] 116 117 popl %ebx 118 119 addl %eax, 4(%ecx) 120 adcl %edx, 8(%ecx) 121 adcl $0, 12(%ecx) 122 ASSERT(nc) 123 124 addl %eax, 4(%ecx) 125 adcl %edx, 8(%ecx) 126 adcl $0, 12(%ecx) 127 ASSERT(nc) 128 129 ret 130 131 132C------------------------------------------------------------------------------ 133defframe(SAVE_EBX, -4) 134defframe(SAVE_ESI, -8) 135defframe(SAVE_EDI, -12) 136defframe(SAVE_EBP, -16) 137deflit(STACK_SPACE, 16) 138 139L(three_or_more): 140 subl $STACK_SPACE, %esp 141 cmpl $4, %ecx 142 jae L(four_or_more) 143deflit(`FRAME',STACK_SPACE) 144 145 146C------------------------------------------------------------------------------ 147C Three limbs 148C 149C Writing out the loads and stores separately at the end of this code comes 150C out about 10 cycles faster than using adcls to memory. 151 152 C eax src 153 C ecx size 154 C edx dst 155 156 movl %ebx, SAVE_EBX 157 movl %eax, %ebx C src 158 movl (%eax), %eax 159 160 movl %edx, %ecx C dst 161 movl %esi, SAVE_ESI 162 movl %edi, SAVE_EDI 163 164 mull %eax C src[0] ^ 2 165 166 movl %eax, (%ecx) 167 movl 4(%ebx), %eax 168 movl %edx, 4(%ecx) 169 170 mull %eax C src[1] ^ 2 171 172 movl %eax, 8(%ecx) 173 movl 8(%ebx), %eax 174 movl %edx, 12(%ecx) 175 176 mull %eax C src[2] ^ 2 177 178 movl %eax, 16(%ecx) 179 movl (%ebx), %eax 180 movl %edx, 20(%ecx) 181 182 mull 4(%ebx) C src[0] * src[1] 183 184 movl %eax, %esi 185 movl (%ebx), %eax 186 movl %edx, %edi 187 188 mull 8(%ebx) C src[0] * src[2] 189 190 addl %eax, %edi 191 movl %ebp, SAVE_EBP 192 movl $0, %ebp 193 194 movl 4(%ebx), %eax 195 adcl %edx, %ebp 196 197 mull 8(%ebx) C src[1] * src[2] 198 199 xorl %ebx, %ebx 200 addl %eax, %ebp 201 202 adcl $0, %edx 203 204 C eax 205 C ebx zero, will be dst[5] 206 C ecx dst 207 C edx dst[4] 208 C esi dst[1] 209 C edi dst[2] 210 C ebp dst[3] 211 212 adcl $0, %edx 213 addl %esi, %esi 214 215 adcl %edi, %edi 216 movl 4(%ecx), %eax 217 218 adcl %ebp, %ebp 219 220 adcl %edx, %edx 221 222 adcl $0, %ebx 223 addl %eax, %esi 224 movl 8(%ecx), %eax 225 226 adcl %eax, %edi 227 movl 12(%ecx), %eax 228 movl %esi, 4(%ecx) 229 230 adcl %eax, %ebp 231 movl 16(%ecx), %eax 232 movl %edi, 8(%ecx) 233 234 movl SAVE_ESI, %esi 235 movl SAVE_EDI, %edi 236 237 adcl %eax, %edx 238 movl 20(%ecx), %eax 239 movl %ebp, 12(%ecx) 240 241 adcl %ebx, %eax 242 ASSERT(nc) 243 movl SAVE_EBX, %ebx 244 movl SAVE_EBP, %ebp 245 246 movl %edx, 16(%ecx) 247 movl %eax, 20(%ecx) 248 addl $FRAME, %esp 249 250 ret 251 252 253C------------------------------------------------------------------------------ 254L(four_or_more): 255 256C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 257C Further products are added in rather than stored. 258 259 C eax src 260 C ebx 261 C ecx size 262 C edx dst 263 C esi 264 C edi 265 C ebp 266 267defframe(`VAR_COUNTER',-20) 268defframe(`VAR_JMP', -24) 269deflit(EXTRA_STACK_SPACE, 8) 270 271 movl %ebx, SAVE_EBX 272 movl %edi, SAVE_EDI 273 leal (%edx,%ecx,4), %edi C &dst[size] 274 275 movl %esi, SAVE_ESI 276 movl %ebp, SAVE_EBP 277 leal (%eax,%ecx,4), %esi C &src[size] 278 279 movl (%eax), %ebp C multiplier 280 movl $0, %ebx 281 decl %ecx 282 283 negl %ecx 284 subl $EXTRA_STACK_SPACE, %esp 285FRAME_subl_esp(EXTRA_STACK_SPACE) 286 287L(mul_1): 288 C eax scratch 289 C ebx carry 290 C ecx counter 291 C edx scratch 292 C esi &src[size] 293 C edi &dst[size] 294 C ebp multiplier 295 296 movl (%esi,%ecx,4), %eax 297 298 mull %ebp 299 300 addl %ebx, %eax 301 movl %eax, (%edi,%ecx,4) 302 movl $0, %ebx 303 304 adcl %edx, %ebx 305 incl %ecx 306 jnz L(mul_1) 307 308 309C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. 310C 311C The last two products, which are the bottom right corner of the product 312C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] 313C and src[size-2]*src[size-1]. If size is 4 then it's only these corner 314C cases that need to be done. 315C 316C The unrolled code is the same as in mpn_addmul_1, see that routine for 317C some comments. 318C 319C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive. 320C 321C VAR_JMP is the computed jump into the unrolled code, stepped by one code 322C chunk each outer loop. 323C 324C K7 does branch prediction on indirect jumps, which is bad since it's a 325C different target each time. There seems no way to avoid this. 326 327dnl This value also hard coded in some shifts and adds 328deflit(CODE_BYTES_PER_LIMB, 17) 329 330dnl With the unmodified &src[size] and &dst[size] pointers, the 331dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT 332dnl values up to 31, but above that an offset must be added to them. 333 334deflit(OFFSET, 335ifelse(eval(UNROLL_COUNT>31),1, 336eval((UNROLL_COUNT-31)*4), 3370)) 338 339dnl Because the last chunk of code is generated differently, a label placed 340dnl at the end doesn't work. Instead calculate the implied end using the 341dnl start and how many chunks of code there are. 342 343deflit(UNROLL_INNER_END, 344`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)') 345 346 C eax 347 C ebx carry 348 C ecx 349 C edx 350 C esi &src[size] 351 C edi &dst[size] 352 C ebp 353 354 movl PARAM_SIZE, %ecx 355 movl %ebx, (%edi) 356 357 subl $4, %ecx 358 jz L(corner) 359 360 negl %ecx 361ifelse(OFFSET,0,,`subl $OFFSET, %edi') 362ifelse(OFFSET,0,,`subl $OFFSET, %esi') 363 364 movl %ecx, %edx 365 shll $4, %ecx 366 367ifdef(`PIC',` 368 call L(pic_calc) 369L(here): 370',` 371 leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx 372') 373 374 375 C The calculated jump mustn't come out to before the start of the 376 C code available. This is the limit UNROLL_COUNT puts on the src 377 C operand size, but checked here directly using the jump address. 378 ASSERT(ae, 379 `movl_text_address(L(unroll_inner_start), %eax) 380 cmpl %eax, %ecx') 381 382 383C------------------------------------------------------------------------------ 384 ALIGN(16) 385L(unroll_outer_top): 386 C eax 387 C ebx high limb to store 388 C ecx VAR_JMP 389 C edx VAR_COUNTER, limbs, negative 390 C esi &src[size], constant 391 C edi dst ptr, high of last addmul 392 C ebp 393 394 movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier 395 movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand 396 397 movl %edx, VAR_COUNTER 398 399 mull %ebp 400 401define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')') 402 403 testb $1, %cl 404 movl %edx, %ebx C high carry 405 movl %ecx, %edx C jump 406 407 movl %eax, %ecx C low carry 408 cmovX( %ebx, %ecx) C high carry reverse 409 cmovX( %eax, %ebx) C low carry reverse 410 411 leal CODE_BYTES_PER_LIMB(%edx), %eax 412 xorl %edx, %edx 413 leal 4(%edi), %edi 414 415 movl %eax, VAR_JMP 416 417 jmp *%eax 418 419 420ifdef(`PIC',` 421L(pic_calc): 422 addl (%esp), %ecx 423 addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx 424 addl %edx, %ecx 425 ret_internal 426') 427 428 429 C Must be an even address to preserve the significance of the low 430 C bit of the jump address indicating which way around ecx/ebx should 431 C start. 432 ALIGN(2) 433 434L(unroll_inner_start): 435 C eax next limb 436 C ebx carry high 437 C ecx carry low 438 C edx scratch 439 C esi src 440 C edi dst 441 C ebp multiplier 442 443forloop(`i', UNROLL_COUNT, 1, ` 444 deflit(`disp_src', eval(-i*4 + OFFSET)) 445 deflit(`disp_dst', eval(disp_src - 4)) 446 447 m4_assert(`disp_src>=-128 && disp_src<128') 448 m4_assert(`disp_dst>=-128 && disp_dst<128') 449 450ifelse(eval(i%2),0,` 451Zdisp( movl, disp_src,(%esi), %eax) 452 adcl %edx, %ebx 453 454 mull %ebp 455 456Zdisp( addl, %ecx, disp_dst,(%edi)) 457 movl $0, %ecx 458 459 adcl %eax, %ebx 460 461',` 462 dnl this bit comes out last 463Zdisp( movl, disp_src,(%esi), %eax) 464 adcl %edx, %ecx 465 466 mull %ebp 467 468Zdisp( addl, %ebx, disp_dst,(%edi)) 469 470ifelse(forloop_last,0, 471` movl $0, %ebx') 472 473 adcl %eax, %ecx 474') 475') 476 477 C eax next limb 478 C ebx carry high 479 C ecx carry low 480 C edx scratch 481 C esi src 482 C edi dst 483 C ebp multiplier 484 485 adcl $0, %edx 486 addl %ecx, -4+OFFSET(%edi) 487 movl VAR_JMP, %ecx 488 489 adcl $0, %edx 490 491 movl %edx, m4_empty_if_zero(OFFSET) (%edi) 492 movl VAR_COUNTER, %edx 493 494 incl %edx 495 jnz L(unroll_outer_top) 496 497 498ifelse(OFFSET,0,,` 499 addl $OFFSET, %esi 500 addl $OFFSET, %edi 501') 502 503 504C------------------------------------------------------------------------------ 505L(corner): 506 C esi &src[size] 507 C edi &dst[2*size-5] 508 509 movl -12(%esi), %ebp 510 movl -8(%esi), %eax 511 movl %eax, %ecx 512 513 mull %ebp 514 515 addl %eax, -4(%edi) 516 movl -4(%esi), %eax 517 518 adcl $0, %edx 519 movl %edx, %ebx 520 movl %eax, %esi 521 522 mull %ebp 523 524 addl %ebx, %eax 525 526 adcl $0, %edx 527 addl %eax, (%edi) 528 movl %esi, %eax 529 530 adcl $0, %edx 531 movl %edx, %ebx 532 533 mull %ecx 534 535 addl %ebx, %eax 536 movl %eax, 4(%edi) 537 538 adcl $0, %edx 539 movl %edx, 8(%edi) 540 541 542 543C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. 544 545L(lshift_start): 546 movl PARAM_SIZE, %eax 547 movl PARAM_DST, %edi 548 xorl %ecx, %ecx C clear carry 549 550 leal (%edi,%eax,8), %edi 551 notl %eax C -size-1, preserve carry 552 553 leal 2(%eax), %eax C -(size-1) 554 555L(lshift): 556 C eax counter, negative 557 C ebx 558 C ecx 559 C edx 560 C esi 561 C edi dst, pointing just after last limb 562 C ebp 563 564 rcll -4(%edi,%eax,8) 565 rcll (%edi,%eax,8) 566 incl %eax 567 jnz L(lshift) 568 569 setc %al 570 571 movl PARAM_SRC, %esi 572 movl %eax, -4(%edi) C dst most significant limb 573 574 movl PARAM_SIZE, %ecx 575 576 577C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., 578C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 579C low limb of src[0]^2. 580 581 movl (%esi), %eax C src[0] 582 583 mull %eax 584 585 leal (%esi,%ecx,4), %esi C src point just after last limb 586 negl %ecx 587 588 movl %eax, (%edi,%ecx,8) C dst[0] 589 incl %ecx 590 591L(diag): 592 C eax scratch 593 C ebx scratch 594 C ecx counter, negative 595 C edx carry 596 C esi src just after last limb 597 C edi dst just after last limb 598 C ebp 599 600 movl (%esi,%ecx,4), %eax 601 movl %edx, %ebx 602 603 mull %eax 604 605 addl %ebx, -4(%edi,%ecx,8) 606 adcl %eax, (%edi,%ecx,8) 607 adcl $0, %edx 608 609 incl %ecx 610 jnz L(diag) 611 612 613 movl SAVE_ESI, %esi 614 movl SAVE_EBX, %ebx 615 616 addl %edx, -4(%edi) C dst most significant limb 617 movl SAVE_EDI, %edi 618 619 movl SAVE_EBP, %ebp 620 addl $FRAME, %esp 621 622 ret 623 624EPILOGUE() 625