1dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers. 2 3dnl Copyright 1999-2003 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop 35C unrolling). 36 37 38 39dnl K6: UNROLL_COUNT cycles/product (approx) 40dnl 8 9.75 41dnl 16 9.3 42dnl 32 9.3 43dnl Maximum possible with the current code is 32. 44dnl 45dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which 46dnl might explain it's good performance. 47 48deflit(UNROLL_COUNT, 16) 49 50 51C void mpn_mul_basecase (mp_ptr wp, 52C mp_srcptr xp, mp_size_t xsize, 53C mp_srcptr yp, mp_size_t ysize); 54C 55C Calculate xp,xsize multiplied by yp,ysize, storing the result in 56C wp,xsize+ysize. 57C 58C This routine is essentially the same as mpn/generic/mul_basecase.c, but 59C it's faster because it does most of the mpn_addmul_1() entry code only 60C once. The saving is about 10-20% on typical sizes coming from the 61C Karatsuba multiply code. 62C 63C Enhancements: 64C 65C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25 66C c/l. Could call mpn_mul_1 when ysize is big enough to make it worthwhile. 67C 68C The main unrolled addmul loop could be shared by mpn_addmul_1, using some 69C extra stack setups and maybe 2 or 3 wasted cycles at the end. Code saving 70C would be 256 bytes. 71 72ifdef(`PIC',` 73deflit(UNROLL_THRESHOLD, 8) 74',` 75deflit(UNROLL_THRESHOLD, 8) 76') 77 78defframe(PARAM_YSIZE,20) 79defframe(PARAM_YP, 16) 80defframe(PARAM_XSIZE,12) 81defframe(PARAM_XP, 8) 82defframe(PARAM_WP, 4) 83 84 TEXT 85 ALIGN(32) 86PROLOGUE(mpn_mul_basecase) 87deflit(`FRAME',0) 88 89 movl PARAM_XSIZE, %ecx 90 movl PARAM_YP, %eax 91 92 movl PARAM_XP, %edx 93 movl (%eax), %eax C yp low limb 94 95 cmpl $2, %ecx 96 ja L(xsize_more_than_two_limbs) 97 je L(two_by_something) 98 99 100 C one limb by one limb 101 102 movl (%edx), %edx C xp low limb 103 movl PARAM_WP, %ecx 104 105 mull %edx 106 107 movl %eax, (%ecx) 108 movl %edx, 4(%ecx) 109 ret 110 111 112C ----------------------------------------------------------------------------- 113L(two_by_something): 114 decl PARAM_YSIZE 115 pushl %ebx 116deflit(`FRAME',4) 117 118 movl PARAM_WP, %ebx 119 pushl %esi 120deflit(`FRAME',8) 121 122 movl %eax, %ecx C yp low limb 123 movl (%edx), %eax C xp low limb 124 125 movl %edx, %esi C xp 126 jnz L(two_by_two) 127 128 129 C two limbs by one limb 130 131 mull %ecx 132 133 movl %eax, (%ebx) 134 movl 4(%esi), %eax 135 136 movl %edx, %esi C carry 137 138 mull %ecx 139 140 addl %eax, %esi 141 movl %esi, 4(%ebx) 142 143 adcl $0, %edx 144 145 movl %edx, 8(%ebx) 146 popl %esi 147 148 popl %ebx 149 ret 150 151 152 153C ----------------------------------------------------------------------------- 154 ALIGN(16) 155L(two_by_two): 156 C eax xp low limb 157 C ebx wp 158 C ecx yp low limb 159 C edx 160 C esi xp 161 C edi 162 C ebp 163deflit(`FRAME',8) 164 165 mull %ecx C xp[0] * yp[0] 166 167 push %edi 168deflit(`FRAME',12) 169 movl %eax, (%ebx) 170 171 movl 4(%esi), %eax 172 movl %edx, %edi C carry, for wp[1] 173 174 mull %ecx C xp[1] * yp[0] 175 176 addl %eax, %edi 177 movl PARAM_YP, %ecx 178 179 adcl $0, %edx 180 181 movl %edi, 4(%ebx) 182 movl 4(%ecx), %ecx C yp[1] 183 184 movl 4(%esi), %eax C xp[1] 185 movl %edx, %edi C carry, for wp[2] 186 187 mull %ecx C xp[1] * yp[1] 188 189 addl %eax, %edi 190 191 adcl $0, %edx 192 193 movl (%esi), %eax C xp[0] 194 movl %edx, %esi C carry, for wp[3] 195 196 mull %ecx C xp[0] * yp[1] 197 198 addl %eax, 4(%ebx) 199 adcl %edx, %edi 200 adcl $0, %esi 201 202 movl %edi, 8(%ebx) 203 popl %edi 204 205 movl %esi, 12(%ebx) 206 popl %esi 207 208 popl %ebx 209 ret 210 211 212C ----------------------------------------------------------------------------- 213 ALIGN(16) 214L(xsize_more_than_two_limbs): 215 216C The first limb of yp is processed with a simple mpn_mul_1 style loop 217C inline. Unrolling this doesn't seem worthwhile since it's only run once 218C (whereas the addmul below is run ysize-1 many times). A call to the 219C actual mpn_mul_1 will be slowed down by the call and parameter pushing and 220C popping, and doesn't seem likely to be worthwhile on the typical 10-20 221C limb operations the Karatsuba code calls here with. 222 223 C eax yp[0] 224 C ebx 225 C ecx xsize 226 C edx xp 227 C esi 228 C edi 229 C ebp 230deflit(`FRAME',0) 231 232 pushl %edi defframe_pushl(SAVE_EDI) 233 pushl %ebp defframe_pushl(SAVE_EBP) 234 235 movl PARAM_WP, %edi 236 pushl %esi defframe_pushl(SAVE_ESI) 237 238 movl %eax, %ebp 239 pushl %ebx defframe_pushl(SAVE_EBX) 240 241 leal (%edx,%ecx,4), %ebx C xp end 242 xorl %esi, %esi 243 244 leal (%edi,%ecx,4), %edi C wp end of mul1 245 negl %ecx 246 247 248L(mul1): 249 C eax scratch 250 C ebx xp end 251 C ecx counter, negative 252 C edx scratch 253 C esi carry 254 C edi wp end of mul1 255 C ebp multiplier 256 257 movl (%ebx,%ecx,4), %eax 258 259 mull %ebp 260 261 addl %esi, %eax 262 movl $0, %esi 263 264 adcl %edx, %esi 265 266 movl %eax, (%edi,%ecx,4) 267 incl %ecx 268 269 jnz L(mul1) 270 271 272 movl PARAM_YSIZE, %edx 273 movl %esi, (%edi) C final carry 274 275 movl PARAM_XSIZE, %ecx 276 decl %edx 277 278 jnz L(ysize_more_than_one_limb) 279 280 popl %ebx 281 popl %esi 282 popl %ebp 283 popl %edi 284 ret 285 286 287L(ysize_more_than_one_limb): 288 cmpl $UNROLL_THRESHOLD, %ecx 289 movl PARAM_YP, %eax 290 291 jae L(unroll) 292 293 294C ----------------------------------------------------------------------------- 295C Simple addmul loop. 296C 297C Using ebx and edi pointing at the ends of their respective locations saves 298C a couple of instructions in the outer loop. The inner loop is still 11 299C cycles, the same as the simple loop in aorsmul_1.asm. 300 301 C eax yp 302 C ebx xp end 303 C ecx xsize 304 C edx ysize-1 305 C esi 306 C edi wp end of mul1 307 C ebp 308 309 movl 4(%eax), %ebp C multiplier 310 negl %ecx 311 312 movl %ecx, PARAM_XSIZE C -xsize 313 xorl %esi, %esi C initial carry 314 315 leal 4(%eax,%edx,4), %eax C yp end 316 negl %edx 317 318 movl %eax, PARAM_YP 319 movl %edx, PARAM_YSIZE 320 321 jmp L(simple_outer_entry) 322 323 324 C aligning here saves a couple of cycles 325 ALIGN(16) 326L(simple_outer_top): 327 C edx ysize counter, negative 328 329 movl PARAM_YP, %eax C yp end 330 xorl %esi, %esi C carry 331 332 movl PARAM_XSIZE, %ecx C -xsize 333 movl %edx, PARAM_YSIZE 334 335 movl (%eax,%edx,4), %ebp C yp limb multiplier 336L(simple_outer_entry): 337 addl $4, %edi 338 339 340L(simple_inner): 341 C eax scratch 342 C ebx xp end 343 C ecx counter, negative 344 C edx scratch 345 C esi carry 346 C edi wp end of this addmul 347 C ebp multiplier 348 349 movl (%ebx,%ecx,4), %eax 350 351 mull %ebp 352 353 addl %esi, %eax 354 movl $0, %esi 355 356 adcl $0, %edx 357 addl %eax, (%edi,%ecx,4) 358 adcl %edx, %esi 359 360 incl %ecx 361 jnz L(simple_inner) 362 363 364 movl PARAM_YSIZE, %edx 365 movl %esi, (%edi) 366 367 incl %edx 368 jnz L(simple_outer_top) 369 370 371 popl %ebx 372 popl %esi 373 popl %ebp 374 popl %edi 375 ret 376 377 378C ----------------------------------------------------------------------------- 379C Unrolled loop. 380C 381C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for 382C some comments. 383C 384C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to 385C 0, inclusive. 386C 387C VAR_JMP is the computed jump into the unrolled loop. 388C 389C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop 390C is entered. 391C 392C VAR_XP_LOW is the least significant limb of xp, which is needed at the 393C start of the unrolled loop. This can't just be fetched through the xp 394C pointer because of the offset applied to it. 395C 396C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, 397C inclusive. 398C 399C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be 400C added to give the location of the next limb of yp, which is the multiplier 401C in the unrolled loop. 402C 403C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added 404C to give the starting point in the destination for each unrolled loop (this 405C point is one limb upwards for each limb of yp processed). 406C 407C Having PARAM_YSIZE count negative to zero means it's not necessary to 408C store new values of PARAM_YP and PARAM_WP on each loop. Those values on 409C the stack remain constant and on each loop an leal adjusts them with the 410C PARAM_YSIZE counter value. 411 412 413defframe(VAR_COUNTER, -20) 414defframe(VAR_COUNTER_INIT, -24) 415defframe(VAR_JMP, -28) 416defframe(VAR_XP_LOW, -32) 417deflit(VAR_STACK_SPACE, 16) 418 419dnl For some strange reason using (%esp) instead of 0(%esp) is a touch 420dnl slower in this code, hence the defframe empty-if-zero feature is 421dnl disabled. 422dnl 423dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the 424dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects 425dnl anything isn't clear. 426dnl 427define(`defframe_empty_if_zero_disabled',1) 428 429L(unroll): 430 C eax yp (not used) 431 C ebx xp end (not used) 432 C ecx xsize 433 C edx ysize-1 434 C esi 435 C edi wp end of mul1 (not used) 436 C ebp 437deflit(`FRAME', 16) 438 439 leal -2(%ecx), %ebp C one limb processed at start, 440 decl %ecx C and ebp is one less 441 442 shrl $UNROLL_LOG2, %ebp 443 negl %ecx 444 445 subl $VAR_STACK_SPACE, %esp 446deflit(`FRAME', 16+VAR_STACK_SPACE) 447 andl $UNROLL_MASK, %ecx 448 449 movl %ecx, %esi 450 shll $4, %ecx 451 452 movl %ebp, VAR_COUNTER_INIT 453 negl %esi 454 455 C 15 code bytes per limb 456ifdef(`PIC',` 457 call L(pic_calc) 458L(unroll_here): 459',` 460 leal L(unroll_entry) (%ecx,%esi,1), %ecx 461') 462 463 movl PARAM_XP, %ebx 464 movl %ebp, VAR_COUNTER 465 466 movl PARAM_WP, %edi 467 movl %ecx, VAR_JMP 468 469 movl (%ebx), %eax 470 leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1 471 472 leal (%ebx,%esi,4), %ebx C xp adjust for unrolling 473 474 movl %eax, VAR_XP_LOW 475 476 movl %ebx, PARAM_XP 477 movl PARAM_YP, %ebx 478 479 leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing 480 movl 4(%ebx), %ebp C multiplier (yp second limb) 481 482 leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing 483 484 movl %ecx, PARAM_WP 485 486 leal 1(%esi), %ecx C adjust parity for decl %ecx above 487 488 movl %ebx, PARAM_YP 489 negl %edx 490 491 movl %edx, PARAM_YSIZE 492 jmp L(unroll_outer_entry) 493 494 495ifdef(`PIC',` 496L(pic_calc): 497 C See mpn/x86/README about old gas bugs 498 leal (%ecx,%esi,1), %ecx 499 addl $L(unroll_entry)-L(unroll_here), %ecx 500 addl (%esp), %ecx 501 ret_internal 502') 503 504 505C ----------------------------------------------------------------------------- 506 C Aligning here saves a couple of cycles per loop. Using 32 doesn't 507 C cost any extra space, since the inner unrolled loop below is 508 C aligned to 32. 509 ALIGN(32) 510L(unroll_outer_top): 511 C edx ysize 512 513 movl PARAM_YP, %eax 514 movl %edx, PARAM_YSIZE C incremented ysize counter 515 516 movl PARAM_WP, %edi 517 518 movl VAR_COUNTER_INIT, %ebx 519 movl (%eax,%edx,4), %ebp C next multiplier 520 521 movl PARAM_XSIZE, %ecx 522 leal (%edi,%edx,4), %edi C adjust wp for where we are in yp 523 524 movl VAR_XP_LOW, %eax 525 movl %ebx, VAR_COUNTER 526 527L(unroll_outer_entry): 528 mull %ebp 529 530 C using testb is a tiny bit faster than testl 531 testb $1, %cl 532 533 movl %eax, %ecx C low carry 534 movl VAR_JMP, %eax 535 536 movl %edx, %esi C high carry 537 movl PARAM_XP, %ebx 538 539 jnz L(unroll_noswap) 540 movl %ecx, %esi C high,low carry other way around 541 542 movl %edx, %ecx 543L(unroll_noswap): 544 545 jmp *%eax 546 547 548 549C ----------------------------------------------------------------------------- 550 ALIGN(32) 551L(unroll_top): 552 C eax scratch 553 C ebx xp 554 C ecx carry low 555 C edx scratch 556 C esi carry high 557 C edi wp 558 C ebp multiplier 559 C VAR_COUNTER loop counter 560 C 561 C 15 code bytes each limb 562 563 leal UNROLL_BYTES(%edi), %edi 564 565L(unroll_entry): 566deflit(CHUNK_COUNT,2) 567forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 568 deflit(`disp0', eval(i*CHUNK_COUNT*4)) 569 deflit(`disp1', eval(disp0 + 4)) 570 deflit(`disp2', eval(disp1 + 4)) 571 572 movl disp1(%ebx), %eax 573 mull %ebp 574Zdisp( addl, %ecx, disp0,(%edi)) 575 adcl %eax, %esi 576 movl %edx, %ecx 577 jadcl0( %ecx) 578 579 movl disp2(%ebx), %eax 580 mull %ebp 581 addl %esi, disp1(%edi) 582 adcl %eax, %ecx 583 movl %edx, %esi 584 jadcl0( %esi) 585') 586 587 decl VAR_COUNTER 588 leal UNROLL_BYTES(%ebx), %ebx 589 590 jns L(unroll_top) 591 592 593 movl PARAM_YSIZE, %edx 594 addl %ecx, UNROLL_BYTES(%edi) 595 596 adcl $0, %esi 597 598 incl %edx 599 movl %esi, UNROLL_BYTES+4(%edi) 600 601 jnz L(unroll_outer_top) 602 603 604 movl SAVE_ESI, %esi 605 movl SAVE_EBP, %ebp 606 movl SAVE_EDI, %edi 607 movl SAVE_EBX, %ebx 608 609 addl $FRAME, %esp 610 ret 611 612EPILOGUE() 613