divrem_1.asm revision 1.1.1.1
1dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, 4dnl Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part. 25 26 27C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 28C mp_srcptr src, mp_size_t size, 29C mp_limb_t divisor); 30C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 31C mp_srcptr src, mp_size_t size, 32C mp_limb_t divisor, mp_limb_t carry); 33C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, 34C mp_srcptr src, mp_size_t size, 35C mp_limb_t divisor, mp_limb_t inverse, 36C unsigned shift); 37C 38C Algorithm: 39C 40C The method and nomenclature follow part 8 of "Division by Invariant 41C Integers using Multiplication" by Granlund and Montgomery, reference in 42C gmp.texi. 43C 44C "m" is written for what is m' in the paper, and "d" for d_norm, which 45C won't cause any confusion since it's only the normalized divisor that's of 46C any use in the code. "b" is written for 2^N, the size of a limb, N being 47C 32 here. 48C 49C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as 50C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets 51C us have just a psubq on the dependent chain. 52C 53C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here, 54C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much. 55C 56C Notes: 57C 58C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high 59C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero 60C carry, since in normal circumstances that will be a very rare event. 61C 62C The test for skipping a division is branch free (once size>=1 is tested). 63C The store to the destination high limb is 0 when a divide is skipped, or 64C if it's not skipped then a copy of the src high limb is stored. The 65C latter is in case src==dst. 66C 67C There's a small bias towards expecting xsize==0, by having code for 68C xsize==0 in a straight line and xsize!=0 under forward jumps. 69C 70C Enhancements: 71C 72C The loop measures 32 cycles, but the dependent chain would suggest it 73C could be done with 30. Not sure where to start looking for the extras. 74C 75C Alternatives: 76C 77C If the divisor is normalized (high bit set) then a division step can 78C always be skipped, since the high destination limb is always 0 or 1 in 79C that case. It doesn't seem worth checking for this though, since it 80C probably occurs infrequently. 81 82 83dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by 84dnl inverse method is used, rather than plain "divl"s. Minimum value 1. 85dnl 86dnl The inverse takes about 80-90 cycles to calculate, but after that the 87dnl multiply is 32 c/l versus division at about 58 c/l. 88dnl 89dnl At 4 limbs the div is a touch faster than the mul (and of course 90dnl simpler), so start the mul from 5 limbs. 91 92deflit(MUL_THRESHOLD, 5) 93 94 95defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 96defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 97defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c 98defframe(PARAM_DIVISOR,20) 99defframe(PARAM_SIZE, 16) 100defframe(PARAM_SRC, 12) 101defframe(PARAM_XSIZE, 8) 102defframe(PARAM_DST, 4) 103 104dnl re-use parameter space 105define(SAVE_ESI,`PARAM_SIZE') 106define(SAVE_EBP,`PARAM_SRC') 107define(SAVE_EDI,`PARAM_DIVISOR') 108define(SAVE_EBX,`PARAM_DST') 109 110 TEXT 111 112 ALIGN(16) 113PROLOGUE(mpn_preinv_divrem_1) 114deflit(`FRAME',0) 115 116 movl PARAM_SIZE, %ecx 117 xorl %edx, %edx C carry if can't skip a div 118 119 movl %esi, SAVE_ESI 120 movl PARAM_SRC, %esi 121 122 movl %ebp, SAVE_EBP 123 movl PARAM_DIVISOR, %ebp 124 125 movl %edi, SAVE_EDI 126 movl PARAM_DST, %edi 127 128 movl -4(%esi,%ecx,4), %eax C src high limb 129 130 movl %ebx, SAVE_EBX 131 movl PARAM_XSIZE, %ebx 132 133 movd PARAM_PREINV_INVERSE, %mm4 134 135 movd PARAM_PREINV_SHIFT, %mm7 C l 136 cmpl %ebp, %eax C high cmp divisor 137 138 cmovc( %eax, %edx) C high is carry if high<divisor 139 movd %edx, %mm0 C carry 140 141 movd %edx, %mm1 C carry 142 movl $0, %edx 143 144 movd %ebp, %mm5 C d 145 cmovnc( %eax, %edx) C 0 if skip div, src high if not 146 C (the latter in case src==dst) 147 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 148 149 movl %edx, (%edi,%ecx,4) C dst high limb 150 sbbl $0, %ecx C skip one division if high<divisor 151 movl $32, %eax 152 153 subl PARAM_PREINV_SHIFT, %eax 154 psllq %mm7, %mm5 C d normalized 155 leal (%edi,%ecx,4), %edi C &dst[xsize+size-1] 156 leal -4(%esi,%ecx,4), %esi C &src[size-1] 157 158 movd %eax, %mm6 C 32-l 159 jmp L(start_preinv) 160 161EPILOGUE() 162 163 164 ALIGN(16) 165PROLOGUE(mpn_divrem_1c) 166deflit(`FRAME',0) 167 168 movl PARAM_CARRY, %edx 169 170 movl PARAM_SIZE, %ecx 171 172 movl %esi, SAVE_ESI 173 movl PARAM_SRC, %esi 174 175 movl %ebp, SAVE_EBP 176 movl PARAM_DIVISOR, %ebp 177 178 movl %edi, SAVE_EDI 179 movl PARAM_DST, %edi 180 181 movl %ebx, SAVE_EBX 182 movl PARAM_XSIZE, %ebx 183 184 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 185 jmp L(start_1c) 186 187EPILOGUE() 188 189 190 ALIGN(16) 191PROLOGUE(mpn_divrem_1) 192deflit(`FRAME',0) 193 194 movl PARAM_SIZE, %ecx 195 xorl %edx, %edx C initial carry (if can't skip a div) 196 197 movl %esi, SAVE_ESI 198 movl PARAM_SRC, %esi 199 200 movl %ebp, SAVE_EBP 201 movl PARAM_DIVISOR, %ebp 202 203 movl %edi, SAVE_EDI 204 movl PARAM_DST, %edi 205 206 movl %ebx, SAVE_EBX 207 movl PARAM_XSIZE, %ebx 208 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 209 210 orl %ecx, %ecx C size 211 jz L(no_skip_div) C if size==0 212 movl -4(%esi,%ecx,4), %eax C src high limb 213 214 cmpl %ebp, %eax C high cmp divisor 215 216 cmovnc( %eax, %edx) C 0 if skip div, src high if not 217 movl %edx, (%edi,%ecx,4) C dst high limb 218 219 movl $0, %edx 220 cmovc( %eax, %edx) C high is carry if high<divisor 221 222 sbbl $0, %ecx C size-1 if high<divisor 223L(no_skip_div): 224 225 226L(start_1c): 227 C eax 228 C ebx xsize 229 C ecx size 230 C edx carry 231 C esi src 232 C edi &dst[xsize-1] 233 C ebp divisor 234 235 leal (%ebx,%ecx), %eax C size+xsize 236 leal -4(%esi,%ecx,4), %esi C &src[size-1] 237 leal (%edi,%ecx,4), %edi C &dst[size+xsize-1] 238 239 cmpl $MUL_THRESHOLD, %eax 240 jae L(mul_by_inverse) 241 242 243 orl %ecx, %ecx 244 jz L(divide_no_integer) C if size==0 245 246L(divide_integer): 247 C eax scratch (quotient) 248 C ebx xsize 249 C ecx counter 250 C edx carry 251 C esi src, decrementing 252 C edi dst, decrementing 253 C ebp divisor 254 255 movl (%esi), %eax 256 subl $4, %esi 257 258 divl %ebp 259 260 movl %eax, (%edi) 261 subl $4, %edi 262 263 subl $1, %ecx 264 jnz L(divide_integer) 265 266 267L(divide_no_integer): 268 orl %ebx, %ebx 269 jnz L(divide_fraction) C if xsize!=0 270 271L(divide_done): 272 movl SAVE_ESI, %esi 273 movl SAVE_EDI, %edi 274 movl SAVE_EBX, %ebx 275 movl SAVE_EBP, %ebp 276 movl %edx, %eax 277 ret 278 279 280L(divide_fraction): 281 C eax scratch (quotient) 282 C ebx counter 283 C ecx 284 C edx carry 285 C esi 286 C edi dst, decrementing 287 C ebp divisor 288 289 movl $0, %eax 290 291 divl %ebp 292 293 movl %eax, (%edi) 294 subl $4, %edi 295 296 subl $1, %ebx 297 jnz L(divide_fraction) 298 299 jmp L(divide_done) 300 301 302 303C ----------------------------------------------------------------------------- 304 305L(mul_by_inverse): 306 C eax 307 C ebx xsize 308 C ecx size 309 C edx carry 310 C esi &src[size-1] 311 C edi &dst[size+xsize-1] 312 C ebp divisor 313 314 bsrl %ebp, %eax C 31-l 315 movd %edx, %mm0 C carry 316 movd %edx, %mm1 C carry 317 movl %ecx, %edx C size 318 movl $31, %ecx 319 320 C 321 322 xorl %eax, %ecx C l = leading zeros on d 323 addl $1, %eax 324 325 shll %cl, %ebp C d normalized 326 movd %ecx, %mm7 C l 327 movl %edx, %ecx C size 328 329 movd %eax, %mm6 C 32-l 330 movl $-1, %edx 331 movl $-1, %eax 332 333 C 334 335 subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 336 337 divl %ebp C floor (b*(b-d)-1 / d) 338 movd %ebp, %mm5 C d 339 340 C 341 342 movd %eax, %mm4 C m 343 344 345L(start_preinv): 346 C eax inverse 347 C ebx xsize 348 C ecx size 349 C edx 350 C esi &src[size-1] 351 C edi &dst[size+xsize-1] 352 C ebp 353 C 354 C mm0 carry 355 C mm1 carry 356 C mm2 357 C mm4 m 358 C mm5 d 359 C mm6 31-l 360 C mm7 l 361 362 psllq %mm7, %mm0 C n2 = carry << l, for size==0 363 364 subl $1, %ecx 365 jb L(integer_none) 366 367 movd (%esi), %mm0 C src high limb 368 punpckldq %mm1, %mm0 369 psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l) 370 jz L(integer_last) 371 372 373C The dependent chain here consists of 374C 375C 2 paddd n1+n2 376C 8 pmuludq m*(n1+n2) 377C 2 paddq n2:nadj + m*(n1+n2) 378C 2 psrlq q1 379C 8 pmuludq d*q1 380C 2 psubq (n-d)-q1*d 381C 2 psrlq high n-(q1+1)*d mask 382C 2 pand d masked 383C 2 paddd n2+d addback 384C -- 385C 30 386C 387C But it seems to run at 32 cycles, so presumably there's something else 388C going on. 389 390 ALIGN(16) 391L(integer_top): 392 C eax 393 C ebx 394 C ecx counter, size-1 to 0 395 C edx 396 C esi src, decrementing 397 C edi dst, decrementing 398 C 399 C mm0 n2 400 C mm4 m 401 C mm5 d 402 C mm6 32-l 403 C mm7 l 404 405 ASSERT(b,`C n2<d 406 movd %mm0, %eax 407 movd %mm5, %edx 408 cmpl %edx, %eax') 409 410 movd -4(%esi), %mm1 C next src limbs 411 movd (%esi), %mm2 412 leal -4(%esi), %esi 413 414 punpckldq %mm2, %mm1 415 psrlq %mm6, %mm1 C n10 416 417 movq %mm1, %mm2 C n10 418 movq %mm1, %mm3 C n10 419 psrad $31, %mm1 C -n1 420 pand %mm5, %mm1 C -n1 & d 421 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow 422 423 psrld $31, %mm2 C n1 424 paddd %mm0, %mm2 C n2+n1 425 punpckldq %mm0, %mm1 C n2:nadj 426 427 pmuludq %mm4, %mm2 C m*(n2+n1) 428 429 C 430 431 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) 432 pxor %mm2, %mm2 C break dependency, saves 4 cycles 433 pcmpeqd %mm2, %mm2 C FF...FF 434 psrlq $63, %mm2 C 1 435 436 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) 437 438 paddd %mm1, %mm2 C q1+1 439 pmuludq %mm5, %mm1 C q1*d 440 441 punpckldq %mm0, %mm3 C n = n2:n10 442 pxor %mm0, %mm0 443 444 psubq %mm5, %mm3 C n - d 445 446 C 447 448 psubq %mm1, %mm3 C n - (q1+1)*d 449 450 por %mm3, %mm0 C copy remainder -> new n2 451 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 452 453 ASSERT(be,`C 0 or -1 454 movd %mm3, %eax 455 addl $1, %eax 456 cmpl $1, %eax') 457 458 paddd %mm3, %mm2 C q 459 pand %mm5, %mm3 C mask & d 460 461 paddd %mm3, %mm0 C addback if necessary 462 movd %mm2, (%edi) 463 leal -4(%edi), %edi 464 465 subl $1, %ecx 466 ja L(integer_top) 467 468 469L(integer_last): 470 C eax 471 C ebx xsize 472 C ecx 473 C edx 474 C esi &src[0] 475 C edi &dst[xsize] 476 C 477 C mm0 n2 478 C mm4 m 479 C mm5 d 480 C mm6 481 C mm7 l 482 483 ASSERT(b,`C n2<d 484 movd %mm0, %eax 485 movd %mm5, %edx 486 cmpl %edx, %eax') 487 488 movd (%esi), %mm1 C src[0] 489 psllq %mm7, %mm1 C n10 490 491 movq %mm1, %mm2 C n10 492 movq %mm1, %mm3 C n10 493 psrad $31, %mm1 C -n1 494 pand %mm5, %mm1 C -n1 & d 495 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow 496 497 psrld $31, %mm2 C n1 498 paddd %mm0, %mm2 C n2+n1 499 punpckldq %mm0, %mm1 C n2:nadj 500 501 pmuludq %mm4, %mm2 C m*(n2+n1) 502 503 C 504 505 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) 506 pcmpeqd %mm2, %mm2 C FF...FF 507 psrlq $63, %mm2 C 1 508 509 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) 510 paddd %mm1, %mm2 C q1 511 512 pmuludq %mm5, %mm1 C q1*d 513 punpckldq %mm0, %mm3 C n 514 psubq %mm5, %mm3 C n - d 515 pxor %mm0, %mm0 516 517 C 518 519 psubq %mm1, %mm3 C n - (q1+1)*d 520 521 por %mm3, %mm0 C remainder -> n2 522 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 523 524 ASSERT(be,`C 0 or -1 525 movd %mm3, %eax 526 addl $1, %eax 527 cmpl $1, %eax') 528 529 paddd %mm3, %mm2 C q 530 pand %mm5, %mm3 C mask & d 531 532 paddd %mm3, %mm0 C addback if necessary 533 movd %mm2, (%edi) 534 leal -4(%edi), %edi 535 536 537L(integer_none): 538 C eax 539 C ebx xsize 540 541 orl %ebx, %ebx 542 jnz L(fraction_some) C if xsize!=0 543 544 545L(fraction_done): 546 movl SAVE_EBP, %ebp 547 psrld %mm7, %mm0 C remainder 548 549 movl SAVE_EDI, %edi 550 movd %mm0, %eax 551 552 movl SAVE_ESI, %esi 553 movl SAVE_EBX, %ebx 554 emms 555 ret 556 557 558 559C ----------------------------------------------------------------------------- 560C 561 562L(fraction_some): 563 C eax 564 C ebx xsize 565 C ecx 566 C edx 567 C esi 568 C edi &dst[xsize-1] 569 C ebp 570 571 572L(fraction_top): 573 C eax 574 C ebx counter, xsize iterations 575 C ecx 576 C edx 577 C esi src, decrementing 578 C edi dst, decrementing 579 C 580 C mm0 n2 581 C mm4 m 582 C mm5 d 583 C mm6 32-l 584 C mm7 l 585 586 ASSERT(b,`C n2<d 587 movd %mm0, %eax 588 movd %mm5, %edx 589 cmpl %edx, %eax') 590 591 movq %mm0, %mm1 C n2 592 pmuludq %mm4, %mm0 C m*n2 593 594 pcmpeqd %mm2, %mm2 595 psrlq $63, %mm2 596 597 C 598 599 psrlq $32, %mm0 C high(m*n2) 600 601 paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2) 602 603 paddd %mm0, %mm2 C q1+1 604 pmuludq %mm5, %mm0 C q1*d 605 606 psllq $32, %mm1 C n = n2:0 607 psubq %mm5, %mm1 C n - d 608 609 C 610 611 psubq %mm0, %mm1 C r = n - (q1+1)*d 612 pxor %mm0, %mm0 613 614 por %mm1, %mm0 C r -> n2 615 psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1 616 617 ASSERT(be,`C 0 or -1 618 movd %mm1, %eax 619 addl $1, %eax 620 cmpl $1, %eax') 621 622 paddd %mm1, %mm2 C q 623 pand %mm5, %mm1 C mask & d 624 625 paddd %mm1, %mm0 C addback if necessary 626 movd %mm2, (%edi) 627 leal -4(%edi), %edi 628 629 subl $1, %ebx 630 jne L(fraction_top) 631 632 633 jmp L(fraction_done) 634 635EPILOGUE() 636