1dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb 2dnl division. 3 4dnl Copyright 1999, 2000, 2001, 2002, 2004 Free Software Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part. 25 26 27C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 28C mp_srcptr src, mp_size_t size, 29C mp_limb_t divisor); 30C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 31C mp_srcptr src, mp_size_t size, 32C mp_limb_t divisor, mp_limb_t carry); 33C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, 34C mp_srcptr src, mp_size_t size, 35C mp_limb_t divisor, mp_limb_t inverse, 36C unsigned shift); 37C 38C Algorithm: 39C 40C The method and nomenclature follow part 8 of "Division by Invariant 41C Integers using Multiplication" by Granlund and Montgomery, reference in 42C gmp.texi. 43C 44C The "and"s shown in the paper are done here with "cmov"s. "m" is written 45C for m', and "d" for d_norm, which won't cause any confusion since it's 46C only the normalized divisor that's of any use in the code. "b" is written 47C for 2^N, the size of a limb, N being 32 here. 48C 49C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as 50C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If 51C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case 52C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then 53C if q1==0xFFFFFFFF that must be the right value. 54C 55C For the last and second last steps q1==0xFFFFFFFF is instead handled by an 56C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This 57C then goes through as normal, and finding no addback required. sbbl costs 58C an extra cycle over what the main loop code does, but it keeps code size 59C and complexity down. 60C 61C Notes: 62C 63C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high 64C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero 65C carry, since in normal circumstances that will be a very rare event. 66C 67C The test for skipping a division is branch free (once size>=1 is tested). 68C The store to the destination high limb is 0 when a divide is skipped, or 69C if it's not skipped then a copy of the src high limb is used. The latter 70C is in case src==dst. 71C 72C There's a small bias towards expecting xsize==0, by having code for 73C xsize==0 in a straight line and xsize!=0 under forward jumps. 74C 75C Alternatives: 76C 77C If the divisor is normalized (high bit set) then a division step can 78C always be skipped, since the high destination limb is always 0 or 1 in 79C that case. It doesn't seem worth checking for this though, since it 80C probably occurs infrequently, in particular note that big_base for a 81C decimal mpn_get_str is not normalized in a 32-bit limb. 82 83 84dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by 85dnl inverse method is used, rather than plain "divl"s. Minimum value 1. 86dnl 87dnl The inverse takes about 50 cycles to calculate, but after that the 88dnl multiply is 17 c/l versus division at 42 c/l. 89dnl 90dnl At 3 limbs the mul is a touch faster than div on the integer part, and 91dnl even more so on the fractional part. 92 93deflit(MUL_THRESHOLD, 3) 94 95 96defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 97defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 98defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c 99defframe(PARAM_DIVISOR,20) 100defframe(PARAM_SIZE, 16) 101defframe(PARAM_SRC, 12) 102defframe(PARAM_XSIZE, 8) 103defframe(PARAM_DST, 4) 104 105defframe(SAVE_EBX, -4) 106defframe(SAVE_ESI, -8) 107defframe(SAVE_EDI, -12) 108defframe(SAVE_EBP, -16) 109 110defframe(VAR_NORM, -20) 111defframe(VAR_INVERSE, -24) 112defframe(VAR_SRC, -28) 113defframe(VAR_DST, -32) 114defframe(VAR_DST_STOP,-36) 115 116deflit(STACK_SPACE, 36) 117 118 TEXT 119 ALIGN(32) 120 121PROLOGUE(mpn_preinv_divrem_1) 122deflit(`FRAME',0) 123 movl PARAM_XSIZE, %ecx 124 movl PARAM_DST, %edx 125 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 126 127 movl %esi, SAVE_ESI 128 movl PARAM_SRC, %esi 129 130 movl %ebx, SAVE_EBX 131 movl PARAM_SIZE, %ebx 132 133 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] 134 movl %ebp, SAVE_EBP 135 movl PARAM_DIVISOR, %ebp 136 137 movl %edx, VAR_DST_STOP C &dst[xsize+2] 138 movl %edi, SAVE_EDI 139 xorl %edi, %edi C carry 140 141 movl -4(%esi,%ebx,4), %eax C src high limb 142 xor %ecx, %ecx 143 144 C 145 146 C 147 148 cmpl %ebp, %eax C high cmp divisor 149 150 cmovc( %eax, %edi) C high is carry if high<divisor 151 cmovnc( %eax, %ecx) C 0 if skip div, src high if not 152 C (the latter in case src==dst) 153 154 movl %ecx, -12(%edx,%ebx,4) C dst high limb 155 sbbl $0, %ebx C skip one division if high<divisor 156 movl PARAM_PREINV_SHIFT, %ecx 157 158 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size] 159 movl $32, %eax 160 161 movl %edx, VAR_DST C &dst[xsize+size] 162 163 shll %cl, %ebp C d normalized 164 subl %ecx, %eax 165 movl %ecx, VAR_NORM 166 167 movd %eax, %mm7 C rshift 168 movl PARAM_PREINV_INVERSE, %eax 169 jmp L(start_preinv) 170 171EPILOGUE() 172 173 174 ALIGN(16) 175 176PROLOGUE(mpn_divrem_1c) 177deflit(`FRAME',0) 178 movl PARAM_CARRY, %edx 179 movl PARAM_SIZE, %ecx 180 subl $STACK_SPACE, %esp 181deflit(`FRAME',STACK_SPACE) 182 183 movl %ebx, SAVE_EBX 184 movl PARAM_XSIZE, %ebx 185 186 movl %edi, SAVE_EDI 187 movl PARAM_DST, %edi 188 189 movl %ebp, SAVE_EBP 190 movl PARAM_DIVISOR, %ebp 191 192 movl %esi, SAVE_ESI 193 movl PARAM_SRC, %esi 194 195 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 196 jmp L(start_1c) 197 198EPILOGUE() 199 200 201 C offset 0xa1, close enough to aligned 202PROLOGUE(mpn_divrem_1) 203deflit(`FRAME',0) 204 205 movl PARAM_SIZE, %ecx 206 movl $0, %edx C initial carry (if can't skip a div) 207 subl $STACK_SPACE, %esp 208deflit(`FRAME',STACK_SPACE) 209 210 movl %esi, SAVE_ESI 211 movl PARAM_SRC, %esi 212 213 movl %ebx, SAVE_EBX 214 movl PARAM_XSIZE, %ebx 215 216 movl %ebp, SAVE_EBP 217 movl PARAM_DIVISOR, %ebp 218 orl %ecx, %ecx C size 219 220 movl %edi, SAVE_EDI 221 movl PARAM_DST, %edi 222 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 223 224 jz L(no_skip_div) C if size==0 225 movl -4(%esi,%ecx,4), %eax C src high limb 226 xorl %esi, %esi 227 228 cmpl %ebp, %eax C high cmp divisor 229 230 cmovc( %eax, %edx) C high is carry if high<divisor 231 cmovnc( %eax, %esi) C 0 if skip div, src high if not 232 233 movl %esi, (%edi,%ecx,4) C dst high limb 234 sbbl $0, %ecx C size-1 if high<divisor 235 movl PARAM_SRC, %esi C reload 236L(no_skip_div): 237 238 239L(start_1c): 240 C eax 241 C ebx xsize 242 C ecx size 243 C edx carry 244 C esi src 245 C edi &dst[xsize-1] 246 C ebp divisor 247 248 leal (%ebx,%ecx), %eax C size+xsize 249 cmpl $MUL_THRESHOLD, %eax 250 jae L(mul_by_inverse) 251 252 253C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs. 254C It'd be possible to write them out without the looping, but no speedup 255C would be expected. 256C 257C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the 258C integer part, but curiously not on the fractional part, where %ebp is a 259C (fixed) couple of cycles faster. 260 261 orl %ecx, %ecx 262 jz L(divide_no_integer) 263 264L(divide_integer): 265 C eax scratch (quotient) 266 C ebx xsize 267 C ecx counter 268 C edx scratch (remainder) 269 C esi src 270 C edi &dst[xsize-1] 271 C ebp divisor 272 273 movl -4(%esi,%ecx,4), %eax 274 275 divl PARAM_DIVISOR 276 277 movl %eax, (%edi,%ecx,4) 278 decl %ecx 279 jnz L(divide_integer) 280 281 282L(divide_no_integer): 283 movl PARAM_DST, %edi 284 orl %ebx, %ebx 285 jnz L(divide_fraction) 286 287L(divide_done): 288 movl SAVE_ESI, %esi 289 movl SAVE_EDI, %edi 290 movl %edx, %eax 291 292 movl SAVE_EBX, %ebx 293 movl SAVE_EBP, %ebp 294 addl $STACK_SPACE, %esp 295 296 ret 297 298 299L(divide_fraction): 300 C eax scratch (quotient) 301 C ebx counter 302 C ecx 303 C edx scratch (remainder) 304 C esi 305 C edi dst 306 C ebp divisor 307 308 movl $0, %eax 309 310 divl %ebp 311 312 movl %eax, -4(%edi,%ebx,4) 313 decl %ebx 314 jnz L(divide_fraction) 315 316 jmp L(divide_done) 317 318 319 320C ----------------------------------------------------------------------------- 321 322L(mul_by_inverse): 323 C eax 324 C ebx xsize 325 C ecx size 326 C edx carry 327 C esi src 328 C edi &dst[xsize-1] 329 C ebp divisor 330 331 bsrl %ebp, %eax C 31-l 332 333 leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop 334 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] 335 336 movl %edi, VAR_DST 337 movl %ebx, VAR_DST_STOP 338 339 movl %ecx, %ebx C size 340 movl $31, %ecx 341 342 movl %edx, %edi C carry 343 movl $-1, %edx 344 345 C 346 347 xorl %eax, %ecx C l 348 incl %eax C 32-l 349 350 shll %cl, %ebp C d normalized 351 movl %ecx, VAR_NORM 352 353 movd %eax, %mm7 354 355 movl $-1, %eax 356 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 357 358 divl %ebp C floor (b*(b-d)-1) / d 359 360L(start_preinv): 361 C eax inverse 362 C ebx size 363 C ecx shift 364 C edx 365 C esi src 366 C edi carry 367 C ebp divisor 368 C 369 C mm7 rshift 370 371 orl %ebx, %ebx C size 372 movl %eax, VAR_INVERSE 373 leal -12(%esi,%ebx,4), %eax C &src[size-3] 374 375 jz L(start_zero) 376 movl %eax, VAR_SRC 377 cmpl $1, %ebx 378 379 movl 8(%eax), %esi C src high limb 380 jz L(start_one) 381 382L(start_two_or_more): 383 movl 4(%eax), %edx C src second highest limb 384 385 shldl( %cl, %esi, %edi) C n2 = carry,high << l 386 387 shldl( %cl, %edx, %esi) C n10 = high,second << l 388 389 cmpl $2, %ebx 390 je L(integer_two_left) 391 jmp L(integer_top) 392 393 394L(start_one): 395 shldl( %cl, %esi, %edi) C n2 = carry,high << l 396 397 shll %cl, %esi C n10 = high << l 398 movl %eax, VAR_SRC 399 jmp L(integer_one_left) 400 401 402L(start_zero): 403 C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and 404 C skipped a division. 405 406 shll %cl, %edi C n2 = carry << l 407 movl %edi, %eax C return value for zero_done 408 cmpl $0, PARAM_XSIZE 409 410 je L(zero_done) 411 jmp L(fraction_some) 412 413 414 415C ----------------------------------------------------------------------------- 416C 417C The multiply by inverse loop is 17 cycles, and relies on some out-of-order 418C execution. The instruction scheduling is important, with various 419C apparently equivalent forms running 1 to 5 cycles slower. 420C 421C A lower bound for the time would seem to be 16 cycles, based on the 422C following successive dependencies. 423C 424C cycles 425C n2+n1 1 426C mul 6 427C q1+1 1 428C mul 6 429C sub 1 430C addback 1 431C --- 432C 16 433C 434C This chain is what the loop has already, but 16 cycles isn't achieved. 435C K7 has enough decode, and probably enough execute (depending maybe on what 436C a mul actually consumes), but nothing running under 17 has been found. 437C 438C In theory n2+n1 could be done in the sub and addback stages (by 439C calculating both n2 and n2+n1 there), but lack of registers makes this an 440C unlikely proposition. 441C 442C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow 443C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent 444C chain, and nothing better than 18 cycles has been found when using it. 445C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will 446C be an extremely rare event. 447C 448C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but 449C if some special data is coming out with this always, the q1_ff special 450C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to 451C induce the q1_ff case, for speed measurements or testing. Note that 452C 0xFFF...FFF divided by 1 or 2 doesn't induce it. 453C 454C The instruction groupings and empty comments show the cycles for a naive 455C in-order view of the code (conveniently ignoring the load latency on 456C VAR_INVERSE). This shows some of where the time is going, but is nonsense 457C to the extent that out-of-order execution rearranges it. In this case 458C there's 19 cycles shown, but it executes at 17. 459 460 ALIGN(16) 461L(integer_top): 462 C eax scratch 463 C ebx scratch (nadj, q1) 464 C ecx scratch (src, dst) 465 C edx scratch 466 C esi n10 467 C edi n2 468 C ebp divisor 469 C 470 C mm0 scratch (src qword) 471 C mm7 rshift for normalization 472 473 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 474 movl %edi, %eax C n2 475 movl VAR_SRC, %ecx 476 477 leal (%ebp,%esi), %ebx 478 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 479 sbbl $-1, %eax C n2+n1 480 481 mull VAR_INVERSE C m*(n2+n1) 482 483 movq (%ecx), %mm0 C next limb and the one below it 484 subl $4, %ecx 485 486 movl %ecx, VAR_SRC 487 488 C 489 490 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 491 leal 1(%edi), %ebx C n2+1 492 movl %ebp, %eax C d 493 494 C 495 496 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 497 jz L(q1_ff) 498 movl VAR_DST, %ecx 499 500 mull %ebx C (q1+1)*d 501 502 psrlq %mm7, %mm0 503 504 leal -4(%ecx), %ecx 505 506 C 507 508 subl %eax, %esi 509 movl VAR_DST_STOP, %eax 510 511 C 512 513 sbbl %edx, %edi C n - (q1+1)*d 514 movl %esi, %edi C remainder -> n2 515 leal (%ebp,%esi), %edx 516 517 movd %mm0, %esi 518 519 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 520 sbbl $0, %ebx C q 521 cmpl %eax, %ecx 522 523 movl %ebx, (%ecx) 524 movl %ecx, VAR_DST 525 jne L(integer_top) 526 527 528L(integer_loop_done): 529 530 531C ----------------------------------------------------------------------------- 532C 533C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz 534C q1_ff special case. This make the code a bit smaller and simpler, and 535C costs only 1 cycle (each). 536 537L(integer_two_left): 538 C eax scratch 539 C ebx scratch (nadj, q1) 540 C ecx scratch (src, dst) 541 C edx scratch 542 C esi n10 543 C edi n2 544 C ebp divisor 545 C 546 C mm7 rshift 547 548 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 549 movl %edi, %eax C n2 550 movl PARAM_SRC, %ecx 551 552 leal (%ebp,%esi), %ebx 553 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 554 sbbl $-1, %eax C n2+n1 555 556 mull VAR_INVERSE C m*(n2+n1) 557 558 movd (%ecx), %mm0 C src low limb 559 560 movl VAR_DST_STOP, %ecx 561 562 C 563 564 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 565 leal 1(%edi), %ebx C n2+1 566 movl %ebp, %eax C d 567 568 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 569 570 sbbl $0, %ebx 571 572 mull %ebx C (q1+1)*d 573 574 psllq $32, %mm0 575 576 psrlq %mm7, %mm0 577 578 C 579 580 subl %eax, %esi 581 582 C 583 584 sbbl %edx, %edi C n - (q1+1)*d 585 movl %esi, %edi C remainder -> n2 586 leal (%ebp,%esi), %edx 587 588 movd %mm0, %esi 589 590 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 591 sbbl $0, %ebx C q 592 593 movl %ebx, -4(%ecx) 594 595 596C ----------------------------------------------------------------------------- 597L(integer_one_left): 598 C eax scratch 599 C ebx scratch (nadj, q1) 600 C ecx dst 601 C edx scratch 602 C esi n10 603 C edi n2 604 C ebp divisor 605 C 606 C mm7 rshift 607 608 movl VAR_DST_STOP, %ecx 609 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 610 movl %edi, %eax C n2 611 612 leal (%ebp,%esi), %ebx 613 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 614 sbbl $-1, %eax C n2+n1 615 616 mull VAR_INVERSE C m*(n2+n1) 617 618 C 619 620 C 621 622 C 623 624 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 625 leal 1(%edi), %ebx C n2+1 626 movl %ebp, %eax C d 627 628 C 629 630 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 631 632 sbbl $0, %ebx C q1 if q1+1 overflowed 633 634 mull %ebx 635 636 C 637 638 C 639 640 C 641 642 subl %eax, %esi 643 644 C 645 646 sbbl %edx, %edi C n - (q1+1)*d 647 movl %esi, %edi C remainder -> n2 648 leal (%ebp,%esi), %edx 649 650 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 651 sbbl $0, %ebx C q 652 653 movl %ebx, -8(%ecx) 654 subl $8, %ecx 655 656 657 658L(integer_none): 659 cmpl $0, PARAM_XSIZE 660 jne L(fraction_some) 661 662 movl %edi, %eax 663L(fraction_done): 664 movl VAR_NORM, %ecx 665L(zero_done): 666 movl SAVE_EBP, %ebp 667 668 movl SAVE_EDI, %edi 669 movl SAVE_ESI, %esi 670 671 movl SAVE_EBX, %ebx 672 addl $STACK_SPACE, %esp 673 674 shrl %cl, %eax 675 emms 676 677 ret 678 679 680C ----------------------------------------------------------------------------- 681C 682C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword 683C of q*d is simply -d and the remainder n-q*d = n10+d 684 685L(q1_ff): 686 C eax (divisor) 687 C ebx (q1+1 == 0) 688 C ecx 689 C edx 690 C esi n10 691 C edi n2 692 C ebp divisor 693 694 movl VAR_DST, %ecx 695 movl VAR_DST_STOP, %edx 696 subl $4, %ecx 697 698 psrlq %mm7, %mm0 699 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 700 movl %ecx, VAR_DST 701 702 movd %mm0, %esi C next n10 703 704 movl $-1, (%ecx) 705 cmpl %ecx, %edx 706 jne L(integer_top) 707 708 jmp L(integer_loop_done) 709 710 711 712C ----------------------------------------------------------------------------- 713C 714C Being the fractional part, the "source" limbs are all zero, meaning 715C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated. 716C 717C The loop runs at 15 cycles. The dependent chain is the same as the 718C general case above, but without the n2+n1 stage (due to n1==0), so 15 719C would seem to be the lower bound. 720C 721C A not entirely obvious simplification is that q1+1 never overflows a limb, 722C and so there's no need for the sbbl $0 or jz q1_ff from the general case. 723C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always. 724C rnd() means rounding down to a multiple of d. 725C 726C m*n2 + b*n2 <= m*(d-1) + b*(d-1) 727C = m*d + b*d - m - b 728C = floor((b(b-d)-1)/d)*d + b*d - m - b 729C = rnd(b(b-d)-1) + b*d - m - b 730C = rnd(b(b-d)-1 + b*d) - m - b 731C = rnd(b*b-1) - m - b 732C <= (b-2)*b 733C 734C Unchanged from the general case is that the final quotient limb q can be 735C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from 736C equation 8.4 of the paper which simplifies as follows when n1==0 and 737C n0==0. 738C 739C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b 740C 741C As before, the instruction groupings and empty comments show a naive 742C in-order view of the code, which is made a nonsense by out of order 743C execution. There's 17 cycles shown, but it executes at 15. 744C 745C Rotating the store q and remainder->n2 instructions up to the top of the 746C loop gets the run time down from 16 to 15. 747 748 ALIGN(16) 749L(fraction_some): 750 C eax 751 C ebx 752 C ecx 753 C edx 754 C esi 755 C edi carry 756 C ebp divisor 757 758 movl PARAM_DST, %esi 759 movl VAR_DST_STOP, %ecx C &dst[xsize+2] 760 movl %edi, %eax 761 762 subl $8, %ecx C &dst[xsize] 763 jmp L(fraction_entry) 764 765 766 ALIGN(16) 767L(fraction_top): 768 C eax n2 carry, then scratch 769 C ebx scratch (nadj, q1) 770 C ecx dst, decrementing 771 C edx scratch 772 C esi dst stop point 773 C edi (will be n2) 774 C ebp divisor 775 776 movl %ebx, (%ecx) C previous q 777 movl %eax, %edi C remainder->n2 778 779L(fraction_entry): 780 mull VAR_INVERSE C m*n2 781 782 movl %ebp, %eax C d 783 subl $4, %ecx C dst 784 leal 1(%edi), %ebx 785 786 C 787 788 C 789 790 C 791 792 C 793 794 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 795 796 mull %ebx C (q1+1)*d 797 798 C 799 800 C 801 802 C 803 804 negl %eax C low of n - (q1+1)*d 805 806 C 807 808 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry 809 leal (%ebp,%eax), %edx 810 811 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 812 sbbl $0, %ebx C q 813 cmpl %esi, %ecx 814 815 jne L(fraction_top) 816 817 818 movl %ebx, (%ecx) 819 jmp L(fraction_done) 820 821EPILOGUE() 822