1dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb 2dnl division. 3 4dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34 35C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part. 36 37 38C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 39C mp_srcptr src, mp_size_t size, 40C mp_limb_t divisor); 41C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 42C mp_srcptr src, mp_size_t size, 43C mp_limb_t divisor, mp_limb_t carry); 44C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, 45C mp_srcptr src, mp_size_t size, 46C mp_limb_t divisor, mp_limb_t inverse, 47C unsigned shift); 48C 49C Algorithm: 50C 51C The method and nomenclature follow part 8 of "Division by Invariant 52C Integers using Multiplication" by Granlund and Montgomery, reference in 53C gmp.texi. 54C 55C The "and"s shown in the paper are done here with "cmov"s. "m" is written 56C for m', and "d" for d_norm, which won't cause any confusion since it's 57C only the normalized divisor that's of any use in the code. "b" is written 58C for 2^N, the size of a limb, N being 32 here. 59C 60C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as 61C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If 62C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case 63C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then 64C if q1==0xFFFFFFFF that must be the right value. 65C 66C For the last and second last steps q1==0xFFFFFFFF is instead handled by an 67C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This 68C then goes through as normal, and finding no addback required. sbbl costs 69C an extra cycle over what the main loop code does, but it keeps code size 70C and complexity down. 71C 72C Notes: 73C 74C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high 75C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero 76C carry, since in normal circumstances that will be a very rare event. 77C 78C The test for skipping a division is branch free (once size>=1 is tested). 79C The store to the destination high limb is 0 when a divide is skipped, or 80C if it's not skipped then a copy of the src high limb is used. The latter 81C is in case src==dst. 82C 83C There's a small bias towards expecting xsize==0, by having code for 84C xsize==0 in a straight line and xsize!=0 under forward jumps. 85C 86C Alternatives: 87C 88C If the divisor is normalized (high bit set) then a division step can 89C always be skipped, since the high destination limb is always 0 or 1 in 90C that case. It doesn't seem worth checking for this though, since it 91C probably occurs infrequently, in particular note that big_base for a 92C decimal mpn_get_str is not normalized in a 32-bit limb. 93 94 95dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by 96dnl inverse method is used, rather than plain "divl"s. Minimum value 1. 97dnl 98dnl The inverse takes about 50 cycles to calculate, but after that the 99dnl multiply is 17 c/l versus division at 42 c/l. 100dnl 101dnl At 3 limbs the mul is a touch faster than div on the integer part, and 102dnl even more so on the fractional part. 103 104deflit(MUL_THRESHOLD, 3) 105 106 107defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 108defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 109defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c 110defframe(PARAM_DIVISOR,20) 111defframe(PARAM_SIZE, 16) 112defframe(PARAM_SRC, 12) 113defframe(PARAM_XSIZE, 8) 114defframe(PARAM_DST, 4) 115 116defframe(SAVE_EBX, -4) 117defframe(SAVE_ESI, -8) 118defframe(SAVE_EDI, -12) 119defframe(SAVE_EBP, -16) 120 121defframe(VAR_NORM, -20) 122defframe(VAR_INVERSE, -24) 123defframe(VAR_SRC, -28) 124defframe(VAR_DST, -32) 125defframe(VAR_DST_STOP,-36) 126 127deflit(STACK_SPACE, 36) 128 129 TEXT 130 ALIGN(32) 131 132PROLOGUE(mpn_preinv_divrem_1) 133deflit(`FRAME',0) 134 movl PARAM_XSIZE, %ecx 135 movl PARAM_DST, %edx 136 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 137 138 movl %esi, SAVE_ESI 139 movl PARAM_SRC, %esi 140 141 movl %ebx, SAVE_EBX 142 movl PARAM_SIZE, %ebx 143 144 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] 145 movl %ebp, SAVE_EBP 146 movl PARAM_DIVISOR, %ebp 147 148 movl %edx, VAR_DST_STOP C &dst[xsize+2] 149 movl %edi, SAVE_EDI 150 xorl %edi, %edi C carry 151 152 movl -4(%esi,%ebx,4), %eax C src high limb 153 xor %ecx, %ecx 154 155 C 156 157 C 158 159 cmpl %ebp, %eax C high cmp divisor 160 161 cmovc( %eax, %edi) C high is carry if high<divisor 162 cmovnc( %eax, %ecx) C 0 if skip div, src high if not 163 C (the latter in case src==dst) 164 165 movl %ecx, -12(%edx,%ebx,4) C dst high limb 166 sbbl $0, %ebx C skip one division if high<divisor 167 movl PARAM_PREINV_SHIFT, %ecx 168 169 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size] 170 movl $32, %eax 171 172 movl %edx, VAR_DST C &dst[xsize+size] 173 174 shll %cl, %ebp C d normalized 175 subl %ecx, %eax 176 movl %ecx, VAR_NORM 177 178 movd %eax, %mm7 C rshift 179 movl PARAM_PREINV_INVERSE, %eax 180 jmp L(start_preinv) 181 182EPILOGUE() 183 184 185 ALIGN(16) 186 187PROLOGUE(mpn_divrem_1c) 188deflit(`FRAME',0) 189 movl PARAM_CARRY, %edx 190 movl PARAM_SIZE, %ecx 191 subl $STACK_SPACE, %esp 192deflit(`FRAME',STACK_SPACE) 193 194 movl %ebx, SAVE_EBX 195 movl PARAM_XSIZE, %ebx 196 197 movl %edi, SAVE_EDI 198 movl PARAM_DST, %edi 199 200 movl %ebp, SAVE_EBP 201 movl PARAM_DIVISOR, %ebp 202 203 movl %esi, SAVE_ESI 204 movl PARAM_SRC, %esi 205 206 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 207 jmp L(start_1c) 208 209EPILOGUE() 210 211 212 C offset 0xa1, close enough to aligned 213PROLOGUE(mpn_divrem_1) 214deflit(`FRAME',0) 215 216 movl PARAM_SIZE, %ecx 217 movl $0, %edx C initial carry (if can't skip a div) 218 subl $STACK_SPACE, %esp 219deflit(`FRAME',STACK_SPACE) 220 221 movl %esi, SAVE_ESI 222 movl PARAM_SRC, %esi 223 224 movl %ebx, SAVE_EBX 225 movl PARAM_XSIZE, %ebx 226 227 movl %ebp, SAVE_EBP 228 movl PARAM_DIVISOR, %ebp 229 orl %ecx, %ecx C size 230 231 movl %edi, SAVE_EDI 232 movl PARAM_DST, %edi 233 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] 234 235 jz L(no_skip_div) C if size==0 236 movl -4(%esi,%ecx,4), %eax C src high limb 237 xorl %esi, %esi 238 239 cmpl %ebp, %eax C high cmp divisor 240 241 cmovc( %eax, %edx) C high is carry if high<divisor 242 cmovnc( %eax, %esi) C 0 if skip div, src high if not 243 244 movl %esi, (%edi,%ecx,4) C dst high limb 245 sbbl $0, %ecx C size-1 if high<divisor 246 movl PARAM_SRC, %esi C reload 247L(no_skip_div): 248 249 250L(start_1c): 251 C eax 252 C ebx xsize 253 C ecx size 254 C edx carry 255 C esi src 256 C edi &dst[xsize-1] 257 C ebp divisor 258 259 leal (%ebx,%ecx), %eax C size+xsize 260 cmpl $MUL_THRESHOLD, %eax 261 jae L(mul_by_inverse) 262 263 264C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs. 265C It'd be possible to write them out without the looping, but no speedup 266C would be expected. 267C 268C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the 269C integer part, but curiously not on the fractional part, where %ebp is a 270C (fixed) couple of cycles faster. 271 272 orl %ecx, %ecx 273 jz L(divide_no_integer) 274 275L(divide_integer): 276 C eax scratch (quotient) 277 C ebx xsize 278 C ecx counter 279 C edx scratch (remainder) 280 C esi src 281 C edi &dst[xsize-1] 282 C ebp divisor 283 284 movl -4(%esi,%ecx,4), %eax 285 286 divl PARAM_DIVISOR 287 288 movl %eax, (%edi,%ecx,4) 289 decl %ecx 290 jnz L(divide_integer) 291 292 293L(divide_no_integer): 294 movl PARAM_DST, %edi 295 orl %ebx, %ebx 296 jnz L(divide_fraction) 297 298L(divide_done): 299 movl SAVE_ESI, %esi 300 movl SAVE_EDI, %edi 301 movl %edx, %eax 302 303 movl SAVE_EBX, %ebx 304 movl SAVE_EBP, %ebp 305 addl $STACK_SPACE, %esp 306 307 ret 308 309 310L(divide_fraction): 311 C eax scratch (quotient) 312 C ebx counter 313 C ecx 314 C edx scratch (remainder) 315 C esi 316 C edi dst 317 C ebp divisor 318 319 movl $0, %eax 320 321 divl %ebp 322 323 movl %eax, -4(%edi,%ebx,4) 324 decl %ebx 325 jnz L(divide_fraction) 326 327 jmp L(divide_done) 328 329 330 331C ----------------------------------------------------------------------------- 332 333L(mul_by_inverse): 334 C eax 335 C ebx xsize 336 C ecx size 337 C edx carry 338 C esi src 339 C edi &dst[xsize-1] 340 C ebp divisor 341 342 bsrl %ebp, %eax C 31-l 343 344 leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop 345 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] 346 347 movl %edi, VAR_DST 348 movl %ebx, VAR_DST_STOP 349 350 movl %ecx, %ebx C size 351 movl $31, %ecx 352 353 movl %edx, %edi C carry 354 movl $-1, %edx 355 356 C 357 358 xorl %eax, %ecx C l 359 incl %eax C 32-l 360 361 shll %cl, %ebp C d normalized 362 movl %ecx, VAR_NORM 363 364 movd %eax, %mm7 365 366 movl $-1, %eax 367 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 368 369 divl %ebp C floor (b*(b-d)-1) / d 370 371L(start_preinv): 372 C eax inverse 373 C ebx size 374 C ecx shift 375 C edx 376 C esi src 377 C edi carry 378 C ebp divisor 379 C 380 C mm7 rshift 381 382 orl %ebx, %ebx C size 383 movl %eax, VAR_INVERSE 384 leal -12(%esi,%ebx,4), %eax C &src[size-3] 385 386 jz L(start_zero) 387 movl %eax, VAR_SRC 388 cmpl $1, %ebx 389 390 movl 8(%eax), %esi C src high limb 391 jz L(start_one) 392 393L(start_two_or_more): 394 movl 4(%eax), %edx C src second highest limb 395 396 shldl( %cl, %esi, %edi) C n2 = carry,high << l 397 398 shldl( %cl, %edx, %esi) C n10 = high,second << l 399 400 cmpl $2, %ebx 401 je L(integer_two_left) 402 jmp L(integer_top) 403 404 405L(start_one): 406 shldl( %cl, %esi, %edi) C n2 = carry,high << l 407 408 shll %cl, %esi C n10 = high << l 409 movl %eax, VAR_SRC 410 jmp L(integer_one_left) 411 412 413L(start_zero): 414 C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and 415 C skipped a division. 416 417 shll %cl, %edi C n2 = carry << l 418 movl %edi, %eax C return value for zero_done 419 cmpl $0, PARAM_XSIZE 420 421 je L(zero_done) 422 jmp L(fraction_some) 423 424 425 426C ----------------------------------------------------------------------------- 427C 428C The multiply by inverse loop is 17 cycles, and relies on some out-of-order 429C execution. The instruction scheduling is important, with various 430C apparently equivalent forms running 1 to 5 cycles slower. 431C 432C A lower bound for the time would seem to be 16 cycles, based on the 433C following successive dependencies. 434C 435C cycles 436C n2+n1 1 437C mul 6 438C q1+1 1 439C mul 6 440C sub 1 441C addback 1 442C --- 443C 16 444C 445C This chain is what the loop has already, but 16 cycles isn't achieved. 446C K7 has enough decode, and probably enough execute (depending maybe on what 447C a mul actually consumes), but nothing running under 17 has been found. 448C 449C In theory n2+n1 could be done in the sub and addback stages (by 450C calculating both n2 and n2+n1 there), but lack of registers makes this an 451C unlikely proposition. 452C 453C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow 454C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent 455C chain, and nothing better than 18 cycles has been found when using it. 456C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will 457C be an extremely rare event. 458C 459C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but 460C if some special data is coming out with this always, the q1_ff special 461C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to 462C induce the q1_ff case, for speed measurements or testing. Note that 463C 0xFFF...FFF divided by 1 or 2 doesn't induce it. 464C 465C The instruction groupings and empty comments show the cycles for a naive 466C in-order view of the code (conveniently ignoring the load latency on 467C VAR_INVERSE). This shows some of where the time is going, but is nonsense 468C to the extent that out-of-order execution rearranges it. In this case 469C there's 19 cycles shown, but it executes at 17. 470 471 ALIGN(16) 472L(integer_top): 473 C eax scratch 474 C ebx scratch (nadj, q1) 475 C ecx scratch (src, dst) 476 C edx scratch 477 C esi n10 478 C edi n2 479 C ebp divisor 480 C 481 C mm0 scratch (src qword) 482 C mm7 rshift for normalization 483 484 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 485 movl %edi, %eax C n2 486 movl VAR_SRC, %ecx 487 488 leal (%ebp,%esi), %ebx 489 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 490 sbbl $-1, %eax C n2+n1 491 492 mull VAR_INVERSE C m*(n2+n1) 493 494 movq (%ecx), %mm0 C next limb and the one below it 495 subl $4, %ecx 496 497 movl %ecx, VAR_SRC 498 499 C 500 501 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 502 leal 1(%edi), %ebx C n2+1 503 movl %ebp, %eax C d 504 505 C 506 507 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 508 jz L(q1_ff) 509 movl VAR_DST, %ecx 510 511 mull %ebx C (q1+1)*d 512 513 psrlq %mm7, %mm0 514 515 leal -4(%ecx), %ecx 516 517 C 518 519 subl %eax, %esi 520 movl VAR_DST_STOP, %eax 521 522 C 523 524 sbbl %edx, %edi C n - (q1+1)*d 525 movl %esi, %edi C remainder -> n2 526 leal (%ebp,%esi), %edx 527 528 movd %mm0, %esi 529 530 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 531 sbbl $0, %ebx C q 532 cmpl %eax, %ecx 533 534 movl %ebx, (%ecx) 535 movl %ecx, VAR_DST 536 jne L(integer_top) 537 538 539L(integer_loop_done): 540 541 542C ----------------------------------------------------------------------------- 543C 544C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz 545C q1_ff special case. This make the code a bit smaller and simpler, and 546C costs only 1 cycle (each). 547 548L(integer_two_left): 549 C eax scratch 550 C ebx scratch (nadj, q1) 551 C ecx scratch (src, dst) 552 C edx scratch 553 C esi n10 554 C edi n2 555 C ebp divisor 556 C 557 C mm7 rshift 558 559 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 560 movl %edi, %eax C n2 561 movl PARAM_SRC, %ecx 562 563 leal (%ebp,%esi), %ebx 564 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 565 sbbl $-1, %eax C n2+n1 566 567 mull VAR_INVERSE C m*(n2+n1) 568 569 movd (%ecx), %mm0 C src low limb 570 571 movl VAR_DST_STOP, %ecx 572 573 C 574 575 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 576 leal 1(%edi), %ebx C n2+1 577 movl %ebp, %eax C d 578 579 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 580 581 sbbl $0, %ebx 582 583 mull %ebx C (q1+1)*d 584 585 psllq $32, %mm0 586 587 psrlq %mm7, %mm0 588 589 C 590 591 subl %eax, %esi 592 593 C 594 595 sbbl %edx, %edi C n - (q1+1)*d 596 movl %esi, %edi C remainder -> n2 597 leal (%ebp,%esi), %edx 598 599 movd %mm0, %esi 600 601 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 602 sbbl $0, %ebx C q 603 604 movl %ebx, -4(%ecx) 605 606 607C ----------------------------------------------------------------------------- 608L(integer_one_left): 609 C eax scratch 610 C ebx scratch (nadj, q1) 611 C ecx dst 612 C edx scratch 613 C esi n10 614 C edi n2 615 C ebp divisor 616 C 617 C mm7 rshift 618 619 movl VAR_DST_STOP, %ecx 620 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 621 movl %edi, %eax C n2 622 623 leal (%ebp,%esi), %ebx 624 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 625 sbbl $-1, %eax C n2+n1 626 627 mull VAR_INVERSE C m*(n2+n1) 628 629 C 630 631 C 632 633 C 634 635 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 636 leal 1(%edi), %ebx C n2+1 637 movl %ebp, %eax C d 638 639 C 640 641 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 642 643 sbbl $0, %ebx C q1 if q1+1 overflowed 644 645 mull %ebx 646 647 C 648 649 C 650 651 C 652 653 subl %eax, %esi 654 655 C 656 657 sbbl %edx, %edi C n - (q1+1)*d 658 movl %esi, %edi C remainder -> n2 659 leal (%ebp,%esi), %edx 660 661 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 662 sbbl $0, %ebx C q 663 664 movl %ebx, -8(%ecx) 665 subl $8, %ecx 666 667 668 669L(integer_none): 670 cmpl $0, PARAM_XSIZE 671 jne L(fraction_some) 672 673 movl %edi, %eax 674L(fraction_done): 675 movl VAR_NORM, %ecx 676L(zero_done): 677 movl SAVE_EBP, %ebp 678 679 movl SAVE_EDI, %edi 680 movl SAVE_ESI, %esi 681 682 movl SAVE_EBX, %ebx 683 addl $STACK_SPACE, %esp 684 685 shrl %cl, %eax 686 emms 687 688 ret 689 690 691C ----------------------------------------------------------------------------- 692C 693C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword 694C of q*d is simply -d and the remainder n-q*d = n10+d 695 696L(q1_ff): 697 C eax (divisor) 698 C ebx (q1+1 == 0) 699 C ecx 700 C edx 701 C esi n10 702 C edi n2 703 C ebp divisor 704 705 movl VAR_DST, %ecx 706 movl VAR_DST_STOP, %edx 707 subl $4, %ecx 708 709 psrlq %mm7, %mm0 710 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 711 movl %ecx, VAR_DST 712 713 movd %mm0, %esi C next n10 714 715 movl $-1, (%ecx) 716 cmpl %ecx, %edx 717 jne L(integer_top) 718 719 jmp L(integer_loop_done) 720 721 722 723C ----------------------------------------------------------------------------- 724C 725C Being the fractional part, the "source" limbs are all zero, meaning 726C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated. 727C 728C The loop runs at 15 cycles. The dependent chain is the same as the 729C general case above, but without the n2+n1 stage (due to n1==0), so 15 730C would seem to be the lower bound. 731C 732C A not entirely obvious simplification is that q1+1 never overflows a limb, 733C and so there's no need for the sbbl $0 or jz q1_ff from the general case. 734C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always. 735C rnd() means rounding down to a multiple of d. 736C 737C m*n2 + b*n2 <= m*(d-1) + b*(d-1) 738C = m*d + b*d - m - b 739C = floor((b(b-d)-1)/d)*d + b*d - m - b 740C = rnd(b(b-d)-1) + b*d - m - b 741C = rnd(b(b-d)-1 + b*d) - m - b 742C = rnd(b*b-1) - m - b 743C <= (b-2)*b 744C 745C Unchanged from the general case is that the final quotient limb q can be 746C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from 747C equation 8.4 of the paper which simplifies as follows when n1==0 and 748C n0==0. 749C 750C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b 751C 752C As before, the instruction groupings and empty comments show a naive 753C in-order view of the code, which is made a nonsense by out of order 754C execution. There's 17 cycles shown, but it executes at 15. 755C 756C Rotating the store q and remainder->n2 instructions up to the top of the 757C loop gets the run time down from 16 to 15. 758 759 ALIGN(16) 760L(fraction_some): 761 C eax 762 C ebx 763 C ecx 764 C edx 765 C esi 766 C edi carry 767 C ebp divisor 768 769 movl PARAM_DST, %esi 770 movl VAR_DST_STOP, %ecx C &dst[xsize+2] 771 movl %edi, %eax 772 773 subl $8, %ecx C &dst[xsize] 774 jmp L(fraction_entry) 775 776 777 ALIGN(16) 778L(fraction_top): 779 C eax n2 carry, then scratch 780 C ebx scratch (nadj, q1) 781 C ecx dst, decrementing 782 C edx scratch 783 C esi dst stop point 784 C edi (will be n2) 785 C ebp divisor 786 787 movl %ebx, (%ecx) C previous q 788 movl %eax, %edi C remainder->n2 789 790L(fraction_entry): 791 mull VAR_INVERSE C m*n2 792 793 movl %ebp, %eax C d 794 subl $4, %ecx C dst 795 leal 1(%edi), %ebx 796 797 C 798 799 C 800 801 C 802 803 C 804 805 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 806 807 mull %ebx C (q1+1)*d 808 809 C 810 811 C 812 813 C 814 815 negl %eax C low of n - (q1+1)*d 816 817 C 818 819 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry 820 leal (%ebp,%eax), %edx 821 822 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 823 sbbl $0, %ebx C q 824 cmpl %esi, %ecx 825 826 jne L(fraction_top) 827 828 829 movl %ebx, (%ecx) 830 jmp L(fraction_done) 831 832EPILOGUE() 833