div_qr_2n_pi1.asm revision 1.1.1.2
1156952Sumednl x86-64 mpn_div_qr_2n_pi1 2156952Sumednl -- Divide an mpn number by a normalized 2-limb number, 3156952Sumednl using a single-limb inverse. 4156952Sume 5156952Sumednl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc. 6156952Sume 7156952Sumednl This file is part of the GNU MP Library. 8156952Sumednl 9156952Sumednl The GNU MP Library is free software; you can redistribute it and/or modify 10156952Sumednl it under the terms of either: 11156952Sumednl 12156952Sumednl * the GNU Lesser General Public License as published by the Free 13156952Sumednl Software Foundation; either version 3 of the License, or (at your 14156952Sumednl option) any later version. 15156952Sumednl 16156952Sumednl or 17156952Sumednl 18156952Sumednl * the GNU General Public License as published by the Free Software 19156952Sumednl Foundation; either version 2 of the License, or (at your option) any 20156952Sumednl later version. 21170242Sumednl 22156952Sumednl or both in parallel, as here. 23156952Sumednl 24156952Sumednl The GNU MP Library is distributed in the hope that it will be useful, but 25156952Sumednl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26156952Sumednl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27156952Sumednl for more details. 28156952Sumednl 29156952Sumednl You should have received copies of the GNU General Public License and the 30156952Sumednl GNU Lesser General Public License along with the GNU MP Library. If not, 31156952Sumednl see https://www.gnu.org/licenses/. 32156952Sume 33156952Sumeinclude(`../config.m4') 34156952Sume 35156952Sume 36156952SumeC c/l 37156952SumeC INPUT PARAMETERS 38156952Sumedefine(`qp', `%rdi') 39156952Sumedefine(`rp', `%rsi') 40156952Sumedefine(`up_param', `%rdx') 41156952Sumedefine(`un', `%rcx') 42156952Sumedefine(`d1', `%r8') 43156952Sumedefine(`d0', `%r9') 44156952Sumedefine(`di_param', `8(%rsp)') 45156952Sume 46156952Sumedefine(`di', `%r10') 47156952Sumedefine(`up', `%r11') 48156952Sumedefine(`u2', `%rbx') 49156952Sumedefine(`u1', `%r12') 50156952Sumedefine(`t1', `%r13') 51156952Sumedefine(`t0', `%r14') 52156952Sumedefine(`md1', `%r15') 53156952Sume 54156952SumeC TODO 55156952SumeC * Store qh in the same stack slot as di_param, instead of pushing 56156952SumeC it. (we could put it in register %rbp, but then we would need to 57156952SumeC save and restore that instead, which doesn't seem like a win). 58156952Sume 59156952SumeABI_SUPPORT(DOS64) 60156952SumeABI_SUPPORT(STD64) 61156952Sume 62156952SumeASM_START() 63156952Sume TEXT 64156952Sume ALIGN(16) 65156952SumePROLOGUE(mpn_div_qr_2n_pi1) 66156952Sume FUNC_ENTRY(4) 67156952SumeIFDOS(` mov 56(%rsp), %r8 ') 68156952SumeIFDOS(` mov 64(%rsp), %r9 ') 69156952SumeIFDOS(`define(`di_param', `72(%rsp)')') 70156952Sume mov di_param, di 71156952Sume mov up_param, up 72156952Sume push %r15 73156952Sume push %r14 74156952Sume push %r13 75156952Sume push %r12 76156952Sume push %rbx 77156952Sume 78156952Sume mov -16(up, un, 8), u1 79156952Sume mov -8(up, un, 8), u2 80156952Sume 81156952Sume mov u1, t0 82156952Sume mov u2, t1 83156952Sume sub d0, t0 84156952Sume sbb d1, t1 85156952Sume cmovnc t0, u1 86156952Sume cmovnc t1, u2 87156952Sume C push qh which is !carry 88156952Sume sbb %rax, %rax 89156952Sume inc %rax 90156952Sume push %rax 91156952Sume lea -2(un), un 92156952Sume mov d1, md1 93156952Sume neg md1 94156952Sume 95156952Sume jmp L(next) 96156952Sume 97156952Sume ALIGN(16) 98156952SumeL(loop): 99156952Sume C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di) 100156952Sume C Based on the optimized divrem_2.asm code. 101156952Sume 102156952Sume mov di, %rax 103156952Sume mul u2 104156952Sume mov u1, t0 105156952Sume add %rax, t0 C q0 in t0 106156952Sume adc u2, %rdx 107156952Sume mov %rdx, t1 C q in t1 108156952Sume imul md1, %rdx 109156952Sume mov d0, %rax 110156952Sume lea (%rdx, u1), u2 111156952Sume mul t1 112156952Sume mov (up, un, 8), u1 113156952Sume sub d0, u1 114156952Sume sbb d1, u2 115156952Sume sub %rax, u1 116156952Sume sbb %rdx, u2 117156952Sume xor R32(%rax), R32(%rax) 118156952Sume xor R32(%rdx), R32(%rdx) 119156952Sume cmp t0, u2 120156952Sume cmovnc d0, %rax 121156952Sume cmovnc d1, %rdx 122156952Sume adc $0, t1 123156952Sume nop 124156952Sume add %rax, u1 125156952Sume adc %rdx, u2 126156952Sume cmp d1, u2 127156952Sume jae L(fix) 128156952SumeL(bck): 129156952Sume mov t1, (qp, un, 8) 130156952SumeL(next): 131156952Sume sub $1, un 132156952Sume jnc L(loop) 133156952SumeL(end): 134156952Sume mov u2, 8(rp) 135156952Sume mov u1, (rp) 136156952Sume 137156952Sume C qh on stack 138156952Sume pop %rax 139156952Sume 140156952Sume pop %rbx 141156952Sume pop %r12 142156952Sume pop %r13 143156952Sume pop %r14 144156952Sume pop %r15 145156952Sume FUNC_EXIT() 146156952Sume ret 147156952Sume 148156952SumeL(fix): C Unlikely update. u2 >= d1 149156952Sume seta %dl 150156952Sume cmp d0, u1 151156952Sume setae %al 152156952Sume orb %dl, %al C "orb" form to placate Sun tools 153156952Sume je L(bck) 154156952Sume inc t1 155156952Sume sub d0, u1 156156952Sume sbb d1, u2 157156952Sume jmp L(bck) 158156952SumeEPILOGUE() 159156952Sume