1dnl x86-64 mpn_divrem_1 -- mpn by limb division. 2 3dnl Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C norm unorm frac 24C K8 13 13 12 25C P4 44.2 44.2 42.3 26C P6 core2 25 24.5 19.3 27C P6 corei7 21.5 20.7 18 28C P6 atom 42 52 37 29 30C TODO 31C * Compute the inverse without relying on the div instruction. 32C Newton's method and mulq, or perhaps the faster fdiv. 33C * Tune prologue. 34C * Optimize for Core 2. 35 36C The code for unnormalized divisors works also for normalized divisors, but 37C for some reason it runs really slowly (on K8) for that case. Use special 38C code until we can address this. The Intel Atom is also affected, but 39C understandably (shld slowness). 40define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1) 41 42C mp_limb_t 43C mpn_divrem_1 (mp_ptr qp, mp_size_t fn, 44C mp_srcptr np, mp_size_t nn, mp_limb_t d) 45 46C mp_limb_t 47C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, 48C mp_srcptr np, mp_size_t nn, mp_limb_t d, 49C mp_limb_t dinv, int cnt) 50 51C INPUT PARAMETERS 52define(`qp', `%rdi') 53define(`fn_param', `%rsi') 54define(`up_param', `%rdx') 55define(`un_param', `%rcx') 56define(`d', `%r8') 57define(`dinv', `%r9') C only for mpn_preinv_divrem_1 58C shift passed on stack C only for mpn_preinv_divrem_1 59 60define(`cnt', `%rcx') 61define(`up', `%rsi') 62define(`fn', `%r12') 63define(`un', `%rbx') 64 65 66C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 67C cnt qp d dinv 68 69ASM_START() 70 TEXT 71 ALIGN(16) 72PROLOGUE(mpn_preinv_divrem_1) 73 xor %eax, %eax 74 push %r13 75 push %r12 76 push %rbp 77 push %rbx 78 79 mov fn_param, fn 80 mov un_param, un 81 add fn_param, un_param 82 mov up_param, up 83 84 lea -8(qp,un_param,8), qp 85 86 test d, d 87 js L(nent) 88 mov 40(%rsp), R8(cnt) 89 shl R8(cnt), d 90 jmp L(uent) 91EPILOGUE() 92 93 ALIGN(16) 94PROLOGUE(mpn_divrem_1) 95 xor %eax, %eax 96 push %r13 97 push %r12 98 push %rbp 99 push %rbx 100 101 mov fn_param, fn 102 mov un_param, un 103 add fn_param, un_param 104 mov up_param, up 105 je L(ret) 106 107 lea -8(qp,un_param,8), qp 108 xor R32(%rbp), R32(%rbp) 109 110 111ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',` 112 test d, d 113 jns L(unnormalized) 114 115L(normalized): 116 test un, un 117 je L(8) C un == 0 118 mov -8(up,un,8), %rbp 119 dec un 120 mov %rbp, %rax 121 sub d, %rbp 122 cmovb %rax, %rbp 123 sbb %eax, %eax 124 inc %eax 125 mov %rax, (qp) 126 lea -8(qp), qp 127L(8): 128 mov d, %rdx 129 mov $-1, %rax 130 not %rdx 131 div d C FREE rax rdx rcx r9 r10 r11 132 mov %rax, dinv 133 mov %rbp, %rax 134 jmp L(nent) 135 136 ALIGN(16) 137L(nloop): C cycK8 cycP6 cycP4 138 mov (up,un,8), %r10 C 139 lea 1(%rax), %rbp C 140 mul dinv C 0,13 0,19 0,45 141 add %r10, %rax C 4 8 12 142 adc %rbp, %rdx C 5 9 13 143 mov %rax, %rbp C 5 9 13 144 mov %rdx, %r13 C 6 11 23 145 imul d, %rdx C 6 11 23 146 sub %rdx, %r10 C 10 16 33 147 mov d, %rax C 148 add %r10, %rax C 11 17 34 149 cmp %rbp, %r10 C 11 17 34 150 cmovb %r10, %rax C 12 18 35 151 adc $-1, %r13 C 152 cmp d, %rax C 153 jae L(nfx) C 154L(nok): mov %r13, (qp) C 155 sub $8, qp C 156L(nent):dec un C 157 jns L(nloop) C 158 159 xor %ecx, %ecx 160 jmp L(87) 161 162L(nfx): sub d, %rax 163 inc %r13 164 jmp L(nok) 165') 166 167L(unnormalized): 168 test un, un 169 je L(44) 170 mov -8(up,un,8), %rax 171 cmp d, %rax 172 jae L(44) 173 mov %rbp, (qp) 174 mov %rax, %rbp 175 lea -8(qp), qp 176 je L(ret) 177 dec un 178L(44): 179 bsr d, %rcx 180 not %ecx 181 sal %cl, d 182 sal %cl, %rbp 183 mov d, %rdx 184 mov $-1, %rax 185 not %rdx 186 div d C FREE rax rdx r9 r10 r11 187 test un, un 188 mov %rax, dinv 189 mov %rbp, %rax 190 je L(87) 191L(uent): 192 mov -8(up,un,8), %rbp 193 shr %cl, %rax 194 shld %cl, %rbp, %rax 195 sub $2, un 196 js L(ulast) 197 198 ALIGN(16) 199L(uloop): 200 nop 201 mov (up,un,8), %r10 202 lea 1(%rax), %r11 203 shld %cl, %r10, %rbp 204 mul dinv 205 add %rbp, %rax 206 adc %r11, %rdx 207 mov %rax, %r11 208 mov %rdx, %r13 209 imul d, %rdx 210 sub %rdx, %rbp 211 mov d, %rax 212 add %rbp, %rax 213 cmp %r11, %rbp 214 cmovb %rbp, %rax 215 adc $-1, %r13 216 cmp d, %rax 217 jae L(ufx) 218L(uok): mov %r13, (qp) 219 sub $8, qp 220 dec un 221 mov %r10, %rbp 222 jns L(uloop) 223L(ulast): 224 lea 1(%rax), %r11 225 sal %cl, %rbp 226 mul dinv 227 add %rbp, %rax 228 adc %r11, %rdx 229 mov %rax, %r11 230 mov %rdx, %r13 231 imul d, %rdx 232 sub %rdx, %rbp 233 mov d, %rax 234 add %rbp, %rax 235 cmp %r11, %rbp 236 cmovb %rbp, %rax 237 adc $-1, %r13 238 cmp d, %rax 239 jae L(93) 240L(69): mov %r13, (qp) 241 sub $8, qp 242 jmp L(87) 243 244L(ufx): sub d, %rax 245 inc %r13 246 jmp L(uok) 247 248L(93): sub d, %rax 249 inc %r13 250 jmp L(69) 251 252L(87): mov d, %rbp 253 neg %rbp 254 jmp L(87b) 255 256 ALIGN(16) 257L(floop): C cycK8 cycP6 cycP4 258 lea 1(%rax), %r11 C 259 mul dinv C 0,12 260 add %r11, %rdx C 5 261 mov %rax, %r11 C 4 262 mov %rdx, %r13 C 6 263 imul %rbp, %rdx C 6 264 mov d, %rax C 265 add %rdx, %rax C 10 266 cmp %r11, %rdx C 10 267 cmovb %rdx, %rax C 11 268 adc $-1, %r13 C 269 mov %r13, (qp) C 270 sub $8, qp C 271L(87b): dec fn C 272 jns L(floop) C 273 274 shr %cl, %rax 275L(ret): pop %rbx 276 pop %rbp 277 pop %r12 278 pop %r13 279 ret 280EPILOGUE() 281