1dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation, 4dnl Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C 486 approx 43 maybe 26C P5 44 27C P6 39 28C P6MMX 39 29C K6 22 30C K7 42 31C P4 58 32 33 34C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 35C mp_srcptr src, mp_size_t size, mp_limb_t divisor); 36C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 37C mp_srcptr src, mp_size_t size, mp_limb_t divisor, 38C mp_limb_t carry); 39C 40C Divide src,size by divisor and store the quotient in dst+xsize,size. 41C Extend the division to fractional quotient limbs in dst,xsize. Return the 42C remainder. Either or both xsize and size can be 0. 43C 44C mpn_divrem_1c takes a carry parameter which is an initial high limb, 45C effectively one extra limb at the top of src,size. Must have 46C carry<divisor. 47C 48C 49C Essentially the code is the same as the division based part of 50C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl 51C instruction even when gcc is not being used (when longlong.h only has the 52C rather slow generic C udiv_qrnnd(). 53C 54C A test is done to see if the high limb is less than the divisor, and if so 55C one less div is done. A div is between 20 and 40 cycles on the various 56C x86s, so assuming high<divisor about half the time, then this test saves 57C half that amount. The branch misprediction penalty on each chip is less 58C than half a div. 59C 60C 61C Notes for P5: 62C 63C It might be thought that moving the load down to pair with the store would 64C save 1 cycle, but that doesn't seem to happen in practice, and in any case 65C would be a mere 2.2% saving, so it's hardly worth bothering about. 66C 67C A mul-by-inverse might be a possibility for P5, as done in 68C mpn/x86/pentium/mod_1.asm. The number of auxiliary instructions required 69C is a hinderance, but there could be a 10-15% speedup available. 70C 71C 72C Notes for K6: 73C 74C K6 has its own version of this code, using loop and paying attention to 75C cache line boundary crossings. The target 20 c/l can be had with the 76C decl+jnz of the present code by pairing up the load and store in the 77C loops. But it's considered easier not to introduce complexity just for 78C that, but instead let k6 have its own code. 79C 80 81defframe(PARAM_CARRY, 24) 82defframe(PARAM_DIVISOR,20) 83defframe(PARAM_SIZE, 16) 84defframe(PARAM_SRC, 12) 85defframe(PARAM_XSIZE, 8) 86defframe(PARAM_DST, 4) 87 88 TEXT 89 ALIGN(16) 90 91PROLOGUE(mpn_divrem_1c) 92deflit(`FRAME',0) 93 94 movl PARAM_SIZE, %ecx 95 pushl %edi FRAME_pushl() 96 97 movl PARAM_SRC, %edi 98 pushl %esi FRAME_pushl() 99 100 movl PARAM_DIVISOR, %esi 101 pushl %ebx FRAME_pushl() 102 103 movl PARAM_DST, %ebx 104 pushl %ebp FRAME_pushl() 105 106 movl PARAM_XSIZE, %ebp 107 orl %ecx, %ecx 108 109 movl PARAM_CARRY, %edx 110 jz L(fraction) 111 112 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part 113 jmp L(integer_top) 114 115EPILOGUE() 116 117 118PROLOGUE(mpn_divrem_1) 119deflit(`FRAME',0) 120 121 movl PARAM_SIZE, %ecx 122 pushl %edi FRAME_pushl() 123 124 movl PARAM_SRC, %edi 125 pushl %esi FRAME_pushl() 126 127 movl PARAM_DIVISOR, %esi 128 orl %ecx,%ecx 129 130 jz L(size_zero) 131 pushl %ebx FRAME_pushl() 132 133 movl -4(%edi,%ecx,4), %eax C src high limb 134 xorl %edx, %edx 135 136 movl PARAM_DST, %ebx 137 pushl %ebp FRAME_pushl() 138 139 movl PARAM_XSIZE, %ebp 140 cmpl %esi, %eax 141 142 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part 143 jae L(integer_entry) 144 145 146 C high<divisor, so high of dst is zero, and avoid one div 147 148 movl %edx, (%ebx,%ecx,4) 149 decl %ecx 150 151 movl %eax, %edx 152 jz L(fraction) 153 154 155L(integer_top): 156 C eax scratch (quotient) 157 C ebx dst+4*xsize-4 158 C ecx counter 159 C edx scratch (remainder) 160 C esi divisor 161 C edi src 162 C ebp xsize 163 164 movl -4(%edi,%ecx,4), %eax 165L(integer_entry): 166 167 divl %esi 168 169 movl %eax, (%ebx,%ecx,4) 170 decl %ecx 171 jnz L(integer_top) 172 173 174L(fraction): 175 orl %ebp, %ecx 176 jz L(done) 177 178 movl PARAM_DST, %ebx 179 180 181L(fraction_top): 182 C eax scratch (quotient) 183 C ebx dst 184 C ecx counter 185 C edx scratch (remainder) 186 C esi divisor 187 C edi 188 C ebp 189 190 xorl %eax, %eax 191 192 divl %esi 193 194 movl %eax, -4(%ebx,%ecx,4) 195 decl %ecx 196 jnz L(fraction_top) 197 198 199L(done): 200 popl %ebp 201 movl %edx, %eax 202 popl %ebx 203 popl %esi 204 popl %edi 205 ret 206 207 208L(size_zero): 209deflit(`FRAME',8) 210 movl PARAM_XSIZE, %ecx 211 xorl %eax, %eax 212 213 movl PARAM_DST, %edi 214 215 cld C better safe than sorry, see mpn/x86/README 216 217 rep 218 stosl 219 220 popl %esi 221 popl %edi 222 ret 223EPILOGUE() 224