1dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C divisor 24C odd even 25C K6: 10.0 12.0 cycles/limb 26C K6-2: 10.0 11.5 27 28 29C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 30C mp_limb_t divisor); 31C 32C A simple divl is used for size==1. This is about 10 cycles faster for an 33C odd divisor or 20 cycles for an even divisor. 34C 35C The loops are quite sensitive to code alignment, speeds should be 36C rechecked (odd and even divisor, pic and non-pic) if contemplating 37C changing anything. 38 39defframe(PARAM_DIVISOR,16) 40defframe(PARAM_SIZE, 12) 41defframe(PARAM_SRC, 8) 42defframe(PARAM_DST, 4) 43 44dnl re-use parameter space 45define(VAR_INVERSE,`PARAM_DST') 46 47 TEXT 48 49 ALIGN(32) 50PROLOGUE(mpn_divexact_1) 51deflit(`FRAME',0) 52 53 movl PARAM_SIZE, %ecx 54 55 movl PARAM_SRC, %eax 56 xorl %edx, %edx 57 58 cmpl $1, %ecx 59 jnz L(two_or_more) 60 61 movl (%eax), %eax 62 63 divl PARAM_DIVISOR 64 65 movl PARAM_DST, %ecx 66 movl %eax, (%ecx) 67 68 ret 69 70 71L(two_or_more): 72 movl PARAM_DIVISOR, %eax 73 pushl %ebx FRAME_pushl() 74 75 movl PARAM_SRC, %ebx 76 pushl %ebp FRAME_pushl() 77 78L(strip_twos): 79 shrl %eax 80 incl %edx C will get shift+1 81 82 jnc L(strip_twos) 83 pushl %esi FRAME_pushl() 84 85 leal 1(%eax,%eax), %esi C d without twos 86 andl $127, %eax C d/2, 7 bits 87 88ifdef(`PIC',` 89 LEA( binvert_limb_table, %ebp) 90Zdisp( movzbl, 0,(%eax,%ebp), %eax) 91',` 92 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 93') 94 pushl %edi FRAME_pushl() 95 96 leal (%eax,%eax), %ebp C 2*inv 97 98 imull %eax, %eax C inv*inv 99 100 movl PARAM_DST, %edi 101 102 imull %esi, %eax C inv*inv*d 103 104 subl %eax, %ebp C inv = 2*inv - inv*inv*d 105 leal (%ebp,%ebp), %eax C 2*inv 106 107 imull %ebp, %ebp C inv*inv 108 109 movl %esi, PARAM_DIVISOR C d without twos 110 leal (%ebx,%ecx,4), %ebx C src end 111 112 imull %esi, %ebp C inv*inv*d 113 114 leal (%edi,%ecx,4), %edi C dst end 115 negl %ecx C -size 116 117 subl %ebp, %eax C inv = 2*inv - inv*inv*d 118 subl $1, %edx C shift amount, and clear carry 119 120 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 121 pushl %eax FRAME_pushl() 122 imull PARAM_DIVISOR, %eax 123 cmpl $1, %eax 124 popl %eax FRAME_popl()') 125 126 movl %eax, VAR_INVERSE 127 jnz L(even) 128 129 movl (%ebx,%ecx,4), %esi C src low limb 130 jmp L(odd_entry) 131 132 133 ALIGN(16) 134 nop C code alignment 135L(odd_top): 136 C eax scratch 137 C ebx src end 138 C ecx counter, limbs, negative 139 C edx inverse 140 C esi next limb, adjusted for carry 141 C edi dst end 142 C ebp carry bit, 0 or -1 143 144 imull %edx, %esi 145 146 movl PARAM_DIVISOR, %eax 147 movl %esi, -4(%edi,%ecx,4) 148 149 mull %esi C carry limb in edx 150 151 subl %ebp, %edx C apply carry bit 152 movl (%ebx,%ecx,4), %esi 153 154L(odd_entry): 155 subl %edx, %esi C apply carry limb 156 movl VAR_INVERSE, %edx 157 158 sbbl %ebp, %ebp C 0 or -1 159 160 incl %ecx 161 jnz L(odd_top) 162 163 164 imull %edx, %esi 165 166 movl %esi, -4(%edi,%ecx,4) 167 168 popl %edi 169 popl %esi 170 171 popl %ebp 172 popl %ebx 173 174 ret 175 176 177L(even): 178 C eax 179 C ebx src end 180 C ecx -size 181 C edx twos 182 C esi 183 C edi dst end 184 C ebp 185 186 xorl %ebp, %ebp 187Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1] 188 189 movd %edx, %mm7 190 movl VAR_INVERSE, %edx 191 192 addl $2, %ecx 193 psrlq %mm7, %mm0 194 195 movd %mm0, %esi 196 jz L(even_two) C if only two limbs 197 198 199C Out-of-order execution is good enough to hide the load/rshift/movd 200C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12, 201C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has 202C been found. Maybe the fact every second movq is unaligned costs the extra 203C 0.5. 204 205L(even_top): 206 C eax scratch 207 C ebx src end 208 C ecx counter, limbs, negative 209 C edx inverse 210 C esi next limb, adjusted for carry 211 C edi dst end 212 C ebp carry bit, 0 or -1 213 C 214 C mm0 scratch, source limbs 215 C mm7 twos 216 217 imull %edx, %esi 218 219 movl %esi, -8(%edi,%ecx,4) 220 movl PARAM_DIVISOR, %eax 221 222 mull %esi C carry limb in edx 223 224 movq -4(%ebx,%ecx,4), %mm0 225 psrlq %mm7, %mm0 226 227 movd %mm0, %esi 228 subl %ebp, %edx C apply carry bit 229 230 subl %edx, %esi C apply carry limb 231 movl VAR_INVERSE, %edx 232 233 sbbl %ebp, %ebp C 0 or -1 234 235 incl %ecx 236 jnz L(even_top) 237 238 239L(even_two): 240 movd -4(%ebx), %mm0 C src high limb 241 psrlq %mm7, %mm0 242 243 imull %edx, %esi 244 245 movl %esi, -8(%edi) 246 movl PARAM_DIVISOR, %eax 247 248 mull %esi C carry limb in edx 249 250 movd %mm0, %esi 251 subl %ebp, %edx C apply carry bit 252 253 movl VAR_INVERSE, %eax 254 subl %edx, %esi C apply carry limb 255 256 imull %eax, %esi 257 258 movl %esi, -4(%edi) 259 260 popl %edi 261 popl %esi 262 263 popl %ebp 264 popl %ebx 265 266 emms_or_femms 267 268 ret 269 270EPILOGUE() 271