1dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication. 2 3dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C P5: 12.0 for 32-bit multiplier 25C 7.0 for 16-bit multiplier 26 27 28C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 29C mp_limb_t multiplier); 30C 31C When the multiplier is 16 bits some special case MMX code is used. Small 32C multipliers might arise reasonably often from mpz_mul_ui etc. If the size 33C is odd there's roughly a 5 cycle penalty, so times for say size==7 and 34C size==8 end up being quite close. If src isn't aligned to an 8 byte 35C boundary then one limb is processed separately with roughly a 5 cycle 36C penalty, so in that case it's say size==8 and size==9 which are close. 37C 38C Alternatives: 39C 40C MMX is not believed to be of any use for 32-bit multipliers, since for 41C instance the current method would just have to be more or less duplicated 42C for the high and low halves of the multiplier, and would probably 43C therefore run at about 14 cycles, which is slower than the plain integer 44C at 12. 45C 46C Adding the high and low MMX products using integer code seems best. An 47C attempt at using paddd and carry bit propagation with pcmpgtd didn't give 48C any joy. Perhaps something could be done keeping the values signed and 49C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or 50C perhaps not. 51C 52C Future: 53C 54C An mpn_mul_1c entrypoint would need a double carry out of the low result 55C limb in the 16-bit code, unless it could be assumed the carry fits in 16 56C bits, possibly as carry<multiplier, this being true of a big calculation 57C done piece by piece. But let's worry about that if/when mul_1c is 58C actually used. 59 60defframe(PARAM_MULTIPLIER,16) 61defframe(PARAM_SIZE, 12) 62defframe(PARAM_SRC, 8) 63defframe(PARAM_DST, 4) 64 65 TEXT 66 67 ALIGN(8) 68PROLOGUE(mpn_mul_1) 69deflit(`FRAME',0) 70 71 movl PARAM_SIZE, %ecx 72 movl PARAM_SRC, %edx 73 74 cmpl $1, %ecx 75 jne L(two_or_more) 76 77 C one limb only 78 79 movl PARAM_MULTIPLIER, %eax 80 movl PARAM_DST, %ecx 81 82 mull (%edx) 83 84 movl %eax, (%ecx) 85 movl %edx, %eax 86 87 ret 88 89 90L(two_or_more): 91 C eax size 92 C ebx 93 C ecx carry 94 C edx 95 C esi src 96 C edi 97 C ebp 98 99 pushl %esi FRAME_pushl() 100 pushl %edi FRAME_pushl() 101 102 movl %edx, %esi C src 103 movl PARAM_DST, %edi 104 105 movl PARAM_MULTIPLIER, %eax 106 pushl %ebx FRAME_pushl() 107 108 leal (%esi,%ecx,4), %esi C src end 109 leal (%edi,%ecx,4), %edi C dst end 110 111 negl %ecx C -size 112 113 pushl %ebp FRAME_pushl() 114 cmpl $65536, %eax 115 116 jb L(small) 117 118 119L(big): 120 xorl %ebx, %ebx C carry limb 121 sarl %ecx C -size/2 122 123 jnc L(top) C with carry flag clear 124 125 126 C size was odd, process one limb separately 127 128 mull 4(%esi,%ecx,8) C m * src[0] 129 130 movl %eax, 4(%edi,%ecx,8) 131 incl %ecx 132 133 orl %edx, %ebx C carry limb, and clear carry flag 134 135 136L(top): 137 C eax 138 C ebx carry 139 C ecx counter, negative 140 C edx 141 C esi src end 142 C edi dst end 143 C ebp (scratch carry) 144 145 adcl $0, %ebx 146 movl (%esi,%ecx,8), %eax 147 148 mull PARAM_MULTIPLIER 149 150 movl %edx, %ebp 151 addl %eax, %ebx 152 153 adcl $0, %ebp 154 movl 4(%esi,%ecx,8), %eax 155 156 mull PARAM_MULTIPLIER 157 158 movl %ebx, (%edi,%ecx,8) 159 addl %ebp, %eax 160 161 movl %eax, 4(%edi,%ecx,8) 162 incl %ecx 163 164 movl %edx, %ebx 165 jnz L(top) 166 167 168 adcl $0, %ebx 169 popl %ebp 170 171 movl %ebx, %eax 172 popl %ebx 173 174 popl %edi 175 popl %esi 176 177 ret 178 179 180L(small): 181 C Special case for 16-bit multiplier. 182 C 183 C eax multiplier 184 C ebx 185 C ecx -size 186 C edx src 187 C esi src end 188 C edi dst end 189 C ebp multiplier 190 191 C size<3 not supported here. At size==3 we're already a couple of 192 C cycles faster, so there's no threshold as such, just use the MMX 193 C as soon as possible. 194 195 cmpl $-3, %ecx 196 ja L(big) 197 198 movd %eax, %mm7 C m 199 pxor %mm6, %mm6 C initial carry word 200 201 punpcklwd %mm7, %mm7 C m replicated 2 times 202 addl $2, %ecx C -size+2 203 204 punpckldq %mm7, %mm7 C m replicated 4 times 205 andl $4, %edx C test alignment, clear carry flag 206 207 movq %mm7, %mm0 C m 208 jz L(small_entry) 209 210 211 C Source is unaligned, process one limb separately. 212 C 213 C Plain integer code is used here, since it's smaller and is about 214 C the same 13 cycles as an mmx block would be. 215 C 216 C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence 217 C the use of separate incl and orl. 218 219 mull -8(%esi,%ecx,4) C m * src[0] 220 221 movl %eax, -8(%edi,%ecx,4) C dst[0] 222 incl %ecx C one limb processed 223 224 movd %edx, %mm6 C initial carry 225 226 orl %eax, %eax C clear carry flag 227 jmp L(small_entry) 228 229 230C The scheduling here is quite tricky, since so many instructions have 231C pairing restrictions. In particular the js won't pair with a movd, and 232C can't be paired with an adc since it wants flags from the inc, so 233C instructions are rotated to the top of the loop to find somewhere useful 234C for it. 235C 236C Trouble has been taken to avoid overlapping successive loop iterations, 237C since that would greatly increase the size of the startup and finishup 238C code. Actually there's probably not much advantage to be had from 239C overlapping anyway, since the difficulties are mostly with pairing, not 240C with latencies as such. 241C 242C In the comments x represents the src data and m the multiplier (16 243C bits, but replicated 4 times). 244C 245C The m signs calculated in %mm3 are a loop invariant and could be held in 246C say %mm5, but that would save only one instruction and hence be no faster. 247 248L(small_top): 249 C eax l.low, then l.high 250 C ebx (h.low) 251 C ecx counter, -size+2 to 0 or 1 252 C edx (h.high) 253 C esi &src[size] 254 C edi &dst[size] 255 C ebp 256 C 257 C %mm0 (high products) 258 C %mm1 (low products) 259 C %mm2 (adjust for m using x signs) 260 C %mm3 (adjust for x using m signs) 261 C %mm4 262 C %mm5 263 C %mm6 h.low, then carry 264 C %mm7 m replicated 4 times 265 266 movd %mm6, %ebx C h.low 267 psrlq $32, %mm1 C l.high 268 269 movd %mm0, %edx C h.high 270 movq %mm0, %mm6 C new c 271 272 adcl %eax, %ebx 273 incl %ecx 274 275 movd %mm1, %eax C l.high 276 movq %mm7, %mm0 277 278 adcl %eax, %edx 279 movl %ebx, -16(%edi,%ecx,4) 280 281 movl %edx, -12(%edi,%ecx,4) 282 psrlq $32, %mm6 C c 283 284L(small_entry): 285 pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high 286 movq %mm7, %mm1 287 288 pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low 289 movq %mm7, %mm3 290 291 movq -8(%esi,%ecx,4), %mm2 C x 292 psraw $15, %mm3 C m signs 293 294 pand -8(%esi,%ecx,4), %mm3 C x selected by m signs 295 psraw $15, %mm2 C x signs 296 297 paddw %mm3, %mm0 C add x to h if m neg 298 pand %mm7, %mm2 C m selected by x signs 299 300 paddw %mm2, %mm0 C add m to h if x neg 301 incl %ecx 302 303 movd %mm1, %eax C l.low 304 punpcklwd %mm0, %mm6 C c + h.low << 16 305 306 psrlq $16, %mm0 C h.high 307 js L(small_top) 308 309 310 311 312 movd %mm6, %ebx C h.low 313 psrlq $32, %mm1 C l.high 314 315 adcl %eax, %ebx 316 popl %ebp FRAME_popl() 317 318 movd %mm0, %edx C h.high 319 psrlq $32, %mm0 C l.high 320 321 movd %mm1, %eax C l.high 322 323 adcl %eax, %edx 324 movl %ebx, -12(%edi,%ecx,4) 325 326 movd %mm0, %eax C c 327 328 adcl $0, %eax 329 movl %edx, -8(%edi,%ecx,4) 330 331 orl %ecx, %ecx 332 jnz L(small_done) C final %ecx==1 means even, ==0 odd 333 334 335 C Size odd, one extra limb to process. 336 C Plain integer code is used here, since it's smaller and is about 337 C the same speed as another mmx block would be. 338 339 movl %eax, %ecx 340 movl PARAM_MULTIPLIER, %eax 341 342 mull -4(%esi) 343 344 addl %ecx, %eax 345 346 adcl $0, %edx 347 movl %eax, -4(%edi) 348 349 movl %edx, %eax 350L(small_done): 351 popl %ebx 352 353 popl %edi 354 popl %esi 355 356 emms 357 358 ret 359 360EPILOGUE() 361