1dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2003, 2005 Free Software Foundation, 4dnl Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C P5: 26C P6 model 0-8,10-12) 5.94 27C P6 model 9 (Banias) 28C P6 model 13 (Dothan) 5.57 29C P4 model 0 (Willamette) 30C P4 model 1 (?) 31C P4 model 2 (Northwood) 32C P4 model 3 (Prescott) 33C P4 model 4 (Nocona) 34C K6: 7.65-8.5 (data dependent) 35C K7: 36C K8: 37 38 39dnl K6: large multipliers small multipliers 40dnl UNROLL_COUNT cycles/limb cycles/limb 41dnl 4 9.5 7.78 42dnl 8 9.0 7.78 43dnl 16 8.4 7.65 44dnl 32 8.4 8.2 45dnl 46dnl Maximum possible unrolling with the current code is 32. 47dnl 48dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256 49dnl byte block, which might explain the good speed at that unrolling. 50 51deflit(UNROLL_COUNT, 16) 52 53 54ifdef(`OPERATION_addmul_1', ` 55 define(M4_inst, addl) 56 define(M4_function_1, mpn_addmul_1) 57 define(M4_function_1c, mpn_addmul_1c) 58',`ifdef(`OPERATION_submul_1', ` 59 define(M4_inst, subl) 60 define(M4_function_1, mpn_submul_1) 61 define(M4_function_1c, mpn_submul_1c) 62',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 63')')') 64 65MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) 66 67 68C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 69C mp_limb_t mult); 70C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 71C mp_limb_t mult, mp_limb_t carry); 72C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 73C mp_limb_t mult); 74C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 75C mp_limb_t mult, mp_limb_t carry); 76C 77C The jadcl0()s in the unrolled loop makes the speed data dependent. Small 78C multipliers (most significant few bits clear) result in few carry bits and 79C speeds up to 7.65 cycles/limb are attained. Large multipliers (most 80C significant few bits set) make the carry bits 50/50 and lead to something 81C more like 8.4 c/l. With adcl's both of these would be 9.3 c/l. 82C 83C It's important that the gains for jadcl0 on small multipliers don't come 84C at the cost of slowing down other data. Tests on uniformly distributed 85C random data, designed to confound branch prediction, show about a 7% 86C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all 87C overheads included). 88C 89C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus 90C 11.0 cycles/limb), and hence isn't used. 91C 92C In the simple loop, note that running ecx from negative to zero and using 93C it as an index in the two movs wouldn't help. It would save one 94C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired 95C that would be collapsed by this. 96C 97C Attempts at a simpler main loop, with less unrolling, haven't yielded much 98C success, generally running over 9 c/l. 99C 100C 101C jadcl0 102C ------ 103C 104C jadcl0() being faster than adcl $0 seems to be an artifact of two things, 105C firstly the instruction decoding and secondly the fact that there's a 106C carry bit for the jadcl0 only on average about 1/4 of the time. 107C 108C The code in the unrolled loop decodes something like the following. 109C 110C decode cycles 111C mull %ebp 2 112C M4_inst %esi, disp(%edi) 1 113C adcl %eax, %ecx 2 114C movl %edx, %esi \ 1 115C jnc 1f / 116C incl %esi \ 1 117C 1: movl disp(%ebx), %eax / 118C --- 119C 7 120C 121C In a back-to-back style test this measures 7 with the jnc not taken, or 8 122C with it taken (both when correctly predicted). This is opposite to the 123C measurements showing small multipliers running faster than large ones. 124C Don't really know why. 125C 126C It's not clear how much branch misprediction might be costing. The K6 127C doco says it will be 1 to 4 cycles, but presumably it's near the low end 128C of that range to get the measured results. 129C 130C 131C In the code the two carries are more or less the preceding mul product and 132C the calculation is roughly 133C 134C x*y + u*b+v 135C 136C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and 137C v are the two limbs it's added to (being the low of the next mul, and a 138C limb from the destination). 139C 140C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and 141C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of 142C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0 143C and b-1, then the total probability can be summed over x and y, 144C 145C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1) 146C --- * sum sum --- = --- * ------- * ------- = 1/4 147C b^2 x=0 y=1 b^2 b^4 2 2 148C 149C Actually it's a very tiny bit less than 1/4 of course. If y is fixed, 150C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2. 151 152 153ifdef(`PIC',` 154deflit(UNROLL_THRESHOLD, 9) 155',` 156deflit(UNROLL_THRESHOLD, 6) 157') 158 159defframe(PARAM_CARRY, 20) 160defframe(PARAM_MULTIPLIER,16) 161defframe(PARAM_SIZE, 12) 162defframe(PARAM_SRC, 8) 163defframe(PARAM_DST, 4) 164 165 TEXT 166 ALIGN(32) 167 168PROLOGUE(M4_function_1c) 169 pushl %esi 170deflit(`FRAME',4) 171 movl PARAM_CARRY, %esi 172 jmp L(start_nc) 173EPILOGUE() 174 175PROLOGUE(M4_function_1) 176 push %esi 177deflit(`FRAME',4) 178 xorl %esi, %esi C initial carry 179 180L(start_nc): 181 movl PARAM_SIZE, %ecx 182 pushl %ebx 183deflit(`FRAME',8) 184 185 movl PARAM_SRC, %ebx 186 pushl %edi 187deflit(`FRAME',12) 188 189 cmpl $UNROLL_THRESHOLD, %ecx 190 movl PARAM_DST, %edi 191 192 pushl %ebp 193deflit(`FRAME',16) 194 jae L(unroll) 195 196 197 C simple loop 198 199 movl PARAM_MULTIPLIER, %ebp 200 201L(simple): 202 C eax scratch 203 C ebx src 204 C ecx counter 205 C edx scratch 206 C esi carry 207 C edi dst 208 C ebp multiplier 209 210 movl (%ebx), %eax 211 addl $4, %ebx 212 213 mull %ebp 214 215 addl $4, %edi 216 addl %esi, %eax 217 218 adcl $0, %edx 219 220 M4_inst %eax, -4(%edi) 221 222 adcl $0, %edx 223 224 movl %edx, %esi 225 loop L(simple) 226 227 228 popl %ebp 229 popl %edi 230 231 popl %ebx 232 movl %esi, %eax 233 234 popl %esi 235 ret 236 237 238 239C ----------------------------------------------------------------------------- 240C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop 241C the carries are ecx=lo, esi=hi, then they swap for each limb processed. 242C For the computed jump an odd size means they start one way around, an even 243C size the other. 244C 245C VAR_JUMP holds the computed jump temporarily because there's not enough 246C registers at the point of doing the mul for the initial two carry limbs. 247C 248C The add/adc for the initial carry in %esi is necessary only for the 249C mpn_addmul/submul_1c entry points. Duplicating the startup code to 250C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good 251C idea. 252 253dnl overlapping with parameters already fetched 254define(VAR_COUNTER, `PARAM_SIZE') 255define(VAR_JUMP, `PARAM_DST') 256 257L(unroll): 258 C eax 259 C ebx src 260 C ecx size 261 C edx 262 C esi initial carry 263 C edi dst 264 C ebp 265 266 movl %ecx, %edx 267 decl %ecx 268 269 subl $2, %edx 270 negl %ecx 271 272 shrl $UNROLL_LOG2, %edx 273 andl $UNROLL_MASK, %ecx 274 275 movl %edx, VAR_COUNTER 276 movl %ecx, %edx 277 278 shll $4, %edx 279 negl %ecx 280 281 C 15 code bytes per limb 282ifdef(`PIC',` 283 call L(pic_calc) 284L(here): 285',` 286 leal L(entry) (%edx,%ecx,1), %edx 287') 288 movl (%ebx), %eax C src low limb 289 290 movl PARAM_MULTIPLIER, %ebp 291 movl %edx, VAR_JUMP 292 293 mull %ebp 294 295 addl %esi, %eax C initial carry (from _1c) 296 jadcl0( %edx) 297 298 299 leal 4(%ebx,%ecx,4), %ebx 300 movl %edx, %esi C high carry 301 302 movl VAR_JUMP, %edx 303 leal (%edi,%ecx,4), %edi 304 305 testl $1, %ecx 306 movl %eax, %ecx C low carry 307 308 jz L(noswap) 309 movl %esi, %ecx C high,low carry other way around 310 311 movl %eax, %esi 312L(noswap): 313 314 jmp *%edx 315 316 317ifdef(`PIC',` 318L(pic_calc): 319 C See mpn/x86/README about old gas bugs 320 leal (%edx,%ecx,1), %edx 321 addl $L(entry)-L(here), %edx 322 addl (%esp), %edx 323 ret_internal 324') 325 326 327C ----------------------------------------------------------- 328 ALIGN(32) 329L(top): 330deflit(`FRAME',16) 331 C eax scratch 332 C ebx src 333 C ecx carry lo 334 C edx scratch 335 C esi carry hi 336 C edi dst 337 C ebp multiplier 338 C 339 C 15 code bytes per limb 340 341 leal UNROLL_BYTES(%edi), %edi 342 343L(entry): 344forloop(`i', 0, UNROLL_COUNT/2-1, ` 345 deflit(`disp0', eval(2*i*4)) 346 deflit(`disp1', eval(disp0 + 4)) 347 348Zdisp( movl, disp0,(%ebx), %eax) 349 mull %ebp 350Zdisp( M4_inst,%ecx, disp0,(%edi)) 351 adcl %eax, %esi 352 movl %edx, %ecx 353 jadcl0( %ecx) 354 355 movl disp1(%ebx), %eax 356 mull %ebp 357 M4_inst %esi, disp1(%edi) 358 adcl %eax, %ecx 359 movl %edx, %esi 360 jadcl0( %esi) 361') 362 363 decl VAR_COUNTER 364 365 leal UNROLL_BYTES(%ebx), %ebx 366 jns L(top) 367 368 369 popl %ebp 370 M4_inst %ecx, UNROLL_BYTES(%edi) 371 372 popl %edi 373 movl %esi, %eax 374 375 popl %ebx 376 jadcl0( %eax) 377 378 popl %esi 379 ret 380 381EPILOGUE() 382