1dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 3dnl Copyright 1999-2003, 2005 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C P5 36C P6 model 0-8,10-12 5.94 37C P6 model 9 (Banias) 5.51 38C P6 model 13 (Dothan) 5.57 39C P4 model 0 (Willamette) 40C P4 model 1 (?) 41C P4 model 2 (Northwood) 42C P4 model 3 (Prescott) 43C P4 model 4 (Nocona) 44C AMD K6 7.65-8.5 (data dependent) 45C AMD K7 46C AMD K8 47 48 49dnl K6: large multipliers small multipliers 50dnl UNROLL_COUNT cycles/limb cycles/limb 51dnl 4 9.5 7.78 52dnl 8 9.0 7.78 53dnl 16 8.4 7.65 54dnl 32 8.4 8.2 55dnl 56dnl Maximum possible unrolling with the current code is 32. 57dnl 58dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256 59dnl byte block, which might explain the good speed at that unrolling. 60 61deflit(UNROLL_COUNT, 16) 62 63 64ifdef(`OPERATION_addmul_1', ` 65 define(M4_inst, addl) 66 define(M4_function_1, mpn_addmul_1) 67 define(M4_function_1c, mpn_addmul_1c) 68',`ifdef(`OPERATION_submul_1', ` 69 define(M4_inst, subl) 70 define(M4_function_1, mpn_submul_1) 71 define(M4_function_1c, mpn_submul_1c) 72',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 73')')') 74 75MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) 76 77 78C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 79C mp_limb_t mult); 80C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 81C mp_limb_t mult, mp_limb_t carry); 82C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 83C mp_limb_t mult); 84C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 85C mp_limb_t mult, mp_limb_t carry); 86C 87C The jadcl0()s in the unrolled loop makes the speed data dependent. Small 88C multipliers (most significant few bits clear) result in few carry bits and 89C speeds up to 7.65 cycles/limb are attained. Large multipliers (most 90C significant few bits set) make the carry bits 50/50 and lead to something 91C more like 8.4 c/l. With adcl's both of these would be 9.3 c/l. 92C 93C It's important that the gains for jadcl0 on small multipliers don't come 94C at the cost of slowing down other data. Tests on uniformly distributed 95C random data, designed to confound branch prediction, show about a 7% 96C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all 97C overheads included). 98C 99C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus 100C 11.0 cycles/limb), and hence isn't used. 101C 102C In the simple loop, note that running ecx from negative to zero and using 103C it as an index in the two movs wouldn't help. It would save one 104C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired 105C that would be collapsed by this. 106C 107C Attempts at a simpler main loop, with less unrolling, haven't yielded much 108C success, generally running over 9 c/l. 109C 110C 111C jadcl0 112C ------ 113C 114C jadcl0() being faster than adcl $0 seems to be an artifact of two things, 115C firstly the instruction decoding and secondly the fact that there's a 116C carry bit for the jadcl0 only on average about 1/4 of the time. 117C 118C The code in the unrolled loop decodes something like the following. 119C 120C decode cycles 121C mull %ebp 2 122C M4_inst %esi, disp(%edi) 1 123C adcl %eax, %ecx 2 124C movl %edx, %esi \ 1 125C jnc 1f / 126C incl %esi \ 1 127C 1: movl disp(%ebx), %eax / 128C --- 129C 7 130C 131C In a back-to-back style test this measures 7 with the jnc not taken, or 8 132C with it taken (both when correctly predicted). This is opposite to the 133C measurements showing small multipliers running faster than large ones. 134C Don't really know why. 135C 136C It's not clear how much branch misprediction might be costing. The K6 137C doco says it will be 1 to 4 cycles, but presumably it's near the low end 138C of that range to get the measured results. 139C 140C 141C In the code the two carries are more or less the preceding mul product and 142C the calculation is roughly 143C 144C x*y + u*b+v 145C 146C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and 147C v are the two limbs it's added to (being the low of the next mul, and a 148C limb from the destination). 149C 150C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and 151C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of 152C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0 153C and b-1, then the total probability can be summed over x and y, 154C 155C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1) 156C --- * sum sum --- = --- * ------- * ------- = 1/4 157C b^2 x=0 y=1 b^2 b^4 2 2 158C 159C Actually it's a very tiny bit less than 1/4 of course. If y is fixed, 160C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2. 161 162 163ifdef(`PIC',` 164deflit(UNROLL_THRESHOLD, 9) 165',` 166deflit(UNROLL_THRESHOLD, 6) 167') 168 169defframe(PARAM_CARRY, 20) 170defframe(PARAM_MULTIPLIER,16) 171defframe(PARAM_SIZE, 12) 172defframe(PARAM_SRC, 8) 173defframe(PARAM_DST, 4) 174 175 TEXT 176 ALIGN(32) 177 178PROLOGUE(M4_function_1c) 179 pushl %esi 180deflit(`FRAME',4) 181 movl PARAM_CARRY, %esi 182 jmp L(start_nc) 183EPILOGUE() 184 185PROLOGUE(M4_function_1) 186 push %esi 187deflit(`FRAME',4) 188 xorl %esi, %esi C initial carry 189 190L(start_nc): 191 movl PARAM_SIZE, %ecx 192 pushl %ebx 193deflit(`FRAME',8) 194 195 movl PARAM_SRC, %ebx 196 pushl %edi 197deflit(`FRAME',12) 198 199 cmpl $UNROLL_THRESHOLD, %ecx 200 movl PARAM_DST, %edi 201 202 pushl %ebp 203deflit(`FRAME',16) 204 jae L(unroll) 205 206 207 C simple loop 208 209 movl PARAM_MULTIPLIER, %ebp 210 211L(simple): 212 C eax scratch 213 C ebx src 214 C ecx counter 215 C edx scratch 216 C esi carry 217 C edi dst 218 C ebp multiplier 219 220 movl (%ebx), %eax 221 addl $4, %ebx 222 223 mull %ebp 224 225 addl $4, %edi 226 addl %esi, %eax 227 228 adcl $0, %edx 229 230 M4_inst %eax, -4(%edi) 231 232 adcl $0, %edx 233 234 movl %edx, %esi 235 loop L(simple) 236 237 238 popl %ebp 239 popl %edi 240 241 popl %ebx 242 movl %esi, %eax 243 244 popl %esi 245 ret 246 247 248 249C ----------------------------------------------------------------------------- 250C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop 251C the carries are ecx=lo, esi=hi, then they swap for each limb processed. 252C For the computed jump an odd size means they start one way around, an even 253C size the other. 254C 255C VAR_JUMP holds the computed jump temporarily because there's not enough 256C registers at the point of doing the mul for the initial two carry limbs. 257C 258C The add/adc for the initial carry in %esi is necessary only for the 259C mpn_addmul/submul_1c entry points. Duplicating the startup code to 260C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good 261C idea. 262 263dnl overlapping with parameters already fetched 264define(VAR_COUNTER, `PARAM_SIZE') 265define(VAR_JUMP, `PARAM_DST') 266 267L(unroll): 268 C eax 269 C ebx src 270 C ecx size 271 C edx 272 C esi initial carry 273 C edi dst 274 C ebp 275 276 movl %ecx, %edx 277 decl %ecx 278 279 subl $2, %edx 280 negl %ecx 281 282 shrl $UNROLL_LOG2, %edx 283 andl $UNROLL_MASK, %ecx 284 285 movl %edx, VAR_COUNTER 286 movl %ecx, %edx 287 288 shll $4, %edx 289 negl %ecx 290 291 C 15 code bytes per limb 292ifdef(`PIC',` 293 call L(pic_calc) 294L(here): 295',` 296 leal L(entry) (%edx,%ecx,1), %edx 297') 298 movl (%ebx), %eax C src low limb 299 300 movl PARAM_MULTIPLIER, %ebp 301 movl %edx, VAR_JUMP 302 303 mull %ebp 304 305 addl %esi, %eax C initial carry (from _1c) 306 jadcl0( %edx) 307 308 309 leal 4(%ebx,%ecx,4), %ebx 310 movl %edx, %esi C high carry 311 312 movl VAR_JUMP, %edx 313 leal (%edi,%ecx,4), %edi 314 315 testl $1, %ecx 316 movl %eax, %ecx C low carry 317 318 jz L(noswap) 319 movl %esi, %ecx C high,low carry other way around 320 321 movl %eax, %esi 322L(noswap): 323 324 jmp *%edx 325 326 327ifdef(`PIC',` 328L(pic_calc): 329 C See mpn/x86/README about old gas bugs 330 leal (%edx,%ecx,1), %edx 331 addl $L(entry)-L(here), %edx 332 addl (%esp), %edx 333 ret_internal 334') 335 336 337C ----------------------------------------------------------- 338 ALIGN(32) 339L(top): 340deflit(`FRAME',16) 341 C eax scratch 342 C ebx src 343 C ecx carry lo 344 C edx scratch 345 C esi carry hi 346 C edi dst 347 C ebp multiplier 348 C 349 C 15 code bytes per limb 350 351 leal UNROLL_BYTES(%edi), %edi 352 353L(entry): 354forloop(`i', 0, UNROLL_COUNT/2-1, ` 355 deflit(`disp0', eval(2*i*4)) 356 deflit(`disp1', eval(disp0 + 4)) 357 358Zdisp( movl, disp0,(%ebx), %eax) 359 mull %ebp 360Zdisp( M4_inst,%ecx, disp0,(%edi)) 361 adcl %eax, %esi 362 movl %edx, %ecx 363 jadcl0( %ecx) 364 365 movl disp1(%ebx), %eax 366 mull %ebp 367 M4_inst %esi, disp1(%edi) 368 adcl %eax, %ecx 369 movl %edx, %esi 370 jadcl0( %esi) 371') 372 373 decl VAR_COUNTER 374 375 leal UNROLL_BYTES(%ebx), %ebx 376 jns L(top) 377 378 379 popl %ebp 380 M4_inst %ecx, UNROLL_BYTES(%edi) 381 382 popl %edi 383 movl %esi, %eax 384 385 popl %ebx 386 jadcl0( %eax) 387 388 popl %esi 389 ret 390 391EPILOGUE() 392