1dnl AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 36C AMD K8,K9 ? ? ? 37C AMD K10 ? ? ? 38C AMD bull ? ? ? 39C AMD pile ? ? ? 40C AMD steam ? ? ? 41C AMD bobcat ? ? ? 42C AMD jaguar ? ? ? 43C Intel P4 ? ? ? 44C Intel core ? ? ? 45C Intel NHM ? ? ? 46C Intel SBR 2.57 2.93 3.0 47C Intel IBR 2.35 2.66 3.0 48C Intel HWL 2.02 2.5 2.5 49C Intel BWL ? ? ? 50C Intel atom ? ? ? 51C VIA nano ? ? ? 52 53C The inner loops of this code are the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbj��rn Granlund, except 55C that the sqr_diag_addlsh1 loop was manually written. 56 57C TODO 58C * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy. 59C * Streamline pointer updates. 60C * Perhaps suppress a few more xor insns in feed-in code. 61C * Make sure we write no dead registers in feed-in code. 62C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch 63C out for negative sizes being zero-extended, though. 64C * The straight-line code for n <= 3 comes from the K8 code, and might be 65C quite sub-optimal here. Write specific code, and add code for n = 4. 66C * The mul_2 loop has a 10 insn common sequence in the loop start and the 67C wind-down code. Try re-rolling it. 68C * This file has been the subject to just basic micro-optimisation. 69 70C When playing with pointers, set this to $2 to fall back to conservative 71C indexing in wind-down code. 72define(`I',`$1') 73 74define(`rp', `%rdi') 75define(`up', `%rsi') 76define(`un_param',`%rdx') 77 78 79ABI_SUPPORT(DOS64) 80ABI_SUPPORT(STD64) 81 82ASM_START() 83 TEXT 84 ALIGN(32) 85PROLOGUE(mpn_sqr_basecase) 86 FUNC_ENTRY(3) 87 88 cmp $2, un_param 89 jae L(gt1) 90 91 mov (up), %rax 92 mul %rax 93 mov %rax, (rp) 94 mov %rdx, 8(rp) 95 FUNC_EXIT() 96 ret 97 98L(gt1): jne L(gt2) 99 100 mov (up), %rax 101 mov %rax, %r8 102 mul %rax 103 mov 8(up), %r11 104 mov %rax, (rp) 105 mov %r11, %rax 106 mov %rdx, %r9 107 mul %rax 108 mov %rax, %r10 109 mov %r11, %rax 110 mov %rdx, %r11 111 mul %r8 112 xor %r8, %r8 113 add %rax, %r9 114 adc %rdx, %r10 115 adc %r8, %r11 116 add %rax, %r9 117 mov %r9, 8(rp) 118 adc %rdx, %r10 119 mov %r10, 16(rp) 120 adc %r8, %r11 121 mov %r11, 24(rp) 122 FUNC_EXIT() 123 ret 124 125L(gt2): cmp $4, un_param 126 jae L(gt3) 127define(`v0', `%r8') 128define(`v1', `%r9') 129define(`w0', `%r10') 130define(`w2', `%r11') 131 132 mov (up), %rax 133 mov %rax, %r10 134 mul %rax 135 mov 8(up), %r11 136 mov %rax, (rp) 137 mov %r11, %rax 138 mov %rdx, 8(rp) 139 mul %rax 140 mov 16(up), %rcx 141 mov %rax, 16(rp) 142 mov %rcx, %rax 143 mov %rdx, 24(rp) 144 mul %rax 145 mov %rax, 32(rp) 146 mov %rdx, 40(rp) 147 148 mov %r11, %rax 149 mul %r10 150 mov %rax, %r8 151 mov %rcx, %rax 152 mov %rdx, %r9 153 mul %r10 154 xor %r10, %r10 155 add %rax, %r9 156 mov %r11, %rax 157 mov %r10, %r11 158 adc %rdx, %r10 159 160 mul %rcx 161 add %rax, %r10 162 adc %r11, %rdx 163 add %r8, %r8 164 adc %r9, %r9 165 adc %r10, %r10 166 adc %rdx, %rdx 167 adc %r11, %r11 168 add %r8, 8(rp) 169 adc %r9, 16(rp) 170 adc %r10, 24(rp) 171 adc %rdx, 32(rp) 172 adc %r11, 40(rp) 173 FUNC_EXIT() 174 ret 175 176L(gt3): 177 178define(`v0', `%r8') 179define(`v1', `%r9') 180define(`w0', `%r10') 181define(`w1', `%r11') 182define(`w2', `%rbx') 183define(`w3', `%rbp') 184define(`un', `%r12') 185define(`n', `%rcx') 186 187define(`X0', `%r13') 188define(`X1', `%r14') 189 190L(do_mul_2): 191 mov (up), v0 192 push %rbx 193 lea (rp,un_param,8), rp C point rp at R[un] 194 mov 8(up), %rax 195 push %rbp 196 lea (up,un_param,8), up C point up right after U's end 197 mov %rax, v1 198 push %r12 199 mov $1, R32(un) C free up rdx 200 push %r13 201 sub un_param, un 202 push %r14 203 push un 204 mul v0 205 mov %rax, (rp,un,8) 206 mov 8(up,un,8), %rax 207 test $1, R8(un) 208 jnz L(m2b1) 209 210L(m2b0):lea 2(un), n 211 xor R32(w1), R32(w1) C FIXME 212 xor R32(w2), R32(w2) C FIXME 213 mov %rdx, w0 214 jmp L(m2l0) 215 216L(m2b1):lea 1(un), n 217 xor R32(w3), R32(w3) C FIXME 218 xor R32(w0), R32(w0) C FIXME 219 mov %rdx, w2 220 jmp L(m2l1) 221 222 ALIGN(32) 223L(m2tp): 224L(m2l0):mul v0 225 add %rax, w0 226 mov %rdx, w3 227 adc $0, w3 228 mov -8(up,n,8), %rax 229 mul v1 230 add w1, w0 231 adc $0, w3 232 add %rax, w2 233 mov w0, -8(rp,n,8) 234 mov %rdx, w0 235 adc $0, w0 236 mov (up,n,8), %rax 237L(m2l1):mul v0 238 add %rax, w2 239 mov %rdx, w1 240 adc $0, w1 241 add w3, w2 242 mov (up,n,8), %rax 243 adc $0, w1 244 mul v1 245 mov w2, (rp,n,8) 246 add %rax, w0 247 mov %rdx, w2 248 mov 8(up,n,8), %rax 249 adc $0, w2 250 add $2, n 251 jnc L(m2tp) 252 253L(m2ed):mul v0 254 add %rax, w0 255 mov %rdx, w3 256 adc $0, w3 257 mov I(-8(up),-8(up,n,8)), %rax 258 mul v1 259 add w1, w0 260 adc $0, w3 261 add %rax, w2 262 mov w0, I(-8(rp),-8(rp,n,8)) 263 adc $0, %rdx 264 add w3, w2 265 mov w2, I((rp),(rp,n,8)) 266 adc $0, %rdx 267 mov %rdx, I(8(rp),8(rp,n,8)) 268 269 add $2, un C decrease |un| 270 271L(do_addmul_2): 272L(outer): 273 lea 16(rp), rp 274 cmp $-2, R32(un) C jump if un C {-1,0} FIXME jump if un C {-2,1} 275 jge L(corner) C FIXME: move to before the lea above 276 277 mov -8(up,un,8), v0 278 mov (up,un,8), %rax 279 mov %rax, v1 280 mul v0 281 test $1, R8(un) 282 jnz L(a1x1) 283 284L(a1x0):mov (rp,un,8), X0 285 xor w0, w0 286 mov 8(rp,un,8), X1 287 add %rax, X0 288 mov %rdx, w1 289 adc $0, w1 290 xor w2, w2 291 mov X0, (rp,un,8) 292 mov 8(up,un,8), %rax 293 test $2, R8(un) 294 jnz L(a110) 295 296L(a100):lea 2(un), n C un = 4, 8, 12, ... 297 jmp L(lo0) 298 299L(a110):lea (un), n C un = 2, 6, 10, ... 300 jmp L(lo2) 301 302L(a1x1):mov (rp,un,8), X1 303 xor w2, w2 304 mov 8(rp,un,8), X0 305 add %rax, X1 306 mov %rdx, w3 307 adc $0, w3 308 xor w0, w0 309 mov 8(up,un,8), %rax 310 test $2, R8(un) 311 jz L(a111) 312 313L(a101):lea 3(un), n C un = 1, 5, 9, ... 314 jmp L(lo1) 315 316L(a111):lea 1(un), n C un = 3, 7, 11, ... 317 jmp L(lo3) 318 319 ALIGN(32) 320L(top): mul v1 321 mov %rdx, w0 322 add %rax, X0 323 adc $0, w0 324 add w1, X1 325 adc $0, w3 326 add w2, X0 327 adc $0, w0 328 mov -16(up,n,8), %rax 329L(lo1): mul v0 330 add %rax, X0 331 mov %rdx, w1 332 adc $0, w1 333 mov -16(up,n,8), %rax 334 mul v1 335 mov X1, -24(rp,n,8) 336 mov -8(rp,n,8), X1 337 add w3, X0 338 adc $0, w1 339 mov %rdx, w2 340 mov X0, -16(rp,n,8) 341 add %rax, X1 342 adc $0, w2 343 mov -8(up,n,8), %rax 344 add w0, X1 345 adc $0, w2 346L(lo0): mul v0 347 add %rax, X1 348 mov %rdx, w3 349 adc $0, w3 350 mov -8(up,n,8), %rax 351 mul v1 352 add w1, X1 353 mov (rp,n,8), X0 354 adc $0, w3 355 mov %rdx, w0 356 add %rax, X0 357 adc $0, w0 358 mov (up,n,8), %rax 359L(lo3): mul v0 360 add w2, X0 361 mov X1, -8(rp,n,8) 362 mov %rdx, w1 363 adc $0, w0 364 add %rax, X0 365 adc $0, w1 366 mov (up,n,8), %rax 367 add w3, X0 368 adc $0, w1 369 mul v1 370 mov 8(rp,n,8), X1 371 add %rax, X1 372 mov %rdx, w2 373 adc $0, w2 374 mov 8(up,n,8), %rax 375 mov X0, (rp,n,8) 376L(lo2): mul v0 377 add w0, X1 378 mov %rdx, w3 379 adc $0, w2 380 add %rax, X1 381 mov 8(up,n,8), %rax 382 mov 16(rp,n,8), X0 383 adc $0, w3 384 add $4, n 385 jnc L(top) 386 387L(end): mul v1 388 add w1, X1 389 adc $0, w3 390 add w2, %rax 391 adc $0, %rdx 392 mov X1, I(-8(rp),-24(rp,n,8)) 393 add w3, %rax 394 adc $0, %rdx 395 mov %rax, I((rp),-16(rp,n,8)) 396 mov %rdx, I(8(rp),-8(rp,n,8)) 397 398 add $2, un C decrease |un| 399 jmp L(outer) C loop until a small corner remains 400 401L(corner): 402 pop n 403 jg L(small_corner) 404 405 lea 8(rp), rp 406 mov -24(up), v0 407 mov -16(up), %rax 408 mov %rax, v1 409 mul v0 410 mov -24(rp), X0 411 mov -16(rp), X1 412 add %rax, X0 413 mov %rdx, w1 414 adc $0, w1 415 xor w2, w2 416 mov X0, -24(rp) 417 mov -8(up), %rax 418 mul v0 419 add $0, X1 420 mov %rdx, w3 421 adc $0, w2 422 add %rax, X1 423 mov -8(up), %rax 424 adc $0, w3 425 mul v1 426 add w1, X1 427 adc $0, w3 428 add w2, %rax 429 adc $0, %rdx 430 mov X1, -16(rp) 431 jmp L(com) 432 433L(small_corner): 434 mov -8(rp), w3 435 mov -16(up), v0 436 mov -8(up), %rax 437 mul v0 438L(com): add w3, %rax 439 adc $0, %rdx 440 mov %rax, -8(rp) 441 mov %rdx, (rp) 442 443L(sqr_diag_addlsh1): 444 mov -8(up,n,8), %rax 445 shl n 446 mul %rax 447 mov %rax, (rp,n,8) 448 449 xor R32(%rbx), R32(%rbx) 450 mov 8(rp,n,8), %r8 451 mov 16(rp,n,8), %r9 452 jmp L(dm) 453 454 ALIGN(32) 455L(dtop):add %r8, %r10 456 adc %r9, %rax 457 mov 8(rp,n,8), %r8 458 mov 16(rp,n,8), %r9 459 mov %r10, -8(rp,n,8) 460 mov %rax, (rp,n,8) 461L(dm): adc %r8, %r8 462 adc %r9, %r9 463 mov (up,n,4), %rax 464 lea (%rdx,%rbx), %r10 465 setc R8(%rbx) 466 mul %rax 467 add $2, n 468 js L(dtop) 469 470L(dend):add %r8, %r10 471 adc %r9, %rax 472 mov %r10, I(-8(rp),-8(rp,n,8)) 473 mov %rax, I((rp),(rp,n,8)) 474 adc %rbx, %rdx 475 mov %rdx, I(8(rp),8(rp,n,8)) 476 477 pop %r14 478 pop %r13 479 pop %r12 480 pop %rbp 481 pop %rbx 482 FUNC_EXIT() 483 ret 484EPILOGUE() 485