1181834Srobertodnl AMD64 mpn_mul_basecase optimised for Intel Haswell. 2280849Scy 3181834Srobertodnl Contributed to the GNU project by Torbj��rn Granlund. 4181834Sroberto 5181834Srobertodnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6280849Scy 7280849Scydnl This file is part of the GNU MP Library. 8280849Scydnl 9181834Srobertodnl The GNU MP Library is free software; you can redistribute it and/or modify 10280849Scydnl it under the terms of either: 11280849Scydnl 12280849Scydnl * the GNU Lesser General Public License as published by the Free 13285169Scydnl Software Foundation; either version 3 of the License, or (at your 14181834Srobertodnl option) any later version. 15280849Scydnl 16280849Scydnl or 17280849Scydnl 18181834Srobertodnl * the GNU General Public License as published by the Free Software 19280849Scydnl Foundation; either version 2 of the License, or (at your option) any 20280849Scydnl later version. 21181834Srobertodnl 22280849Scydnl or both in parallel, as here. 23280849Scydnl 24181834Srobertodnl The GNU MP Library is distributed in the hope that it will be useful, but 25280849Scydnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26181834Srobertodnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27280849Scydnl for more details. 28280849Scydnl 29280849Scydnl You should have received copies of the GNU General Public License and the 30181834Srobertodnl GNU Lesser General Public License along with the GNU MP Library. If not, 31181834Srobertodnl see https://www.gnu.org/licenses/. 32181834Sroberto 33181834Srobertoinclude(`../config.m4') 34181834Sroberto 35285169ScyC cycles/limb mul_1 mul_2 mul_3 addmul_2 36181834SrobertoC AMD K8,K9 n/a n/a - n/a 37181834SrobertoC AMD K10 n/a n/a - n/a 38181834SrobertoC AMD bull n/a n/a - n/a 39181834SrobertoC AMD pile n/a n/a - n/a 40181834SrobertoC AMD steam ? ? - ? 41285169ScyC AMD bobcat n/a n/a - n/a 42280849ScyC AMD jaguar ? ? - ? 43181834SrobertoC Intel P4 n/a n/a - n/a 44280849ScyC Intel core n/a n/a - n/a 45280849ScyC Intel NHM n/a n/a - n/a 46280849ScyC Intel SBR n/a n/a - n/a 47181834SrobertoC Intel IBR n/a n/a - n/a 48280849ScyC Intel HWL 1.77 1.86 - 2.15 49280849ScyC Intel BWL ? ? - ? 50280849ScyC Intel atom n/a n/a - n/a 51280849ScyC VIA nano n/a n/a - n/a 52280849Scy 53280849ScyC The inner loops of this code are the result of running a code generation and 54280849ScyC optimisation tool suite written by David Harvey and Torbj��rn Granlund. 55280849Scy 56280849ScyC TODO 57280849ScyC * Adjoin a mul_3. 58280849ScyC * Further micro-optimise. 59280849Scy 60280849Scydefine(`rp', `%rdi') 61280849Scydefine(`up', `%rsi') 62181834Srobertodefine(`un_param',`%rdx') 63181834Srobertodefine(`vp', `%rcx') 64280849Scydefine(`vn', `%r8') 65280849Scy 66280849Scydefine(`un', `%rbx') 67280849Scy 68280849Scydefine(`w0', `%r10') 69280849Scydefine(`w1', `%r11') 70280849Scydefine(`w2', `%r12') 71280849Scydefine(`w3', `%r13') 72280849Scydefine(`n', `%rbp') 73280849Scydefine(`v0', `%r9') 74280849Scy 75181834SrobertoABI_SUPPORT(DOS64) 76280849ScyABI_SUPPORT(STD64) 77280849Scy 78280849ScyASM_START() 79280849Scy TEXT 80280849Scy ALIGN(16) 81280849ScyPROLOGUE(mpn_mul_basecase) 82181834Sroberto FUNC_ENTRY(4) 83280849ScyIFDOS(` mov 56(%rsp), %r8d ') 84181834Sroberto push %rbx 85280849Scy push %rbp 86280849Scy push %r12 87181834Sroberto push %r13 88280849Scy push %r14 89280849Scy mov un_param, un C free up rdx 90181834Sroberto neg un 91280849Scy 92280849Scy mov un_param, n C FIXME: share 93280849Scy sar $2, n C FIXME: share 94280849Scy 95280849Scy test $1, R8(vn) 96280849Scy jz L(do_mul_2) 97181834Sroberto 98280849Scydefine(`w4', `%r9') 99280849Scydefine(`w5', `%r14') 100280849Scy 101181834Sroberto mov (vp), %rdx 102280849Scy 103181834SrobertoL(do_mul_1): 104280849Scy test $1, R8(un) 105280849Scy jnz L(m1x1) 106280849Scy 107280849ScyL(m1x0):test $2, R8(un) 108280849Scy jnz L(m110) 109280849Scy 110280849ScyL(m100): 111280849Scy mulx( (up), w5, w2) 112280849Scy mulx( 8,(up), w1, w3) 113280849Scy lea -24(rp), rp 114280849Scy jmp L(m1l0) 115280849Scy 116181834SrobertoL(m110): 117280849Scy mulx( (up), w3, w4) 118280849Scy mulx( 8,(up), w1, w5) 119181834Sroberto lea -8(rp), rp 120280849Scy test n, n 121280849Scy jz L(cj2) 122280849Scy mulx( 16,(up), w0, w2) 123280849Scy lea 16(up), up 124280849Scy jmp L(m1l2) 125280849Scy 126181834SrobertoL(m1x1):test $2, R8(un) 127280849Scy jz L(m111) 128280849Scy 129280849ScyL(m101): 130280849Scy mulx( (up), w4, w5) 131280849Scy lea -16(rp), rp 132181834Sroberto test n, n 133280849Scy jz L(cj1) 134280849Scy mulx( 8,(up), w0, w2) 135280849Scy lea 8(up), up 136280849Scy jmp L(m1l1) 137280849Scy 138280849ScyL(m111): 139280849Scy mulx( (up), w2, w3) 140280849Scy mulx( 8,(up), w0, w4) 141181834Sroberto mulx( 16,(up), w1, w5) 142280849Scy lea 24(up), up 143280849Scy test n, n 144280849Scy jnz L(gt3) 145280849Scy add w0, w3 146280849Scy jmp L(cj3) 147280849ScyL(gt3): add w0, w3 148280849Scy jmp L(m1l3) 149280849Scy 150280849Scy ALIGN(32) 151280849ScyL(m1tp):lea 32(rp), rp 152280849ScyL(m1l3):mov w2, (rp) 153280849Scy mulx( (up), w0, w2) 154280849ScyL(m1l2):mov w3, 8(rp) 155280849Scy adc w1, w4 156280849ScyL(m1l1):adc w0, w5 157280849Scy mov w4, 16(rp) 158280849Scy mulx( 8,(up), w1, w3) 159280849ScyL(m1l0):mov w5, 24(rp) 160280849Scy mulx( 16,(up), w0, w4) 161280849Scy adc w1, w2 162280849Scy mulx( 24,(up), w1, w5) 163280849Scy adc w0, w3 164280849Scy lea 32(up), up 165280849Scy dec n 166280849Scy jnz L(m1tp) 167181834Sroberto 168280849ScyL(m1ed):lea 32(rp), rp 169280849ScyL(cj3): mov w2, (rp) 170181834SrobertoL(cj2): mov w3, 8(rp) 171181834Sroberto adc w1, w4 172280849ScyL(cj1): mov w4, 16(rp) 173280849Scy adc $0, w5 174280849Scy mov w5, 24(rp) 175280849Scy 176280849Scy dec R32(vn) 177280849Scy jz L(ret5) 178280849Scy 179181834Sroberto lea 8(vp), vp 180181834Sroberto lea 32(rp), rp 181181834SrobertoC push %r12 182181834SrobertoC push %r13 183181834SrobertoC push %r14 184285169Scy jmp L(do_addmul) 185285169Scy 186181834SrobertoL(do_mul_2): 187181834Srobertodefine(`v1', `%r14') 188181834SrobertoC push %r12 189181834SrobertoC push %r13 190181834SrobertoC push %r14 191280849Scy 192181834Sroberto mov (vp), v0 193280849Scy mov 8(vp), v1 194181834Sroberto 195181834Sroberto lea (un), n 196280849Scy sar $2, n 197280849Scy 198280849Scy test $1, R8(un) 199285169Scy jnz L(m2x1) 200285169Scy 201280849ScyL(m2x0):xor w0, w0 202280849Scy test $2, R8(un) 203280849Scy mov (up), %rdx 204280849Scy mulx( v0, w2, w1) 205280849Scy jz L(m2l0) 206280849Scy 207280849ScyL(m210):lea -16(rp), rp 208280849Scy lea -16(up), up 209280849Scy jmp L(m2l2) 210280849Scy 211280849ScyL(m2x1):xor w2, w2 212280849Scy test $2, R8(un) 213280849Scy mov (up), %rdx 214181834Sroberto mulx( v0, w0, w3) 215181834Sroberto jz L(m211) 216181834Sroberto 217181834SrobertoL(m201):lea -24(rp), rp 218285169Scy lea 8(up), up 219285169Scy jmp L(m2l1) 220181834Sroberto 221181834SrobertoL(m211):lea -8(rp), rp 222181834Sroberto lea -8(up), up 223181834Sroberto jmp L(m2l3) 224181834Sroberto 225280849Scy ALIGN(16) 226181834SrobertoL(m2tp):mulx( v1, %rax, w0) 227280849Scy add %rax, w2 228181834Sroberto mov (up), %rdx 229181834Sroberto mulx( v0, %rax, w1) 230280849Scy adc $0, w0 231280849Scy add %rax, w2 232181834Sroberto adc $0, w1 233181834Sroberto add w3, w2 234181834SrobertoL(m2l0):mov w2, (rp) 235181834Sroberto adc $0, w1 236181834Sroberto mulx( v1, %rax, w2) 237181834Sroberto add %rax, w0 238 mov 8(up), %rdx 239 adc $0, w2 240 mulx( v0, %rax, w3) 241 add %rax, w0 242 adc $0, w3 243 add w1, w0 244L(m2l3):mov w0, 8(rp) 245 adc $0, w3 246 mulx( v1, %rax, w0) 247 add %rax, w2 248 mov 16(up), %rdx 249 mulx( v0, %rax, w1) 250 adc $0, w0 251 add %rax, w2 252 adc $0, w1 253 add w3, w2 254L(m2l2):mov w2, 16(rp) 255 adc $0, w1 256 mulx( v1, %rax, w2) 257 add %rax, w0 258 mov 24(up), %rdx 259 adc $0, w2 260 mulx( v0, %rax, w3) 261 add %rax, w0 262 adc $0, w3 263 add w1, w0 264 lea 32(up), up 265L(m2l1):mov w0, 24(rp) 266 adc $0, w3 267 inc n 268 lea 32(rp), rp 269 jnz L(m2tp) 270 271L(m2ed):mulx( v1, %rdx, %rax) 272 add %rdx, w2 273 adc $0, %rax 274 add w3, w2 275 mov w2, (rp) 276 adc $0, %rax 277 mov %rax, 8(rp) 278 279 add $-2, R32(vn) 280 jz L(ret5) 281 lea 16(vp), vp 282 lea 16(rp), rp 283 284 285L(do_addmul): 286 push %r15 287 push vn C save vn in new stack slot 288define(`vn', `(%rsp)') 289define(`X0', `%r14') 290define(`X1', `%r15') 291define(`v1', `%r8') 292 293 lea (rp,un,8), rp 294 lea (up,un,8), up 295 296L(outer): 297 mov (vp), v0 298 mov 8(vp), v1 299 300 lea 2(un), n 301 sar $2, n 302 303 mov (up), %rdx 304 test $1, R8(un) 305 jnz L(bx1) 306 307L(bx0): mov (rp), X0 308 mov 8(rp), X1 309 mulx( v0, %rax, w1) 310 add %rax, X0 311 mulx( v1, %rax, w2) 312 adc $0, w1 313 mov X0, (rp) 314 add %rax, X1 315 adc $0, w2 316 mov 8(up), %rdx 317 test $2, R8(un) 318 jnz L(b10) 319 320L(b00): lea 16(up), up 321 lea 16(rp), rp 322 jmp L(lo0) 323 324L(b10): mov 16(rp), X0 325 lea 32(up), up 326 mulx( v0, %rax, w3) 327 jmp L(lo2) 328 329L(bx1): mov (rp), X1 330 mov 8(rp), X0 331 mulx( v0, %rax, w3) 332 add %rax, X1 333 adc $0, w3 334 mulx( v1, %rax, w0) 335 add %rax, X0 336 adc $0, w0 337 mov 8(up), %rdx 338 mov X1, (rp) 339 mulx( v0, %rax, w1) 340 test $2, R8(un) 341 jz L(b11) 342 343L(b01): mov 16(rp), X1 344 lea 24(rp), rp 345 lea 24(up), up 346 jmp L(lo1) 347 348L(b11): lea 8(rp), rp 349 lea 8(up), up 350 jmp L(lo3) 351 352 ALIGN(16) 353L(top): mulx( v0, %rax, w3) 354 add w0, X1 355 adc $0, w2 356L(lo2): add %rax, X1 357 adc $0, w3 358 mulx( v1, %rax, w0) 359 add %rax, X0 360 adc $0, w0 361 lea 32(rp), rp 362 add w1, X1 363 mov -16(up), %rdx 364 mov X1, -24(rp) 365 adc $0, w3 366 add w2, X0 367 mov -8(rp), X1 368 mulx( v0, %rax, w1) 369 adc $0, w0 370L(lo1): add %rax, X0 371 mulx( v1, %rax, w2) 372 adc $0, w1 373 add w3, X0 374 mov X0, -16(rp) 375 adc $0, w1 376 add %rax, X1 377 adc $0, w2 378 add w0, X1 379 mov -8(up), %rdx 380 adc $0, w2 381L(lo0): mulx( v0, %rax, w3) 382 add %rax, X1 383 adc $0, w3 384 mov (rp), X0 385 mulx( v1, %rax, w0) 386 add %rax, X0 387 adc $0, w0 388 add w1, X1 389 mov X1, -8(rp) 390 adc $0, w3 391 mov (up), %rdx 392 add w2, X0 393 mulx( v0, %rax, w1) 394 adc $0, w0 395L(lo3): add %rax, X0 396 adc $0, w1 397 mulx( v1, %rax, w2) 398 add w3, X0 399 mov 8(rp), X1 400 mov X0, (rp) 401 mov 16(rp), X0 402 adc $0, w1 403 add %rax, X1 404 adc $0, w2 405 mov 8(up), %rdx 406 lea 32(up), up 407 inc n 408 jnz L(top) 409 410L(end): mulx( v0, %rax, w3) 411 add w0, X1 412 adc $0, w2 413 add %rax, X1 414 adc $0, w3 415 mulx( v1, %rdx, %rax) 416 add w1, X1 417 mov X1, 8(rp) 418 adc $0, w3 419 add w2, %rdx 420 adc $0, %rax 421 add w3, %rdx 422 mov %rdx, 16(rp) 423 adc $0, %rax 424 mov %rax, 24(rp) 425 426 addl $-2, vn 427 lea 16(vp), vp 428 lea -16(up,un,8), up 429 lea 32(rp,un,8), rp 430 jnz L(outer) 431 432 pop %rax C deallocate vn slot 433 pop %r15 434L(ret5):pop %r14 435L(ret4):pop %r13 436L(ret3):pop %r12 437L(ret2):pop %rbp 438 pop %rbx 439 FUNC_EXIT() 440 ret 441EPILOGUE() 442