1dnl X64-64 mpn_mullo_basecase optimised for Intel Broadwell. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2017 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C The inner loops of this code are the result of running a code generation and 36C optimisation tool suite written by David Harvey and Torbjorn Granlund. 37 38define(`rp', `%rdi') 39define(`up', `%rsi') 40define(`vp_param', `%rdx') 41define(`n', `%rcx') 42 43define(`vp', `%r11') 44define(`jmpreg',`%rbx') 45define(`nn', `%rbp') 46 47C TODO 48C * Suppress more rp[] rewrites in corner. 49C * Rearrange feed-in jumps for short branch forms. 50C * Perhaps roll out the heavy artillery and 8-way unroll outer loop. Since 51C feed-in code implodes, the blow-up will not be more than perhaps 4x. 52C * Micro-optimise critical lead-in code block around L(ent). 53C * Write n < 4 code specifically for Broadwell (current code is for Haswell). 54 55ABI_SUPPORT(DOS64) 56ABI_SUPPORT(STD64) 57 58ASM_START() 59 TEXT 60 ALIGN(32) 61PROLOGUE(mpn_mullo_basecase) 62 FUNC_ENTRY(4) 63 cmp $4, R32(n) 64 jae L(big) 65 66 mov vp_param, vp 67 mov (up), %rdx 68 69 cmp $2, R32(n) 70 jae L(gt1) 71L(n1): imul (vp), %rdx 72 mov %rdx, (rp) 73 FUNC_EXIT() 74 ret 75L(gt1): ja L(gt2) 76L(n2): mov (vp), %r9 77 mulx( %r9, %rax, %rdx) 78 mov %rax, (rp) 79 mov 8(up), %rax 80 imul %r9, %rax 81 add %rax, %rdx 82 mov 8(vp), %r9 83 mov (up), %rcx 84 imul %r9, %rcx 85 add %rcx, %rdx 86 mov %rdx, 8(rp) 87 FUNC_EXIT() 88 ret 89L(gt2): 90L(n3): mov (vp), %r9 91 mulx( %r9, %rax, %r10) C u0 x v0 92 mov %rax, (rp) 93 mov 8(up), %rdx 94 mulx( %r9, %rax, %rdx) C u1 x v0 95 imul 16(up), %r9 C u2 x v0 96 add %rax, %r10 97 adc %rdx, %r9 98 mov 8(vp), %r8 99 mov (up), %rdx 100 mulx( %r8, %rax, %rdx) C u0 x v1 101 add %rax, %r10 102 adc %rdx, %r9 103 imul 8(up), %r8 C u1 x v1 104 add %r8, %r9 105 mov %r10, 8(rp) 106 mov 16(vp), %r10 107 mov (up), %rax 108 imul %rax, %r10 C u0 x v2 109 add %r10, %r9 110 mov %r9, 16(rp) 111 FUNC_EXIT() 112 ret 113 114 ALIGN(16) 115L(big): push %r14 116 push %r12 117 push %rbx 118 push %rbp 119 mov -8(vp_param,n,8), %r14 C FIXME Put at absolute end 120 imul (up), %r14 C FIXME Put at absolute end 121 lea -3(n), R32(nn) 122 lea 8(vp_param), vp 123 mov (vp_param), %rdx 124 125 mov R32(n), R32(%rax) 126 shr $3, R32(n) 127 and $7, R32(%rax) C clear OF, CF as side-effect 128 lea L(mtab)(%rip), %r10 129ifdef(`PIC', 130` movslq (%r10,%rax,4), %rax 131 lea (%rax, %r10), %r10 132 jmp *%r10 133',` 134 jmp *(%r10,%rax,8) 135') 136 137L(mf0): mulx( (up), %r10, %r8) 138 lea 56(up), up 139 lea -8(rp), rp 140 lea L(f7)(%rip), jmpreg 141 jmp L(mb0) 142 143L(mf3): mulx( (up), %r9, %rax) 144 lea 16(up), up 145 lea 16(rp), rp 146 jrcxz L(mc) 147 inc R32(n) 148 lea L(f2)(%rip), jmpreg 149 jmp L(mb3) 150 151L(mc): mulx( -8,(up), %r10, %r8) 152 add %rax, %r10 153 mov %r9, -16(rp) 154 mulx( (up), %r9, %rax) 155 mov %r10, -8(rp) 156 adc %r8, %r9 157 mov %r9, (rp) 158 jmp L(c2) 159 160L(mf4): mulx( (up), %r10, %r8) 161 lea 24(up), up 162 lea 24(rp), rp 163 inc R32(n) 164 lea L(f3)(%rip), jmpreg 165 jmp L(mb4) 166 167L(mf5): mulx( (up), %r9, %rax) 168 lea 32(up), up 169 lea 32(rp), rp 170 inc R32(n) 171 lea L(f4)(%rip), jmpreg 172 jmp L(mb5) 173 174L(mf6): mulx( (up), %r10, %r8) 175 lea 40(up), up 176 lea 40(rp), rp 177 inc R32(n) 178 lea L(f5)(%rip), jmpreg 179 jmp L(mb6) 180 181L(mf7): mulx( (up), %r9, %rax) 182 lea 48(up), up 183 lea 48(rp), rp 184 lea L(f6)(%rip), jmpreg 185 jmp L(mb7) 186 187L(mf1): mulx( (up), %r9, %rax) 188 lea L(f0)(%rip), jmpreg 189 jmp L(mb1) 190 191L(mf2): mulx( (up), %r10, %r8) 192 lea 8(up), up 193 lea 8(rp), rp 194 lea L(f1)(%rip), jmpreg 195 mulx( (up), %r9, %rax) 196 197C FIXME ugly fallthrough FIXME 198 ALIGN(32) 199L(mtop):mov %r10, -8(rp) 200 adc %r8, %r9 201L(mb1): mulx( 8,(up), %r10, %r8) 202 adc %rax, %r10 203 lea 64(up), up 204 mov %r9, (rp) 205L(mb0): mov %r10, 8(rp) 206 mulx( -48,(up), %r9, %rax) 207 lea 64(rp), rp 208 adc %r8, %r9 209L(mb7): mulx( -40,(up), %r10, %r8) 210 mov %r9, -48(rp) 211 adc %rax, %r10 212L(mb6): mov %r10, -40(rp) 213 mulx( -32,(up), %r9, %rax) 214 adc %r8, %r9 215L(mb5): mulx( -24,(up), %r10, %r8) 216 mov %r9, -32(rp) 217 adc %rax, %r10 218L(mb4): mulx( -16,(up), %r9, %rax) 219 mov %r10, -24(rp) 220 adc %r8, %r9 221L(mb3): mulx( -8,(up), %r10, %r8) 222 adc %rax, %r10 223 mov %r9, -16(rp) 224 dec R32(n) 225 mulx( (up), %r9, %rax) 226 jnz L(mtop) 227 228L(mend):mov %r10, -8(rp) 229 adc %r8, %r9 230 mov %r9, (rp) 231 adc %rcx, %rax 232 233 lea 8(,nn,8), %r12 234 neg %r12 235 shr $3, R32(nn) 236 jmp L(ent) 237 238L(f0): mulx( (up), %r10, %r8) 239 lea -8(up), up 240 lea -8(rp), rp 241 lea L(f7)(%rip), jmpreg 242 jmp L(b0) 243 244L(f1): mulx( (up), %r9, %rax) 245 lea -1(nn), R32(nn) 246 lea L(f0)(%rip), jmpreg 247 jmp L(b1) 248 249L(end): adox( (rp), %r9) 250 mov %r9, (rp) 251 adox( %rcx, %rax) C relies on rcx = 0 252 adc %rcx, %rax C FIXME suppress, use adc below; reqs ent path edits 253 lea 8(%r12), %r12 254L(ent): mulx( 8,(up), %r10, %r8) C r8 unused (use imul?) 255 add %rax, %r14 256 add %r10, %r14 C h 257 lea (up,%r12), up C reset up 258 lea 8(rp,%r12), rp C reset rp 259 mov (vp), %rdx 260 lea 8(vp), vp 261 or R32(nn), R32(n) C copy count, clear CF,OF (n = 0 prior) 262 jmp *jmpreg 263 264L(f7): mulx( (up), %r9, %rax) 265 lea -16(up), up 266 lea -16(rp), rp 267 lea L(f6)(%rip), jmpreg 268 jmp L(b7) 269 270L(f2): mulx( (up), %r10, %r8) 271 lea 8(up), up 272 lea 8(rp), rp 273 mulx( (up), %r9, %rax) 274 lea L(f1)(%rip), jmpreg 275 276C FIXME ugly fallthrough FIXME 277 ALIGN(32) 278L(top): adox( -8,(rp), %r10) 279 adcx( %r8, %r9) 280 mov %r10, -8(rp) 281 jrcxz L(end) 282L(b1): mulx( 8,(up), %r10, %r8) 283 adox( (rp), %r9) 284 lea -1(n), R32(n) 285 mov %r9, (rp) 286 adcx( %rax, %r10) 287L(b0): mulx( 16,(up), %r9, %rax) 288 adcx( %r8, %r9) 289 adox( 8,(rp), %r10) 290 mov %r10, 8(rp) 291L(b7): mulx( 24,(up), %r10, %r8) 292 lea 64(up), up 293 adcx( %rax, %r10) 294 adox( 16,(rp), %r9) 295 mov %r9, 16(rp) 296L(b6): mulx( -32,(up), %r9, %rax) 297 adox( 24,(rp), %r10) 298 adcx( %r8, %r9) 299 mov %r10, 24(rp) 300L(b5): mulx( -24,(up), %r10, %r8) 301 adcx( %rax, %r10) 302 adox( 32,(rp), %r9) 303 mov %r9, 32(rp) 304L(b4): mulx( -16,(up), %r9, %rax) 305 adox( 40,(rp), %r10) 306 adcx( %r8, %r9) 307 mov %r10, 40(rp) 308L(b3): adox( 48,(rp), %r9) 309 mulx( -8,(up), %r10, %r8) 310 mov %r9, 48(rp) 311 lea 64(rp), rp 312 adcx( %rax, %r10) 313 mulx( (up), %r9, %rax) 314 jmp L(top) 315 316L(f6): mulx( (up), %r10, %r8) 317 lea 40(up), up 318 lea -24(rp), rp 319 lea L(f5)(%rip), jmpreg 320 jmp L(b6) 321 322L(f5): mulx( (up), %r9, %rax) 323 lea 32(up), up 324 lea -32(rp), rp 325 lea L(f4)(%rip), jmpreg 326 jmp L(b5) 327 328L(f4): mulx( (up), %r10, %r8) 329 lea 24(up), up 330 lea -40(rp), rp 331 lea L(f3)(%rip), jmpreg 332 jmp L(b4) 333 334L(f3): mulx( (up), %r9, %rax) 335 lea 16(up), up 336 lea -48(rp), rp 337 jrcxz L(cor) 338 lea L(f2)(%rip), jmpreg 339 jmp L(b3) 340 341L(cor): adox( 48,(rp), %r9) 342 mulx( -8,(up), %r10, %r8) 343 mov %r9, 48(rp) 344 lea 64(rp), rp 345 adcx( %rax, %r10) 346 mulx( (up), %r9, %rax) 347 adox( -8,(rp), %r10) 348 adcx( %r8, %r9) 349 mov %r10, -8(rp) C FIXME suppress 350 adox( (rp), %r9) 351 mov %r9, (rp) C FIXME suppress 352 adox( %rcx, %rax) 353L(c2): 354 mulx( 8,(up), %r10, %r8) 355 adc %rax, %r14 356 add %r10, %r14 357 mov (vp), %rdx 358 test R32(%rcx), R32(%rcx) 359 mulx( -16,(up), %r10, %r8) 360 mulx( -8,(up), %r9, %rax) 361 adox( -8,(rp), %r10) 362 adcx( %r8, %r9) 363 mov %r10, -8(rp) 364 adox( (rp), %r9) 365 adox( %rcx, %rax) 366 adc %rcx, %rax 367 mulx( (up), %r10, %r8) 368 add %rax, %r14 369 add %r10, %r14 370 mov 8(vp), %rdx 371 mulx( -16,(up), %rcx, %rax) 372 add %r9, %rcx 373 mov %rcx, (rp) 374 adc $0, %rax 375 mulx( -8,(up), %r10, %r8) 376 add %rax, %r14 377 add %r10, %r14 378 mov %r14, 8(rp) 379 pop %rbp 380 pop %rbx 381 pop %r12 382 pop %r14 383 FUNC_EXIT() 384 ret 385EPILOGUE() 386 JUMPTABSECT 387 ALIGN(8) 388L(mtab):JMPENT( L(mf7), L(mtab)) 389 JMPENT( L(mf0), L(mtab)) 390 JMPENT( L(mf1), L(mtab)) 391 JMPENT( L(mf2), L(mtab)) 392 JMPENT( L(mf3), L(mtab)) 393 JMPENT( L(mf4), L(mtab)) 394 JMPENT( L(mf5), L(mtab)) 395 JMPENT( L(mf6), L(mtab)) 396