1dnl AMD64 mpn_mullo_basecase optimised for Intel Haswell. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb mul_2 addmul_2 36C AMD K8,K9 n/a n/a 37C AMD K10 n/a n/a 38C AMD bull n/a n/a 39C AMD pile n/a n/a 40C AMD steam ? ? 41C AMD bobcat n/a n/a 42C AMD jaguar ? ? 43C Intel P4 n/a n/a 44C Intel core n/a n/a 45C Intel NHM n/a n/a 46C Intel SBR n/a n/a 47C Intel IBR n/a n/a 48C Intel HWL 1.86 2.15 49C Intel BWL ? ? 50C Intel atom n/a n/a 51C VIA nano n/a n/a 52 53C The inner loops of this code are the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbj��rn Granlund. 55 56C TODO 57C * Implement proper cor2, replacing current cor0. 58C * Micro-optimise. 59 60define(`rp', `%rdi') 61define(`up', `%rsi') 62define(`vp_param', `%rdx') 63define(`n', `%rcx') 64 65define(`vp', `%r8') 66define(`X0', `%r14') 67define(`X1', `%r15') 68 69define(`w0', `%r10') 70define(`w1', `%r11') 71define(`w2', `%r12') 72define(`w3', `%r13') 73define(`i', `%rbp') 74define(`v0', `%r9') 75define(`v1', `%rbx') 76 77C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 78 79ABI_SUPPORT(DOS64) 80ABI_SUPPORT(STD64) 81 82ASM_START() 83 TEXT 84 ALIGN(32) 85PROLOGUE(mpn_mullo_basecase) 86 FUNC_ENTRY(4) 87 88 mov vp_param, vp 89 mov (up), %rdx 90 91 cmp $4, n 92 jb L(small) 93 94 push %rbx 95 push %rbp 96 push %r12 97 push %r13 98 99 mov (vp), v0 100 mov 8(vp), v1 101 102 lea 2(n), i 103 shr $2, i 104 neg n 105 add $2, n 106 107 push up C put entry `up' on stack 108 109 test $1, R8(n) 110 jnz L(m2x1) 111 112L(m2x0):mulx( v0, w0, w3) 113 xor R32(w2), R32(w2) 114 test $2, R8(n) 115 jz L(m2b2) 116 117L(m2b0):lea -8(rp), rp 118 lea -8(up), up 119 jmp L(m2e0) 120 121L(m2b2):lea -24(rp), rp 122 lea 8(up), up 123 jmp L(m2e2) 124 125L(m2x1):mulx( v0, w2, w1) 126 xor R32(w0), R32(w0) 127 test $2, R8(n) 128 jnz L(m2b3) 129 130L(m2b1):jmp L(m2e1) 131 132L(m2b3):lea -16(rp), rp 133 lea -16(up), up 134 jmp L(m2e3) 135 136 ALIGN(16) 137L(m2tp):mulx( v1, %rax, w0) 138 add %rax, w2 139 mov (up), %rdx 140 mulx( v0, %rax, w1) 141 adc $0, w0 142 add %rax, w2 143 adc $0, w1 144 add w3, w2 145L(m2e1):mov w2, (rp) 146 adc $0, w1 147 mulx( v1, %rax, w2) 148 add %rax, w0 149 mov 8(up), %rdx 150 adc $0, w2 151 mulx( v0, %rax, w3) 152 add %rax, w0 153 adc $0, w3 154 add w1, w0 155L(m2e0):mov w0, 8(rp) 156 adc $0, w3 157 mulx( v1, %rax, w0) 158 add %rax, w2 159 mov 16(up), %rdx 160 mulx( v0, %rax, w1) 161 adc $0, w0 162 add %rax, w2 163 adc $0, w1 164 add w3, w2 165L(m2e3):mov w2, 16(rp) 166 adc $0, w1 167 mulx( v1, %rax, w2) 168 add %rax, w0 169 mov 24(up), %rdx 170 adc $0, w2 171 mulx( v0, %rax, w3) 172 add %rax, w0 173 adc $0, w3 174 add w1, w0 175 lea 32(up), up 176L(m2e2):mov w0, 24(rp) 177 adc $0, w3 178 dec i 179 lea 32(rp), rp 180 jnz L(m2tp) 181 182L(m2ed):mulx( v1, %rax, w0) 183 add %rax, w2 184 mov (up), %rdx 185 mulx( v0, %rax, w1) 186 add w2, %rax 187 add w3, %rax 188 mov %rax, (rp) 189 190 mov (%rsp), up C restore `up' to beginning 191 lea 16(vp), vp 192 lea 8(rp,n,8), rp C put back rp to old rp + 2 193 add $2, n 194 jge L(cor1) 195 196 push %r14 197 push %r15 198 199L(outer): 200 mov (vp), v0 201 mov 8(vp), v1 202 203 lea (n), i 204 sar $2, i 205 206 mov (up), %rdx 207 test $1, R8(n) 208 jnz L(bx1) 209 210L(bx0): mov (rp), X1 211 mov 8(rp), X0 212 mulx( v0, %rax, w3) 213 add %rax, X1 214 adc $0, w3 215 mulx( v1, %rax, w0) 216 add %rax, X0 217 adc $0, w0 218 mov 8(up), %rdx 219 mov X1, (rp) 220 mulx( v0, %rax, w1) 221 test $2, R8(n) 222 jz L(b2) 223 224L(b0): lea 8(rp), rp 225 lea 8(up), up 226 jmp L(lo0) 227 228L(b2): mov 16(rp), X1 229 lea 24(rp), rp 230 lea 24(up), up 231 jmp L(lo2) 232 233L(bx1): mov (rp), X0 234 mov 8(rp), X1 235 mulx( v0, %rax, w1) 236 add %rax, X0 237 mulx( v1, %rax, w2) 238 adc $0, w1 239 mov X0, (rp) 240 add %rax, X1 241 adc $0, w2 242 mov 8(up), %rdx 243 test $2, R8(n) 244 jnz L(b3) 245 246L(b1): lea 16(up), up 247 lea 16(rp), rp 248 jmp L(lo1) 249 250L(b3): mov 16(rp), X0 251 lea 32(up), up 252 mulx( v0, %rax, w3) 253 inc i 254 jz L(cj3) 255 jmp L(lo3) 256 257 ALIGN(16) 258L(top): mulx( v0, %rax, w3) 259 add w0, X1 260 adc $0, w2 261L(lo3): add %rax, X1 262 adc $0, w3 263 mulx( v1, %rax, w0) 264 add %rax, X0 265 adc $0, w0 266 lea 32(rp), rp 267 add w1, X1 268 mov -16(up), %rdx 269 mov X1, -24(rp) 270 adc $0, w3 271 add w2, X0 272 mov -8(rp), X1 273 mulx( v0, %rax, w1) 274 adc $0, w0 275L(lo2): add %rax, X0 276 mulx( v1, %rax, w2) 277 adc $0, w1 278 add w3, X0 279 mov X0, -16(rp) 280 adc $0, w1 281 add %rax, X1 282 adc $0, w2 283 add w0, X1 284 mov -8(up), %rdx 285 adc $0, w2 286L(lo1): mulx( v0, %rax, w3) 287 add %rax, X1 288 adc $0, w3 289 mov (rp), X0 290 mulx( v1, %rax, w0) 291 add %rax, X0 292 adc $0, w0 293 add w1, X1 294 mov X1, -8(rp) 295 adc $0, w3 296 mov (up), %rdx 297 add w2, X0 298 mulx( v0, %rax, w1) 299 adc $0, w0 300L(lo0): add %rax, X0 301 adc $0, w1 302 mulx( v1, %rax, w2) 303 add w3, X0 304 mov 8(rp), X1 305 mov X0, (rp) 306 mov 16(rp), X0 307 adc $0, w1 308 add %rax, X1 309 adc $0, w2 310 mov 8(up), %rdx 311 lea 32(up), up 312 inc i 313 jnz L(top) 314 315L(end): mulx( v0, %rax, w3) 316 add w0, X1 317 adc $0, w2 318L(cj3): add %rax, X1 319 adc $0, w3 320 mulx( v1, %rax, w0) 321 add %rax, X0 322 add w1, X1 323 mov -16(up), %rdx 324 mov X1, 8(rp) 325 adc $0, w3 326 add w2, X0 327 mulx( v0, %rax, w1) 328 add X0, %rax 329 add w3, %rax 330 mov %rax, 16(rp) 331 332 mov 16(%rsp), up C restore `up' to beginning 333 lea 16(vp), vp 334 lea 24(rp,n,8), rp C put back rp to old rp + 2 335 add $2, n 336 jl L(outer) 337 338 pop %r15 339 pop %r14 340 341 jnz L(cor0) 342 343L(cor1):mov (vp), v0 344 mov 8(vp), v1 345 mov (up), %rdx 346 mulx( v0, %r12, %rbp) C u0 x v2 347 add (rp), %r12 C FIXME: rp[0] still available in reg? 348 adc %rax, %rbp 349 mov 8(up), %r10 350 imul v0, %r10 351 imul v1, %rdx 352 mov %r12, (rp) 353 add %r10, %rdx 354 add %rbp, %rdx 355 mov %rdx, 8(rp) 356 pop %rax C deallocate `up' copy 357 pop %r13 358 pop %r12 359 pop %rbp 360 pop %rbx 361 FUNC_EXIT() 362 ret 363 364L(cor0):mov (vp), %r11 365 imul (up), %r11 366 add %rax, %r11 367 mov %r11, (rp) 368 pop %rax C deallocate `up' copy 369 pop %r13 370 pop %r12 371 pop %rbp 372 pop %rbx 373 FUNC_EXIT() 374 ret 375 376 ALIGN(16) 377L(small): 378 cmp $2, n 379 jae L(gt1) 380L(n1): imul (vp), %rdx 381 mov %rdx, (rp) 382 FUNC_EXIT() 383 ret 384L(gt1): ja L(gt2) 385L(n2): mov (vp), %r9 386 mulx( %r9, %rax, %rdx) 387 mov %rax, (rp) 388 mov 8(up), %rax 389 imul %r9, %rax 390 add %rax, %rdx 391 mov 8(vp), %r9 392 mov (up), %rcx 393 imul %r9, %rcx 394 add %rcx, %rdx 395 mov %rdx, 8(rp) 396 FUNC_EXIT() 397 ret 398L(gt2): 399L(n3): mov (vp), %r9 400 mulx( %r9, %rax, %r10) C u0 x v0 401 mov %rax, (rp) 402 mov 8(up), %rdx 403 mulx( %r9, %rax, %rdx) C u1 x v0 404 imul 16(up), %r9 C u2 x v0 405 add %rax, %r10 406 adc %rdx, %r9 407 mov 8(vp), %r11 408 mov (up), %rdx 409 mulx( %r11, %rax, %rdx) C u0 x v1 410 add %rax, %r10 411 adc %rdx, %r9 412 imul 8(up), %r11 C u1 x v1 413 add %r11, %r9 414 mov %r10, 8(rp) 415 mov 16(vp), %r10 416 mov (up), %rax 417 imul %rax, %r10 C u0 x v2 418 add %r10, %r9 419 mov %r9, 16(rp) 420 FUNC_EXIT() 421 ret 422EPILOGUE() 423