1dnl AMD64 mpn_redc_1 optimised for Intel Haswell. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 n/a 37C AMD K10 n/a 38C AMD bull n/a 39C AMD pile n/a 40C AMD steam ? 41C AMD bobcat n/a 42C AMD jaguar ? 43C Intel P4 n/a 44C Intel core n/a 45C Intel NHM n/a 46C Intel SBR n/a 47C Intel IBR n/a 48C Intel HWL 2.32 49C Intel BWL ? 50C Intel atom n/a 51C VIA nano n/a 52 53C The inner loops of this code are the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbj��rn Granlund. 55 56C TODO 57C * Micro-optimise. 58C * Consider inlining mpn_add_n. Tests indicate that this saves just 1-2 59C cycles, though. 60 61define(`rp', `%rdi') C rcx 62define(`up', `%rsi') C rdx 63define(`mp_param', `%rdx') C r8 64define(`n', `%rcx') C r9 65define(`u0inv_param', `%r8') C stack 66 67define(`i', `%r14') 68define(`j', `%r15') 69define(`mp', `%rdi') 70define(`u0inv', `(%rsp)') C stack 71 72ABI_SUPPORT(DOS64) C FIXME: needs verification 73ABI_SUPPORT(STD64) 74 75ASM_START() 76 TEXT 77 ALIGN(16) 78PROLOGUE(mpn_redc_1) 79 FUNC_ENTRY(4) 80IFDOS(` mov 56(%rsp), %r8 ') 81 push %rbx 82 push %rbp 83 push %r12 84 push %r13 85 push %r14 86 push %r15 87 push rp 88 mov mp_param, mp C note that rp and mp shares register 89 mov (up), %rdx 90 91 neg n 92 push %r8 C put u0inv on stack 93 imul u0inv_param, %rdx C first iteration q0 94 mov n, j C outer loop induction var 95 96 test $1, R8(n) 97 jnz L(bx1) 98 99L(bx0): test $2, R8(n) 100 jz L(o0b) 101 102 cmp $-2, R32(n) 103 jnz L(o2) 104 105C Special code for n = 2 since general code cannot handle it 106 mov 8(%rsp), %rbx C rp 107 lea 16(%rsp), %rsp C deallocate two slots 108 mulx( (mp), %r9, %r12) 109 mulx( 8,(mp), %r11, %r10) 110 add %r12, %r11 111 adc $0, %r10 112 add (up), %r9 C = 0 113 adc 8(up), %r11 C r11 = up[1] 114 adc $0, %r10 C -> up[0] 115 mov %r11, %rdx 116 imul u0inv_param, %rdx 117 mulx( (mp), %r13, %r12) 118 mulx( 8,(mp), %r14, %r15) 119 xor R32(%rax), R32(%rax) 120 add %r12, %r14 121 adc $0, %r15 122 add %r11, %r13 C = 0 123 adc 16(up), %r14 C rp[2] 124 adc $0, %r15 C -> up[1] 125 add %r14, %r10 126 adc 24(up), %r15 127 mov %r10, (%rbx) 128 mov %r15, 8(%rbx) 129 setc R8(%rax) 130 jmp L(ret) 131 132L(o2): lea 2(n), i C inner loop induction var 133 mulx( (mp), %r9, %r8) 134 mulx( 8,(mp), %r11, %r10) 135 sar $2, i 136 add %r8, %r11 137 jmp L(lo2) 138 139 ALIGN(16) 140L(tp2): adc %rax, %r9 141 lea 32(up), up 142 adc %r8, %r11 143L(lo2): mulx( 16,(mp), %r13, %r12) 144 mov (up), %r8 145 mulx( 24,(mp), %rbx, %rax) 146 lea 32(mp), mp 147 adc %r10, %r13 148 adc %r12, %rbx 149 adc $0, %rax 150 mov 8(up), %r10 151 mov 16(up), %r12 152 add %r9, %r8 153 mov 24(up), %rbp 154 mov %r8, (up) 155 adc %r11, %r10 156 mulx( (mp), %r9, %r8) 157 mov %r10, 8(up) 158 adc %r13, %r12 159 mov %r12, 16(up) 160 adc %rbx, %rbp 161 mulx( 8,(mp), %r11, %r10) 162 mov %rbp, 24(up) 163 inc i 164 jnz L(tp2) 165 166L(ed2): mov 56(up,n,8), %rdx C next iteration up[0] 167 lea 16(mp,n,8), mp C mp = (last starting mp) 168 adc %rax, %r9 169 adc %r8, %r11 170 mov 32(up), %r8 171 adc $0, %r10 172 imul u0inv, %rdx C next iteration q0 173 mov 40(up), %rax 174 add %r9, %r8 175 mov %r8, 32(up) 176 adc %r11, %rax 177 mov %rax, 40(up) 178 lea 56(up,n,8), up C up = (last starting up) + 1 179 adc $0, %r10 180 mov %r10, -8(up) 181 inc j 182 jnz L(o2) 183 184 jmp L(cj) 185 186 187L(bx1): test $2, R8(n) 188 jz L(o3a) 189 190L(o1a): cmp $-1, R32(n) 191 jnz L(o1b) 192 193C Special code for n = 1 since general code cannot handle it 194 mov 8(%rsp), %rbx C rp 195 lea 16(%rsp), %rsp C deallocate two slots 196 mulx( (mp), %r11, %r10) 197 add (up), %r11 198 adc 8(up), %r10 199 mov %r10, (%rbx) 200 mov $0, R32(%rax) 201 setc R8(%rax) 202 jmp L(ret) 203 204L(o1b): lea 24(mp), mp 205L(o1): lea 1(n), i C inner loop induction var 206 mulx( -24,(mp), %r11, %r10) 207 mulx( -16,(mp), %r13, %r12) 208 mulx( -8,(mp), %rbx, %rax) 209 sar $2, i 210 add %r10, %r13 211 adc %r12, %rbx 212 adc $0, %rax 213 mov (up), %r10 214 mov 8(up), %r12 215 mov 16(up), %rbp 216 add %r11, %r10 217 jmp L(lo1) 218 219 ALIGN(16) 220L(tp1): adc %rax, %r9 221 lea 32(up), up 222 adc %r8, %r11 223 mulx( 16,(mp), %r13, %r12) 224 mov -8(up), %r8 225 mulx( 24,(mp), %rbx, %rax) 226 lea 32(mp), mp 227 adc %r10, %r13 228 adc %r12, %rbx 229 adc $0, %rax 230 mov (up), %r10 231 mov 8(up), %r12 232 add %r9, %r8 233 mov 16(up), %rbp 234 mov %r8, -8(up) 235 adc %r11, %r10 236L(lo1): mulx( (mp), %r9, %r8) 237 mov %r10, (up) 238 adc %r13, %r12 239 mov %r12, 8(up) 240 adc %rbx, %rbp 241 mulx( 8,(mp), %r11, %r10) 242 mov %rbp, 16(up) 243 inc i 244 jnz L(tp1) 245 246L(ed1): mov 48(up,n,8), %rdx C next iteration up[0] 247 lea 40(mp,n,8), mp C mp = (last starting mp) 248 adc %rax, %r9 249 adc %r8, %r11 250 mov 24(up), %r8 251 adc $0, %r10 252 imul u0inv, %rdx C next iteration q0 253 mov 32(up), %rax 254 add %r9, %r8 255 mov %r8, 24(up) 256 adc %r11, %rax 257 mov %rax, 32(up) 258 lea 48(up,n,8), up C up = (last starting up) + 1 259 adc $0, %r10 260 mov %r10, -8(up) 261 inc j 262 jnz L(o1) 263 264 jmp L(cj) 265 266L(o3a): cmp $-3, R32(n) 267 jnz L(o3b) 268 269C Special code for n = 3 since general code cannot handle it 270L(n3): mulx( (mp), %rbx, %rax) 271 mulx( 8,(mp), %r9, %r14) 272 add (up), %rbx 273 mulx( 16,(mp), %r11, %r10) 274 adc %rax, %r9 C W 1 275 adc %r14, %r11 C W 2 276 mov 8(up), %r14 277 mov u0inv_param, %rdx 278 adc $0, %r10 C W 3 279 mov 16(up), %rax 280 add %r9, %r14 C W 1 281 mov %r14, 8(up) 282 mulx( %r14, %rdx, %r13) C next iteration q0 283 adc %r11, %rax C W 2 284 mov %rax, 16(up) 285 adc $0, %r10 C W 3 286 mov %r10, (up) 287 lea 8(up), up C up = (last starting up) + 1 288 inc j 289 jnz L(n3) 290 291 jmp L(cj) 292 293L(o3b): lea 8(mp), mp 294L(o3): lea 4(n), i C inner loop induction var 295 mulx( -8,(mp), %rbx, %rax) 296 mulx( (mp), %r9, %r8) 297 mov (up), %rbp 298 mulx( 8,(mp), %r11, %r10) 299 sar $2, i 300 add %rbx, %rbp 301 nop 302 adc %rax, %r9 303 jmp L(lo3) 304 305 ALIGN(16) 306L(tp3): adc %rax, %r9 307 lea 32(up), up 308L(lo3): adc %r8, %r11 309 mulx( 16,(mp), %r13, %r12) 310 mov 8(up), %r8 311 mulx( 24,(mp), %rbx, %rax) 312 lea 32(mp), mp 313 adc %r10, %r13 314 adc %r12, %rbx 315 adc $0, %rax 316 mov 16(up), %r10 317 mov 24(up), %r12 318 add %r9, %r8 319 mov 32(up), %rbp 320 mov %r8, 8(up) 321 adc %r11, %r10 322 mulx( (mp), %r9, %r8) 323 mov %r10, 16(up) 324 adc %r13, %r12 325 mov %r12, 24(up) 326 adc %rbx, %rbp 327 mulx( 8,(mp), %r11, %r10) 328 mov %rbp, 32(up) 329 inc i 330 jnz L(tp3) 331 332L(ed3): mov 64(up,n,8), %rdx C next iteration up[0] 333 lea 24(mp,n,8), mp C mp = (last starting mp) 334 adc %rax, %r9 335 adc %r8, %r11 336 mov 40(up), %r8 337 adc $0, %r10 338 imul u0inv, %rdx C next iteration q0 339 mov 48(up), %rax 340 add %r9, %r8 341 mov %r8, 40(up) 342 adc %r11, %rax 343 mov %rax, 48(up) 344 lea 64(up,n,8), up C up = (last starting up) + 1 345 adc $0, %r10 346 mov %r10, -8(up) 347 inc j 348 jnz L(o3) 349 350 jmp L(cj) 351 352L(o0b): lea 16(mp), mp 353L(o0): mov n, i C inner loop induction var 354 mulx( -16,(mp), %r13, %r12) 355 mulx( -8,(mp), %rbx, %rax) 356 sar $2, i 357 add %r12, %rbx 358 adc $0, %rax 359 mov (up), %r12 360 mov 8(up), %rbp 361 mulx( (mp), %r9, %r8) 362 add %r13, %r12 363 jmp L(lo0) 364 365 ALIGN(16) 366L(tp0): adc %rax, %r9 367 lea 32(up), up 368 adc %r8, %r11 369 mulx( 16,(mp), %r13, %r12) 370 mov -16(up), %r8 371 mulx( 24,(mp), %rbx, %rax) 372 lea 32(mp), mp 373 adc %r10, %r13 374 adc %r12, %rbx 375 adc $0, %rax 376 mov -8(up), %r10 377 mov (up), %r12 378 add %r9, %r8 379 mov 8(up), %rbp 380 mov %r8, -16(up) 381 adc %r11, %r10 382 mulx( (mp), %r9, %r8) 383 mov %r10, -8(up) 384 adc %r13, %r12 385 mov %r12, (up) 386L(lo0): adc %rbx, %rbp 387 mulx( 8,(mp), %r11, %r10) 388 mov %rbp, 8(up) 389 inc i 390 jnz L(tp0) 391 392L(ed0): mov 40(up,n,8), %rdx C next iteration up[0] 393 lea 32(mp,n,8), mp C mp = (last starting mp) 394 adc %rax, %r9 395 adc %r8, %r11 396 mov 16(up), %r8 397 adc $0, %r10 398 imul u0inv, %rdx C next iteration q0 399 mov 24(up), %rax 400 add %r9, %r8 401 mov %r8, 16(up) 402 adc %r11, %rax 403 mov %rax, 24(up) 404 lea 40(up,n,8), up C up = (last starting up) + 1 405 adc $0, %r10 406 mov %r10, -8(up) 407 inc j 408 jnz L(o0) 409 410L(cj): 411IFSTD(` mov 8(%rsp), %rdi C param 1: rp 412 lea 16-8(%rsp), %rsp C deallocate 2, add back for alignment 413 lea (up,n,8), %rdx C param 3: up - n 414 neg R32(n) ') C param 4: n 415 416IFDOS(` mov up, %rdx C param 2: up 417 lea (up,n,8), %r8 C param 3: up - n 418 neg R32(n) 419 mov n, %r9 C param 4: n 420 mov 8(%rsp), %rcx C param 1: rp 421 lea 16-32-8(%rsp), %rsp') C deallocate 2, allocate shadow, align 422 423 ASSERT(nz, `test $15, %rsp') 424 CALL( mpn_add_n) 425 426IFSTD(` lea 8(%rsp), %rsp ') 427IFDOS(` lea 32+8(%rsp), %rsp') 428 429L(ret): pop %r15 430 pop %r14 431 pop %r13 432 pop %r12 433 pop %rbp 434 pop %rbx 435 FUNC_EXIT() 436 ret 437EPILOGUE() 438