1dnl X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 ? 37C AMD K10 ? 38C AMD bull ? 39C AMD pile ? 40C AMD steam ? 41C AMD bobcat ? 42C AMD jaguar ? 43C Intel P4 ? 44C Intel core 4.5 (fluctuating) 45C Intel NHM ? 46C Intel SBR ? 47C Intel IBR ? 48C Intel HWL ? 49C Intel BWL ? 50C Intel atom ? 51C VIA nano ? 52 53C The inner loops of this code are the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbj��rn Granlund. 55 56C TODO 57C * Micro-optimise, none performed thus far. 58C * Consider inlining mpn_add_n. 59C * Single basecases out before the pushes. 60C * Keep up[i] in registers for basecases (might require pushes). 61 62C When playing with pointers, set this to $2 to fall back to conservative 63C indexing in wind-down code. 64define(`I',`$1') 65 66define(`rp', `%rdi') C rcx 67define(`up', `%rsi') C rdx 68define(`mp_param', `%rdx') C r8 69define(`n', `%rcx') C r9 70define(`u0inv', `%r8') C stack 71 72define(`i', `%r14') 73define(`j', `%r15') 74define(`mp', `%r12') 75define(`q0', `%r13') 76 77C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 78C X q0' n X rp up u0i mp q0 i j 79 80ABI_SUPPORT(DOS64) 81ABI_SUPPORT(STD64) 82 83define(`ALIGNx', `ALIGN(16)') 84 85ASM_START() 86 TEXT 87 ALIGN(32) 88PROLOGUE(mpn_redc_1) 89 FUNC_ENTRY(4) 90IFDOS(` mov 56(%rsp), %r8 ') 91 push %rbx 92 push %rbp 93 push %r12 94 push %r13 95 push %r14 96 push %r15 97 98 mov (up), q0 99 mov n, j C outer loop induction var 100 lea (mp_param,n,8), mp 101 lea -16(up,n,8), up 102 neg n 103 imul u0inv, q0 C first iteration q0 104 105 test $1, R8(n) 106 jz L(b0) 107 108L(b1): cmp $-1, R32(n) 109 jz L(n1) 110 cmp $-3, R32(n) 111 jz L(n3) 112 113 push rp 114 115L(otp1):lea 3(n), i 116 mov (mp,n,8), %rax 117 mul q0 118 lea (%rax), %rbp 119 mov 8(mp,n,8), %rax 120 lea (%rdx), %r9 121 mul q0 122 lea (%rax), %r11 123 mov 16(mp,n,8), %rax 124 mov 16(up,n,8), %r10 125 lea (%rdx), %rdi 126 mul q0 127 add %rbp, %r10 128 lea (%rax), %rbp 129 mov 24(mp,n,8), %rax 130 adc %r9, %r11 131 mov 24(up,n,8), %rbx 132 lea (%rdx), %r9 133 adc $0, %rdi 134 mul q0 135 add %r11, %rbx 136 lea (%rax), %r11 137 mov 32(mp,n,8), %rax 138 adc %rdi, %rbp 139 mov %rbx, 24(up,n,8) 140 mov 32(up,n,8), %r10 141 lea (%rdx), %rdi 142 adc $0, %r9 143 imul u0inv, %rbx C next q limb 144 add $2, i 145 jns L(ed1) 146 147 ALIGNx 148L(tp1): mul q0 149 add %rbp, %r10 150 lea (%rax), %rbp 151 mov (mp,i,8), %rax 152 adc %r9, %r11 153 mov %r10, -8(up,i,8) 154 mov (up,i,8), %r10 155 lea (%rdx), %r9 156 adc $0, %rdi 157 mul q0 158 add %r11, %r10 159 lea (%rax), %r11 160 mov 8(mp,i,8), %rax 161 adc %rdi, %rbp 162 mov %r10, (up,i,8) 163 mov 8(up,i,8), %r10 164 lea (%rdx), %rdi 165 adc $0, %r9 166 add $2, i 167 js L(tp1) 168 169L(ed1): mul q0 170 add %rbp, %r10 171 adc %r9, %r11 172 mov %r10, I(-8(up),-8(up,i,8)) 173 mov I((up),(up,i,8)), %r10 174 adc $0, %rdi 175 add %r11, %r10 176 adc %rdi, %rax 177 mov %r10, I((up),(up,i,8)) 178 mov I(8(up),8(up,i,8)), %r10 179 adc $0, %rdx 180 add %rax, %r10 181 mov %r10, I(8(up),8(up,i,8)) 182 adc $0, %rdx 183 mov %rdx, 16(up,n,8) C up[0] 184 mov %rbx, q0 C previously computed q limb -> q0 185 lea 8(up), up C up++ 186 dec j 187 jnz L(otp1) 188 jmp L(cj) 189 190L(b0): cmp $-2, R32(n) 191 jz L(n2) 192 cmp $-4, R32(n) 193 jz L(n4) 194 195 push rp 196 197L(otp0):lea 4(n), i 198 mov (mp,n,8), %rax 199 mul q0 200 lea (%rax), %r11 201 mov 8(mp,n,8), %rax 202 lea (%rdx), %rdi 203 mul q0 204 lea (%rax), %rbp 205 mov 16(mp,n,8), %rax 206 mov 16(up,n,8), %r10 207 lea (%rdx), %r9 208 mul q0 209 add %r11, %r10 210 lea (%rax), %r11 211 mov 24(mp,n,8), %rax 212 adc %rdi, %rbp 213 mov 24(up,n,8), %rbx 214 lea (%rdx), %rdi 215 adc $0, %r9 216 mul q0 217 add %rbp, %rbx 218 lea (%rax), %rbp 219 mov 32(mp,n,8), %rax 220 adc %r9, %r11 221 mov %rbx, 24(up,n,8) 222 mov 32(up,n,8), %r10 223 lea (%rdx), %r9 224 adc $0, %rdi 225 imul u0inv, %rbx C next q limb 226 jmp L(e0) 227 228 ALIGNx 229L(tp0): mul q0 230 add %rbp, %r10 231 lea (%rax), %rbp 232 mov (mp,i,8), %rax 233 adc %r9, %r11 234 mov %r10, -8(up,i,8) 235 mov (up,i,8), %r10 236 lea (%rdx), %r9 237 adc $0, %rdi 238L(e0): mul q0 239 add %r11, %r10 240 lea (%rax), %r11 241 mov 8(mp,i,8), %rax 242 adc %rdi, %rbp 243 mov %r10, (up,i,8) 244 mov 8(up,i,8), %r10 245 lea (%rdx), %rdi 246 adc $0, %r9 247 add $2, i 248 js L(tp0) 249 250L(ed0): mul q0 251 add %rbp, %r10 252 adc %r9, %r11 253 mov %r10, I(-8(up),-8(up,i,8)) 254 mov I((up),(up,i,8)), %r10 255 adc $0, %rdi 256 add %r11, %r10 257 adc %rdi, %rax 258 mov %r10, I((up),(up,i,8)) 259 mov I(8(up),8(up,i,8)), %r10 260 adc $0, %rdx 261 add %rax, %r10 262 mov %r10, I(8(up),8(up,i,8)) 263 adc $0, %rdx 264 mov %rdx, 16(up,n,8) C up[0] 265 mov %rbx, q0 C previously computed q limb -> q0 266 lea 8(up), up C up++ 267 dec j 268 jnz L(otp0) 269 270L(cj): lea 16(up), up C FIXME 271 pop rp 272L(add_n): 273IFSTD(` lea (up,n,8), up C param 2: up 274 lea (up,n,8), %rdx C param 3: up - n 275 neg R32(n) ') C param 4: n 276 277IFDOS(` lea (up,n,8), %rdx C param 2: up 278 lea (%rdx,n,8), %r8 C param 3: up - n 279 neg R32(n) 280 mov n, %r9 C param 4: n 281 mov rp, %rcx ') C param 1: rp 282 283IFSTD(` sub $8, %rsp ') 284IFDOS(` sub $40, %rsp ') 285 ASSERT(nz, `test $15, %rsp') 286 CALL( mpn_add_n) 287IFSTD(` add $8, %rsp ') 288IFDOS(` add $40, %rsp ') 289 290L(ret): pop %r15 291 pop %r14 292 pop %r13 293 pop %r12 294 pop %rbp 295 pop %rbx 296 FUNC_EXIT() 297 ret 298 299L(n1): mov (mp_param), %rax 300 mul q0 301 add 8(up), %rax 302 adc 16(up), %rdx 303 mov %rdx, (rp) 304 mov $0, R32(%rax) 305 adc R32(%rax), R32(%rax) 306 jmp L(ret) 307 308L(n2): mov (mp_param), %rax 309 mov (up), %rbp 310 mul q0 311 add %rax, %rbp 312 mov %rdx, %r9 313 adc $0, %r9 314 mov -8(mp), %rax 315 mov 8(up), %r10 316 mul q0 317 add %rax, %r10 318 mov %rdx, %r11 319 adc $0, %r11 320 add %r9, %r10 321 adc $0, %r11 322 mov %r10, q0 323 imul u0inv, q0 C next q0 324 mov -16(mp), %rax 325 mul q0 326 add %rax, %r10 327 mov %rdx, %r9 328 adc $0, %r9 329 mov -8(mp), %rax 330 mov 16(up), %r14 331 mul q0 332 add %rax, %r14 333 adc $0, %rdx 334 add %r9, %r14 335 adc $0, %rdx 336 xor R32(%rax), R32(%rax) 337 add %r11, %r14 338 adc 24(up), %rdx 339 mov %r14, (rp) 340 mov %rdx, 8(rp) 341 adc R32(%rax), R32(%rax) 342 jmp L(ret) 343 344 ALIGNx 345L(n3): mov -24(mp), %rax 346 mov -8(up), %r10 347 mul q0 348 add %rax, %r10 349 mov -16(mp), %rax 350 mov %rdx, %r11 351 adc $0, %r11 352 mov (up), %rbp 353 mul q0 354 add %rax, %rbp 355 mov %rdx, %r9 356 adc $0, %r9 357 mov -8(mp), %rax 358 add %r11, %rbp 359 mov 8(up), %r10 360 adc $0, %r9 361 mul q0 362 mov %rbp, q0 363 imul u0inv, q0 C next q0 364 add %rax, %r10 365 mov %rdx, %r11 366 adc $0, %r11 367 mov %rbp, (up) 368 add %r9, %r10 369 adc $0, %r11 370 mov %r10, 8(up) 371 mov %r11, -8(up) C up[0] 372 lea 8(up), up C up++ 373 dec j 374 jnz L(n3) 375 376 mov -32(up), %rdx 377 mov -24(up), %rbx 378 xor R32(%rax), R32(%rax) 379 add %rbp, %rdx 380 adc %r10, %rbx 381 adc 8(up), %r11 382 mov %rdx, (rp) 383 mov %rbx, 8(rp) 384 mov %r11, 16(rp) 385 adc R32(%rax), R32(%rax) 386 jmp L(ret) 387 388 ALIGNx 389L(n4): mov -32(mp), %rax 390 mul q0 391 lea (%rax), %r11 392 mov -24(mp), %rax 393 lea (%rdx), %r14 394 mul q0 395 lea (%rax), %rbp 396 mov -16(mp), %rax 397 mov -16(up), %r10 398 lea (%rdx), %r9 399 mul q0 400 add %r11, %r10 401 lea (%rax), %r11 402 mov -8(mp), %rax 403 adc %r14, %rbp 404 mov -8(up), %rbx 405 lea (%rdx), %r14 406 adc $0, %r9 407 mul q0 408 add %rbp, %rbx 409 adc %r9, %r11 410 mov %rbx, -8(up) 411 mov (up), %r10 412 adc $0, %r14 413 imul u0inv, %rbx C next q limb 414 add %r11, %r10 415 adc %r14, %rax 416 mov %r10, (up) 417 mov 8(up), %r10 418 adc $0, %rdx 419 add %rax, %r10 420 mov %r10, 8(up) 421 adc $0, %rdx 422 mov %rdx, -16(up) C up[0] 423 mov %rbx, q0 C previously computed q limb -> q0 424 lea 8(up), up C up++ 425 dec j 426 jnz L(n4) 427 lea 16(up), up 428 jmp L(add_n) 429EPILOGUE() 430ASM_END() 431