redc_1.asm revision 1.1.1.1
1dnl X86-64 mpn_redc_1 optimised for Intel Nehalem and Westmere. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 ? 37C AMD K10 ? 38C AMD bull ? 39C AMD pile ? 40C AMD steam ? 41C AMD bobcat ? 42C AMD jaguar ? 43C Intel P4 ? 44C Intel core ? 45C Intel NHM ? 46C Intel SBR ? 47C Intel IBR ? 48C Intel HWL ? 49C Intel BWL ? 50C Intel atom ? 51C VIA nano ? 52 53C The inner loops of this code are the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbj��rn Granlund. 55 56C TODO 57C * Micro-optimise, none performed thus far. 58C * Consider inlining mpn_add_n. 59C * Single basecases out before the pushes. 60 61C When playing with pointers, set this to $2 to fall back to conservative 62C indexing in wind-down code. 63define(`I',`$1') 64 65define(`rp', `%rdi') C rcx 66define(`up', `%rsi') C rdx 67define(`mp_param', `%rdx') C r8 68define(`n', `%rcx') C r9 69define(`u0inv', `%r8') C stack 70 71define(`i', `%r14') 72define(`j', `%r15') 73define(`mp', `%r12') 74define(`q0', `%r13') 75 76C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 77 78ABI_SUPPORT(DOS64) 79ABI_SUPPORT(STD64) 80 81define(`ALIGNx', `ALIGN(16)') 82 83ASM_START() 84 TEXT 85 ALIGN(32) 86PROLOGUE(mpn_redc_1) 87 FUNC_ENTRY(4) 88IFDOS(` mov 56(%rsp), %r8 ') 89 push %rbx 90 push %rbp 91 push %r12 92 push %r13 93 push %r14 94 push %r15 95 96 mov (up), q0 97 mov n, j C outer loop induction var 98 lea (mp_param,n,8), mp 99 lea (up,n,8), up 100 neg n 101 imul u0inv, q0 C first iteration q0 102 103 test $1, R8(n) 104 jz L(bx0) 105 106L(bx1): test $2, R8(n) 107 jz L(b3) 108 109L(b1): cmp $-1, R32(n) 110 jz L(n1) 111 112L(otp1):lea 3(n), i 113 mov (mp,n,8), %rax 114 mov (up,n,8), %rbp 115 mul q0 116 add %rax, %rbp 117 mov $0, R32(%r9) 118 mov 8(mp,n,8), %rax 119 adc %rdx, %r9 120 mul q0 121 mov $0, R32(%r11) 122 mov 8(up,n,8), %rbx 123 add %rax, %rbx 124 mov 16(mp,n,8), %rax 125 adc %rdx, %r11 126 add %r9, %rbx 127 adc $0, %r11 128 mov 16(up,n,8), %rbp 129 mul q0 130 add %rax, %rbp 131 mov $0, R32(%r9) 132 mov 24(mp,n,8), %rax 133 adc %rdx, %r9 134 mov %rbx, 8(up,n,8) 135 imul u0inv, %rbx C next q limb 136 jmp L(e1) 137 138 ALIGNx 139L(tp1): mul q0 140 add %rax, %rbp 141 mov $0, R32(%r9) 142 mov -16(mp,i,8), %rax 143 adc %rdx, %r9 144 mul q0 145 add %r11, %rbp 146 mov $0, R32(%r11) 147 mov -16(up,i,8), %r10 148 adc $0, %r9 149 add %rax, %r10 150 mov -8(mp,i,8), %rax 151 adc %rdx, %r11 152 mov %rbp, -24(up,i,8) 153 add %r9, %r10 154 adc $0, %r11 155 mov -8(up,i,8), %rbp 156 mul q0 157 add %rax, %rbp 158 mov $0, R32(%r9) 159 mov (mp,i,8), %rax 160 adc %rdx, %r9 161 mov %r10, -16(up,i,8) 162L(e1): add %r11, %rbp 163 adc $0, %r9 164 mul q0 165 mov (up,i,8), %r10 166 mov $0, R32(%r11) 167 add %rax, %r10 168 mov 8(mp,i,8), %rax 169 adc %rdx, %r11 170 mov %rbp, -8(up,i,8) 171 add %r9, %r10 172 adc $0, %r11 173 mov 8(up,i,8), %rbp 174 mov %r10, (up,i,8) 175 add $4, i 176 jnc L(tp1) 177 178L(ed1): mul q0 179 add %rax, %rbp 180 adc $0, %rdx 181 add %r11, %rbp 182 adc $0, %rdx 183 mov %rbp, I(-8(up),-24(up,i,8)) 184 mov %rdx, (up,n,8) C up[0] 185 mov %rbx, q0 C previously computed q limb -> q0 186 lea 8(up), up C up++ 187 dec j 188 jnz L(otp1) 189 jmp L(cj) 190 191L(b3): cmp $-3, R32(n) 192 jz L(n3) 193 194L(otp3):lea 5(n), i 195 mov (mp,n,8), %rax 196 mov (up,n,8), %rbp 197 mul q0 198 add %rax, %rbp 199 mov $0, R32(%r9) 200 mov 8(mp,n,8), %rax 201 adc %rdx, %r9 202 mul q0 203 mov 8(up,n,8), %rbx 204 mov $0, R32(%r11) 205 add %rax, %rbx 206 mov 16(mp,n,8), %rax 207 adc %rdx, %r11 208 add %r9, %rbx 209 adc $0, %r11 210 mov 16(up,n,8), %rbp 211 mov %rbx, 8(up,n,8) 212 imul u0inv, %rbx C next q limb 213C jmp L(tp3) 214 215 ALIGNx 216L(tp3): mul q0 217 add %rax, %rbp 218 mov $0, R32(%r9) 219 mov -16(mp,i,8), %rax 220 adc %rdx, %r9 221 mul q0 222 add %r11, %rbp 223 mov $0, R32(%r11) 224 mov -16(up,i,8), %r10 225 adc $0, %r9 226 add %rax, %r10 227 mov -8(mp,i,8), %rax 228 adc %rdx, %r11 229 mov %rbp, -24(up,i,8) 230 add %r9, %r10 231 adc $0, %r11 232 mov -8(up,i,8), %rbp 233 mul q0 234 add %rax, %rbp 235 mov $0, R32(%r9) 236 mov (mp,i,8), %rax 237 adc %rdx, %r9 238 mov %r10, -16(up,i,8) 239 add %r11, %rbp 240 adc $0, %r9 241 mul q0 242 mov (up,i,8), %r10 243 mov $0, R32(%r11) 244 add %rax, %r10 245 mov 8(mp,i,8), %rax 246 adc %rdx, %r11 247 mov %rbp, -8(up,i,8) 248 add %r9, %r10 249 adc $0, %r11 250 mov 8(up,i,8), %rbp 251 mov %r10, (up,i,8) 252 add $4, i 253 jnc L(tp3) 254 255L(ed3): mul q0 256 add %rax, %rbp 257 adc $0, %rdx 258 add %r11, %rbp 259 adc $0, %rdx 260 mov %rbp, I(-8(up),-24(up,i,8)) 261 mov %rdx, (up,n,8) C up[0] 262 mov %rbx, q0 C previously computed q limb -> q0 263 lea 8(up), up C up++ 264 dec j 265 jnz L(otp3) 266C jmp L(cj) 267 268L(cj): 269IFSTD(` lea (up,n,8), up C param 2: up 270 lea (up,n,8), %rdx C param 3: up - n 271 neg R32(n) ') C param 4: n 272 273IFDOS(` lea (up,n,8), %rdx C param 2: up 274 lea (%rdx,n,8), %r8 C param 3: up - n 275 neg R32(n) 276 mov n, %r9 C param 4: n 277 mov rp, %rcx ') C param 1: rp 278 279IFSTD(` sub $8, %rsp ') 280IFDOS(` sub $40, %rsp ') 281 ASSERT(nz, `test $15, %rsp') 282 CALL( mpn_add_n) 283IFSTD(` add $8, %rsp ') 284IFDOS(` add $40, %rsp ') 285 286L(ret): pop %r15 287 pop %r14 288 pop %r13 289 pop %r12 290 pop %rbp 291 pop %rbx 292 FUNC_EXIT() 293 ret 294 295L(bx0): test $2, R8(n) 296 jnz L(b2) 297 298L(b0): 299L(otp0):lea 2(n), i 300 mov (mp,n,8), %rax 301 mul q0 302 mov $0, R32(%r11) 303 mov (up,n,8), %r10 304 add %rax, %r10 305 mov 8(mp,n,8), %rax 306 adc %rdx, %r11 307 mov 8(up,n,8), %rbx 308 mul q0 309 add %rax, %rbx 310 mov $0, R32(%r9) 311 mov 16(mp,n,8), %rax 312 adc %rdx, %r9 313 add %r11, %rbx 314 adc $0, %r9 315 mul q0 316 mov 16(up,n,8), %r10 317 mov $0, R32(%r11) 318 add %rax, %r10 319 mov 24(mp,n,8), %rax 320 adc %rdx, %r11 321 mov %rbx, 8(up,n,8) 322 imul u0inv, %rbx C next q limb 323 jmp L(e0) 324 325 ALIGNx 326L(tp0): mul q0 327 add %rax, %rbp 328 mov $0, R32(%r9) 329 mov -16(mp,i,8), %rax 330 adc %rdx, %r9 331 mul q0 332 add %r11, %rbp 333 mov $0, R32(%r11) 334 mov -16(up,i,8), %r10 335 adc $0, %r9 336 add %rax, %r10 337 mov -8(mp,i,8), %rax 338 adc %rdx, %r11 339 mov %rbp, -24(up,i,8) 340 add %r9, %r10 341 adc $0, %r11 342 mov -8(up,i,8), %rbp 343 mul q0 344 add %rax, %rbp 345 mov $0, R32(%r9) 346 mov (mp,i,8), %rax 347 adc %rdx, %r9 348 mov %r10, -16(up,i,8) 349 add %r11, %rbp 350 adc $0, %r9 351 mul q0 352 mov (up,i,8), %r10 353 mov $0, R32(%r11) 354 add %rax, %r10 355 mov 8(mp,i,8), %rax 356 adc %rdx, %r11 357 mov %rbp, -8(up,i,8) 358L(e0): add %r9, %r10 359 adc $0, %r11 360 mov 8(up,i,8), %rbp 361 mov %r10, (up,i,8) 362 add $4, i 363 jnc L(tp0) 364 365L(ed0): mul q0 366 add %rax, %rbp 367 adc $0, %rdx 368 add %r11, %rbp 369 adc $0, %rdx 370 mov %rbp, I(-8(up),-24(up,i,8)) 371 mov %rdx, (up,n,8) C up[0] 372 mov %rbx, q0 C previously computed q limb -> q0 373 lea 8(up), up C up++ 374 dec j 375 jnz L(otp0) 376 jmp L(cj) 377 378L(b2): cmp $-2, R32(n) 379 jz L(n2) 380 381L(otp2):lea 4(n), i 382 mov (mp,n,8), %rax 383 mul q0 384 mov (up,n,8), %r10 385 mov $0, R32(%r11) 386 add %rax, %r10 387 mov 8(mp,n,8), %rax 388 adc %rdx, %r11 389 mov 8(up,n,8), %rbx 390 mul q0 391 add %rax, %rbx 392 mov $0, R32(%r9) 393 mov 16(mp,n,8), %rax 394 adc %rdx, %r9 395 mul q0 396 add %r11, %rbx 397 mov $0, R32(%r11) 398 mov 16(up,n,8), %r10 399 adc $0, %r9 400 add %rax, %r10 401 mov 24(mp,n,8), %rax 402 adc %rdx, %r11 403 mov %rbx, 8(up,n,8) 404 imul u0inv, %rbx C next q limb 405 jmp L(e2) 406 407 ALIGNx 408L(tp2): mul q0 409 add %rax, %rbp 410 mov $0, R32(%r9) 411 mov -16(mp,i,8), %rax 412 adc %rdx, %r9 413 mul q0 414 add %r11, %rbp 415 mov $0, R32(%r11) 416 mov -16(up,i,8), %r10 417 adc $0, %r9 418 add %rax, %r10 419 mov -8(mp,i,8), %rax 420 adc %rdx, %r11 421 mov %rbp, -24(up,i,8) 422L(e2): add %r9, %r10 423 adc $0, %r11 424 mov -8(up,i,8), %rbp 425 mul q0 426 add %rax, %rbp 427 mov $0, R32(%r9) 428 mov (mp,i,8), %rax 429 adc %rdx, %r9 430 mov %r10, -16(up,i,8) 431 add %r11, %rbp 432 adc $0, %r9 433 mul q0 434 mov (up,i,8), %r10 435 mov $0, R32(%r11) 436 add %rax, %r10 437 mov 8(mp,i,8), %rax 438 adc %rdx, %r11 439 mov %rbp, -8(up,i,8) 440 add %r9, %r10 441 adc $0, %r11 442 mov 8(up,i,8), %rbp 443 mov %r10, (up,i,8) 444 add $4, i 445 jnc L(tp2) 446 447L(ed2): mul q0 448 add %rax, %rbp 449 adc $0, %rdx 450 add %r11, %rbp 451 adc $0, %rdx 452 mov %rbp, I(-8(up),-24(up,i,8)) 453 mov %rdx, (up,n,8) C up[0] 454 mov %rbx, q0 C previously computed q limb -> q0 455 lea 8(up), up C up++ 456 dec j 457 jnz L(otp2) 458 jmp L(cj) 459 460L(n1): mov (mp_param), %rax 461 mul q0 462 add -8(up), %rax 463 adc (up), %rdx 464 mov %rdx, (rp) 465 mov $0, R32(%rax) 466 adc R32(%rax), R32(%rax) 467 jmp L(ret) 468 469L(n2): mov (mp_param), %rax 470 mov -16(up), %rbp 471 mul q0 472 add %rax, %rbp 473 mov %rdx, %r9 474 adc $0, %r9 475 mov -8(mp), %rax 476 mov -8(up), %r10 477 mul q0 478 add %rax, %r10 479 mov %rdx, %r11 480 adc $0, %r11 481 add %r9, %r10 482 adc $0, %r11 483 mov %r10, q0 484 imul u0inv, q0 C next q0 485 mov -16(mp), %rax 486 mul q0 487 add %rax, %r10 488 mov %rdx, %r9 489 adc $0, %r9 490 mov -8(mp), %rax 491 mov (up), %r14 492 mul q0 493 add %rax, %r14 494 adc $0, %rdx 495 add %r9, %r14 496 adc $0, %rdx 497 xor R32(%rax), R32(%rax) 498 add %r11, %r14 499 adc 8(up), %rdx 500 mov %r14, (rp) 501 mov %rdx, 8(rp) 502 adc R32(%rax), R32(%rax) 503 jmp L(ret) 504 505 ALIGNx 506L(n3): mov -24(mp), %rax 507 mov -24(up), %r10 508 mul q0 509 add %rax, %r10 510 mov -16(mp), %rax 511 mov %rdx, %r11 512 adc $0, %r11 513 mov -16(up), %rbp 514 mul q0 515 add %rax, %rbp 516 mov %rdx, %r9 517 adc $0, %r9 518 mov -8(mp), %rax 519 add %r11, %rbp 520 mov -8(up), %r10 521 adc $0, %r9 522 mul q0 523 mov %rbp, q0 524 imul u0inv, q0 C next q0 525 add %rax, %r10 526 mov %rdx, %r11 527 adc $0, %r11 528 mov %rbp, -16(up) 529 add %r9, %r10 530 adc $0, %r11 531 mov %r10, -8(up) 532 mov %r11, -24(up) C up[0] 533 lea 8(up), up C up++ 534 dec j 535 jnz L(n3) 536 537 mov -48(up), %rdx 538 mov -40(up), %rbx 539 xor R32(%rax), R32(%rax) 540 add %rbp, %rdx 541 adc %r10, %rbx 542 adc -8(up), %r11 543 mov %rdx, (rp) 544 mov %rbx, 8(rp) 545 mov %r11, 16(rp) 546 adc R32(%rax), R32(%rax) 547 jmp L(ret) 548EPILOGUE() 549ASM_END() 550