redc_1.asm revision 1.1.1.1
1dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 ? 37C AMD K10 ? 38C AMD bull ? 39C AMD pile ? 40C AMD steam ? 41C AMD bobcat ? 42C AMD jaguar ? 43C Intel P4 ? 44C Intel core ? 45C Intel NHM ? 46C Intel SBR 3.24 47C Intel IBR 3.04 48C Intel HWL ? 49C Intel BWL ? 50C Intel atom ? 51C VIA nano ? 52 53C The inner loops of this code are the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbj��rn Granlund. 55 56C TODO 57C * Micro-optimise, none performed thus far. 58C * Consider inlining mpn_add_n. 59C * Single basecases out before the pushes. 60 61C When playing with pointers, set this to $2 to fall back to conservative 62C indexing in wind-down code. 63define(`I',`$1') 64 65define(`rp', `%rdi') C rcx 66define(`up', `%rsi') C rdx 67define(`mp_param', `%rdx') C r8 68define(`n', `%rcx') C r9 69define(`u0inv', `%r8') C stack 70 71define(`i', `%r14') 72define(`j', `%r15') 73define(`mp', `%r12') 74define(`q0', `%r13') 75 76C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 77 78ABI_SUPPORT(DOS64) 79ABI_SUPPORT(STD64) 80 81define(`ALIGNx', `ALIGN(16)') 82 83ASM_START() 84 TEXT 85 ALIGN(32) 86PROLOGUE(mpn_redc_1) 87 FUNC_ENTRY(4) 88IFDOS(` mov 56(%rsp), %r8 ') 89 push %rbx 90 push %rbp 91 push %r12 92 push %r13 93 push %r14 94 push %r15 95 96 mov (up), q0 97 mov n, j C outer loop induction var 98 lea 8(mp_param,n,8), mp 99 lea 8(up,n,8), up 100 neg n 101 imul u0inv, q0 C first iteration q0 102 103 test $1, R8(n) 104 jz L(bx0) 105 106L(bx1): test $2, R8(n) 107 jz L(b3) 108 109L(b1): cmp $-1, R32(n) 110 jz L(n1) 111 112L(otp1):lea 1(n), i 113 mov -8(mp,n,8), %rax 114 mul q0 115 mov -8(up,n,8), %r10 116 mov %rdx, %r11 117 add %rax, %r10 118 mov (mp,n,8), %rax 119 adc $0, %r11 120 mul q0 121 mov %rdx, %r9 122 mov (up,n,8), %rbx 123 add %rax, %rbx 124 adc $0, %r9 125 mov (mp,i,8), %rax 126 mul q0 127 mov (up,i,8), %r10 128 add %r11, %rbx 129 mov %rbx, -8(up,i,8) C next low remainder limb 130 adc $0, %r9 131 imul u0inv, %rbx C next q limb 132 jmp L(e1) 133 134 ALIGNx 135L(tp1): mul q0 136 mov -16(up,i,8), %r10 137 add %r11, %rbp 138 mov %rdx, %r11 139 adc $0, %r9 140 mov %rbp, -24(up,i,8) 141 add %rax, %r10 142 mov -8(mp,i,8), %rax 143 adc $0, %r11 144 mul q0 145 add %r9, %r10 146 mov %rdx, %r9 147 mov -8(up,i,8), %rbp 148 adc $0, %r11 149 mov %r10, -16(up,i,8) 150 add %rax, %rbp 151 adc $0, %r9 152 mov (mp,i,8), %rax 153 mul q0 154 mov (up,i,8), %r10 155 add %r11, %rbp 156 mov %rbp, -8(up,i,8) 157 adc $0, %r9 158L(e1): mov %rdx, %r11 159 add %rax, %r10 160 mov 8(mp,i,8), %rax 161 adc $0, %r11 162 mul q0 163 mov 8(up,i,8), %rbp 164 add %r9, %r10 165 mov %rdx, %r9 166 mov %r10, (up,i,8) 167 adc $0, %r11 168 add %rax, %rbp 169 adc $0, %r9 170 mov 16(mp,i,8), %rax 171 add $4, i 172 jnc L(tp1) 173 174L(ed1): mul q0 175 mov I(-16(up),-16(up,i,8)), %r10 176 add %r11, %rbp 177 adc $0, %r9 178 mov %rbp, I(-24(up),-24(up,i,8)) 179 add %rax, %r10 180 adc $0, %rdx 181 add %r9, %r10 182 adc $0, %rdx 183 mov %r10, I(-16(up),-16(up,i,8)) 184 mov %rdx, -8(up,n,8) C up[0] 185 mov %rbx, q0 C previously computed q limb -> q0 186 lea 8(up), up C up++ 187 dec j 188 jnz L(otp1) 189 jmp L(cj) 190 191L(b3): cmp $-3, R32(n) 192 jz L(n3) 193 194L(otp3):lea 3(n), i 195 mov -8(mp,n,8), %rax 196 mul q0 197 mov -8(up,n,8), %r10 198 mov %rdx, %r11 199 add %rax, %r10 200 mov (mp,n,8), %rax 201 adc $0, %r11 202 mul q0 203 mov (up,n,8), %rbx 204 mov %rdx, %r9 205 add %rax, %rbx 206 adc $0, %r9 207 mov 8(mp,n,8), %rax 208 mul q0 209 mov 8(up,n,8), %r10 210 add %r11, %rbx 211 mov %rdx, %r11 212 adc $0, %r9 213 mov %rbx, (up,n,8) 214 imul u0inv, %rbx C next q limb 215 jmp L(e3) 216 217 ALIGNx 218L(tp3): mul q0 219 mov -16(up,i,8), %r10 220 add %r11, %rbp 221 mov %rdx, %r11 222 adc $0, %r9 223 mov %rbp, -24(up,i,8) 224L(e3): add %rax, %r10 225 mov -8(mp,i,8), %rax 226 adc $0, %r11 227 mul q0 228 add %r9, %r10 229 mov %rdx, %r9 230 mov -8(up,i,8), %rbp 231 adc $0, %r11 232 mov %r10, -16(up,i,8) 233 add %rax, %rbp 234 adc $0, %r9 235 mov (mp,i,8), %rax 236 mul q0 237 mov (up,i,8), %r10 238 add %r11, %rbp 239 mov %rbp, -8(up,i,8) 240 adc $0, %r9 241 mov %rdx, %r11 242 add %rax, %r10 243 mov 8(mp,i,8), %rax 244 adc $0, %r11 245 mul q0 246 mov 8(up,i,8), %rbp 247 add %r9, %r10 248 mov %rdx, %r9 249 mov %r10, (up,i,8) 250 adc $0, %r11 251 add %rax, %rbp 252 adc $0, %r9 253 mov 16(mp,i,8), %rax 254 add $4, i 255 jnc L(tp3) 256 257L(ed3): mul q0 258 mov I(-16(up),-16(up,i,8)), %r10 259 add %r11, %rbp 260 adc $0, %r9 261 mov %rbp, I(-24(up),-24(up,i,8)) 262 add %rax, %r10 263 adc $0, %rdx 264 add %r9, %r10 265 adc $0, %rdx 266 mov %r10, I(-16(up),-16(up,i,8)) 267 mov %rdx, -8(up,n,8) C up[0] 268 mov %rbx, q0 C previously computed q limb -> q0 269 lea 8(up), up C up++ 270 dec j 271 jnz L(otp3) 272C jmp L(cj) 273 274L(cj): 275IFSTD(` lea -8(up,n,8), up C param 2: up 276 lea (up,n,8), %rdx C param 3: up - n 277 neg R32(n) ') C param 4: n 278 279IFDOS(` lea -8(up,n,8), %rdx C param 2: up 280 lea (%rdx,n,8), %r8 C param 3: up - n 281 neg R32(n) 282 mov n, %r9 C param 4: n 283 mov rp, %rcx ') C param 1: rp 284 285IFSTD(` sub $8, %rsp ') 286IFDOS(` sub $40, %rsp ') 287 ASSERT(nz, `test $15, %rsp') 288 CALL( mpn_add_n) 289IFSTD(` add $8, %rsp ') 290IFDOS(` add $40, %rsp ') 291 292L(ret): pop %r15 293 pop %r14 294 pop %r13 295 pop %r12 296 pop %rbp 297 pop %rbx 298 FUNC_EXIT() 299 ret 300 301L(bx0): test $2, R8(n) 302 jnz L(b2) 303 304L(b0): 305L(otp0):lea (n), i 306 mov -8(mp,n,8), %rax 307 mul q0 308 mov %rdx, %r9 309 mov -8(up,n,8), %rbp 310 add %rax, %rbp 311 adc $0, %r9 312 mov (mp,n,8), %rax 313 mul q0 314 mov (up,n,8), %rbx 315 mov %rdx, %r11 316 add %rax, %rbx 317 mov 8(mp,n,8), %rax 318 adc $0, %r11 319 mul q0 320 mov 8(up,n,8), %rbp 321 add %r9, %rbx 322 mov %rdx, %r9 323 mov %rbx, (up,n,8) 324 adc $0, %r11 325 imul u0inv, %rbx C next q limb 326 jmp L(e0) 327 328 ALIGNx 329L(tp0): mul q0 330 mov -16(up,i,8), %r10 331 add %r11, %rbp 332 mov %rdx, %r11 333 adc $0, %r9 334 mov %rbp, -24(up,i,8) 335 add %rax, %r10 336 mov -8(mp,i,8), %rax 337 adc $0, %r11 338 mul q0 339 add %r9, %r10 340 mov %rdx, %r9 341 mov -8(up,i,8), %rbp 342 adc $0, %r11 343 mov %r10, -16(up,i,8) 344 add %rax, %rbp 345 adc $0, %r9 346 mov (mp,i,8), %rax 347 mul q0 348 mov (up,i,8), %r10 349 add %r11, %rbp 350 mov %rbp, -8(up,i,8) 351 adc $0, %r9 352 mov %rdx, %r11 353 add %rax, %r10 354 mov 8(mp,i,8), %rax 355 adc $0, %r11 356 mul q0 357 mov 8(up,i,8), %rbp 358 add %r9, %r10 359 mov %rdx, %r9 360 mov %r10, (up,i,8) 361 adc $0, %r11 362L(e0): add %rax, %rbp 363 adc $0, %r9 364 mov 16(mp,i,8), %rax 365 add $4, i 366 jnc L(tp0) 367 368L(ed0): mul q0 369 mov I(-16(up),-16(up,i,8)), %r10 370 add %r11, %rbp 371 adc $0, %r9 372 mov %rbp, I(-24(up),-24(up,i,8)) 373 add %rax, %r10 374 adc $0, %rdx 375 add %r9, %r10 376 adc $0, %rdx 377 mov %r10, I(-16(up),-16(up,i,8)) 378 mov %rdx, -8(up,n,8) C up[0] 379 mov %rbx, q0 C previously computed q limb -> q0 380 lea 8(up), up C up++ 381 dec j 382 jnz L(otp0) 383 jmp L(cj) 384 385L(b2): cmp $-2, R32(n) 386 jz L(n2) 387 388L(otp2):lea 2(n), i 389 mov -8(mp,n,8), %rax 390 mul q0 391 mov -8(up,n,8), %rbp 392 mov %rdx, %r9 393 add %rax, %rbp 394 adc $0, %r9 395 mov (mp,n,8), %rax 396 mul q0 397 mov (up,n,8), %rbx 398 mov %rdx, %r11 399 add %rax, %rbx 400 mov 8(mp,n,8), %rax 401 adc $0, %r11 402 mul q0 403 add %r9, %rbx 404 mov %rdx, %r9 405 mov 8(up,n,8), %rbp 406 adc $0, %r11 407 mov %rbx, (up,n,8) 408 imul u0inv, %rbx C next q limb 409 jmp L(e2) 410 411 ALIGNx 412L(tp2): mul q0 413 mov -16(up,i,8), %r10 414 add %r11, %rbp 415 mov %rdx, %r11 416 adc $0, %r9 417 mov %rbp, -24(up,i,8) 418 add %rax, %r10 419 mov -8(mp,i,8), %rax 420 adc $0, %r11 421 mul q0 422 add %r9, %r10 423 mov %rdx, %r9 424 mov -8(up,i,8), %rbp 425 adc $0, %r11 426 mov %r10, -16(up,i,8) 427L(e2): add %rax, %rbp 428 adc $0, %r9 429 mov (mp,i,8), %rax 430 mul q0 431 mov (up,i,8), %r10 432 add %r11, %rbp 433 mov %rbp, -8(up,i,8) 434 adc $0, %r9 435 mov %rdx, %r11 436 add %rax, %r10 437 mov 8(mp,i,8), %rax 438 adc $0, %r11 439 mul q0 440 mov 8(up,i,8), %rbp 441 add %r9, %r10 442 mov %rdx, %r9 443 mov %r10, (up,i,8) 444 adc $0, %r11 445 add %rax, %rbp 446 adc $0, %r9 447 mov 16(mp,i,8), %rax 448 add $4, i 449 jnc L(tp2) 450 451L(ed2): mul q0 452 mov I(-16(up),-16(up,i,8)), %r10 453 add %r11, %rbp 454 adc $0, %r9 455 mov %rbp, I(-24(up),-24(up,i,8)) 456 add %rax, %r10 457 adc $0, %rdx 458 add %r9, %r10 459 adc $0, %rdx 460 mov %r10, I(-16(up),-16(up,i,8)) 461 mov %rdx, -8(up,n,8) C up[0] 462 mov %rbx, q0 C previously computed q limb -> q0 463 lea 8(up), up C up++ 464 dec j 465 jnz L(otp2) 466 jmp L(cj) 467 468L(n1): mov (mp_param), %rax 469 mul q0 470 add -16(up), %rax 471 adc -8(up), %rdx 472 mov %rdx, (rp) 473 mov $0, R32(%rax) 474 adc R32(%rax), R32(%rax) 475 jmp L(ret) 476 477L(n2): mov (mp_param), %rax 478 mov -24(up), %rbp 479 mul q0 480 add %rax, %rbp 481 mov %rdx, %r9 482 adc $0, %r9 483 mov -16(mp), %rax 484 mov -16(up), %r10 485 mul q0 486 add %rax, %r10 487 mov %rdx, %r11 488 adc $0, %r11 489 add %r9, %r10 490 adc $0, %r11 491 mov %r10, q0 492 imul u0inv, q0 C next q0 493 mov -24(mp), %rax 494 mul q0 495 add %rax, %r10 496 mov %rdx, %r9 497 adc $0, %r9 498 mov -16(mp), %rax 499 mov -8(up), %r14 500 mul q0 501 add %rax, %r14 502 adc $0, %rdx 503 add %r9, %r14 504 adc $0, %rdx 505 xor R32(%rax), R32(%rax) 506 add %r11, %r14 507 adc (up), %rdx 508 mov %r14, (rp) 509 mov %rdx, 8(rp) 510 adc R32(%rax), R32(%rax) 511 jmp L(ret) 512 513 ALIGNx 514L(n3): mov -32(mp), %rax 515 mov -32(up), %r10 516 mul q0 517 add %rax, %r10 518 mov -24(mp), %rax 519 mov %rdx, %r11 520 adc $0, %r11 521 mov -24(up), %rbp 522 mul q0 523 add %rax, %rbp 524 mov %rdx, %r9 525 adc $0, %r9 526 mov -16(mp), %rax 527 add %r11, %rbp 528 mov -16(up), %r10 529 adc $0, %r9 530 mul q0 531 mov %rbp, q0 532 imul u0inv, q0 C next q0 533 add %rax, %r10 534 mov %rdx, %r11 535 adc $0, %r11 536 mov %rbp, -24(up) 537 add %r9, %r10 538 adc $0, %r11 539 mov %r10, -16(up) 540 mov %r11, -32(up) C up[0] 541 lea 8(up), up C up++ 542 dec j 543 jnz L(n3) 544 jmp L(cj) 545EPILOGUE() 546ASM_END() 547