1dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere. 2dnl It also seems good for Conroe/Wolfdale. 3 4dnl Contributed to the GNU project by Torbj��rn Granlund. 5 6dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 37C AMD K8,K9 38C AMD K10 39C AMD bull 40C AMD pile 41C AMD steam 42C AMD bobcat 43C AMD jaguar 44C Intel P4 45C Intel core 4.9 4.18-4.25 3.87 46C Intel NHM 3.8 4.06-4.2 3.5 47C Intel SBR 48C Intel IBR 49C Intel HWL 50C Intel BWL 51C Intel atom 52C VIA nano 53 54C The inner loops of this code are the result of running a code generation and 55C optimisation tool suite written by David Harvey and Torbj��rn Granlund. 56 57C Code structure: 58C 59C 60C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) 61C | | | | 62C | | | | 63C | | | | 64C \|/ \|/ \|/ \|/ 65C ____________ ____________ 66C / \ / \ 67C \|/ \ \|/ \ 68C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) 69C \ /|\ \ /|\ 70C \____________/ \____________/ 71C \ / 72C \ / 73C \ / 74C tail(0m2) tail(1m2) 75C \ / 76C \ / 77C sqr_diag_addlsh1 78 79C TODO 80C * Tune. None done so far. 81C * Currently 2761 bytes, making it smaller would be nice. 82C * Consider using a jumptab-based entry sequence. One might even use a mask- 83C less sequence, if the table is large enough to support tuneup's needs. 84C The code would be, using non-PIC code, 85C lea tab(%rip),%rax; jmp *(n,%rax) 86C or, 87C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx 88C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,.. 89C with the last four entries repeated a safe number of times. 90C * Consider expanding feed-in code in order to avoid zeroing registers. 91C * Zero consistently with xor. 92C * Check if using "lea (reg),reg" should be done in more places; we have some 93C explicit "mov %rax,reg" now. 94C * Try zeroing with xor in m2 loops. 95C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication 96C between loop header and wind-down code. 97C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. 98 99C When playing with pointers, set this to $2 to fall back to conservative 100C indexing in wind-down code. 101define(`I',`$1') 102 103C Define this to $1 to use late loop index variable as zero, $2 to use an 104C explicit $0. 105define(`Z',`$1') 106 107define(`rp', `%rdi') 108define(`up', `%rsi') 109define(`n_param', `%rdx') 110 111define(`n', `%r8') 112 113define(`v0', `%r10') 114define(`v1', `%r11') 115define(`w0', `%rbx') 116define(`w1', `%rcx') 117define(`w2', `%rbp') 118define(`w3', `%r9') 119define(`i', `%r13') 120 121define(`X0', `%r12') 122define(`X1', `%r14') 123 124C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 125 126ABI_SUPPORT(DOS64) 127ABI_SUPPORT(STD64) 128 129define(`ALIGNx', `ALIGN(16)') 130 131define(`N', 85) 132ifdef(`N',,`define(`N',0)') 133define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') 134 135ASM_START() 136 TEXT 137 ALIGN(32) 138PROLOGUE(mpn_sqr_basecase) 139 FUNC_ENTRY(3) 140 141 cmp $4, n_param 142 jl L(small) 143 144 push %rbx 145 push %rbp 146 push %r12 147 push %r13 148 push %r14 149 150 mov (up), v0 151 mov 8(up), %rax 152 mov %rax, v1 153 154 mov $1, R32(n) 155 sub n_param, n C n = -n_param+1 156 push n 157 158 lea (up,n_param,8), up 159 lea (rp,n_param,8), rp 160 161 mul v0 162 163 test $1, R8(n) 164 jnz L(bx1) 165 166L(bx0): test $2, R8(n) 167 mov %rax, (rp,n,8) 168 jnz L(b10) 169 170L(b00): lea (n), i C n = 5, 9, ... 171 mov %rdx, w1 C FIXME: Use lea? 172 xor R32(w2), R32(w2) 173 jmp L(m2e0) 174 175L(b10): lea 2(n), i C n = 7, 11, ... 176 mov 8(up,n,8), %rax 177 mov %rdx, w3 C FIXME: Use lea? 178 xor R32(w0), R32(w0) 179 xor R32(w1), R32(w1) 180 jmp L(m2e2) 181 182L(bx1): test $2, R8(n) 183 mov %rax, (rp,n,8) 184 jz L(b11) 185 186L(b01): lea 1(n), i C n = 6, 10, ... 187 mov %rdx, w0 C FIXME: Use lea? 188 xor R32(w1), R32(w1) 189 jmp L(m2e1) 190 191L(b11): lea -1(n), i C n = 4, 8, 12, ... 192 mov %rdx, w2 C FIXME: Use lea? 193 xor R32(w3), R32(w3) 194 jmp L(m2e3) 195 196 197 ALIGNx 198L(m2top1): 199 mul v0 200 add %rax, w3 201 mov -8(up,i,8), %rax 202 mov w3, -8(rp,i,8) 203 adc %rdx, w0 204 adc $0, R32(w1) 205 mul v1 206 add %rax, w0 207 adc %rdx, w1 208L(m2e1):mov $0, R32(w2) 209 mov (up,i,8), %rax 210 mul v0 211 add %rax, w0 212 mov w0, (rp,i,8) 213 adc %rdx, w1 214 mov (up,i,8), %rax 215 adc $0, R32(w2) 216 mul v1 217 add %rax, w1 218 adc %rdx, w2 219 mov 8(up,i,8), %rax 220 mul v0 221 mov $0, R32(w3) 222 add %rax, w1 223 adc %rdx, w2 224 adc $0, R32(w3) 225 mov 8(up,i,8), %rax 226 mul v1 227 add %rax, w2 228 mov w1, 8(rp,i,8) 229 adc %rdx, w3 230 mov $0, R32(w0) 231 mov 16(up,i,8), %rax 232 mul v0 233 add %rax, w2 234 mov 16(up,i,8), %rax 235 adc %rdx, w3 236 adc $0, R32(w0) 237 mul v1 238 mov $0, R32(w1) 239 add %rax, w3 240 mov 24(up,i,8), %rax 241 mov w2, 16(rp,i,8) 242 adc %rdx, w0 243 add $4, i 244 js L(m2top1) 245 246 mul v0 247 add %rax, w3 248 mov I(-8(up),-8(up,i,8)), %rax 249 mov w3, I(-8(rp),-8(rp,i,8)) 250 adc %rdx, w0 251 adc R32(w1), R32(w1) 252 mul v1 253 add w0, %rax 254 adc w1, %rdx 255 mov %rax, I((rp),(rp,i,8)) 256 mov %rdx, I(8(rp),8(rp,i,8)) 257 258 lea 16(rp), rp 259 add $2, n C decrease |n| 260 jmp L(am2o3) 261 262 ALIGNx 263L(m2top3): 264 mul v0 265 add %rax, w3 266 mov -8(up,i,8), %rax 267 mov w3, -8(rp,i,8) 268 adc %rdx, w0 269 adc $0, R32(w1) 270 mul v1 271 add %rax, w0 272 adc %rdx, w1 273 mov $0, R32(w2) 274 mov (up,i,8), %rax 275 mul v0 276 add %rax, w0 277 mov w0, (rp,i,8) 278 adc %rdx, w1 279 mov (up,i,8), %rax 280 adc $0, R32(w2) 281 mul v1 282 add %rax, w1 283 adc %rdx, w2 284 mov 8(up,i,8), %rax 285 mul v0 286 mov $0, R32(w3) 287 add %rax, w1 288 adc %rdx, w2 289 adc $0, R32(w3) 290 mov 8(up,i,8), %rax 291 mul v1 292 add %rax, w2 293 mov w1, 8(rp,i,8) 294 adc %rdx, w3 295L(m2e3):mov $0, R32(w0) 296 mov 16(up,i,8), %rax 297 mul v0 298 add %rax, w2 299 mov 16(up,i,8), %rax 300 adc %rdx, w3 301 adc $0, R32(w0) 302 mul v1 303 mov $0, R32(w1) 304 add %rax, w3 305 mov 24(up,i,8), %rax 306 mov w2, 16(rp,i,8) 307 adc %rdx, w0 308 add $4, i 309 js L(m2top3) 310 311 mul v0 312 add %rax, w3 313 mov I(-8(up),-8(up,i,8)), %rax 314 mov w3, I(-8(rp),-8(rp,i,8)) 315 adc %rdx, w0 316 adc R32(w1), R32(w1) 317 mul v1 318 add w0, %rax 319 adc w1, %rdx 320 mov %rax, I((rp),(rp,i,8)) 321 mov %rdx, I(8(rp),8(rp,i,8)) 322 323 lea 16(rp), rp 324 add $2, n C decrease |n| 325 cmp $-1, n 326 jz L(cor1) C jumps iff entry n = 4 327 328L(am2o1): 329 mov -8(up,n,8), v0 330 mov (up,n,8), %rax 331 mov %rax, v1 332 lea 1(n), i 333 mul v0 334 mov %rax, X1 335 MOV( %rdx, X0, 128) 336 mov (rp,n,8), w1 337 xor R32(w2), R32(w2) 338 mov 8(up,n,8), %rax 339 xor R32(w3), R32(w3) 340 jmp L(lo1) 341 342 ALIGNx 343L(am2top1): 344 mul v1 345 add w0, w1 346 adc %rax, w2 347 mov (up,i,8), %rax 348 MOV( %rdx, w3, 1) 349 adc $0, w3 350L(lo1): mul v0 351 add w1, X1 352 mov X1, -8(rp,i,8) 353 adc %rax, X0 354 MOV( %rdx, X1, 2) 355 adc $0, X1 356 mov (up,i,8), %rax 357 mul v1 358 MOV( %rdx, w0, 4) 359 mov (rp,i,8), w1 360 add w1, w2 361 adc %rax, w3 362 adc $0, w0 363 mov 8(up,i,8), %rax 364 mul v0 365 add w2, X0 366 adc %rax, X1 367 mov X0, (rp,i,8) 368 MOV( %rdx, X0, 8) 369 adc $0, X0 370 mov 8(up,i,8), %rax 371 mov 8(rp,i,8), w2 372 mul v1 373 add w2, w3 374 adc %rax, w0 375 MOV( %rdx, w1, 16) 376 adc $0, w1 377 mov 16(up,i,8), %rax 378 mul v0 379 add w3, X1 380 mov X1, 8(rp,i,8) 381 adc %rax, X0 382 MOV( %rdx, X1, 32) 383 mov 16(rp,i,8), w3 384 adc $0, X1 385 mov 16(up,i,8), %rax 386 mul v1 387 add w3, w0 388 MOV( %rdx, w2, 64) 389 adc %rax, w1 390 mov 24(up,i,8), %rax 391 adc $0, w2 392 mul v0 393 add w0, X0 394 mov X0, 16(rp,i,8) 395 MOV( %rdx, X0, 128) 396 adc %rax, X1 397 mov 24(up,i,8), %rax 398 mov 24(rp,i,8), w0 399 adc $0, X0 400 add $4, i 401 jnc L(am2top1) 402 403 mul v1 404 add w0, w1 405 adc w2, %rax 406 adc Z(i,$0), %rdx 407 add w1, X1 408 adc Z(i,$0), X0 409 mov X1, I(-8(rp),-8(rp,i,8)) 410 add X0, %rax 411 mov %rax, I((rp),(rp,i,8)) 412 adc Z(i,$0), %rdx 413 mov %rdx, I(8(rp),8(rp,i,8)) 414 415 lea 16(rp), rp 416 add $2, n 417 418L(am2o3): 419 mov -8(up,n,8), v0 420 mov (up,n,8), %rax 421 mov %rax, v1 422 lea -1(n), i 423 mul v0 424 mov %rax, X1 425 MOV( %rdx, X0, 8) 426 mov (rp,n,8), w3 427 xor R32(w0), R32(w0) 428 xor R32(w1), R32(w1) 429 mov 8(up,n,8), %rax 430 jmp L(lo3) 431 432 ALIGNx 433L(am2top3): 434 mul v1 435 add w0, w1 436 adc %rax, w2 437 mov (up,i,8), %rax 438 MOV( %rdx, w3, 1) 439 adc $0, w3 440 mul v0 441 add w1, X1 442 mov X1, -8(rp,i,8) 443 adc %rax, X0 444 MOV( %rdx, X1, 2) 445 adc $0, X1 446 mov (up,i,8), %rax 447 mul v1 448 MOV( %rdx, w0, 4) 449 mov (rp,i,8), w1 450 add w1, w2 451 adc %rax, w3 452 adc $0, w0 453 mov 8(up,i,8), %rax 454 mul v0 455 add w2, X0 456 adc %rax, X1 457 mov X0, (rp,i,8) 458 MOV( %rdx, X0, 8) 459 adc $0, X0 460 mov 8(up,i,8), %rax 461 mov 8(rp,i,8), w2 462 mul v1 463 add w2, w3 464 adc %rax, w0 465 MOV( %rdx, w1, 16) 466 adc $0, w1 467 mov 16(up,i,8), %rax 468L(lo3): mul v0 469 add w3, X1 470 mov X1, 8(rp,i,8) 471 adc %rax, X0 472 MOV( %rdx, X1, 32) 473 mov 16(rp,i,8), w3 474 adc $0, X1 475 mov 16(up,i,8), %rax 476 mul v1 477 add w3, w0 478 MOV( %rdx, w2, 64) 479 adc %rax, w1 480 mov 24(up,i,8), %rax 481 adc $0, w2 482 mul v0 483 add w0, X0 484 mov X0, 16(rp,i,8) 485 MOV( %rdx, X0, 128) 486 adc %rax, X1 487 mov 24(up,i,8), %rax 488 mov 24(rp,i,8), w0 489 adc $0, X0 490 add $4, i 491 jnc L(am2top3) 492 493 mul v1 494 add w0, w1 495 adc w2, %rax 496 adc Z(i,$0), %rdx 497 add w1, X1 498 adc Z(i,$0), X0 499 mov X1, I(-8(rp),-8(rp,i,8)) 500 add X0, %rax 501 mov %rax, I((rp),(rp,i,8)) 502 adc Z(i,$0), %rdx 503 mov %rdx, I(8(rp),8(rp,i,8)) 504 505 lea 16(rp), rp 506 add $2, n 507 cmp $-1, n 508 jnz L(am2o1) 509 510L(cor1):pop n 511 mov %rdx, w3 512 mov -16(up), v0 513 mov -8(up), %rax 514 mul v0 515 add w3, %rax 516 adc $0, %rdx 517 mov %rax, -8(rp) 518 mov %rdx, (rp) 519 jmp L(sqr_diag_addlsh1) 520 521 ALIGNx 522L(m2top2): 523L(m2e2):mul v0 524 add %rax, w3 525 mov -8(up,i,8), %rax 526 mov w3, -8(rp,i,8) 527 adc %rdx, w0 528 adc $0, R32(w1) 529 mul v1 530 add %rax, w0 531 adc %rdx, w1 532 mov $0, R32(w2) 533 mov (up,i,8), %rax 534 mul v0 535 add %rax, w0 536 mov w0, (rp,i,8) 537 adc %rdx, w1 538 mov (up,i,8), %rax 539 adc $0, R32(w2) 540 mul v1 541 add %rax, w1 542 adc %rdx, w2 543 mov 8(up,i,8), %rax 544 mul v0 545 mov $0, R32(w3) 546 add %rax, w1 547 adc %rdx, w2 548 adc $0, R32(w3) 549 mov 8(up,i,8), %rax 550 mul v1 551 add %rax, w2 552 mov w1, 8(rp,i,8) 553 adc %rdx, w3 554 mov $0, R32(w0) 555 mov 16(up,i,8), %rax 556 mul v0 557 add %rax, w2 558 mov 16(up,i,8), %rax 559 adc %rdx, w3 560 adc $0, R32(w0) 561 mul v1 562 mov $0, R32(w1) 563 add %rax, w3 564 mov 24(up,i,8), %rax 565 mov w2, 16(rp,i,8) 566 adc %rdx, w0 567 add $4, i 568 js L(m2top2) 569 570 mul v0 571 add %rax, w3 572 mov I(-8(up),-8(up,i,8)), %rax 573 mov w3, I(-8(rp),-8(rp,i,8)) 574 adc %rdx, w0 575 adc R32(w1), R32(w1) 576 mul v1 577 add w0, %rax 578 adc w1, %rdx 579 mov %rax, I((rp),(rp,i,8)) 580 mov %rdx, I(8(rp),8(rp,i,8)) 581 582 lea 16(rp), rp 583 add $2, n C decrease |n| 584 jmp L(am2o0) 585 586 ALIGNx 587L(m2top0): 588 mul v0 589 add %rax, w3 590 mov -8(up,i,8), %rax 591 mov w3, -8(rp,i,8) 592 adc %rdx, w0 593 adc $0, R32(w1) 594 mul v1 595 add %rax, w0 596 adc %rdx, w1 597 mov $0, R32(w2) 598 mov (up,i,8), %rax 599 mul v0 600 add %rax, w0 601 mov w0, (rp,i,8) 602 adc %rdx, w1 603 mov (up,i,8), %rax 604 adc $0, R32(w2) 605 mul v1 606 add %rax, w1 607 adc %rdx, w2 608L(m2e0):mov 8(up,i,8), %rax 609 mul v0 610 mov $0, R32(w3) 611 add %rax, w1 612 adc %rdx, w2 613 adc $0, R32(w3) 614 mov 8(up,i,8), %rax 615 mul v1 616 add %rax, w2 617 mov w1, 8(rp,i,8) 618 adc %rdx, w3 619 mov $0, R32(w0) 620 mov 16(up,i,8), %rax 621 mul v0 622 add %rax, w2 623 mov 16(up,i,8), %rax 624 adc %rdx, w3 625 adc $0, R32(w0) 626 mul v1 627 mov $0, R32(w1) 628 add %rax, w3 629 mov 24(up,i,8), %rax 630 mov w2, 16(rp,i,8) 631 adc %rdx, w0 632 add $4, i 633 js L(m2top0) 634 635 mul v0 636 add %rax, w3 637 mov I(-8(up),-8(up,i,8)), %rax 638 mov w3, I(-8(rp),-8(rp,i,8)) 639 adc %rdx, w0 640 adc R32(w1), R32(w1) 641 mul v1 642 add w0, %rax 643 adc w1, %rdx 644 mov %rax, I((rp),(rp,i,8)) 645 mov %rdx, I(8(rp),8(rp,i,8)) 646 647 lea 16(rp), rp 648 add $2, n C decrease |n| 649 cmp $-2, n 650 jz L(cor2) C jumps iff entry n = 5 651 652L(am2o2): 653 mov -8(up,n,8), v0 654 mov (up,n,8), %rax 655 mov %rax, v1 656 lea -2(n), i 657 mul v0 658 mov %rax, X0 659 MOV( %rdx, X1, 32) 660 mov (rp,n,8), w0 661 xor R32(w1), R32(w1) 662 xor R32(w2), R32(w2) 663 mov 8(up,n,8), %rax 664 jmp L(lo2) 665 666 ALIGNx 667L(am2top2): 668 mul v1 669 add w0, w1 670 adc %rax, w2 671 mov (up,i,8), %rax 672 MOV( %rdx, w3, 1) 673 adc $0, w3 674 mul v0 675 add w1, X1 676 mov X1, -8(rp,i,8) 677 adc %rax, X0 678 MOV( %rdx, X1, 2) 679 adc $0, X1 680 mov (up,i,8), %rax 681 mul v1 682 MOV( %rdx, w0, 4) 683 mov (rp,i,8), w1 684 add w1, w2 685 adc %rax, w3 686 adc $0, w0 687 mov 8(up,i,8), %rax 688 mul v0 689 add w2, X0 690 adc %rax, X1 691 mov X0, (rp,i,8) 692 MOV( %rdx, X0, 8) 693 adc $0, X0 694 mov 8(up,i,8), %rax 695 mov 8(rp,i,8), w2 696 mul v1 697 add w2, w3 698 adc %rax, w0 699 MOV( %rdx, w1, 16) 700 adc $0, w1 701 mov 16(up,i,8), %rax 702 mul v0 703 add w3, X1 704 mov X1, 8(rp,i,8) 705 adc %rax, X0 706 MOV( %rdx, X1, 32) 707 mov 16(rp,i,8), w3 708 adc $0, X1 709 mov 16(up,i,8), %rax 710 mul v1 711 add w3, w0 712 MOV( %rdx, w2, 64) 713 adc %rax, w1 714 mov 24(up,i,8), %rax 715 adc $0, w2 716L(lo2): mul v0 717 add w0, X0 718 mov X0, 16(rp,i,8) 719 MOV( %rdx, X0, 128) 720 adc %rax, X1 721 mov 24(up,i,8), %rax 722 mov 24(rp,i,8), w0 723 adc $0, X0 724 add $4, i 725 jnc L(am2top2) 726 727 mul v1 728 add w0, w1 729 adc w2, %rax 730 adc Z(i,$0), %rdx 731 add w1, X1 732 adc Z(i,$0), X0 733 mov X1, I(-8(rp),-8(rp,i,8)) 734 add X0, %rax 735 mov %rax, I((rp),(rp,i,8)) 736 adc Z(i,$0), %rdx 737 mov %rdx, I(8(rp),8(rp,i,8)) 738 739 lea 16(rp), rp 740 add $2, n 741 742L(am2o0): 743 mov -8(up,n,8), v0 744 mov (up,n,8), %rax 745 mov %rax, v1 746 lea 0(n), i 747 mul v0 748 mov %rax, X0 749 MOV( %rdx, X1, 2) 750 xor R32(w0), R32(w0) 751 mov (rp,n,8), w2 752 xor R32(w3), R32(w3) 753 jmp L(lo0) 754 755 ALIGNx 756L(am2top0): 757 mul v1 758 add w0, w1 759 adc %rax, w2 760 mov (up,i,8), %rax 761 MOV( %rdx, w3, 1) 762 adc $0, w3 763 mul v0 764 add w1, X1 765 mov X1, -8(rp,i,8) 766 adc %rax, X0 767 MOV( %rdx, X1, 2) 768 adc $0, X1 769 mov (up,i,8), %rax 770 mul v1 771 MOV( %rdx, w0, 4) 772 mov (rp,i,8), w1 773 add w1, w2 774 adc %rax, w3 775 adc $0, w0 776L(lo0): mov 8(up,i,8), %rax 777 mul v0 778 add w2, X0 779 adc %rax, X1 780 mov X0, (rp,i,8) 781 MOV( %rdx, X0, 8) 782 adc $0, X0 783 mov 8(up,i,8), %rax 784 mov 8(rp,i,8), w2 785 mul v1 786 add w2, w3 787 adc %rax, w0 788 MOV( %rdx, w1, 16) 789 adc $0, w1 790 mov 16(up,i,8), %rax 791 mul v0 792 add w3, X1 793 mov X1, 8(rp,i,8) 794 adc %rax, X0 795 MOV( %rdx, X1, 32) 796 mov 16(rp,i,8), w3 797 adc $0, X1 798 mov 16(up,i,8), %rax 799 mul v1 800 add w3, w0 801 MOV( %rdx, w2, 64) 802 adc %rax, w1 803 mov 24(up,i,8), %rax 804 adc $0, w2 805 mul v0 806 add w0, X0 807 mov X0, 16(rp,i,8) 808 MOV( %rdx, X0, 128) 809 adc %rax, X1 810 mov 24(up,i,8), %rax 811 mov 24(rp,i,8), w0 812 adc $0, X0 813 add $4, i 814 jnc L(am2top0) 815 816 mul v1 817 add w0, w1 818 adc w2, %rax 819 adc Z(i,$0), %rdx 820 add w1, X1 821 adc Z(i,$0), X0 822 mov X1, I(-8(rp),-8(rp,i,8)) 823 add X0, %rax 824 mov %rax, I((rp),(rp,i,8)) 825 adc Z(i,$0), %rdx 826 mov %rdx, I(8(rp),8(rp,i,8)) 827 828 lea 16(rp), rp 829 add $2, n 830 cmp $-2, n 831 jnz L(am2o2) 832 833L(cor2):pop n 834 mov -24(up), v0 835 mov %rax, w2 836 mov %rdx, w0 837 mov -16(up), %rax 838 mov %rax, v1 839 mul v0 840 mov %rax, X0 841 MOV( %rdx, X1, 32) 842 mov -8(up), %rax 843 mul v0 844 add w2, X0 845 mov X0, -16(rp) 846 MOV( %rdx, X0, 128) 847 adc %rax, X1 848 mov -8(up), %rax 849 adc $0, X0 850 mul v1 851 add w0, X1 852 adc $0, X0 853 mov X1, -8(rp) 854 add X0, %rax 855 mov %rax, (rp) 856 adc $0, %rdx 857 mov %rdx, 8(rp) 858 lea 8(rp), rp 859 860L(sqr_diag_addlsh1): 861 mov -8(up,n,8), %rax 862 shl n 863 xor R32(%rbx), R32(%rbx) 864 mul %rax 865 mov 8(rp,n,8), %r11 866 lea (%rdx), %r10 867 mov 16(rp,n,8), %r9 868 add %r11, %r11 869 jmp L(dm) 870 871 ALIGNx 872L(dtop):mul %rax 873 add %r11, %r10 874 mov 8(rp,n,8), %r11 875 mov %r10, -8(rp,n,8) 876 adc %r9, %rax 877 lea (%rdx,%rbx), %r10 878 mov 16(rp,n,8), %r9 879 adc %r11, %r11 880L(dm): mov %rax, (rp,n,8) 881 mov (up,n,4), %rax 882 adc %r9, %r9 883 setc R8(%rbx) 884 add $2, n 885 js L(dtop) 886 887 mul %rax 888 add %r11, %r10 889 mov %r10, -8(rp) 890 adc %r9, %rax 891 lea (%rdx,%rbx), %r10 892 mov %rax, (rp) 893 adc $0, %r10 894 mov %r10, 8(rp) 895 896 pop %r14 897 pop %r13 898 pop %r12 899 pop %rbp 900 pop %rbx 901 FUNC_EXIT() 902 ret 903 904 ALIGN(16) 905L(small): 906 mov (up), %rax 907 cmp $2, n_param 908 jae L(gt1) 909L(n1): 910 mul %rax 911 mov %rax, (rp) 912 mov %rdx, 8(rp) 913 FUNC_EXIT() 914 ret 915 916L(gt1): jne L(gt2) 917L(n2): mov %rax, %r8 918 mul %rax 919 mov 8(up), %r11 920 mov %rax, (rp) 921 mov %r11, %rax 922 mov %rdx, %r9 923 mul %rax 924 mov %rax, %r10 925 mov %r11, %rax 926 mov %rdx, %r11 927 mul %r8 928 xor %r8, %r8 929 add %rax, %r9 930 adc %rdx, %r10 931 adc %r8, %r11 932 add %rax, %r9 933 mov %r9, 8(rp) 934 adc %rdx, %r10 935 mov %r10, 16(rp) 936 adc %r8, %r11 937 mov %r11, 24(rp) 938 FUNC_EXIT() 939 ret 940 941L(gt2): 942L(n3): mov %rax, %r10 943 mul %rax 944 mov 8(up), %r11 945 mov %rax, (rp) 946 mov %r11, %rax 947 mov %rdx, 8(rp) 948 mul %rax 949 mov 16(up), %rcx 950 mov %rax, 16(rp) 951 mov %rcx, %rax 952 mov %rdx, 24(rp) 953 mul %rax 954 mov %rax, 32(rp) 955 mov %rdx, 40(rp) 956 957 mov %r11, %rax 958 mul %r10 959 mov %rax, %r8 960 mov %rcx, %rax 961 mov %rdx, %r9 962 mul %r10 963 xor %r10, %r10 964 add %rax, %r9 965 mov %r11, %rax 966 mov %r10, %r11 967 adc %rdx, %r10 968 969 mul %rcx 970 add %rax, %r10 971 adc %r11, %rdx 972 add %r8, %r8 973 adc %r9, %r9 974 adc %r10, %r10 975 adc %rdx, %rdx 976 adc %r11, %r11 977 add %r8, 8(rp) 978 adc %r9, 16(rp) 979 adc %r10, 24(rp) 980 adc %rdx, 32(rp) 981 adc %r11, 40(rp) 982 FUNC_EXIT() 983 ret 984EPILOGUE() 985