x86_64-mont.S revision 298999
1185380Ssam # $FreeBSD: stable/10/secure/lib/libcrypto/amd64/x86_64-mont.S 298999 2016-05-03 18:54:20Z jkim $ 2185380Ssam.text 3185380Ssam 4185380Ssam.globl bn_mul_mont 5185380Ssam.type bn_mul_mont,@function 6185380Ssam.align 16 7185380Ssambn_mul_mont: 8185380Ssam testl $3,%r9d 9185380Ssam jnz .Lmul_enter 10185380Ssam cmpl $8,%r9d 11185380Ssam jb .Lmul_enter 12185380Ssam cmpq %rsi,%rdx 13185380Ssam jne .Lmul4x_enter 14185380Ssam jmp .Lsqr4x_enter 15185380Ssam 16185380Ssam.align 16 17187129Ssam.Lmul_enter: 18185380Ssam pushq %rbx 19185380Ssam pushq %rbp 20185380Ssam pushq %r12 21185380Ssam pushq %r13 22185380Ssam pushq %r14 23185380Ssam pushq %r15 24185380Ssam 25185380Ssam movl %r9d,%r9d 26185380Ssam leaq 2(%r9),%r10 27185380Ssam movq %rsp,%r11 28185380Ssam negq %r10 29185380Ssam leaq (%rsp,%r10,8),%rsp 30185380Ssam andq $-1024,%rsp 31185380Ssam 32185380Ssam movq %r11,8(%rsp,%r9,8) 33185380Ssam.Lmul_body: 34185380Ssam 35185380Ssam 36185380Ssam 37185380Ssam 38185380Ssam 39185380Ssam 40185380Ssam subq %rsp,%r11 41185380Ssam andq $-4096,%r11 42185380Ssam.Lmul_page_walk: 43185380Ssam movq (%rsp,%r11,1),%r10 44185380Ssam subq $4096,%r11 45185380Ssam.byte 0x66,0x2e 46185380Ssam jnc .Lmul_page_walk 47185380Ssam 48185380Ssam movq %rdx,%r12 49185380Ssam movq (%r8),%r8 50185380Ssam movq (%r12),%rbx 51185380Ssam movq (%rsi),%rax 52185380Ssam 53185380Ssam xorq %r14,%r14 54185380Ssam xorq %r15,%r15 55185380Ssam 56185380Ssam movq %r8,%rbp 57185380Ssam mulq %rbx 58185380Ssam movq %rax,%r10 59185380Ssam movq (%rcx),%rax 60185380Ssam 61185380Ssam imulq %r10,%rbp 62185380Ssam movq %rdx,%r11 63185380Ssam 64185380Ssam mulq %rbp 65185380Ssam addq %rax,%r10 66185380Ssam movq 8(%rsi),%rax 67185380Ssam adcq $0,%rdx 68185380Ssam movq %rdx,%r13 69185380Ssam 70185380Ssam leaq 1(%r15),%r15 71185380Ssam jmp .L1st_enter 72185380Ssam 73185380Ssam.align 16 74185380Ssam.L1st: 75185380Ssam addq %rax,%r13 76185380Ssam movq (%rsi,%r15,8),%rax 77185380Ssam adcq $0,%rdx 78185380Ssam addq %r11,%r13 79185380Ssam movq %r10,%r11 80185380Ssam adcq $0,%rdx 81185380Ssam movq %r13,-16(%rsp,%r15,8) 82185380Ssam movq %rdx,%r13 83185380Ssam 84185380Ssam.L1st_enter: 85185380Ssam mulq %rbx 86185380Ssam addq %rax,%r11 87185380Ssam movq (%rcx,%r15,8),%rax 88185380Ssam adcq $0,%rdx 89185380Ssam leaq 1(%r15),%r15 90185380Ssam movq %rdx,%r10 91185380Ssam 92185380Ssam mulq %rbp 93185380Ssam cmpq %r9,%r15 94185380Ssam jne .L1st 95185380Ssam 96185380Ssam addq %rax,%r13 97185380Ssam movq (%rsi),%rax 98185380Ssam adcq $0,%rdx 99185380Ssam addq %r11,%r13 100185380Ssam adcq $0,%rdx 101185380Ssam movq %r13,-16(%rsp,%r15,8) 102185380Ssam movq %rdx,%r13 103185380Ssam movq %r10,%r11 104185380Ssam 105185380Ssam xorq %rdx,%rdx 106185380Ssam addq %r11,%r13 107185380Ssam adcq $0,%rdx 108185380Ssam movq %r13,-8(%rsp,%r9,8) 109185380Ssam movq %rdx,(%rsp,%r9,8) 110185380Ssam 111185380Ssam leaq 1(%r14),%r14 112185380Ssam jmp .Louter 113185380Ssam.align 16 114185380Ssam.Louter: 115185380Ssam movq (%r12,%r14,8),%rbx 116185380Ssam xorq %r15,%r15 117185380Ssam movq %r8,%rbp 118185380Ssam movq (%rsp),%r10 119185380Ssam mulq %rbx 120185380Ssam addq %rax,%r10 121185380Ssam movq (%rcx),%rax 122185380Ssam adcq $0,%rdx 123185380Ssam 124185380Ssam imulq %r10,%rbp 125185380Ssam movq %rdx,%r11 126185380Ssam 127185380Ssam mulq %rbp 128185380Ssam addq %rax,%r10 129185380Ssam movq 8(%rsi),%rax 130185380Ssam adcq $0,%rdx 131185380Ssam movq 8(%rsp),%r10 132185380Ssam movq %rdx,%r13 133185380Ssam 134185380Ssam leaq 1(%r15),%r15 135185380Ssam jmp .Linner_enter 136185380Ssam 137185380Ssam.align 16 138185380Ssam.Linner: 139185380Ssam addq %rax,%r13 140185380Ssam movq (%rsi,%r15,8),%rax 141185380Ssam adcq $0,%rdx 142185380Ssam addq %r10,%r13 143185380Ssam movq (%rsp,%r15,8),%r10 144185380Ssam adcq $0,%rdx 145185380Ssam movq %r13,-16(%rsp,%r15,8) 146185380Ssam movq %rdx,%r13 147185380Ssam 148185380Ssam.Linner_enter: 149185380Ssam mulq %rbx 150185380Ssam addq %rax,%r11 151185380Ssam movq (%rcx,%r15,8),%rax 152185380Ssam adcq $0,%rdx 153185380Ssam addq %r11,%r10 154185380Ssam movq %rdx,%r11 155185380Ssam adcq $0,%r11 156185380Ssam leaq 1(%r15),%r15 157185380Ssam 158185380Ssam mulq %rbp 159185380Ssam cmpq %r9,%r15 160185380Ssam jne .Linner 161185380Ssam 162185380Ssam addq %rax,%r13 163185380Ssam movq (%rsi),%rax 164185380Ssam adcq $0,%rdx 165185380Ssam addq %r10,%r13 166185380Ssam movq (%rsp,%r15,8),%r10 167185380Ssam adcq $0,%rdx 168185380Ssam movq %r13,-16(%rsp,%r15,8) 169185380Ssam movq %rdx,%r13 170185380Ssam 171185380Ssam xorq %rdx,%rdx 172185380Ssam addq %r11,%r13 173185380Ssam adcq $0,%rdx 174185380Ssam addq %r10,%r13 175185380Ssam adcq $0,%rdx 176185380Ssam movq %r13,-8(%rsp,%r9,8) 177185380Ssam movq %rdx,(%rsp,%r9,8) 178185380Ssam 179185380Ssam leaq 1(%r14),%r14 180185380Ssam cmpq %r9,%r14 181185380Ssam jl .Louter 182185380Ssam 183185380Ssam xorq %r14,%r14 184185380Ssam movq (%rsp),%rax 185185380Ssam leaq (%rsp),%rsi 186185380Ssam movq %r9,%r15 187185380Ssam jmp .Lsub 188185380Ssam.align 16 189185380Ssam.Lsub: sbbq (%rcx,%r14,8),%rax 190185380Ssam movq %rax,(%rdi,%r14,8) 191185380Ssam movq 8(%rsi,%r14,8),%rax 192185380Ssam leaq 1(%r14),%r14 193185380Ssam decq %r15 194185380Ssam jnz .Lsub 195185380Ssam 196185380Ssam sbbq $0,%rax 197185380Ssam xorq %r14,%r14 198185380Ssam andq %rax,%rsi 199185380Ssam notq %rax 200185380Ssam movq %rdi,%rcx 201185380Ssam andq %rax,%rcx 202185380Ssam movq %r9,%r15 203185380Ssam orq %rcx,%rsi 204185380Ssam.align 16 205185380Ssam.Lcopy: 206185380Ssam movq (%rsi,%r14,8),%rax 207185380Ssam movq %r14,(%rsp,%r14,8) 208185380Ssam movq %rax,(%rdi,%r14,8) 209185380Ssam leaq 1(%r14),%r14 210185380Ssam subq $1,%r15 211185380Ssam jnz .Lcopy 212185380Ssam 213185380Ssam movq 8(%rsp,%r9,8),%rsi 214185380Ssam movq $1,%rax 215185380Ssam movq (%rsi),%r15 216185380Ssam movq 8(%rsi),%r14 217185380Ssam movq 16(%rsi),%r13 218185380Ssam movq 24(%rsi),%r12 219185380Ssam movq 32(%rsi),%rbp 220185380Ssam movq 40(%rsi),%rbx 221185380Ssam leaq 48(%rsi),%rsp 222185380Ssam.Lmul_epilogue: 223185380Ssam .byte 0xf3,0xc3 224185380Ssam.size bn_mul_mont,.-bn_mul_mont 225185380Ssam.type bn_mul4x_mont,@function 226185380Ssam.align 16 227185380Ssambn_mul4x_mont: 228185380Ssam.Lmul4x_enter: 229185380Ssam pushq %rbx 230185380Ssam pushq %rbp 231185380Ssam pushq %r12 232185380Ssam pushq %r13 233185380Ssam pushq %r14 234185380Ssam pushq %r15 235185380Ssam 236185380Ssam movl %r9d,%r9d 237185380Ssam leaq 4(%r9),%r10 238185380Ssam movq %rsp,%r11 239185380Ssam negq %r10 240185380Ssam leaq (%rsp,%r10,8),%rsp 241185380Ssam andq $-1024,%rsp 242188197Ssam 243185380Ssam movq %r11,8(%rsp,%r9,8) 244185380Ssam.Lmul4x_body: 245185380Ssam subq %rsp,%r11 246188197Ssam andq $-4096,%r11 247185380Ssam.Lmul4x_page_walk: 248185380Ssam movq (%rsp,%r11,1),%r10 249185380Ssam subq $4096,%r11 250188197Ssam.byte 0x2e 251185380Ssam jnc .Lmul4x_page_walk 252188197Ssam 253185380Ssam movq %rdi,16(%rsp,%r9,8) 254185380Ssam movq %rdx,%r12 255185380Ssam movq (%r8),%r8 256185380Ssam movq (%r12),%rbx 257185380Ssam movq (%rsi),%rax 258188197Ssam 259185380Ssam xorq %r14,%r14 260185380Ssam xorq %r15,%r15 261188197Ssam 262185380Ssam movq %r8,%rbp 263185380Ssam mulq %rbx 264188197Ssam movq %rax,%r10 265185380Ssam movq (%rcx),%rax 266185380Ssam 267188197Ssam imulq %r10,%rbp 268185380Ssam movq %rdx,%r11 269185380Ssam 270185380Ssam mulq %rbp 271188197Ssam addq %rax,%r10 272185380Ssam movq 8(%rsi),%rax 273185380Ssam adcq $0,%rdx 274185380Ssam movq %rdx,%rdi 275185380Ssam 276185380Ssam mulq %rbx 277185380Ssam addq %rax,%r11 278185380Ssam movq 8(%rcx),%rax 279185380Ssam adcq $0,%rdx 280185380Ssam movq %rdx,%r10 281185380Ssam 282185380Ssam mulq %rbp 283185380Ssam addq %rax,%rdi 284185380Ssam movq 16(%rsi),%rax 285187129Ssam adcq $0,%rdx 286187129Ssam addq %r11,%rdi 287185380Ssam leaq 4(%r15),%r15 288185380Ssam adcq $0,%rdx 289185380Ssam movq %rdi,(%rsp) 290185380Ssam movq %rdx,%r13 291185380Ssam jmp .L1st4x 292185380Ssam.align 16 293185380Ssam.L1st4x: 294185380Ssam mulq %rbx 295185380Ssam addq %rax,%r10 296185380Ssam movq -16(%rcx,%r15,8),%rax 297185380Ssam adcq $0,%rdx 298185380Ssam movq %rdx,%r11 299185380Ssam 300185380Ssam mulq %rbp 301185380Ssam addq %rax,%r13 302185380Ssam movq -8(%rsi,%r15,8),%rax 303185380Ssam adcq $0,%rdx 304185380Ssam addq %r10,%r13 305185380Ssam adcq $0,%rdx 306185380Ssam movq %r13,-24(%rsp,%r15,8) 307185380Ssam movq %rdx,%rdi 308188197Ssam 309188197Ssam mulq %rbx 310188197Ssam addq %rax,%r11 311185380Ssam movq -8(%rcx,%r15,8),%rax 312185380Ssam adcq $0,%rdx 313185380Ssam movq %rdx,%r10 314185380Ssam 315185380Ssam mulq %rbp 316185380Ssam addq %rax,%rdi 317185380Ssam movq (%rsi,%r15,8),%rax 318185380Ssam adcq $0,%rdx 319185380Ssam addq %r11,%rdi 320185380Ssam adcq $0,%rdx 321185380Ssam movq %rdi,-16(%rsp,%r15,8) 322185380Ssam movq %rdx,%r13 323185380Ssam 324185380Ssam mulq %rbx 325185380Ssam addq %rax,%r10 326185380Ssam movq (%rcx,%r15,8),%rax 327185380Ssam adcq $0,%rdx 328185380Ssam movq %rdx,%r11 329185380Ssam 330185380Ssam mulq %rbp 331185380Ssam addq %rax,%r13 332185380Ssam movq 8(%rsi,%r15,8),%rax 333185380Ssam adcq $0,%rdx 334 addq %r10,%r13 335 adcq $0,%rdx 336 movq %r13,-8(%rsp,%r15,8) 337 movq %rdx,%rdi 338 339 mulq %rbx 340 addq %rax,%r11 341 movq 8(%rcx,%r15,8),%rax 342 adcq $0,%rdx 343 leaq 4(%r15),%r15 344 movq %rdx,%r10 345 346 mulq %rbp 347 addq %rax,%rdi 348 movq -16(%rsi,%r15,8),%rax 349 adcq $0,%rdx 350 addq %r11,%rdi 351 adcq $0,%rdx 352 movq %rdi,-32(%rsp,%r15,8) 353 movq %rdx,%r13 354 cmpq %r9,%r15 355 jl .L1st4x 356 357 mulq %rbx 358 addq %rax,%r10 359 movq -16(%rcx,%r15,8),%rax 360 adcq $0,%rdx 361 movq %rdx,%r11 362 363 mulq %rbp 364 addq %rax,%r13 365 movq -8(%rsi,%r15,8),%rax 366 adcq $0,%rdx 367 addq %r10,%r13 368 adcq $0,%rdx 369 movq %r13,-24(%rsp,%r15,8) 370 movq %rdx,%rdi 371 372 mulq %rbx 373 addq %rax,%r11 374 movq -8(%rcx,%r15,8),%rax 375 adcq $0,%rdx 376 movq %rdx,%r10 377 378 mulq %rbp 379 addq %rax,%rdi 380 movq (%rsi),%rax 381 adcq $0,%rdx 382 addq %r11,%rdi 383 adcq $0,%rdx 384 movq %rdi,-16(%rsp,%r15,8) 385 movq %rdx,%r13 386 387 xorq %rdi,%rdi 388 addq %r10,%r13 389 adcq $0,%rdi 390 movq %r13,-8(%rsp,%r15,8) 391 movq %rdi,(%rsp,%r15,8) 392 393 leaq 1(%r14),%r14 394.align 4 395.Louter4x: 396 movq (%r12,%r14,8),%rbx 397 xorq %r15,%r15 398 movq (%rsp),%r10 399 movq %r8,%rbp 400 mulq %rbx 401 addq %rax,%r10 402 movq (%rcx),%rax 403 adcq $0,%rdx 404 405 imulq %r10,%rbp 406 movq %rdx,%r11 407 408 mulq %rbp 409 addq %rax,%r10 410 movq 8(%rsi),%rax 411 adcq $0,%rdx 412 movq %rdx,%rdi 413 414 mulq %rbx 415 addq %rax,%r11 416 movq 8(%rcx),%rax 417 adcq $0,%rdx 418 addq 8(%rsp),%r11 419 adcq $0,%rdx 420 movq %rdx,%r10 421 422 mulq %rbp 423 addq %rax,%rdi 424 movq 16(%rsi),%rax 425 adcq $0,%rdx 426 addq %r11,%rdi 427 leaq 4(%r15),%r15 428 adcq $0,%rdx 429 movq %rdi,(%rsp) 430 movq %rdx,%r13 431 jmp .Linner4x 432.align 16 433.Linner4x: 434 mulq %rbx 435 addq %rax,%r10 436 movq -16(%rcx,%r15,8),%rax 437 adcq $0,%rdx 438 addq -16(%rsp,%r15,8),%r10 439 adcq $0,%rdx 440 movq %rdx,%r11 441 442 mulq %rbp 443 addq %rax,%r13 444 movq -8(%rsi,%r15,8),%rax 445 adcq $0,%rdx 446 addq %r10,%r13 447 adcq $0,%rdx 448 movq %r13,-24(%rsp,%r15,8) 449 movq %rdx,%rdi 450 451 mulq %rbx 452 addq %rax,%r11 453 movq -8(%rcx,%r15,8),%rax 454 adcq $0,%rdx 455 addq -8(%rsp,%r15,8),%r11 456 adcq $0,%rdx 457 movq %rdx,%r10 458 459 mulq %rbp 460 addq %rax,%rdi 461 movq (%rsi,%r15,8),%rax 462 adcq $0,%rdx 463 addq %r11,%rdi 464 adcq $0,%rdx 465 movq %rdi,-16(%rsp,%r15,8) 466 movq %rdx,%r13 467 468 mulq %rbx 469 addq %rax,%r10 470 movq (%rcx,%r15,8),%rax 471 adcq $0,%rdx 472 addq (%rsp,%r15,8),%r10 473 adcq $0,%rdx 474 movq %rdx,%r11 475 476 mulq %rbp 477 addq %rax,%r13 478 movq 8(%rsi,%r15,8),%rax 479 adcq $0,%rdx 480 addq %r10,%r13 481 adcq $0,%rdx 482 movq %r13,-8(%rsp,%r15,8) 483 movq %rdx,%rdi 484 485 mulq %rbx 486 addq %rax,%r11 487 movq 8(%rcx,%r15,8),%rax 488 adcq $0,%rdx 489 addq 8(%rsp,%r15,8),%r11 490 adcq $0,%rdx 491 leaq 4(%r15),%r15 492 movq %rdx,%r10 493 494 mulq %rbp 495 addq %rax,%rdi 496 movq -16(%rsi,%r15,8),%rax 497 adcq $0,%rdx 498 addq %r11,%rdi 499 adcq $0,%rdx 500 movq %rdi,-32(%rsp,%r15,8) 501 movq %rdx,%r13 502 cmpq %r9,%r15 503 jl .Linner4x 504 505 mulq %rbx 506 addq %rax,%r10 507 movq -16(%rcx,%r15,8),%rax 508 adcq $0,%rdx 509 addq -16(%rsp,%r15,8),%r10 510 adcq $0,%rdx 511 movq %rdx,%r11 512 513 mulq %rbp 514 addq %rax,%r13 515 movq -8(%rsi,%r15,8),%rax 516 adcq $0,%rdx 517 addq %r10,%r13 518 adcq $0,%rdx 519 movq %r13,-24(%rsp,%r15,8) 520 movq %rdx,%rdi 521 522 mulq %rbx 523 addq %rax,%r11 524 movq -8(%rcx,%r15,8),%rax 525 adcq $0,%rdx 526 addq -8(%rsp,%r15,8),%r11 527 adcq $0,%rdx 528 leaq 1(%r14),%r14 529 movq %rdx,%r10 530 531 mulq %rbp 532 addq %rax,%rdi 533 movq (%rsi),%rax 534 adcq $0,%rdx 535 addq %r11,%rdi 536 adcq $0,%rdx 537 movq %rdi,-16(%rsp,%r15,8) 538 movq %rdx,%r13 539 540 xorq %rdi,%rdi 541 addq %r10,%r13 542 adcq $0,%rdi 543 addq (%rsp,%r9,8),%r13 544 adcq $0,%rdi 545 movq %r13,-8(%rsp,%r15,8) 546 movq %rdi,(%rsp,%r15,8) 547 548 cmpq %r9,%r14 549 jl .Louter4x 550 movq 16(%rsp,%r9,8),%rdi 551 movq 0(%rsp),%rax 552 pxor %xmm0,%xmm0 553 movq 8(%rsp),%rdx 554 shrq $2,%r9 555 leaq (%rsp),%rsi 556 xorq %r14,%r14 557 558 subq 0(%rcx),%rax 559 movq 16(%rsi),%rbx 560 movq 24(%rsi),%rbp 561 sbbq 8(%rcx),%rdx 562 leaq -1(%r9),%r15 563 jmp .Lsub4x 564.align 16 565.Lsub4x: 566 movq %rax,0(%rdi,%r14,8) 567 movq %rdx,8(%rdi,%r14,8) 568 sbbq 16(%rcx,%r14,8),%rbx 569 movq 32(%rsi,%r14,8),%rax 570 movq 40(%rsi,%r14,8),%rdx 571 sbbq 24(%rcx,%r14,8),%rbp 572 movq %rbx,16(%rdi,%r14,8) 573 movq %rbp,24(%rdi,%r14,8) 574 sbbq 32(%rcx,%r14,8),%rax 575 movq 48(%rsi,%r14,8),%rbx 576 movq 56(%rsi,%r14,8),%rbp 577 sbbq 40(%rcx,%r14,8),%rdx 578 leaq 4(%r14),%r14 579 decq %r15 580 jnz .Lsub4x 581 582 movq %rax,0(%rdi,%r14,8) 583 movq 32(%rsi,%r14,8),%rax 584 sbbq 16(%rcx,%r14,8),%rbx 585 movq %rdx,8(%rdi,%r14,8) 586 sbbq 24(%rcx,%r14,8),%rbp 587 movq %rbx,16(%rdi,%r14,8) 588 589 sbbq $0,%rax 590 movq %rbp,24(%rdi,%r14,8) 591 xorq %r14,%r14 592 andq %rax,%rsi 593 notq %rax 594 movq %rdi,%rcx 595 andq %rax,%rcx 596 leaq -1(%r9),%r15 597 orq %rcx,%rsi 598 599 movdqu (%rsi),%xmm1 600 movdqa %xmm0,(%rsp) 601 movdqu %xmm1,(%rdi) 602 jmp .Lcopy4x 603.align 16 604.Lcopy4x: 605 movdqu 16(%rsi,%r14,1),%xmm2 606 movdqu 32(%rsi,%r14,1),%xmm1 607 movdqa %xmm0,16(%rsp,%r14,1) 608 movdqu %xmm2,16(%rdi,%r14,1) 609 movdqa %xmm0,32(%rsp,%r14,1) 610 movdqu %xmm1,32(%rdi,%r14,1) 611 leaq 32(%r14),%r14 612 decq %r15 613 jnz .Lcopy4x 614 615 shlq $2,%r9 616 movdqu 16(%rsi,%r14,1),%xmm2 617 movdqa %xmm0,16(%rsp,%r14,1) 618 movdqu %xmm2,16(%rdi,%r14,1) 619 movq 8(%rsp,%r9,8),%rsi 620 movq $1,%rax 621 movq (%rsi),%r15 622 movq 8(%rsi),%r14 623 movq 16(%rsi),%r13 624 movq 24(%rsi),%r12 625 movq 32(%rsi),%rbp 626 movq 40(%rsi),%rbx 627 leaq 48(%rsi),%rsp 628.Lmul4x_epilogue: 629 .byte 0xf3,0xc3 630.size bn_mul4x_mont,.-bn_mul4x_mont 631.type bn_sqr4x_mont,@function 632.align 16 633bn_sqr4x_mont: 634.Lsqr4x_enter: 635 movq %rsp,%rax 636 pushq %rbx 637 pushq %rbp 638 pushq %r12 639 pushq %r13 640 pushq %r14 641 pushq %r15 642 643 shll $3,%r9d 644 movq %rsp,%r11 645 negq %r9 646 movq (%r8),%r8 647 leaq -72(%rsp,%r9,2),%rsp 648 andq $-1024,%rsp 649 650 subq %rsp,%r11 651 andq $-4096,%r11 652.Lsqr4x_page_walk: 653 movq (%rsp,%r11,1),%r10 654 subq $4096,%r11 655.byte 0x2e 656 jnc .Lsqr4x_page_walk 657 658 movq %r9,%r10 659 negq %r9 660 leaq -48(%rax),%r11 661 662 663 664 665 666 667 668 669 670 671 672 movq %rdi,32(%rsp) 673 movq %rcx,40(%rsp) 674 movq %r8,48(%rsp) 675 movq %r11,56(%rsp) 676.Lsqr4x_body: 677 678 679 680 681 682 683 684 leaq 32(%r10),%rbp 685 leaq (%rsi,%r9,1),%rsi 686 687 movq %r9,%rcx 688 689 690 movq -32(%rsi,%rbp,1),%r14 691 leaq 64(%rsp,%r9,2),%rdi 692 movq -24(%rsi,%rbp,1),%rax 693 leaq -32(%rdi,%rbp,1),%rdi 694 movq -16(%rsi,%rbp,1),%rbx 695 movq %rax,%r15 696 697 mulq %r14 698 movq %rax,%r10 699 movq %rbx,%rax 700 movq %rdx,%r11 701 movq %r10,-24(%rdi,%rbp,1) 702 703 xorq %r10,%r10 704 mulq %r14 705 addq %rax,%r11 706 movq %rbx,%rax 707 adcq %rdx,%r10 708 movq %r11,-16(%rdi,%rbp,1) 709 710 leaq -16(%rbp),%rcx 711 712 713 movq 8(%rsi,%rcx,1),%rbx 714 mulq %r15 715 movq %rax,%r12 716 movq %rbx,%rax 717 movq %rdx,%r13 718 719 xorq %r11,%r11 720 addq %r12,%r10 721 leaq 16(%rcx),%rcx 722 adcq $0,%r11 723 mulq %r14 724 addq %rax,%r10 725 movq %rbx,%rax 726 adcq %rdx,%r11 727 movq %r10,-8(%rdi,%rcx,1) 728 jmp .Lsqr4x_1st 729 730.align 16 731.Lsqr4x_1st: 732 movq (%rsi,%rcx,1),%rbx 733 xorq %r12,%r12 734 mulq %r15 735 addq %rax,%r13 736 movq %rbx,%rax 737 adcq %rdx,%r12 738 739 xorq %r10,%r10 740 addq %r13,%r11 741 adcq $0,%r10 742 mulq %r14 743 addq %rax,%r11 744 movq %rbx,%rax 745 adcq %rdx,%r10 746 movq %r11,(%rdi,%rcx,1) 747 748 749 movq 8(%rsi,%rcx,1),%rbx 750 xorq %r13,%r13 751 mulq %r15 752 addq %rax,%r12 753 movq %rbx,%rax 754 adcq %rdx,%r13 755 756 xorq %r11,%r11 757 addq %r12,%r10 758 adcq $0,%r11 759 mulq %r14 760 addq %rax,%r10 761 movq %rbx,%rax 762 adcq %rdx,%r11 763 movq %r10,8(%rdi,%rcx,1) 764 765 movq 16(%rsi,%rcx,1),%rbx 766 xorq %r12,%r12 767 mulq %r15 768 addq %rax,%r13 769 movq %rbx,%rax 770 adcq %rdx,%r12 771 772 xorq %r10,%r10 773 addq %r13,%r11 774 adcq $0,%r10 775 mulq %r14 776 addq %rax,%r11 777 movq %rbx,%rax 778 adcq %rdx,%r10 779 movq %r11,16(%rdi,%rcx,1) 780 781 782 movq 24(%rsi,%rcx,1),%rbx 783 xorq %r13,%r13 784 mulq %r15 785 addq %rax,%r12 786 movq %rbx,%rax 787 adcq %rdx,%r13 788 789 xorq %r11,%r11 790 addq %r12,%r10 791 leaq 32(%rcx),%rcx 792 adcq $0,%r11 793 mulq %r14 794 addq %rax,%r10 795 movq %rbx,%rax 796 adcq %rdx,%r11 797 movq %r10,-8(%rdi,%rcx,1) 798 799 cmpq $0,%rcx 800 jne .Lsqr4x_1st 801 802 xorq %r12,%r12 803 addq %r11,%r13 804 adcq $0,%r12 805 mulq %r15 806 addq %rax,%r13 807 adcq %rdx,%r12 808 809 movq %r13,(%rdi) 810 leaq 16(%rbp),%rbp 811 movq %r12,8(%rdi) 812 jmp .Lsqr4x_outer 813 814.align 16 815.Lsqr4x_outer: 816 movq -32(%rsi,%rbp,1),%r14 817 leaq 64(%rsp,%r9,2),%rdi 818 movq -24(%rsi,%rbp,1),%rax 819 leaq -32(%rdi,%rbp,1),%rdi 820 movq -16(%rsi,%rbp,1),%rbx 821 movq %rax,%r15 822 823 movq -24(%rdi,%rbp,1),%r10 824 xorq %r11,%r11 825 mulq %r14 826 addq %rax,%r10 827 movq %rbx,%rax 828 adcq %rdx,%r11 829 movq %r10,-24(%rdi,%rbp,1) 830 831 xorq %r10,%r10 832 addq -16(%rdi,%rbp,1),%r11 833 adcq $0,%r10 834 mulq %r14 835 addq %rax,%r11 836 movq %rbx,%rax 837 adcq %rdx,%r10 838 movq %r11,-16(%rdi,%rbp,1) 839 840 leaq -16(%rbp),%rcx 841 xorq %r12,%r12 842 843 844 movq 8(%rsi,%rcx,1),%rbx 845 xorq %r13,%r13 846 addq 8(%rdi,%rcx,1),%r12 847 adcq $0,%r13 848 mulq %r15 849 addq %rax,%r12 850 movq %rbx,%rax 851 adcq %rdx,%r13 852 853 xorq %r11,%r11 854 addq %r12,%r10 855 adcq $0,%r11 856 mulq %r14 857 addq %rax,%r10 858 movq %rbx,%rax 859 adcq %rdx,%r11 860 movq %r10,8(%rdi,%rcx,1) 861 862 leaq 16(%rcx),%rcx 863 jmp .Lsqr4x_inner 864 865.align 16 866.Lsqr4x_inner: 867 movq (%rsi,%rcx,1),%rbx 868 xorq %r12,%r12 869 addq (%rdi,%rcx,1),%r13 870 adcq $0,%r12 871 mulq %r15 872 addq %rax,%r13 873 movq %rbx,%rax 874 adcq %rdx,%r12 875 876 xorq %r10,%r10 877 addq %r13,%r11 878 adcq $0,%r10 879 mulq %r14 880 addq %rax,%r11 881 movq %rbx,%rax 882 adcq %rdx,%r10 883 movq %r11,(%rdi,%rcx,1) 884 885 movq 8(%rsi,%rcx,1),%rbx 886 xorq %r13,%r13 887 addq 8(%rdi,%rcx,1),%r12 888 adcq $0,%r13 889 mulq %r15 890 addq %rax,%r12 891 movq %rbx,%rax 892 adcq %rdx,%r13 893 894 xorq %r11,%r11 895 addq %r12,%r10 896 leaq 16(%rcx),%rcx 897 adcq $0,%r11 898 mulq %r14 899 addq %rax,%r10 900 movq %rbx,%rax 901 adcq %rdx,%r11 902 movq %r10,-8(%rdi,%rcx,1) 903 904 cmpq $0,%rcx 905 jne .Lsqr4x_inner 906 907 xorq %r12,%r12 908 addq %r11,%r13 909 adcq $0,%r12 910 mulq %r15 911 addq %rax,%r13 912 adcq %rdx,%r12 913 914 movq %r13,(%rdi) 915 movq %r12,8(%rdi) 916 917 addq $16,%rbp 918 jnz .Lsqr4x_outer 919 920 921 movq -32(%rsi),%r14 922 leaq 64(%rsp,%r9,2),%rdi 923 movq -24(%rsi),%rax 924 leaq -32(%rdi,%rbp,1),%rdi 925 movq -16(%rsi),%rbx 926 movq %rax,%r15 927 928 xorq %r11,%r11 929 mulq %r14 930 addq %rax,%r10 931 movq %rbx,%rax 932 adcq %rdx,%r11 933 movq %r10,-24(%rdi) 934 935 xorq %r10,%r10 936 addq %r13,%r11 937 adcq $0,%r10 938 mulq %r14 939 addq %rax,%r11 940 movq %rbx,%rax 941 adcq %rdx,%r10 942 movq %r11,-16(%rdi) 943 944 movq -8(%rsi),%rbx 945 mulq %r15 946 addq %rax,%r12 947 movq %rbx,%rax 948 adcq $0,%rdx 949 950 xorq %r11,%r11 951 addq %r12,%r10 952 movq %rdx,%r13 953 adcq $0,%r11 954 mulq %r14 955 addq %rax,%r10 956 movq %rbx,%rax 957 adcq %rdx,%r11 958 movq %r10,-8(%rdi) 959 960 xorq %r12,%r12 961 addq %r11,%r13 962 adcq $0,%r12 963 mulq %r15 964 addq %rax,%r13 965 movq -16(%rsi),%rax 966 adcq %rdx,%r12 967 968 movq %r13,(%rdi) 969 movq %r12,8(%rdi) 970 971 mulq %rbx 972 addq $16,%rbp 973 xorq %r14,%r14 974 subq %r9,%rbp 975 xorq %r15,%r15 976 977 addq %r12,%rax 978 adcq $0,%rdx 979 movq %rax,8(%rdi) 980 movq %rdx,16(%rdi) 981 movq %r15,24(%rdi) 982 983 movq -16(%rsi,%rbp,1),%rax 984 leaq 64(%rsp,%r9,2),%rdi 985 xorq %r10,%r10 986 movq -24(%rdi,%rbp,2),%r11 987 988 leaq (%r14,%r10,2),%r12 989 shrq $63,%r10 990 leaq (%rcx,%r11,2),%r13 991 shrq $63,%r11 992 orq %r10,%r13 993 movq -16(%rdi,%rbp,2),%r10 994 movq %r11,%r14 995 mulq %rax 996 negq %r15 997 movq -8(%rdi,%rbp,2),%r11 998 adcq %rax,%r12 999 movq -8(%rsi,%rbp,1),%rax 1000 movq %r12,-32(%rdi,%rbp,2) 1001 adcq %rdx,%r13 1002 1003 leaq (%r14,%r10,2),%rbx 1004 movq %r13,-24(%rdi,%rbp,2) 1005 sbbq %r15,%r15 1006 shrq $63,%r10 1007 leaq (%rcx,%r11,2),%r8 1008 shrq $63,%r11 1009 orq %r10,%r8 1010 movq 0(%rdi,%rbp,2),%r10 1011 movq %r11,%r14 1012 mulq %rax 1013 negq %r15 1014 movq 8(%rdi,%rbp,2),%r11 1015 adcq %rax,%rbx 1016 movq 0(%rsi,%rbp,1),%rax 1017 movq %rbx,-16(%rdi,%rbp,2) 1018 adcq %rdx,%r8 1019 leaq 16(%rbp),%rbp 1020 movq %r8,-40(%rdi,%rbp,2) 1021 sbbq %r15,%r15 1022 jmp .Lsqr4x_shift_n_add 1023 1024.align 16 1025.Lsqr4x_shift_n_add: 1026 leaq (%r14,%r10,2),%r12 1027 shrq $63,%r10 1028 leaq (%rcx,%r11,2),%r13 1029 shrq $63,%r11 1030 orq %r10,%r13 1031 movq -16(%rdi,%rbp,2),%r10 1032 movq %r11,%r14 1033 mulq %rax 1034 negq %r15 1035 movq -8(%rdi,%rbp,2),%r11 1036 adcq %rax,%r12 1037 movq -8(%rsi,%rbp,1),%rax 1038 movq %r12,-32(%rdi,%rbp,2) 1039 adcq %rdx,%r13 1040 1041 leaq (%r14,%r10,2),%rbx 1042 movq %r13,-24(%rdi,%rbp,2) 1043 sbbq %r15,%r15 1044 shrq $63,%r10 1045 leaq (%rcx,%r11,2),%r8 1046 shrq $63,%r11 1047 orq %r10,%r8 1048 movq 0(%rdi,%rbp,2),%r10 1049 movq %r11,%r14 1050 mulq %rax 1051 negq %r15 1052 movq 8(%rdi,%rbp,2),%r11 1053 adcq %rax,%rbx 1054 movq 0(%rsi,%rbp,1),%rax 1055 movq %rbx,-16(%rdi,%rbp,2) 1056 adcq %rdx,%r8 1057 1058 leaq (%r14,%r10,2),%r12 1059 movq %r8,-8(%rdi,%rbp,2) 1060 sbbq %r15,%r15 1061 shrq $63,%r10 1062 leaq (%rcx,%r11,2),%r13 1063 shrq $63,%r11 1064 orq %r10,%r13 1065 movq 16(%rdi,%rbp,2),%r10 1066 movq %r11,%r14 1067 mulq %rax 1068 negq %r15 1069 movq 24(%rdi,%rbp,2),%r11 1070 adcq %rax,%r12 1071 movq 8(%rsi,%rbp,1),%rax 1072 movq %r12,0(%rdi,%rbp,2) 1073 adcq %rdx,%r13 1074 1075 leaq (%r14,%r10,2),%rbx 1076 movq %r13,8(%rdi,%rbp,2) 1077 sbbq %r15,%r15 1078 shrq $63,%r10 1079 leaq (%rcx,%r11,2),%r8 1080 shrq $63,%r11 1081 orq %r10,%r8 1082 movq 32(%rdi,%rbp,2),%r10 1083 movq %r11,%r14 1084 mulq %rax 1085 negq %r15 1086 movq 40(%rdi,%rbp,2),%r11 1087 adcq %rax,%rbx 1088 movq 16(%rsi,%rbp,1),%rax 1089 movq %rbx,16(%rdi,%rbp,2) 1090 adcq %rdx,%r8 1091 movq %r8,24(%rdi,%rbp,2) 1092 sbbq %r15,%r15 1093 addq $32,%rbp 1094 jnz .Lsqr4x_shift_n_add 1095 1096 leaq (%r14,%r10,2),%r12 1097 shrq $63,%r10 1098 leaq (%rcx,%r11,2),%r13 1099 shrq $63,%r11 1100 orq %r10,%r13 1101 movq -16(%rdi),%r10 1102 movq %r11,%r14 1103 mulq %rax 1104 negq %r15 1105 movq -8(%rdi),%r11 1106 adcq %rax,%r12 1107 movq -8(%rsi),%rax 1108 movq %r12,-32(%rdi) 1109 adcq %rdx,%r13 1110 1111 leaq (%r14,%r10,2),%rbx 1112 movq %r13,-24(%rdi) 1113 sbbq %r15,%r15 1114 shrq $63,%r10 1115 leaq (%rcx,%r11,2),%r8 1116 shrq $63,%r11 1117 orq %r10,%r8 1118 mulq %rax 1119 negq %r15 1120 adcq %rax,%rbx 1121 adcq %rdx,%r8 1122 movq %rbx,-16(%rdi) 1123 movq %r8,-8(%rdi) 1124 movq 40(%rsp),%rsi 1125 movq 48(%rsp),%r8 1126 xorq %rcx,%rcx 1127 movq %r9,0(%rsp) 1128 subq %r9,%rcx 1129 movq 64(%rsp),%r10 1130 movq %r8,%r14 1131 leaq 64(%rsp,%r9,2),%rax 1132 leaq 64(%rsp,%r9,1),%rdi 1133 movq %rax,8(%rsp) 1134 leaq (%rsi,%r9,1),%rsi 1135 xorq %rbp,%rbp 1136 1137 movq 0(%rsi,%rcx,1),%rax 1138 movq 8(%rsi,%rcx,1),%r9 1139 imulq %r10,%r14 1140 movq %rax,%rbx 1141 jmp .Lsqr4x_mont_outer 1142 1143.align 16 1144.Lsqr4x_mont_outer: 1145 xorq %r11,%r11 1146 mulq %r14 1147 addq %rax,%r10 1148 movq %r9,%rax 1149 adcq %rdx,%r11 1150 movq %r8,%r15 1151 1152 xorq %r10,%r10 1153 addq 8(%rdi,%rcx,1),%r11 1154 adcq $0,%r10 1155 mulq %r14 1156 addq %rax,%r11 1157 movq %rbx,%rax 1158 adcq %rdx,%r10 1159 1160 imulq %r11,%r15 1161 1162 movq 16(%rsi,%rcx,1),%rbx 1163 xorq %r13,%r13 1164 addq %r11,%r12 1165 adcq $0,%r13 1166 mulq %r15 1167 addq %rax,%r12 1168 movq %rbx,%rax 1169 adcq %rdx,%r13 1170 movq %r12,8(%rdi,%rcx,1) 1171 1172 xorq %r11,%r11 1173 addq 16(%rdi,%rcx,1),%r10 1174 adcq $0,%r11 1175 mulq %r14 1176 addq %rax,%r10 1177 movq %r9,%rax 1178 adcq %rdx,%r11 1179 1180 movq 24(%rsi,%rcx,1),%r9 1181 xorq %r12,%r12 1182 addq %r10,%r13 1183 adcq $0,%r12 1184 mulq %r15 1185 addq %rax,%r13 1186 movq %r9,%rax 1187 adcq %rdx,%r12 1188 movq %r13,16(%rdi,%rcx,1) 1189 1190 xorq %r10,%r10 1191 addq 24(%rdi,%rcx,1),%r11 1192 leaq 32(%rcx),%rcx 1193 adcq $0,%r10 1194 mulq %r14 1195 addq %rax,%r11 1196 movq %rbx,%rax 1197 adcq %rdx,%r10 1198 jmp .Lsqr4x_mont_inner 1199 1200.align 16 1201.Lsqr4x_mont_inner: 1202 movq (%rsi,%rcx,1),%rbx 1203 xorq %r13,%r13 1204 addq %r11,%r12 1205 adcq $0,%r13 1206 mulq %r15 1207 addq %rax,%r12 1208 movq %rbx,%rax 1209 adcq %rdx,%r13 1210 movq %r12,-8(%rdi,%rcx,1) 1211 1212 xorq %r11,%r11 1213 addq (%rdi,%rcx,1),%r10 1214 adcq $0,%r11 1215 mulq %r14 1216 addq %rax,%r10 1217 movq %r9,%rax 1218 adcq %rdx,%r11 1219 1220 movq 8(%rsi,%rcx,1),%r9 1221 xorq %r12,%r12 1222 addq %r10,%r13 1223 adcq $0,%r12 1224 mulq %r15 1225 addq %rax,%r13 1226 movq %r9,%rax 1227 adcq %rdx,%r12 1228 movq %r13,(%rdi,%rcx,1) 1229 1230 xorq %r10,%r10 1231 addq 8(%rdi,%rcx,1),%r11 1232 adcq $0,%r10 1233 mulq %r14 1234 addq %rax,%r11 1235 movq %rbx,%rax 1236 adcq %rdx,%r10 1237 1238 1239 movq 16(%rsi,%rcx,1),%rbx 1240 xorq %r13,%r13 1241 addq %r11,%r12 1242 adcq $0,%r13 1243 mulq %r15 1244 addq %rax,%r12 1245 movq %rbx,%rax 1246 adcq %rdx,%r13 1247 movq %r12,8(%rdi,%rcx,1) 1248 1249 xorq %r11,%r11 1250 addq 16(%rdi,%rcx,1),%r10 1251 adcq $0,%r11 1252 mulq %r14 1253 addq %rax,%r10 1254 movq %r9,%rax 1255 adcq %rdx,%r11 1256 1257 movq 24(%rsi,%rcx,1),%r9 1258 xorq %r12,%r12 1259 addq %r10,%r13 1260 adcq $0,%r12 1261 mulq %r15 1262 addq %rax,%r13 1263 movq %r9,%rax 1264 adcq %rdx,%r12 1265 movq %r13,16(%rdi,%rcx,1) 1266 1267 xorq %r10,%r10 1268 addq 24(%rdi,%rcx,1),%r11 1269 leaq 32(%rcx),%rcx 1270 adcq $0,%r10 1271 mulq %r14 1272 addq %rax,%r11 1273 movq %rbx,%rax 1274 adcq %rdx,%r10 1275 cmpq $0,%rcx 1276 jne .Lsqr4x_mont_inner 1277 1278 subq 0(%rsp),%rcx 1279 movq %r8,%r14 1280 1281 xorq %r13,%r13 1282 addq %r11,%r12 1283 adcq $0,%r13 1284 mulq %r15 1285 addq %rax,%r12 1286 movq %r9,%rax 1287 adcq %rdx,%r13 1288 movq %r12,-8(%rdi) 1289 1290 xorq %r11,%r11 1291 addq (%rdi),%r10 1292 adcq $0,%r11 1293 movq 0(%rsi,%rcx,1),%rbx 1294 addq %rbp,%r10 1295 adcq $0,%r11 1296 1297 imulq 16(%rdi,%rcx,1),%r14 1298 xorq %r12,%r12 1299 movq 8(%rsi,%rcx,1),%r9 1300 addq %r10,%r13 1301 movq 16(%rdi,%rcx,1),%r10 1302 adcq $0,%r12 1303 mulq %r15 1304 addq %rax,%r13 1305 movq %rbx,%rax 1306 adcq %rdx,%r12 1307 movq %r13,(%rdi) 1308 1309 xorq %rbp,%rbp 1310 addq 8(%rdi),%r12 1311 adcq %rbp,%rbp 1312 addq %r11,%r12 1313 leaq 16(%rdi),%rdi 1314 adcq $0,%rbp 1315 movq %r12,-8(%rdi) 1316 cmpq 8(%rsp),%rdi 1317 jb .Lsqr4x_mont_outer 1318 1319 movq 0(%rsp),%r9 1320 movq %rbp,(%rdi) 1321 movq 64(%rsp,%r9,1),%rax 1322 leaq 64(%rsp,%r9,1),%rbx 1323 movq 40(%rsp),%rsi 1324 shrq $5,%r9 1325 movq 8(%rbx),%rdx 1326 xorq %rbp,%rbp 1327 1328 movq 32(%rsp),%rdi 1329 subq 0(%rsi),%rax 1330 movq 16(%rbx),%r10 1331 movq 24(%rbx),%r11 1332 sbbq 8(%rsi),%rdx 1333 leaq -1(%r9),%rcx 1334 jmp .Lsqr4x_sub 1335.align 16 1336.Lsqr4x_sub: 1337 movq %rax,0(%rdi,%rbp,8) 1338 movq %rdx,8(%rdi,%rbp,8) 1339 sbbq 16(%rsi,%rbp,8),%r10 1340 movq 32(%rbx,%rbp,8),%rax 1341 movq 40(%rbx,%rbp,8),%rdx 1342 sbbq 24(%rsi,%rbp,8),%r11 1343 movq %r10,16(%rdi,%rbp,8) 1344 movq %r11,24(%rdi,%rbp,8) 1345 sbbq 32(%rsi,%rbp,8),%rax 1346 movq 48(%rbx,%rbp,8),%r10 1347 movq 56(%rbx,%rbp,8),%r11 1348 sbbq 40(%rsi,%rbp,8),%rdx 1349 leaq 4(%rbp),%rbp 1350 decq %rcx 1351 jnz .Lsqr4x_sub 1352 1353 movq %rax,0(%rdi,%rbp,8) 1354 movq 32(%rbx,%rbp,8),%rax 1355 sbbq 16(%rsi,%rbp,8),%r10 1356 movq %rdx,8(%rdi,%rbp,8) 1357 sbbq 24(%rsi,%rbp,8),%r11 1358 movq %r10,16(%rdi,%rbp,8) 1359 1360 sbbq $0,%rax 1361 movq %r11,24(%rdi,%rbp,8) 1362 xorq %rbp,%rbp 1363 andq %rax,%rbx 1364 notq %rax 1365 movq %rdi,%rsi 1366 andq %rax,%rsi 1367 leaq -1(%r9),%rcx 1368 orq %rsi,%rbx 1369 1370 pxor %xmm0,%xmm0 1371 leaq 64(%rsp,%r9,8),%rsi 1372 movdqu (%rbx),%xmm1 1373 leaq (%rsi,%r9,8),%rsi 1374 movdqa %xmm0,64(%rsp) 1375 movdqa %xmm0,(%rsi) 1376 movdqu %xmm1,(%rdi) 1377 jmp .Lsqr4x_copy 1378.align 16 1379.Lsqr4x_copy: 1380 movdqu 16(%rbx,%rbp,1),%xmm2 1381 movdqu 32(%rbx,%rbp,1),%xmm1 1382 movdqa %xmm0,80(%rsp,%rbp,1) 1383 movdqa %xmm0,96(%rsp,%rbp,1) 1384 movdqa %xmm0,16(%rsi,%rbp,1) 1385 movdqa %xmm0,32(%rsi,%rbp,1) 1386 movdqu %xmm2,16(%rdi,%rbp,1) 1387 movdqu %xmm1,32(%rdi,%rbp,1) 1388 leaq 32(%rbp),%rbp 1389 decq %rcx 1390 jnz .Lsqr4x_copy 1391 1392 movdqu 16(%rbx,%rbp,1),%xmm2 1393 movdqa %xmm0,80(%rsp,%rbp,1) 1394 movdqa %xmm0,16(%rsi,%rbp,1) 1395 movdqu %xmm2,16(%rdi,%rbp,1) 1396 movq 56(%rsp),%rsi 1397 movq $1,%rax 1398 movq 0(%rsi),%r15 1399 movq 8(%rsi),%r14 1400 movq 16(%rsi),%r13 1401 movq 24(%rsi),%r12 1402 movq 32(%rsi),%rbp 1403 movq 40(%rsi),%rbx 1404 leaq 48(%rsi),%rsp 1405.Lsqr4x_epilogue: 1406 .byte 0xf3,0xc3 1407.size bn_sqr4x_mont,.-bn_sqr4x_mont 1408.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1409.align 16 1410