ecp_nistz256-x86_64.pl revision 306195
1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright 2014 Intel Corporation # 6# # 7# Licensed under the Apache License, Version 2.0 (the "License"); # 8# you may not use this file except in compliance with the License. # 9# You may obtain a copy of the License at # 10# # 11# http://www.apache.org/licenses/LICENSE-2.0 # 12# # 13# Unless required by applicable law or agreed to in writing, software # 14# distributed under the License is distributed on an "AS IS" BASIS, # 15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 16# See the License for the specific language governing permissions and # 17# limitations under the License. # 18# # 19############################################################################## 20# # 21# Developers and authors: # 22# Shay Gueron (1, 2), and Vlad Krasnov (1) # 23# (1) Intel Corporation, Israel Development Center # 24# (2) University of Haifa # 25# Reference: # 26# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with# 27# 256 Bit Primes" # 28# # 29############################################################################## 30 31# Further optimization by <appro@openssl.org>: 32# 33# this/original with/without -DECP_NISTZ256_ASM(*) 34# Opteron +12-49% +110-150% 35# Bulldozer +14-45% +175-210% 36# P4 +18-46% n/a :-( 37# Westmere +12-34% +80-87% 38# Sandy Bridge +9-35% +110-120% 39# Ivy Bridge +9-35% +110-125% 40# Haswell +8-37% +140-160% 41# Broadwell +18-58% +145-210% 42# Atom +15-50% +130-180% 43# VIA Nano +43-160% +300-480% 44# 45# (*) "without -DECP_NISTZ256_ASM" refers to build with 46# "enable-ec_nistp_64_gcc_128"; 47# 48# Ranges denote minimum and maximum improvement coefficients depending 49# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest 50# server-side operation. Keep in mind that +100% means 2x improvement. 51 52$flavour = shift; 53$output = shift; 54if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 55 56$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 57 58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 59( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 60( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 61die "can't locate x86_64-xlate.pl"; 62 63open OUT,"| \"$^X\" $xlate $flavour $output"; 64*STDOUT=*OUT; 65 66if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 67 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 68 $avx = ($1>=2.19) + ($1>=2.22); 69 $addx = ($1>=2.23); 70} 71 72if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 73 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 74 $avx = ($1>=2.09) + ($1>=2.10); 75 $addx = ($1>=2.10); 76} 77 78if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 79 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 80 $avx = ($1>=10) + ($1>=11); 81 $addx = ($1>=12); 82} 83 84if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 85 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 86 $avx = ($ver>=3.0) + ($ver>=3.01); 87 $addx = ($ver>=3.03); 88} 89 90$code.=<<___; 91.text 92.extern OPENSSL_ia32cap_P 93 94# The polynomial 95.align 64 96.Lpoly: 97.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 98 99# 2^512 mod P precomputed for NIST P256 polynomial 100.LRR: 101.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd 102 103.LOne: 104.long 1,1,1,1,1,1,1,1 105.LTwo: 106.long 2,2,2,2,2,2,2,2 107.LThree: 108.long 3,3,3,3,3,3,3,3 109.LONE_mont: 110.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 111___ 112 113{ 114################################################################################ 115# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); 116 117my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 118my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 119my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 120 121$code.=<<___; 122 123.globl ecp_nistz256_mul_by_2 124.type ecp_nistz256_mul_by_2,\@function,2 125.align 64 126ecp_nistz256_mul_by_2: 127 push %r12 128 push %r13 129 130 mov 8*0($a_ptr), $a0 131 xor $t4,$t4 132 mov 8*1($a_ptr), $a1 133 add $a0, $a0 # a0:a3+a0:a3 134 mov 8*2($a_ptr), $a2 135 adc $a1, $a1 136 mov 8*3($a_ptr), $a3 137 lea .Lpoly(%rip), $a_ptr 138 mov $a0, $t0 139 adc $a2, $a2 140 adc $a3, $a3 141 mov $a1, $t1 142 adc \$0, $t4 143 144 sub 8*0($a_ptr), $a0 145 mov $a2, $t2 146 sbb 8*1($a_ptr), $a1 147 sbb 8*2($a_ptr), $a2 148 mov $a3, $t3 149 sbb 8*3($a_ptr), $a3 150 sbb \$0, $t4 151 152 cmovc $t0, $a0 153 cmovc $t1, $a1 154 mov $a0, 8*0($r_ptr) 155 cmovc $t2, $a2 156 mov $a1, 8*1($r_ptr) 157 cmovc $t3, $a3 158 mov $a2, 8*2($r_ptr) 159 mov $a3, 8*3($r_ptr) 160 161 pop %r13 162 pop %r12 163 ret 164.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 165 166################################################################################ 167# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]); 168.globl ecp_nistz256_div_by_2 169.type ecp_nistz256_div_by_2,\@function,2 170.align 32 171ecp_nistz256_div_by_2: 172 push %r12 173 push %r13 174 175 mov 8*0($a_ptr), $a0 176 mov 8*1($a_ptr), $a1 177 mov 8*2($a_ptr), $a2 178 mov $a0, $t0 179 mov 8*3($a_ptr), $a3 180 lea .Lpoly(%rip), $a_ptr 181 182 mov $a1, $t1 183 xor $t4, $t4 184 add 8*0($a_ptr), $a0 185 mov $a2, $t2 186 adc 8*1($a_ptr), $a1 187 adc 8*2($a_ptr), $a2 188 mov $a3, $t3 189 adc 8*3($a_ptr), $a3 190 adc \$0, $t4 191 xor $a_ptr, $a_ptr # borrow $a_ptr 192 test \$1, $t0 193 194 cmovz $t0, $a0 195 cmovz $t1, $a1 196 cmovz $t2, $a2 197 cmovz $t3, $a3 198 cmovz $a_ptr, $t4 199 200 mov $a1, $t0 # a0:a3>>1 201 shr \$1, $a0 202 shl \$63, $t0 203 mov $a2, $t1 204 shr \$1, $a1 205 or $t0, $a0 206 shl \$63, $t1 207 mov $a3, $t2 208 shr \$1, $a2 209 or $t1, $a1 210 shl \$63, $t2 211 shr \$1, $a3 212 shl \$63, $t4 213 or $t2, $a2 214 or $t4, $a3 215 216 mov $a0, 8*0($r_ptr) 217 mov $a1, 8*1($r_ptr) 218 mov $a2, 8*2($r_ptr) 219 mov $a3, 8*3($r_ptr) 220 221 pop %r13 222 pop %r12 223 ret 224.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 225 226################################################################################ 227# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]); 228.globl ecp_nistz256_mul_by_3 229.type ecp_nistz256_mul_by_3,\@function,2 230.align 32 231ecp_nistz256_mul_by_3: 232 push %r12 233 push %r13 234 235 mov 8*0($a_ptr), $a0 236 xor $t4, $t4 237 mov 8*1($a_ptr), $a1 238 add $a0, $a0 # a0:a3+a0:a3 239 mov 8*2($a_ptr), $a2 240 adc $a1, $a1 241 mov 8*3($a_ptr), $a3 242 mov $a0, $t0 243 adc $a2, $a2 244 adc $a3, $a3 245 mov $a1, $t1 246 adc \$0, $t4 247 248 sub \$-1, $a0 249 mov $a2, $t2 250 sbb .Lpoly+8*1(%rip), $a1 251 sbb \$0, $a2 252 mov $a3, $t3 253 sbb .Lpoly+8*3(%rip), $a3 254 sbb \$0, $t4 255 256 cmovc $t0, $a0 257 cmovc $t1, $a1 258 cmovc $t2, $a2 259 cmovc $t3, $a3 260 261 xor $t4, $t4 262 add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3] 263 adc 8*1($a_ptr), $a1 264 mov $a0, $t0 265 adc 8*2($a_ptr), $a2 266 adc 8*3($a_ptr), $a3 267 mov $a1, $t1 268 adc \$0, $t4 269 270 sub \$-1, $a0 271 mov $a2, $t2 272 sbb .Lpoly+8*1(%rip), $a1 273 sbb \$0, $a2 274 mov $a3, $t3 275 sbb .Lpoly+8*3(%rip), $a3 276 sbb \$0, $t4 277 278 cmovc $t0, $a0 279 cmovc $t1, $a1 280 mov $a0, 8*0($r_ptr) 281 cmovc $t2, $a2 282 mov $a1, 8*1($r_ptr) 283 cmovc $t3, $a3 284 mov $a2, 8*2($r_ptr) 285 mov $a3, 8*3($r_ptr) 286 287 pop %r13 288 pop %r12 289 ret 290.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 291 292################################################################################ 293# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 294.globl ecp_nistz256_add 295.type ecp_nistz256_add,\@function,3 296.align 32 297ecp_nistz256_add: 298 push %r12 299 push %r13 300 301 mov 8*0($a_ptr), $a0 302 xor $t4, $t4 303 mov 8*1($a_ptr), $a1 304 mov 8*2($a_ptr), $a2 305 mov 8*3($a_ptr), $a3 306 lea .Lpoly(%rip), $a_ptr 307 308 add 8*0($b_ptr), $a0 309 adc 8*1($b_ptr), $a1 310 mov $a0, $t0 311 adc 8*2($b_ptr), $a2 312 adc 8*3($b_ptr), $a3 313 mov $a1, $t1 314 adc \$0, $t4 315 316 sub 8*0($a_ptr), $a0 317 mov $a2, $t2 318 sbb 8*1($a_ptr), $a1 319 sbb 8*2($a_ptr), $a2 320 mov $a3, $t3 321 sbb 8*3($a_ptr), $a3 322 sbb \$0, $t4 323 324 cmovc $t0, $a0 325 cmovc $t1, $a1 326 mov $a0, 8*0($r_ptr) 327 cmovc $t2, $a2 328 mov $a1, 8*1($r_ptr) 329 cmovc $t3, $a3 330 mov $a2, 8*2($r_ptr) 331 mov $a3, 8*3($r_ptr) 332 333 pop %r13 334 pop %r12 335 ret 336.size ecp_nistz256_add,.-ecp_nistz256_add 337 338################################################################################ 339# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 340.globl ecp_nistz256_sub 341.type ecp_nistz256_sub,\@function,3 342.align 32 343ecp_nistz256_sub: 344 push %r12 345 push %r13 346 347 mov 8*0($a_ptr), $a0 348 xor $t4, $t4 349 mov 8*1($a_ptr), $a1 350 mov 8*2($a_ptr), $a2 351 mov 8*3($a_ptr), $a3 352 lea .Lpoly(%rip), $a_ptr 353 354 sub 8*0($b_ptr), $a0 355 sbb 8*1($b_ptr), $a1 356 mov $a0, $t0 357 sbb 8*2($b_ptr), $a2 358 sbb 8*3($b_ptr), $a3 359 mov $a1, $t1 360 sbb \$0, $t4 361 362 add 8*0($a_ptr), $a0 363 mov $a2, $t2 364 adc 8*1($a_ptr), $a1 365 adc 8*2($a_ptr), $a2 366 mov $a3, $t3 367 adc 8*3($a_ptr), $a3 368 test $t4, $t4 369 370 cmovz $t0, $a0 371 cmovz $t1, $a1 372 mov $a0, 8*0($r_ptr) 373 cmovz $t2, $a2 374 mov $a1, 8*1($r_ptr) 375 cmovz $t3, $a3 376 mov $a2, 8*2($r_ptr) 377 mov $a3, 8*3($r_ptr) 378 379 pop %r13 380 pop %r12 381 ret 382.size ecp_nistz256_sub,.-ecp_nistz256_sub 383 384################################################################################ 385# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 386.globl ecp_nistz256_neg 387.type ecp_nistz256_neg,\@function,2 388.align 32 389ecp_nistz256_neg: 390 push %r12 391 push %r13 392 393 xor $a0, $a0 394 xor $a1, $a1 395 xor $a2, $a2 396 xor $a3, $a3 397 xor $t4, $t4 398 399 sub 8*0($a_ptr), $a0 400 sbb 8*1($a_ptr), $a1 401 sbb 8*2($a_ptr), $a2 402 mov $a0, $t0 403 sbb 8*3($a_ptr), $a3 404 lea .Lpoly(%rip), $a_ptr 405 mov $a1, $t1 406 sbb \$0, $t4 407 408 add 8*0($a_ptr), $a0 409 mov $a2, $t2 410 adc 8*1($a_ptr), $a1 411 adc 8*2($a_ptr), $a2 412 mov $a3, $t3 413 adc 8*3($a_ptr), $a3 414 test $t4, $t4 415 416 cmovz $t0, $a0 417 cmovz $t1, $a1 418 mov $a0, 8*0($r_ptr) 419 cmovz $t2, $a2 420 mov $a1, 8*1($r_ptr) 421 cmovz $t3, $a3 422 mov $a2, 8*2($r_ptr) 423 mov $a3, 8*3($r_ptr) 424 425 pop %r13 426 pop %r12 427 ret 428.size ecp_nistz256_neg,.-ecp_nistz256_neg 429___ 430} 431{ 432my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 433my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 434my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 435my ($poly1,$poly3)=($acc6,$acc7); 436 437$code.=<<___; 438################################################################################ 439# void ecp_nistz256_to_mont( 440# uint64_t res[4], 441# uint64_t in[4]); 442.globl ecp_nistz256_to_mont 443.type ecp_nistz256_to_mont,\@function,2 444.align 32 445ecp_nistz256_to_mont: 446___ 447$code.=<<___ if ($addx); 448 mov \$0x80100, %ecx 449 and OPENSSL_ia32cap_P+8(%rip), %ecx 450___ 451$code.=<<___; 452 lea .LRR(%rip), $b_org 453 jmp .Lmul_mont 454.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 455 456################################################################################ 457# void ecp_nistz256_mul_mont( 458# uint64_t res[4], 459# uint64_t a[4], 460# uint64_t b[4]); 461 462.globl ecp_nistz256_mul_mont 463.type ecp_nistz256_mul_mont,\@function,3 464.align 32 465ecp_nistz256_mul_mont: 466___ 467$code.=<<___ if ($addx); 468 mov \$0x80100, %ecx 469 and OPENSSL_ia32cap_P+8(%rip), %ecx 470___ 471$code.=<<___; 472.Lmul_mont: 473 push %rbp 474 push %rbx 475 push %r12 476 push %r13 477 push %r14 478 push %r15 479___ 480$code.=<<___ if ($addx); 481 cmp \$0x80100, %ecx 482 je .Lmul_montx 483___ 484$code.=<<___; 485 mov $b_org, $b_ptr 486 mov 8*0($b_org), %rax 487 mov 8*0($a_ptr), $acc1 488 mov 8*1($a_ptr), $acc2 489 mov 8*2($a_ptr), $acc3 490 mov 8*3($a_ptr), $acc4 491 492 call __ecp_nistz256_mul_montq 493___ 494$code.=<<___ if ($addx); 495 jmp .Lmul_mont_done 496 497.align 32 498.Lmul_montx: 499 mov $b_org, $b_ptr 500 mov 8*0($b_org), %rdx 501 mov 8*0($a_ptr), $acc1 502 mov 8*1($a_ptr), $acc2 503 mov 8*2($a_ptr), $acc3 504 mov 8*3($a_ptr), $acc4 505 lea -128($a_ptr), $a_ptr # control u-op density 506 507 call __ecp_nistz256_mul_montx 508___ 509$code.=<<___; 510.Lmul_mont_done: 511 pop %r15 512 pop %r14 513 pop %r13 514 pop %r12 515 pop %rbx 516 pop %rbp 517 ret 518.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 519 520.type __ecp_nistz256_mul_montq,\@abi-omnipotent 521.align 32 522__ecp_nistz256_mul_montq: 523 ######################################################################## 524 # Multiply a by b[0] 525 mov %rax, $t1 526 mulq $acc1 527 mov .Lpoly+8*1(%rip),$poly1 528 mov %rax, $acc0 529 mov $t1, %rax 530 mov %rdx, $acc1 531 532 mulq $acc2 533 mov .Lpoly+8*3(%rip),$poly3 534 add %rax, $acc1 535 mov $t1, %rax 536 adc \$0, %rdx 537 mov %rdx, $acc2 538 539 mulq $acc3 540 add %rax, $acc2 541 mov $t1, %rax 542 adc \$0, %rdx 543 mov %rdx, $acc3 544 545 mulq $acc4 546 add %rax, $acc3 547 mov $acc0, %rax 548 adc \$0, %rdx 549 xor $acc5, $acc5 550 mov %rdx, $acc4 551 552 ######################################################################## 553 # First reduction step 554 # Basically now we want to multiply acc[0] by p256, 555 # and add the result to the acc. 556 # Due to the special form of p256 we do some optimizations 557 # 558 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 559 # then we add acc[0] and get acc[0] x 2^96 560 561 mov $acc0, $t1 562 shl \$32, $acc0 563 mulq $poly3 564 shr \$32, $t1 565 add $acc0, $acc1 # +=acc[0]<<96 566 adc $t1, $acc2 567 adc %rax, $acc3 568 mov 8*1($b_ptr), %rax 569 adc %rdx, $acc4 570 adc \$0, $acc5 571 xor $acc0, $acc0 572 573 ######################################################################## 574 # Multiply by b[1] 575 mov %rax, $t1 576 mulq 8*0($a_ptr) 577 add %rax, $acc1 578 mov $t1, %rax 579 adc \$0, %rdx 580 mov %rdx, $t0 581 582 mulq 8*1($a_ptr) 583 add $t0, $acc2 584 adc \$0, %rdx 585 add %rax, $acc2 586 mov $t1, %rax 587 adc \$0, %rdx 588 mov %rdx, $t0 589 590 mulq 8*2($a_ptr) 591 add $t0, $acc3 592 adc \$0, %rdx 593 add %rax, $acc3 594 mov $t1, %rax 595 adc \$0, %rdx 596 mov %rdx, $t0 597 598 mulq 8*3($a_ptr) 599 add $t0, $acc4 600 adc \$0, %rdx 601 add %rax, $acc4 602 mov $acc1, %rax 603 adc %rdx, $acc5 604 adc \$0, $acc0 605 606 ######################################################################## 607 # Second reduction step 608 mov $acc1, $t1 609 shl \$32, $acc1 610 mulq $poly3 611 shr \$32, $t1 612 add $acc1, $acc2 613 adc $t1, $acc3 614 adc %rax, $acc4 615 mov 8*2($b_ptr), %rax 616 adc %rdx, $acc5 617 adc \$0, $acc0 618 xor $acc1, $acc1 619 620 ######################################################################## 621 # Multiply by b[2] 622 mov %rax, $t1 623 mulq 8*0($a_ptr) 624 add %rax, $acc2 625 mov $t1, %rax 626 adc \$0, %rdx 627 mov %rdx, $t0 628 629 mulq 8*1($a_ptr) 630 add $t0, $acc3 631 adc \$0, %rdx 632 add %rax, $acc3 633 mov $t1, %rax 634 adc \$0, %rdx 635 mov %rdx, $t0 636 637 mulq 8*2($a_ptr) 638 add $t0, $acc4 639 adc \$0, %rdx 640 add %rax, $acc4 641 mov $t1, %rax 642 adc \$0, %rdx 643 mov %rdx, $t0 644 645 mulq 8*3($a_ptr) 646 add $t0, $acc5 647 adc \$0, %rdx 648 add %rax, $acc5 649 mov $acc2, %rax 650 adc %rdx, $acc0 651 adc \$0, $acc1 652 653 ######################################################################## 654 # Third reduction step 655 mov $acc2, $t1 656 shl \$32, $acc2 657 mulq $poly3 658 shr \$32, $t1 659 add $acc2, $acc3 660 adc $t1, $acc4 661 adc %rax, $acc5 662 mov 8*3($b_ptr), %rax 663 adc %rdx, $acc0 664 adc \$0, $acc1 665 xor $acc2, $acc2 666 667 ######################################################################## 668 # Multiply by b[3] 669 mov %rax, $t1 670 mulq 8*0($a_ptr) 671 add %rax, $acc3 672 mov $t1, %rax 673 adc \$0, %rdx 674 mov %rdx, $t0 675 676 mulq 8*1($a_ptr) 677 add $t0, $acc4 678 adc \$0, %rdx 679 add %rax, $acc4 680 mov $t1, %rax 681 adc \$0, %rdx 682 mov %rdx, $t0 683 684 mulq 8*2($a_ptr) 685 add $t0, $acc5 686 adc \$0, %rdx 687 add %rax, $acc5 688 mov $t1, %rax 689 adc \$0, %rdx 690 mov %rdx, $t0 691 692 mulq 8*3($a_ptr) 693 add $t0, $acc0 694 adc \$0, %rdx 695 add %rax, $acc0 696 mov $acc3, %rax 697 adc %rdx, $acc1 698 adc \$0, $acc2 699 700 ######################################################################## 701 # Final reduction step 702 mov $acc3, $t1 703 shl \$32, $acc3 704 mulq $poly3 705 shr \$32, $t1 706 add $acc3, $acc4 707 adc $t1, $acc5 708 mov $acc4, $t0 709 adc %rax, $acc0 710 adc %rdx, $acc1 711 mov $acc5, $t1 712 adc \$0, $acc2 713 714 ######################################################################## 715 # Branch-less conditional subtraction of P 716 sub \$-1, $acc4 # .Lpoly[0] 717 mov $acc0, $t2 718 sbb $poly1, $acc5 # .Lpoly[1] 719 sbb \$0, $acc0 # .Lpoly[2] 720 mov $acc1, $t3 721 sbb $poly3, $acc1 # .Lpoly[3] 722 sbb \$0, $acc2 723 724 cmovc $t0, $acc4 725 cmovc $t1, $acc5 726 mov $acc4, 8*0($r_ptr) 727 cmovc $t2, $acc0 728 mov $acc5, 8*1($r_ptr) 729 cmovc $t3, $acc1 730 mov $acc0, 8*2($r_ptr) 731 mov $acc1, 8*3($r_ptr) 732 733 ret 734.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 735 736################################################################################ 737# void ecp_nistz256_sqr_mont( 738# uint64_t res[4], 739# uint64_t a[4]); 740 741# we optimize the square according to S.Gueron and V.Krasnov, 742# "Speeding up Big-Number Squaring" 743.globl ecp_nistz256_sqr_mont 744.type ecp_nistz256_sqr_mont,\@function,2 745.align 32 746ecp_nistz256_sqr_mont: 747___ 748$code.=<<___ if ($addx); 749 mov \$0x80100, %ecx 750 and OPENSSL_ia32cap_P+8(%rip), %ecx 751___ 752$code.=<<___; 753 push %rbp 754 push %rbx 755 push %r12 756 push %r13 757 push %r14 758 push %r15 759___ 760$code.=<<___ if ($addx); 761 cmp \$0x80100, %ecx 762 je .Lsqr_montx 763___ 764$code.=<<___; 765 mov 8*0($a_ptr), %rax 766 mov 8*1($a_ptr), $acc6 767 mov 8*2($a_ptr), $acc7 768 mov 8*3($a_ptr), $acc0 769 770 call __ecp_nistz256_sqr_montq 771___ 772$code.=<<___ if ($addx); 773 jmp .Lsqr_mont_done 774 775.align 32 776.Lsqr_montx: 777 mov 8*0($a_ptr), %rdx 778 mov 8*1($a_ptr), $acc6 779 mov 8*2($a_ptr), $acc7 780 mov 8*3($a_ptr), $acc0 781 lea -128($a_ptr), $a_ptr # control u-op density 782 783 call __ecp_nistz256_sqr_montx 784___ 785$code.=<<___; 786.Lsqr_mont_done: 787 pop %r15 788 pop %r14 789 pop %r13 790 pop %r12 791 pop %rbx 792 pop %rbp 793 ret 794.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 795 796.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 797.align 32 798__ecp_nistz256_sqr_montq: 799 mov %rax, $acc5 800 mulq $acc6 # a[1]*a[0] 801 mov %rax, $acc1 802 mov $acc7, %rax 803 mov %rdx, $acc2 804 805 mulq $acc5 # a[0]*a[2] 806 add %rax, $acc2 807 mov $acc0, %rax 808 adc \$0, %rdx 809 mov %rdx, $acc3 810 811 mulq $acc5 # a[0]*a[3] 812 add %rax, $acc3 813 mov $acc7, %rax 814 adc \$0, %rdx 815 mov %rdx, $acc4 816 817 ################################# 818 mulq $acc6 # a[1]*a[2] 819 add %rax, $acc3 820 mov $acc0, %rax 821 adc \$0, %rdx 822 mov %rdx, $t1 823 824 mulq $acc6 # a[1]*a[3] 825 add %rax, $acc4 826 mov $acc0, %rax 827 adc \$0, %rdx 828 add $t1, $acc4 829 mov %rdx, $acc5 830 adc \$0, $acc5 831 832 ################################# 833 mulq $acc7 # a[2]*a[3] 834 xor $acc7, $acc7 835 add %rax, $acc5 836 mov 8*0($a_ptr), %rax 837 mov %rdx, $acc6 838 adc \$0, $acc6 839 840 add $acc1, $acc1 # acc1:6<<1 841 adc $acc2, $acc2 842 adc $acc3, $acc3 843 adc $acc4, $acc4 844 adc $acc5, $acc5 845 adc $acc6, $acc6 846 adc \$0, $acc7 847 848 mulq %rax 849 mov %rax, $acc0 850 mov 8*1($a_ptr), %rax 851 mov %rdx, $t0 852 853 mulq %rax 854 add $t0, $acc1 855 adc %rax, $acc2 856 mov 8*2($a_ptr), %rax 857 adc \$0, %rdx 858 mov %rdx, $t0 859 860 mulq %rax 861 add $t0, $acc3 862 adc %rax, $acc4 863 mov 8*3($a_ptr), %rax 864 adc \$0, %rdx 865 mov %rdx, $t0 866 867 mulq %rax 868 add $t0, $acc5 869 adc %rax, $acc6 870 mov $acc0, %rax 871 adc %rdx, $acc7 872 873 mov .Lpoly+8*1(%rip), $a_ptr 874 mov .Lpoly+8*3(%rip), $t1 875 876 ########################################## 877 # Now the reduction 878 # First iteration 879 mov $acc0, $t0 880 shl \$32, $acc0 881 mulq $t1 882 shr \$32, $t0 883 add $acc0, $acc1 # +=acc[0]<<96 884 adc $t0, $acc2 885 adc %rax, $acc3 886 mov $acc1, %rax 887 adc \$0, %rdx 888 889 ########################################## 890 # Second iteration 891 mov $acc1, $t0 892 shl \$32, $acc1 893 mov %rdx, $acc0 894 mulq $t1 895 shr \$32, $t0 896 add $acc1, $acc2 897 adc $t0, $acc3 898 adc %rax, $acc0 899 mov $acc2, %rax 900 adc \$0, %rdx 901 902 ########################################## 903 # Third iteration 904 mov $acc2, $t0 905 shl \$32, $acc2 906 mov %rdx, $acc1 907 mulq $t1 908 shr \$32, $t0 909 add $acc2, $acc3 910 adc $t0, $acc0 911 adc %rax, $acc1 912 mov $acc3, %rax 913 adc \$0, %rdx 914 915 ########################################### 916 # Last iteration 917 mov $acc3, $t0 918 shl \$32, $acc3 919 mov %rdx, $acc2 920 mulq $t1 921 shr \$32, $t0 922 add $acc3, $acc0 923 adc $t0, $acc1 924 adc %rax, $acc2 925 adc \$0, %rdx 926 xor $acc3, $acc3 927 928 ############################################ 929 # Add the rest of the acc 930 add $acc0, $acc4 931 adc $acc1, $acc5 932 mov $acc4, $acc0 933 adc $acc2, $acc6 934 adc %rdx, $acc7 935 mov $acc5, $acc1 936 adc \$0, $acc3 937 938 sub \$-1, $acc4 # .Lpoly[0] 939 mov $acc6, $acc2 940 sbb $a_ptr, $acc5 # .Lpoly[1] 941 sbb \$0, $acc6 # .Lpoly[2] 942 mov $acc7, $t0 943 sbb $t1, $acc7 # .Lpoly[3] 944 sbb \$0, $acc3 945 946 cmovc $acc0, $acc4 947 cmovc $acc1, $acc5 948 mov $acc4, 8*0($r_ptr) 949 cmovc $acc2, $acc6 950 mov $acc5, 8*1($r_ptr) 951 cmovc $t0, $acc7 952 mov $acc6, 8*2($r_ptr) 953 mov $acc7, 8*3($r_ptr) 954 955 ret 956.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 957___ 958 959if ($addx) { 960$code.=<<___; 961.type __ecp_nistz256_mul_montx,\@abi-omnipotent 962.align 32 963__ecp_nistz256_mul_montx: 964 ######################################################################## 965 # Multiply by b[0] 966 mulx $acc1, $acc0, $acc1 967 mulx $acc2, $t0, $acc2 968 mov \$32, $poly1 969 xor $acc5, $acc5 # cf=0 970 mulx $acc3, $t1, $acc3 971 mov .Lpoly+8*3(%rip), $poly3 972 adc $t0, $acc1 973 mulx $acc4, $t0, $acc4 974 mov $acc0, %rdx 975 adc $t1, $acc2 976 shlx $poly1,$acc0,$t1 977 adc $t0, $acc3 978 shrx $poly1,$acc0,$t0 979 adc \$0, $acc4 980 981 ######################################################################## 982 # First reduction step 983 add $t1, $acc1 984 adc $t0, $acc2 985 986 mulx $poly3, $t0, $t1 987 mov 8*1($b_ptr), %rdx 988 adc $t0, $acc3 989 adc $t1, $acc4 990 adc \$0, $acc5 991 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 992 993 ######################################################################## 994 # Multiply by b[1] 995 mulx 8*0+128($a_ptr), $t0, $t1 996 adcx $t0, $acc1 997 adox $t1, $acc2 998 999 mulx 8*1+128($a_ptr), $t0, $t1 1000 adcx $t0, $acc2 1001 adox $t1, $acc3 1002 1003 mulx 8*2+128($a_ptr), $t0, $t1 1004 adcx $t0, $acc3 1005 adox $t1, $acc4 1006 1007 mulx 8*3+128($a_ptr), $t0, $t1 1008 mov $acc1, %rdx 1009 adcx $t0, $acc4 1010 shlx $poly1, $acc1, $t0 1011 adox $t1, $acc5 1012 shrx $poly1, $acc1, $t1 1013 1014 adcx $acc0, $acc5 1015 adox $acc0, $acc0 1016 adc \$0, $acc0 1017 1018 ######################################################################## 1019 # Second reduction step 1020 add $t0, $acc2 1021 adc $t1, $acc3 1022 1023 mulx $poly3, $t0, $t1 1024 mov 8*2($b_ptr), %rdx 1025 adc $t0, $acc4 1026 adc $t1, $acc5 1027 adc \$0, $acc0 1028 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 1029 1030 ######################################################################## 1031 # Multiply by b[2] 1032 mulx 8*0+128($a_ptr), $t0, $t1 1033 adcx $t0, $acc2 1034 adox $t1, $acc3 1035 1036 mulx 8*1+128($a_ptr), $t0, $t1 1037 adcx $t0, $acc3 1038 adox $t1, $acc4 1039 1040 mulx 8*2+128($a_ptr), $t0, $t1 1041 adcx $t0, $acc4 1042 adox $t1, $acc5 1043 1044 mulx 8*3+128($a_ptr), $t0, $t1 1045 mov $acc2, %rdx 1046 adcx $t0, $acc5 1047 shlx $poly1, $acc2, $t0 1048 adox $t1, $acc0 1049 shrx $poly1, $acc2, $t1 1050 1051 adcx $acc1, $acc0 1052 adox $acc1, $acc1 1053 adc \$0, $acc1 1054 1055 ######################################################################## 1056 # Third reduction step 1057 add $t0, $acc3 1058 adc $t1, $acc4 1059 1060 mulx $poly3, $t0, $t1 1061 mov 8*3($b_ptr), %rdx 1062 adc $t0, $acc5 1063 adc $t1, $acc0 1064 adc \$0, $acc1 1065 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 1066 1067 ######################################################################## 1068 # Multiply by b[3] 1069 mulx 8*0+128($a_ptr), $t0, $t1 1070 adcx $t0, $acc3 1071 adox $t1, $acc4 1072 1073 mulx 8*1+128($a_ptr), $t0, $t1 1074 adcx $t0, $acc4 1075 adox $t1, $acc5 1076 1077 mulx 8*2+128($a_ptr), $t0, $t1 1078 adcx $t0, $acc5 1079 adox $t1, $acc0 1080 1081 mulx 8*3+128($a_ptr), $t0, $t1 1082 mov $acc3, %rdx 1083 adcx $t0, $acc0 1084 shlx $poly1, $acc3, $t0 1085 adox $t1, $acc1 1086 shrx $poly1, $acc3, $t1 1087 1088 adcx $acc2, $acc1 1089 adox $acc2, $acc2 1090 adc \$0, $acc2 1091 1092 ######################################################################## 1093 # Fourth reduction step 1094 add $t0, $acc4 1095 adc $t1, $acc5 1096 1097 mulx $poly3, $t0, $t1 1098 mov $acc4, $t2 1099 mov .Lpoly+8*1(%rip), $poly1 1100 adc $t0, $acc0 1101 mov $acc5, $t3 1102 adc $t1, $acc1 1103 adc \$0, $acc2 1104 1105 ######################################################################## 1106 # Branch-less conditional subtraction of P 1107 xor %eax, %eax 1108 mov $acc0, $t0 1109 sbb \$-1, $acc4 # .Lpoly[0] 1110 sbb $poly1, $acc5 # .Lpoly[1] 1111 sbb \$0, $acc0 # .Lpoly[2] 1112 mov $acc1, $t1 1113 sbb $poly3, $acc1 # .Lpoly[3] 1114 sbb \$0, $acc2 1115 1116 cmovc $t2, $acc4 1117 cmovc $t3, $acc5 1118 mov $acc4, 8*0($r_ptr) 1119 cmovc $t0, $acc0 1120 mov $acc5, 8*1($r_ptr) 1121 cmovc $t1, $acc1 1122 mov $acc0, 8*2($r_ptr) 1123 mov $acc1, 8*3($r_ptr) 1124 1125 ret 1126.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 1127 1128.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 1129.align 32 1130__ecp_nistz256_sqr_montx: 1131 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1132 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1133 xor %eax, %eax 1134 adc $t0, $acc2 1135 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1136 mov $acc6, %rdx 1137 adc $t1, $acc3 1138 adc \$0, $acc4 1139 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1140 1141 ################################# 1142 mulx $acc7, $t0, $t1 # a[1]*a[2] 1143 adcx $t0, $acc3 1144 adox $t1, $acc4 1145 1146 mulx $acc0, $t0, $t1 # a[1]*a[3] 1147 mov $acc7, %rdx 1148 adcx $t0, $acc4 1149 adox $t1, $acc5 1150 adc \$0, $acc5 1151 1152 ################################# 1153 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1154 mov 8*0+128($a_ptr), %rdx 1155 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1156 adcx $acc1, $acc1 # acc1:6<<1 1157 adox $t0, $acc5 1158 adcx $acc2, $acc2 1159 adox $acc7, $acc6 # of=0 1160 1161 mulx %rdx, $acc0, $t1 1162 mov 8*1+128($a_ptr), %rdx 1163 adcx $acc3, $acc3 1164 adox $t1, $acc1 1165 adcx $acc4, $acc4 1166 mulx %rdx, $t0, $t4 1167 mov 8*2+128($a_ptr), %rdx 1168 adcx $acc5, $acc5 1169 adox $t0, $acc2 1170 adcx $acc6, $acc6 1171 .byte 0x67 1172 mulx %rdx, $t0, $t1 1173 mov 8*3+128($a_ptr), %rdx 1174 adox $t4, $acc3 1175 adcx $acc7, $acc7 1176 adox $t0, $acc4 1177 mov \$32, $a_ptr 1178 adox $t1, $acc5 1179 .byte 0x67,0x67 1180 mulx %rdx, $t0, $t4 1181 mov $acc0, %rdx 1182 adox $t0, $acc6 1183 shlx $a_ptr, $acc0, $t0 1184 adox $t4, $acc7 1185 shrx $a_ptr, $acc0, $t4 1186 mov .Lpoly+8*3(%rip), $t1 1187 1188 # reduction step 1 1189 add $t0, $acc1 1190 adc $t4, $acc2 1191 1192 mulx $t1, $t0, $acc0 1193 mov $acc1, %rdx 1194 adc $t0, $acc3 1195 shlx $a_ptr, $acc1, $t0 1196 adc \$0, $acc0 1197 shrx $a_ptr, $acc1, $t4 1198 1199 # reduction step 2 1200 add $t0, $acc2 1201 adc $t4, $acc3 1202 1203 mulx $t1, $t0, $acc1 1204 mov $acc2, %rdx 1205 adc $t0, $acc0 1206 shlx $a_ptr, $acc2, $t0 1207 adc \$0, $acc1 1208 shrx $a_ptr, $acc2, $t4 1209 1210 # reduction step 3 1211 add $t0, $acc3 1212 adc $t4, $acc0 1213 1214 mulx $t1, $t0, $acc2 1215 mov $acc3, %rdx 1216 adc $t0, $acc1 1217 shlx $a_ptr, $acc3, $t0 1218 adc \$0, $acc2 1219 shrx $a_ptr, $acc3, $t4 1220 1221 # reduction step 4 1222 add $t0, $acc0 1223 adc $t4, $acc1 1224 1225 mulx $t1, $t0, $acc3 1226 adc $t0, $acc2 1227 adc \$0, $acc3 1228 1229 xor $t3, $t3 # cf=0 1230 adc $acc0, $acc4 # accumulate upper half 1231 mov .Lpoly+8*1(%rip), $a_ptr 1232 adc $acc1, $acc5 1233 mov $acc4, $acc0 1234 adc $acc2, $acc6 1235 adc $acc3, $acc7 1236 mov $acc5, $acc1 1237 adc \$0, $t3 1238 1239 xor %eax, %eax # cf=0 1240 sbb \$-1, $acc4 # .Lpoly[0] 1241 mov $acc6, $acc2 1242 sbb $a_ptr, $acc5 # .Lpoly[1] 1243 sbb \$0, $acc6 # .Lpoly[2] 1244 mov $acc7, $acc3 1245 sbb $t1, $acc7 # .Lpoly[3] 1246 sbb \$0, $t3 1247 1248 cmovc $acc0, $acc4 1249 cmovc $acc1, $acc5 1250 mov $acc4, 8*0($r_ptr) 1251 cmovc $acc2, $acc6 1252 mov $acc5, 8*1($r_ptr) 1253 cmovc $acc3, $acc7 1254 mov $acc6, 8*2($r_ptr) 1255 mov $acc7, 8*3($r_ptr) 1256 1257 ret 1258.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 1259___ 1260} 1261} 1262{ 1263my ($r_ptr,$in_ptr)=("%rdi","%rsi"); 1264my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); 1265my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); 1266 1267$code.=<<___; 1268################################################################################ 1269# void ecp_nistz256_from_mont( 1270# uint64_t res[4], 1271# uint64_t in[4]); 1272# This one performs Montgomery multiplication by 1, so we only need the reduction 1273 1274.globl ecp_nistz256_from_mont 1275.type ecp_nistz256_from_mont,\@function,2 1276.align 32 1277ecp_nistz256_from_mont: 1278 push %r12 1279 push %r13 1280 1281 mov 8*0($in_ptr), %rax 1282 mov .Lpoly+8*3(%rip), $t2 1283 mov 8*1($in_ptr), $acc1 1284 mov 8*2($in_ptr), $acc2 1285 mov 8*3($in_ptr), $acc3 1286 mov %rax, $acc0 1287 mov .Lpoly+8*1(%rip), $t1 1288 1289 ######################################### 1290 # First iteration 1291 mov %rax, $t0 1292 shl \$32, $acc0 1293 mulq $t2 1294 shr \$32, $t0 1295 add $acc0, $acc1 1296 adc $t0, $acc2 1297 adc %rax, $acc3 1298 mov $acc1, %rax 1299 adc \$0, %rdx 1300 1301 ######################################### 1302 # Second iteration 1303 mov $acc1, $t0 1304 shl \$32, $acc1 1305 mov %rdx, $acc0 1306 mulq $t2 1307 shr \$32, $t0 1308 add $acc1, $acc2 1309 adc $t0, $acc3 1310 adc %rax, $acc0 1311 mov $acc2, %rax 1312 adc \$0, %rdx 1313 1314 ########################################## 1315 # Third iteration 1316 mov $acc2, $t0 1317 shl \$32, $acc2 1318 mov %rdx, $acc1 1319 mulq $t2 1320 shr \$32, $t0 1321 add $acc2, $acc3 1322 adc $t0, $acc0 1323 adc %rax, $acc1 1324 mov $acc3, %rax 1325 adc \$0, %rdx 1326 1327 ########################################### 1328 # Last iteration 1329 mov $acc3, $t0 1330 shl \$32, $acc3 1331 mov %rdx, $acc2 1332 mulq $t2 1333 shr \$32, $t0 1334 add $acc3, $acc0 1335 adc $t0, $acc1 1336 mov $acc0, $t0 1337 adc %rax, $acc2 1338 mov $acc1, $in_ptr 1339 adc \$0, %rdx 1340 1341 ########################################### 1342 # Branch-less conditional subtraction 1343 sub \$-1, $acc0 1344 mov $acc2, %rax 1345 sbb $t1, $acc1 1346 sbb \$0, $acc2 1347 mov %rdx, $acc3 1348 sbb $t2, %rdx 1349 sbb $t2, $t2 1350 1351 cmovnz $t0, $acc0 1352 cmovnz $in_ptr, $acc1 1353 mov $acc0, 8*0($r_ptr) 1354 cmovnz %rax, $acc2 1355 mov $acc1, 8*1($r_ptr) 1356 cmovz %rdx, $acc3 1357 mov $acc2, 8*2($r_ptr) 1358 mov $acc3, 8*3($r_ptr) 1359 1360 pop %r13 1361 pop %r12 1362 ret 1363.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 1364___ 1365} 1366{ 1367my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1368my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 1369my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 1370my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 1371 1372$code.=<<___; 1373################################################################################ 1374# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1375.globl ecp_nistz256_select_w5 1376.type ecp_nistz256_select_w5,\@abi-omnipotent 1377.align 32 1378ecp_nistz256_select_w5: 1379___ 1380$code.=<<___ if ($avx>1); 1381 mov OPENSSL_ia32cap_P+8(%rip), %eax 1382 test \$`1<<5`, %eax 1383 jnz .Lavx2_select_w5 1384___ 1385$code.=<<___ if ($win64); 1386 lea -0x88(%rsp), %rax 1387.LSEH_begin_ecp_nistz256_select_w5: 1388 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1389 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1390 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1391 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1392 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1393 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1394 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1395 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1396 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1397 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1398 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1399___ 1400$code.=<<___; 1401 movdqa .LOne(%rip), $ONE 1402 movd $index, $INDEX 1403 1404 pxor $Ra, $Ra 1405 pxor $Rb, $Rb 1406 pxor $Rc, $Rc 1407 pxor $Rd, $Rd 1408 pxor $Re, $Re 1409 pxor $Rf, $Rf 1410 1411 movdqa $ONE, $M0 1412 pshufd \$0, $INDEX, $INDEX 1413 1414 mov \$16, %rax 1415.Lselect_loop_sse_w5: 1416 1417 movdqa $M0, $TMP0 1418 paddd $ONE, $M0 1419 pcmpeqd $INDEX, $TMP0 1420 1421 movdqa 16*0($in_t), $T0a 1422 movdqa 16*1($in_t), $T0b 1423 movdqa 16*2($in_t), $T0c 1424 movdqa 16*3($in_t), $T0d 1425 movdqa 16*4($in_t), $T0e 1426 movdqa 16*5($in_t), $T0f 1427 lea 16*6($in_t), $in_t 1428 1429 pand $TMP0, $T0a 1430 pand $TMP0, $T0b 1431 por $T0a, $Ra 1432 pand $TMP0, $T0c 1433 por $T0b, $Rb 1434 pand $TMP0, $T0d 1435 por $T0c, $Rc 1436 pand $TMP0, $T0e 1437 por $T0d, $Rd 1438 pand $TMP0, $T0f 1439 por $T0e, $Re 1440 por $T0f, $Rf 1441 1442 dec %rax 1443 jnz .Lselect_loop_sse_w5 1444 1445 movdqu $Ra, 16*0($val) 1446 movdqu $Rb, 16*1($val) 1447 movdqu $Rc, 16*2($val) 1448 movdqu $Rd, 16*3($val) 1449 movdqu $Re, 16*4($val) 1450 movdqu $Rf, 16*5($val) 1451___ 1452$code.=<<___ if ($win64); 1453 movaps (%rsp), %xmm6 1454 movaps 0x10(%rsp), %xmm7 1455 movaps 0x20(%rsp), %xmm8 1456 movaps 0x30(%rsp), %xmm9 1457 movaps 0x40(%rsp), %xmm10 1458 movaps 0x50(%rsp), %xmm11 1459 movaps 0x60(%rsp), %xmm12 1460 movaps 0x70(%rsp), %xmm13 1461 movaps 0x80(%rsp), %xmm14 1462 movaps 0x90(%rsp), %xmm15 1463 lea 0xa8(%rsp), %rsp 1464.LSEH_end_ecp_nistz256_select_w5: 1465___ 1466$code.=<<___; 1467 ret 1468.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1469 1470################################################################################ 1471# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1472.globl ecp_nistz256_select_w7 1473.type ecp_nistz256_select_w7,\@abi-omnipotent 1474.align 32 1475ecp_nistz256_select_w7: 1476___ 1477$code.=<<___ if ($avx>1); 1478 mov OPENSSL_ia32cap_P+8(%rip), %eax 1479 test \$`1<<5`, %eax 1480 jnz .Lavx2_select_w7 1481___ 1482$code.=<<___ if ($win64); 1483 lea -0x88(%rsp), %rax 1484.LSEH_begin_ecp_nistz256_select_w7: 1485 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1486 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1487 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1488 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1489 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1490 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1491 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1492 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1493 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1494 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1495 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1496___ 1497$code.=<<___; 1498 movdqa .LOne(%rip), $M0 1499 movd $index, $INDEX 1500 1501 pxor $Ra, $Ra 1502 pxor $Rb, $Rb 1503 pxor $Rc, $Rc 1504 pxor $Rd, $Rd 1505 1506 movdqa $M0, $ONE 1507 pshufd \$0, $INDEX, $INDEX 1508 mov \$64, %rax 1509 1510.Lselect_loop_sse_w7: 1511 movdqa $M0, $TMP0 1512 paddd $ONE, $M0 1513 movdqa 16*0($in_t), $T0a 1514 movdqa 16*1($in_t), $T0b 1515 pcmpeqd $INDEX, $TMP0 1516 movdqa 16*2($in_t), $T0c 1517 movdqa 16*3($in_t), $T0d 1518 lea 16*4($in_t), $in_t 1519 1520 pand $TMP0, $T0a 1521 pand $TMP0, $T0b 1522 por $T0a, $Ra 1523 pand $TMP0, $T0c 1524 por $T0b, $Rb 1525 pand $TMP0, $T0d 1526 por $T0c, $Rc 1527 prefetcht0 255($in_t) 1528 por $T0d, $Rd 1529 1530 dec %rax 1531 jnz .Lselect_loop_sse_w7 1532 1533 movdqu $Ra, 16*0($val) 1534 movdqu $Rb, 16*1($val) 1535 movdqu $Rc, 16*2($val) 1536 movdqu $Rd, 16*3($val) 1537___ 1538$code.=<<___ if ($win64); 1539 movaps (%rsp), %xmm6 1540 movaps 0x10(%rsp), %xmm7 1541 movaps 0x20(%rsp), %xmm8 1542 movaps 0x30(%rsp), %xmm9 1543 movaps 0x40(%rsp), %xmm10 1544 movaps 0x50(%rsp), %xmm11 1545 movaps 0x60(%rsp), %xmm12 1546 movaps 0x70(%rsp), %xmm13 1547 movaps 0x80(%rsp), %xmm14 1548 movaps 0x90(%rsp), %xmm15 1549 lea 0xa8(%rsp), %rsp 1550.LSEH_end_ecp_nistz256_select_w7: 1551___ 1552$code.=<<___; 1553 ret 1554.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1555___ 1556} 1557if ($avx>1) { 1558my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1559my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 1560my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 1561my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 1562 1563$code.=<<___; 1564################################################################################ 1565# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 1566.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 1567.align 32 1568ecp_nistz256_avx2_select_w5: 1569.Lavx2_select_w5: 1570 vzeroupper 1571___ 1572$code.=<<___ if ($win64); 1573 lea -0x88(%rsp), %rax 1574.LSEH_begin_ecp_nistz256_avx2_select_w5: 1575 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1576 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1577 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1578 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1579 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1580 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1581 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1582 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1583 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1584 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1585 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1586___ 1587$code.=<<___; 1588 vmovdqa .LTwo(%rip), $TWO 1589 1590 vpxor $Ra, $Ra, $Ra 1591 vpxor $Rb, $Rb, $Rb 1592 vpxor $Rc, $Rc, $Rc 1593 1594 vmovdqa .LOne(%rip), $M0 1595 vmovdqa .LTwo(%rip), $M1 1596 1597 vmovd $index, %xmm1 1598 vpermd $INDEX, $Ra, $INDEX 1599 1600 mov \$8, %rax 1601.Lselect_loop_avx2_w5: 1602 1603 vmovdqa 32*0($in_t), $T0a 1604 vmovdqa 32*1($in_t), $T0b 1605 vmovdqa 32*2($in_t), $T0c 1606 1607 vmovdqa 32*3($in_t), $T1a 1608 vmovdqa 32*4($in_t), $T1b 1609 vmovdqa 32*5($in_t), $T1c 1610 1611 vpcmpeqd $INDEX, $M0, $TMP0 1612 vpcmpeqd $INDEX, $M1, $TMP1 1613 1614 vpaddd $TWO, $M0, $M0 1615 vpaddd $TWO, $M1, $M1 1616 lea 32*6($in_t), $in_t 1617 1618 vpand $TMP0, $T0a, $T0a 1619 vpand $TMP0, $T0b, $T0b 1620 vpand $TMP0, $T0c, $T0c 1621 vpand $TMP1, $T1a, $T1a 1622 vpand $TMP1, $T1b, $T1b 1623 vpand $TMP1, $T1c, $T1c 1624 1625 vpxor $T0a, $Ra, $Ra 1626 vpxor $T0b, $Rb, $Rb 1627 vpxor $T0c, $Rc, $Rc 1628 vpxor $T1a, $Ra, $Ra 1629 vpxor $T1b, $Rb, $Rb 1630 vpxor $T1c, $Rc, $Rc 1631 1632 dec %rax 1633 jnz .Lselect_loop_avx2_w5 1634 1635 vmovdqu $Ra, 32*0($val) 1636 vmovdqu $Rb, 32*1($val) 1637 vmovdqu $Rc, 32*2($val) 1638 vzeroupper 1639___ 1640$code.=<<___ if ($win64); 1641 movaps (%rsp), %xmm6 1642 movaps 0x10(%rsp), %xmm7 1643 movaps 0x20(%rsp), %xmm8 1644 movaps 0x30(%rsp), %xmm9 1645 movaps 0x40(%rsp), %xmm10 1646 movaps 0x50(%rsp), %xmm11 1647 movaps 0x60(%rsp), %xmm12 1648 movaps 0x70(%rsp), %xmm13 1649 movaps 0x80(%rsp), %xmm14 1650 movaps 0x90(%rsp), %xmm15 1651 lea 0xa8(%rsp), %rsp 1652.LSEH_end_ecp_nistz256_avx2_select_w5: 1653___ 1654$code.=<<___; 1655 ret 1656.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 1657___ 1658} 1659if ($avx>1) { 1660my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1661my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 1662my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 1663my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 1664my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 1665 1666$code.=<<___; 1667 1668################################################################################ 1669# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 1670.globl ecp_nistz256_avx2_select_w7 1671.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 1672.align 32 1673ecp_nistz256_avx2_select_w7: 1674.Lavx2_select_w7: 1675 vzeroupper 1676___ 1677$code.=<<___ if ($win64); 1678 lea -0x88(%rsp), %rax 1679.LSEH_begin_ecp_nistz256_avx2_select_w7: 1680 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1681 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1682 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1683 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1684 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1685 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1686 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1687 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1688 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1689 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1690 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1691___ 1692$code.=<<___; 1693 vmovdqa .LThree(%rip), $THREE 1694 1695 vpxor $Ra, $Ra, $Ra 1696 vpxor $Rb, $Rb, $Rb 1697 1698 vmovdqa .LOne(%rip), $M0 1699 vmovdqa .LTwo(%rip), $M1 1700 vmovdqa .LThree(%rip), $M2 1701 1702 vmovd $index, %xmm1 1703 vpermd $INDEX, $Ra, $INDEX 1704 # Skip index = 0, because it is implicitly the point at infinity 1705 1706 mov \$21, %rax 1707.Lselect_loop_avx2_w7: 1708 1709 vmovdqa 32*0($in_t), $T0a 1710 vmovdqa 32*1($in_t), $T0b 1711 1712 vmovdqa 32*2($in_t), $T1a 1713 vmovdqa 32*3($in_t), $T1b 1714 1715 vmovdqa 32*4($in_t), $T2a 1716 vmovdqa 32*5($in_t), $T2b 1717 1718 vpcmpeqd $INDEX, $M0, $TMP0 1719 vpcmpeqd $INDEX, $M1, $TMP1 1720 vpcmpeqd $INDEX, $M2, $TMP2 1721 1722 vpaddd $THREE, $M0, $M0 1723 vpaddd $THREE, $M1, $M1 1724 vpaddd $THREE, $M2, $M2 1725 lea 32*6($in_t), $in_t 1726 1727 vpand $TMP0, $T0a, $T0a 1728 vpand $TMP0, $T0b, $T0b 1729 vpand $TMP1, $T1a, $T1a 1730 vpand $TMP1, $T1b, $T1b 1731 vpand $TMP2, $T2a, $T2a 1732 vpand $TMP2, $T2b, $T2b 1733 1734 vpxor $T0a, $Ra, $Ra 1735 vpxor $T0b, $Rb, $Rb 1736 vpxor $T1a, $Ra, $Ra 1737 vpxor $T1b, $Rb, $Rb 1738 vpxor $T2a, $Ra, $Ra 1739 vpxor $T2b, $Rb, $Rb 1740 1741 dec %rax 1742 jnz .Lselect_loop_avx2_w7 1743 1744 1745 vmovdqa 32*0($in_t), $T0a 1746 vmovdqa 32*1($in_t), $T0b 1747 1748 vpcmpeqd $INDEX, $M0, $TMP0 1749 1750 vpand $TMP0, $T0a, $T0a 1751 vpand $TMP0, $T0b, $T0b 1752 1753 vpxor $T0a, $Ra, $Ra 1754 vpxor $T0b, $Rb, $Rb 1755 1756 vmovdqu $Ra, 32*0($val) 1757 vmovdqu $Rb, 32*1($val) 1758 vzeroupper 1759___ 1760$code.=<<___ if ($win64); 1761 movaps (%rsp), %xmm6 1762 movaps 0x10(%rsp), %xmm7 1763 movaps 0x20(%rsp), %xmm8 1764 movaps 0x30(%rsp), %xmm9 1765 movaps 0x40(%rsp), %xmm10 1766 movaps 0x50(%rsp), %xmm11 1767 movaps 0x60(%rsp), %xmm12 1768 movaps 0x70(%rsp), %xmm13 1769 movaps 0x80(%rsp), %xmm14 1770 movaps 0x90(%rsp), %xmm15 1771 lea 0xa8(%rsp), %rsp 1772.LSEH_end_ecp_nistz256_avx2_select_w7: 1773___ 1774$code.=<<___; 1775 ret 1776.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1777___ 1778} else { 1779$code.=<<___; 1780.globl ecp_nistz256_avx2_select_w7 1781.type ecp_nistz256_avx2_select_w7,\@function,3 1782.align 32 1783ecp_nistz256_avx2_select_w7: 1784 .byte 0x0f,0x0b # ud2 1785 ret 1786.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1787___ 1788} 1789{{{ 1790######################################################################## 1791# This block implements higher level point_double, point_add and 1792# point_add_affine. The key to performance in this case is to allow 1793# out-of-order execution logic to overlap computations from next step 1794# with tail processing from current step. By using tailored calling 1795# sequence we minimize inter-step overhead to give processor better 1796# shot at overlapping operations... 1797# 1798# You will notice that input data is copied to stack. Trouble is that 1799# there are no registers to spare for holding original pointers and 1800# reloading them, pointers, would create undesired dependencies on 1801# effective addresses calculation paths. In other words it's too done 1802# to favour out-of-order execution logic. 1803# <appro@openssl.org> 1804 1805my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 1806my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 1807my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 1808my ($poly1,$poly3)=($acc6,$acc7); 1809 1810sub load_for_mul () { 1811my ($a,$b,$src0) = @_; 1812my $bias = $src0 eq "%rax" ? 0 : -128; 1813 1814" mov $b, $src0 1815 lea $b, $b_ptr 1816 mov 8*0+$a, $acc1 1817 mov 8*1+$a, $acc2 1818 lea $bias+$a, $a_ptr 1819 mov 8*2+$a, $acc3 1820 mov 8*3+$a, $acc4" 1821} 1822 1823sub load_for_sqr () { 1824my ($a,$src0) = @_; 1825my $bias = $src0 eq "%rax" ? 0 : -128; 1826 1827" mov 8*0+$a, $src0 1828 mov 8*1+$a, $acc6 1829 lea $bias+$a, $a_ptr 1830 mov 8*2+$a, $acc7 1831 mov 8*3+$a, $acc0" 1832} 1833 1834 { 1835######################################################################## 1836# operate in 4-5-0-1 "name space" that matches multiplication output 1837# 1838my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 1839 1840$code.=<<___; 1841.type __ecp_nistz256_add_toq,\@abi-omnipotent 1842.align 32 1843__ecp_nistz256_add_toq: 1844 xor $t4,$t4 1845 add 8*0($b_ptr), $a0 1846 adc 8*1($b_ptr), $a1 1847 mov $a0, $t0 1848 adc 8*2($b_ptr), $a2 1849 adc 8*3($b_ptr), $a3 1850 mov $a1, $t1 1851 adc \$0, $t4 1852 1853 sub \$-1, $a0 1854 mov $a2, $t2 1855 sbb $poly1, $a1 1856 sbb \$0, $a2 1857 mov $a3, $t3 1858 sbb $poly3, $a3 1859 sbb \$0, $t4 1860 1861 cmovc $t0, $a0 1862 cmovc $t1, $a1 1863 mov $a0, 8*0($r_ptr) 1864 cmovc $t2, $a2 1865 mov $a1, 8*1($r_ptr) 1866 cmovc $t3, $a3 1867 mov $a2, 8*2($r_ptr) 1868 mov $a3, 8*3($r_ptr) 1869 1870 ret 1871.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 1872 1873.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 1874.align 32 1875__ecp_nistz256_sub_fromq: 1876 sub 8*0($b_ptr), $a0 1877 sbb 8*1($b_ptr), $a1 1878 mov $a0, $t0 1879 sbb 8*2($b_ptr), $a2 1880 sbb 8*3($b_ptr), $a3 1881 mov $a1, $t1 1882 sbb $t4, $t4 1883 1884 add \$-1, $a0 1885 mov $a2, $t2 1886 adc $poly1, $a1 1887 adc \$0, $a2 1888 mov $a3, $t3 1889 adc $poly3, $a3 1890 test $t4, $t4 1891 1892 cmovz $t0, $a0 1893 cmovz $t1, $a1 1894 mov $a0, 8*0($r_ptr) 1895 cmovz $t2, $a2 1896 mov $a1, 8*1($r_ptr) 1897 cmovz $t3, $a3 1898 mov $a2, 8*2($r_ptr) 1899 mov $a3, 8*3($r_ptr) 1900 1901 ret 1902.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 1903 1904.type __ecp_nistz256_subq,\@abi-omnipotent 1905.align 32 1906__ecp_nistz256_subq: 1907 sub $a0, $t0 1908 sbb $a1, $t1 1909 mov $t0, $a0 1910 sbb $a2, $t2 1911 sbb $a3, $t3 1912 mov $t1, $a1 1913 sbb $t4, $t4 1914 1915 add \$-1, $t0 1916 mov $t2, $a2 1917 adc $poly1, $t1 1918 adc \$0, $t2 1919 mov $t3, $a3 1920 adc $poly3, $t3 1921 test $t4, $t4 1922 1923 cmovnz $t0, $a0 1924 cmovnz $t1, $a1 1925 cmovnz $t2, $a2 1926 cmovnz $t3, $a3 1927 1928 ret 1929.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 1930 1931.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 1932.align 32 1933__ecp_nistz256_mul_by_2q: 1934 xor $t4, $t4 1935 add $a0, $a0 # a0:a3+a0:a3 1936 adc $a1, $a1 1937 mov $a0, $t0 1938 adc $a2, $a2 1939 adc $a3, $a3 1940 mov $a1, $t1 1941 adc \$0, $t4 1942 1943 sub \$-1, $a0 1944 mov $a2, $t2 1945 sbb $poly1, $a1 1946 sbb \$0, $a2 1947 mov $a3, $t3 1948 sbb $poly3, $a3 1949 sbb \$0, $t4 1950 1951 cmovc $t0, $a0 1952 cmovc $t1, $a1 1953 mov $a0, 8*0($r_ptr) 1954 cmovc $t2, $a2 1955 mov $a1, 8*1($r_ptr) 1956 cmovc $t3, $a3 1957 mov $a2, 8*2($r_ptr) 1958 mov $a3, 8*3($r_ptr) 1959 1960 ret 1961.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 1962___ 1963 } 1964sub gen_double () { 1965 my $x = shift; 1966 my ($src0,$sfx,$bias); 1967 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1968 1969 if ($x ne "x") { 1970 $src0 = "%rax"; 1971 $sfx = ""; 1972 $bias = 0; 1973 1974$code.=<<___; 1975.globl ecp_nistz256_point_double 1976.type ecp_nistz256_point_double,\@function,2 1977.align 32 1978ecp_nistz256_point_double: 1979___ 1980$code.=<<___ if ($addx); 1981 mov \$0x80100, %ecx 1982 and OPENSSL_ia32cap_P+8(%rip), %ecx 1983 cmp \$0x80100, %ecx 1984 je .Lpoint_doublex 1985___ 1986 } else { 1987 $src0 = "%rdx"; 1988 $sfx = "x"; 1989 $bias = 128; 1990 1991$code.=<<___; 1992.type ecp_nistz256_point_doublex,\@function,2 1993.align 32 1994ecp_nistz256_point_doublex: 1995.Lpoint_doublex: 1996___ 1997 } 1998$code.=<<___; 1999 push %rbp 2000 push %rbx 2001 push %r12 2002 push %r13 2003 push %r14 2004 push %r15 2005 sub \$32*5+8, %rsp 2006 2007.Lpoint_double_shortcut$x: 2008 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 2009 mov $a_ptr, $b_ptr # backup copy 2010 movdqu 0x10($a_ptr), %xmm1 2011 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 2012 mov 0x20+8*1($a_ptr), $acc5 2013 mov 0x20+8*2($a_ptr), $acc0 2014 mov 0x20+8*3($a_ptr), $acc1 2015 mov .Lpoly+8*1(%rip), $poly1 2016 mov .Lpoly+8*3(%rip), $poly3 2017 movdqa %xmm0, $in_x(%rsp) 2018 movdqa %xmm1, $in_x+0x10(%rsp) 2019 lea 0x20($r_ptr), $acc2 2020 lea 0x40($r_ptr), $acc3 2021 movq $r_ptr, %xmm0 2022 movq $acc2, %xmm1 2023 movq $acc3, %xmm2 2024 2025 lea $S(%rsp), $r_ptr 2026 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 2027 2028 mov 0x40+8*0($a_ptr), $src0 2029 mov 0x40+8*1($a_ptr), $acc6 2030 mov 0x40+8*2($a_ptr), $acc7 2031 mov 0x40+8*3($a_ptr), $acc0 2032 lea 0x40-$bias($a_ptr), $a_ptr 2033 lea $Zsqr(%rsp), $r_ptr 2034 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 2035 2036 `&load_for_sqr("$S(%rsp)", "$src0")` 2037 lea $S(%rsp), $r_ptr 2038 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 2039 2040 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 2041 mov 0x40+8*0($b_ptr), $acc1 2042 mov 0x40+8*1($b_ptr), $acc2 2043 mov 0x40+8*2($b_ptr), $acc3 2044 mov 0x40+8*3($b_ptr), $acc4 2045 lea 0x40-$bias($b_ptr), $a_ptr 2046 lea 0x20($b_ptr), $b_ptr 2047 movq %xmm2, $r_ptr 2048 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 2049 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 2050 2051 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2052 mov $in_x+8*1(%rsp), $acc5 2053 lea $Zsqr(%rsp), $b_ptr 2054 mov $in_x+8*2(%rsp), $acc0 2055 mov $in_x+8*3(%rsp), $acc1 2056 lea $M(%rsp), $r_ptr 2057 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 2058 2059 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2060 mov $in_x+8*1(%rsp), $acc5 2061 lea $Zsqr(%rsp), $b_ptr 2062 mov $in_x+8*2(%rsp), $acc0 2063 mov $in_x+8*3(%rsp), $acc1 2064 lea $Zsqr(%rsp), $r_ptr 2065 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 2066 2067 `&load_for_sqr("$S(%rsp)", "$src0")` 2068 movq %xmm1, $r_ptr 2069 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 2070___ 2071{ 2072######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 2073# operate in 4-5-6-7 "name space" that matches squaring output 2074# 2075my ($poly1,$poly3)=($a_ptr,$t1); 2076my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 2077 2078$code.=<<___; 2079 xor $t4, $t4 2080 mov $a0, $t0 2081 add \$-1, $a0 2082 mov $a1, $t1 2083 adc $poly1, $a1 2084 mov $a2, $t2 2085 adc \$0, $a2 2086 mov $a3, $t3 2087 adc $poly3, $a3 2088 adc \$0, $t4 2089 xor $a_ptr, $a_ptr # borrow $a_ptr 2090 test \$1, $t0 2091 2092 cmovz $t0, $a0 2093 cmovz $t1, $a1 2094 cmovz $t2, $a2 2095 cmovz $t3, $a3 2096 cmovz $a_ptr, $t4 2097 2098 mov $a1, $t0 # a0:a3>>1 2099 shr \$1, $a0 2100 shl \$63, $t0 2101 mov $a2, $t1 2102 shr \$1, $a1 2103 or $t0, $a0 2104 shl \$63, $t1 2105 mov $a3, $t2 2106 shr \$1, $a2 2107 or $t1, $a1 2108 shl \$63, $t2 2109 mov $a0, 8*0($r_ptr) 2110 shr \$1, $a3 2111 mov $a1, 8*1($r_ptr) 2112 shl \$63, $t4 2113 or $t2, $a2 2114 or $t4, $a3 2115 mov $a2, 8*2($r_ptr) 2116 mov $a3, 8*3($r_ptr) 2117___ 2118} 2119$code.=<<___; 2120 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 2121 lea $M(%rsp), $r_ptr 2122 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 2123 2124 lea $tmp0(%rsp), $r_ptr 2125 call __ecp_nistz256_mul_by_2$x 2126 2127 lea $M(%rsp), $b_ptr 2128 lea $M(%rsp), $r_ptr 2129 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 2130 2131 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 2132 lea $S(%rsp), $r_ptr 2133 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 2134 2135 lea $tmp0(%rsp), $r_ptr 2136 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 2137 2138 `&load_for_sqr("$M(%rsp)", "$src0")` 2139 movq %xmm0, $r_ptr 2140 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 2141 2142 lea $tmp0(%rsp), $b_ptr 2143 mov $acc6, $acc0 # harmonize sqr output and sub input 2144 mov $acc7, $acc1 2145 mov $a_ptr, $poly1 2146 mov $t1, $poly3 2147 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 2148 2149 mov $S+8*0(%rsp), $t0 2150 mov $S+8*1(%rsp), $t1 2151 mov $S+8*2(%rsp), $t2 2152 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 2153 lea $S(%rsp), $r_ptr 2154 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 2155 2156 mov $M(%rsp), $src0 2157 lea $M(%rsp), $b_ptr 2158 mov $acc4, $acc6 # harmonize sub output and mul input 2159 xor %ecx, %ecx 2160 mov $acc4, $S+8*0(%rsp) # have to save:-( 2161 mov $acc5, $acc2 2162 mov $acc5, $S+8*1(%rsp) 2163 cmovz $acc0, $acc3 2164 mov $acc0, $S+8*2(%rsp) 2165 lea $S-$bias(%rsp), $a_ptr 2166 cmovz $acc1, $acc4 2167 mov $acc1, $S+8*3(%rsp) 2168 mov $acc6, $acc1 2169 lea $S(%rsp), $r_ptr 2170 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 2171 2172 movq %xmm1, $b_ptr 2173 movq %xmm1, $r_ptr 2174 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 2175 2176 add \$32*5+8, %rsp 2177 pop %r15 2178 pop %r14 2179 pop %r13 2180 pop %r12 2181 pop %rbx 2182 pop %rbp 2183 ret 2184.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 2185___ 2186} 2187&gen_double("q"); 2188 2189sub gen_add () { 2190 my $x = shift; 2191 my ($src0,$sfx,$bias); 2192 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 2193 $U1,$U2,$S1,$S2, 2194 $res_x,$res_y,$res_z, 2195 $in1_x,$in1_y,$in1_z, 2196 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 2197 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2198 2199 if ($x ne "x") { 2200 $src0 = "%rax"; 2201 $sfx = ""; 2202 $bias = 0; 2203 2204$code.=<<___; 2205.globl ecp_nistz256_point_add 2206.type ecp_nistz256_point_add,\@function,3 2207.align 32 2208ecp_nistz256_point_add: 2209___ 2210$code.=<<___ if ($addx); 2211 mov \$0x80100, %ecx 2212 and OPENSSL_ia32cap_P+8(%rip), %ecx 2213 cmp \$0x80100, %ecx 2214 je .Lpoint_addx 2215___ 2216 } else { 2217 $src0 = "%rdx"; 2218 $sfx = "x"; 2219 $bias = 128; 2220 2221$code.=<<___; 2222.type ecp_nistz256_point_addx,\@function,3 2223.align 32 2224ecp_nistz256_point_addx: 2225.Lpoint_addx: 2226___ 2227 } 2228$code.=<<___; 2229 push %rbp 2230 push %rbx 2231 push %r12 2232 push %r13 2233 push %r14 2234 push %r15 2235 sub \$32*18+8, %rsp 2236 2237 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2238 movdqu 0x10($a_ptr), %xmm1 2239 movdqu 0x20($a_ptr), %xmm2 2240 movdqu 0x30($a_ptr), %xmm3 2241 movdqu 0x40($a_ptr), %xmm4 2242 movdqu 0x50($a_ptr), %xmm5 2243 mov $a_ptr, $b_ptr # reassign 2244 mov $b_org, $a_ptr # reassign 2245 movdqa %xmm0, $in1_x(%rsp) 2246 movdqa %xmm1, $in1_x+0x10(%rsp) 2247 movdqa %xmm2, $in1_y(%rsp) 2248 movdqa %xmm3, $in1_y+0x10(%rsp) 2249 movdqa %xmm4, $in1_z(%rsp) 2250 movdqa %xmm5, $in1_z+0x10(%rsp) 2251 por %xmm4, %xmm5 2252 2253 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 2254 pshufd \$0xb1, %xmm5, %xmm3 2255 movdqu 0x10($a_ptr), %xmm1 2256 movdqu 0x20($a_ptr), %xmm2 2257 por %xmm3, %xmm5 2258 movdqu 0x30($a_ptr), %xmm3 2259 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 2260 mov 0x40+8*1($a_ptr), $acc6 2261 mov 0x40+8*2($a_ptr), $acc7 2262 mov 0x40+8*3($a_ptr), $acc0 2263 movdqa %xmm0, $in2_x(%rsp) 2264 pshufd \$0x1e, %xmm5, %xmm4 2265 movdqa %xmm1, $in2_x+0x10(%rsp) 2266 movdqu 0x40($a_ptr),%xmm0 # in2_z again 2267 movdqu 0x50($a_ptr),%xmm1 2268 movdqa %xmm2, $in2_y(%rsp) 2269 movdqa %xmm3, $in2_y+0x10(%rsp) 2270 por %xmm4, %xmm5 2271 pxor %xmm4, %xmm4 2272 por %xmm0, %xmm1 2273 movq $r_ptr, %xmm0 # save $r_ptr 2274 2275 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2276 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 2277 mov $acc6, $in2_z+8*1(%rsp) 2278 mov $acc7, $in2_z+8*2(%rsp) 2279 mov $acc0, $in2_z+8*3(%rsp) 2280 lea $Z2sqr(%rsp), $r_ptr # Z2^2 2281 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 2282 2283 pcmpeqd %xmm4, %xmm5 2284 pshufd \$0xb1, %xmm1, %xmm4 2285 por %xmm1, %xmm4 2286 pshufd \$0, %xmm5, %xmm5 # in1infty 2287 pshufd \$0x1e, %xmm4, %xmm3 2288 por %xmm3, %xmm4 2289 pxor %xmm3, %xmm3 2290 pcmpeqd %xmm3, %xmm4 2291 pshufd \$0, %xmm4, %xmm4 # in2infty 2292 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 2293 mov 0x40+8*1($b_ptr), $acc6 2294 mov 0x40+8*2($b_ptr), $acc7 2295 mov 0x40+8*3($b_ptr), $acc0 2296 movq $b_ptr, %xmm1 2297 2298 lea 0x40-$bias($b_ptr), $a_ptr 2299 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2300 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2301 2302 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 2303 lea $S1(%rsp), $r_ptr # S1 = Z2^3 2304 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 2305 2306 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2307 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2308 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2309 2310 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 2311 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 2312 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 2313 2314 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2315 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2316 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2317 2318 lea $S1(%rsp), $b_ptr 2319 lea $R(%rsp), $r_ptr # R = S2 - S1 2320 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 2321 2322 or $acc5, $acc4 # see if result is zero 2323 movdqa %xmm4, %xmm2 2324 or $acc0, $acc4 2325 or $acc1, $acc4 2326 por %xmm5, %xmm2 # in1infty || in2infty 2327 movq $acc4, %xmm3 2328 2329 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2330 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 2331 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 2332 2333 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 2334 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2335 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 2336 2337 lea $U1(%rsp), $b_ptr 2338 lea $H(%rsp), $r_ptr # H = U2 - U1 2339 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 2340 2341 or $acc5, $acc4 # see if result is zero 2342 or $acc0, $acc4 2343 or $acc1, $acc4 2344 2345 .byte 0x3e # predict taken 2346 jnz .Ladd_proceed$x # is_equal(U1,U2)? 2347 movq %xmm2, $acc0 2348 movq %xmm3, $acc1 2349 test $acc0, $acc0 2350 jnz .Ladd_proceed$x # (in1infty || in2infty)? 2351 test $acc1, $acc1 2352 jz .Ladd_double$x # is_equal(S1,S2)? 2353 2354 movq %xmm0, $r_ptr # restore $r_ptr 2355 pxor %xmm0, %xmm0 2356 movdqu %xmm0, 0x00($r_ptr) 2357 movdqu %xmm0, 0x10($r_ptr) 2358 movdqu %xmm0, 0x20($r_ptr) 2359 movdqu %xmm0, 0x30($r_ptr) 2360 movdqu %xmm0, 0x40($r_ptr) 2361 movdqu %xmm0, 0x50($r_ptr) 2362 jmp .Ladd_done$x 2363 2364.align 32 2365.Ladd_double$x: 2366 movq %xmm1, $a_ptr # restore $a_ptr 2367 movq %xmm0, $r_ptr # restore $r_ptr 2368 add \$`32*(18-5)`, %rsp # difference in frame sizes 2369 jmp .Lpoint_double_shortcut$x 2370 2371.align 32 2372.Ladd_proceed$x: 2373 `&load_for_sqr("$R(%rsp)", "$src0")` 2374 lea $Rsqr(%rsp), $r_ptr # R^2 2375 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2376 2377 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2378 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2379 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2380 2381 `&load_for_sqr("$H(%rsp)", "$src0")` 2382 lea $Hsqr(%rsp), $r_ptr # H^2 2383 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2384 2385 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 2386 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2387 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 2388 2389 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 2390 lea $Hcub(%rsp), $r_ptr # H^3 2391 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2392 2393 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 2394 lea $U2(%rsp), $r_ptr # U1*H^2 2395 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 2396___ 2397{ 2398####################################################################### 2399# operate in 4-5-0-1 "name space" that matches multiplication output 2400# 2401my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2402my ($poly1, $poly3)=($acc6,$acc7); 2403 2404$code.=<<___; 2405 #lea $U2(%rsp), $a_ptr 2406 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2407 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2408 2409 xor $t4, $t4 2410 add $acc0, $acc0 # a0:a3+a0:a3 2411 lea $Rsqr(%rsp), $a_ptr 2412 adc $acc1, $acc1 2413 mov $acc0, $t0 2414 adc $acc2, $acc2 2415 adc $acc3, $acc3 2416 mov $acc1, $t1 2417 adc \$0, $t4 2418 2419 sub \$-1, $acc0 2420 mov $acc2, $t2 2421 sbb $poly1, $acc1 2422 sbb \$0, $acc2 2423 mov $acc3, $t3 2424 sbb $poly3, $acc3 2425 sbb \$0, $t4 2426 2427 cmovc $t0, $acc0 2428 mov 8*0($a_ptr), $t0 2429 cmovc $t1, $acc1 2430 mov 8*1($a_ptr), $t1 2431 cmovc $t2, $acc2 2432 mov 8*2($a_ptr), $t2 2433 cmovc $t3, $acc3 2434 mov 8*3($a_ptr), $t3 2435 2436 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2437 2438 lea $Hcub(%rsp), $b_ptr 2439 lea $res_x(%rsp), $r_ptr 2440 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2441 2442 mov $U2+8*0(%rsp), $t0 2443 mov $U2+8*1(%rsp), $t1 2444 mov $U2+8*2(%rsp), $t2 2445 mov $U2+8*3(%rsp), $t3 2446 lea $res_y(%rsp), $r_ptr 2447 2448 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 2449 2450 mov $acc0, 8*0($r_ptr) # save the result, as 2451 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2452 mov $acc2, 8*2($r_ptr) 2453 mov $acc3, 8*3($r_ptr) 2454___ 2455} 2456$code.=<<___; 2457 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 2458 lea $S2(%rsp), $r_ptr 2459 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 2460 2461 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 2462 lea $res_y(%rsp), $r_ptr 2463 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 2464 2465 lea $S2(%rsp), $b_ptr 2466 lea $res_y(%rsp), $r_ptr 2467 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 2468 2469 movq %xmm0, $r_ptr # restore $r_ptr 2470 2471 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 2472 movdqa %xmm5, %xmm1 2473 pandn $res_z(%rsp), %xmm0 2474 movdqa %xmm5, %xmm2 2475 pandn $res_z+0x10(%rsp), %xmm1 2476 movdqa %xmm5, %xmm3 2477 pand $in2_z(%rsp), %xmm2 2478 pand $in2_z+0x10(%rsp), %xmm3 2479 por %xmm0, %xmm2 2480 por %xmm1, %xmm3 2481 2482 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2483 movdqa %xmm4, %xmm1 2484 pandn %xmm2, %xmm0 2485 movdqa %xmm4, %xmm2 2486 pandn %xmm3, %xmm1 2487 movdqa %xmm4, %xmm3 2488 pand $in1_z(%rsp), %xmm2 2489 pand $in1_z+0x10(%rsp), %xmm3 2490 por %xmm0, %xmm2 2491 por %xmm1, %xmm3 2492 movdqu %xmm2, 0x40($r_ptr) 2493 movdqu %xmm3, 0x50($r_ptr) 2494 2495 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2496 movdqa %xmm5, %xmm1 2497 pandn $res_x(%rsp), %xmm0 2498 movdqa %xmm5, %xmm2 2499 pandn $res_x+0x10(%rsp), %xmm1 2500 movdqa %xmm5, %xmm3 2501 pand $in2_x(%rsp), %xmm2 2502 pand $in2_x+0x10(%rsp), %xmm3 2503 por %xmm0, %xmm2 2504 por %xmm1, %xmm3 2505 2506 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2507 movdqa %xmm4, %xmm1 2508 pandn %xmm2, %xmm0 2509 movdqa %xmm4, %xmm2 2510 pandn %xmm3, %xmm1 2511 movdqa %xmm4, %xmm3 2512 pand $in1_x(%rsp), %xmm2 2513 pand $in1_x+0x10(%rsp), %xmm3 2514 por %xmm0, %xmm2 2515 por %xmm1, %xmm3 2516 movdqu %xmm2, 0x00($r_ptr) 2517 movdqu %xmm3, 0x10($r_ptr) 2518 2519 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2520 movdqa %xmm5, %xmm1 2521 pandn $res_y(%rsp), %xmm0 2522 movdqa %xmm5, %xmm2 2523 pandn $res_y+0x10(%rsp), %xmm1 2524 movdqa %xmm5, %xmm3 2525 pand $in2_y(%rsp), %xmm2 2526 pand $in2_y+0x10(%rsp), %xmm3 2527 por %xmm0, %xmm2 2528 por %xmm1, %xmm3 2529 2530 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2531 movdqa %xmm4, %xmm1 2532 pandn %xmm2, %xmm0 2533 movdqa %xmm4, %xmm2 2534 pandn %xmm3, %xmm1 2535 movdqa %xmm4, %xmm3 2536 pand $in1_y(%rsp), %xmm2 2537 pand $in1_y+0x10(%rsp), %xmm3 2538 por %xmm0, %xmm2 2539 por %xmm1, %xmm3 2540 movdqu %xmm2, 0x20($r_ptr) 2541 movdqu %xmm3, 0x30($r_ptr) 2542 2543.Ladd_done$x: 2544 add \$32*18+8, %rsp 2545 pop %r15 2546 pop %r14 2547 pop %r13 2548 pop %r12 2549 pop %rbx 2550 pop %rbp 2551 ret 2552.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 2553___ 2554} 2555&gen_add("q"); 2556 2557sub gen_add_affine () { 2558 my $x = shift; 2559 my ($src0,$sfx,$bias); 2560 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 2561 $res_x,$res_y,$res_z, 2562 $in1_x,$in1_y,$in1_z, 2563 $in2_x,$in2_y)=map(32*$_,(0..14)); 2564 my $Z1sqr = $S2; 2565 2566 if ($x ne "x") { 2567 $src0 = "%rax"; 2568 $sfx = ""; 2569 $bias = 0; 2570 2571$code.=<<___; 2572.globl ecp_nistz256_point_add_affine 2573.type ecp_nistz256_point_add_affine,\@function,3 2574.align 32 2575ecp_nistz256_point_add_affine: 2576___ 2577$code.=<<___ if ($addx); 2578 mov \$0x80100, %ecx 2579 and OPENSSL_ia32cap_P+8(%rip), %ecx 2580 cmp \$0x80100, %ecx 2581 je .Lpoint_add_affinex 2582___ 2583 } else { 2584 $src0 = "%rdx"; 2585 $sfx = "x"; 2586 $bias = 128; 2587 2588$code.=<<___; 2589.type ecp_nistz256_point_add_affinex,\@function,3 2590.align 32 2591ecp_nistz256_point_add_affinex: 2592.Lpoint_add_affinex: 2593___ 2594 } 2595$code.=<<___; 2596 push %rbp 2597 push %rbx 2598 push %r12 2599 push %r13 2600 push %r14 2601 push %r15 2602 sub \$32*15+8, %rsp 2603 2604 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2605 mov $b_org, $b_ptr # reassign 2606 movdqu 0x10($a_ptr), %xmm1 2607 movdqu 0x20($a_ptr), %xmm2 2608 movdqu 0x30($a_ptr), %xmm3 2609 movdqu 0x40($a_ptr), %xmm4 2610 movdqu 0x50($a_ptr), %xmm5 2611 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 2612 mov 0x40+8*1($a_ptr), $acc6 2613 mov 0x40+8*2($a_ptr), $acc7 2614 mov 0x40+8*3($a_ptr), $acc0 2615 movdqa %xmm0, $in1_x(%rsp) 2616 movdqa %xmm1, $in1_x+0x10(%rsp) 2617 movdqa %xmm2, $in1_y(%rsp) 2618 movdqa %xmm3, $in1_y+0x10(%rsp) 2619 movdqa %xmm4, $in1_z(%rsp) 2620 movdqa %xmm5, $in1_z+0x10(%rsp) 2621 por %xmm4, %xmm5 2622 2623 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 2624 pshufd \$0xb1, %xmm5, %xmm3 2625 movdqu 0x10($b_ptr), %xmm1 2626 movdqu 0x20($b_ptr), %xmm2 2627 por %xmm3, %xmm5 2628 movdqu 0x30($b_ptr), %xmm3 2629 movdqa %xmm0, $in2_x(%rsp) 2630 pshufd \$0x1e, %xmm5, %xmm4 2631 movdqa %xmm1, $in2_x+0x10(%rsp) 2632 por %xmm0, %xmm1 2633 movq $r_ptr, %xmm0 # save $r_ptr 2634 movdqa %xmm2, $in2_y(%rsp) 2635 movdqa %xmm3, $in2_y+0x10(%rsp) 2636 por %xmm2, %xmm3 2637 por %xmm4, %xmm5 2638 pxor %xmm4, %xmm4 2639 por %xmm1, %xmm3 2640 2641 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2642 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2643 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2644 2645 pcmpeqd %xmm4, %xmm5 2646 pshufd \$0xb1, %xmm3, %xmm4 2647 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 2648 #lea 0x00($b_ptr), $b_ptr 2649 mov $acc4, $acc1 # harmonize sqr output and mul input 2650 por %xmm3, %xmm4 2651 pshufd \$0, %xmm5, %xmm5 # in1infty 2652 pshufd \$0x1e, %xmm4, %xmm3 2653 mov $acc5, $acc2 2654 por %xmm3, %xmm4 2655 pxor %xmm3, %xmm3 2656 mov $acc6, $acc3 2657 pcmpeqd %xmm3, %xmm4 2658 pshufd \$0, %xmm4, %xmm4 # in2infty 2659 2660 lea $Z1sqr-$bias(%rsp), $a_ptr 2661 mov $acc7, $acc4 2662 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2663 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 2664 2665 lea $in1_x(%rsp), $b_ptr 2666 lea $H(%rsp), $r_ptr # H = U2 - U1 2667 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 2668 2669 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2670 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2671 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2672 2673 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2674 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2675 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2676 2677 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2678 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2679 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2680 2681 lea $in1_y(%rsp), $b_ptr 2682 lea $R(%rsp), $r_ptr # R = S2 - S1 2683 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 2684 2685 `&load_for_sqr("$H(%rsp)", "$src0")` 2686 lea $Hsqr(%rsp), $r_ptr # H^2 2687 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2688 2689 `&load_for_sqr("$R(%rsp)", "$src0")` 2690 lea $Rsqr(%rsp), $r_ptr # R^2 2691 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2692 2693 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 2694 lea $Hcub(%rsp), $r_ptr # H^3 2695 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2696 2697 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2698 lea $U2(%rsp), $r_ptr # U1*H^2 2699 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 2700___ 2701{ 2702####################################################################### 2703# operate in 4-5-0-1 "name space" that matches multiplication output 2704# 2705my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2706my ($poly1, $poly3)=($acc6,$acc7); 2707 2708$code.=<<___; 2709 #lea $U2(%rsp), $a_ptr 2710 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2711 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2712 2713 xor $t4, $t4 2714 add $acc0, $acc0 # a0:a3+a0:a3 2715 lea $Rsqr(%rsp), $a_ptr 2716 adc $acc1, $acc1 2717 mov $acc0, $t0 2718 adc $acc2, $acc2 2719 adc $acc3, $acc3 2720 mov $acc1, $t1 2721 adc \$0, $t4 2722 2723 sub \$-1, $acc0 2724 mov $acc2, $t2 2725 sbb $poly1, $acc1 2726 sbb \$0, $acc2 2727 mov $acc3, $t3 2728 sbb $poly3, $acc3 2729 sbb \$0, $t4 2730 2731 cmovc $t0, $acc0 2732 mov 8*0($a_ptr), $t0 2733 cmovc $t1, $acc1 2734 mov 8*1($a_ptr), $t1 2735 cmovc $t2, $acc2 2736 mov 8*2($a_ptr), $t2 2737 cmovc $t3, $acc3 2738 mov 8*3($a_ptr), $t3 2739 2740 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2741 2742 lea $Hcub(%rsp), $b_ptr 2743 lea $res_x(%rsp), $r_ptr 2744 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2745 2746 mov $U2+8*0(%rsp), $t0 2747 mov $U2+8*1(%rsp), $t1 2748 mov $U2+8*2(%rsp), $t2 2749 mov $U2+8*3(%rsp), $t3 2750 lea $H(%rsp), $r_ptr 2751 2752 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 2753 2754 mov $acc0, 8*0($r_ptr) # save the result, as 2755 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2756 mov $acc2, 8*2($r_ptr) 2757 mov $acc3, 8*3($r_ptr) 2758___ 2759} 2760$code.=<<___; 2761 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 2762 lea $S2(%rsp), $r_ptr 2763 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 2764 2765 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 2766 lea $H(%rsp), $r_ptr 2767 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 2768 2769 lea $S2(%rsp), $b_ptr 2770 lea $res_y(%rsp), $r_ptr 2771 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 2772 2773 movq %xmm0, $r_ptr # restore $r_ptr 2774 2775 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 2776 movdqa %xmm5, %xmm1 2777 pandn $res_z(%rsp), %xmm0 2778 movdqa %xmm5, %xmm2 2779 pandn $res_z+0x10(%rsp), %xmm1 2780 movdqa %xmm5, %xmm3 2781 pand .LONE_mont(%rip), %xmm2 2782 pand .LONE_mont+0x10(%rip), %xmm3 2783 por %xmm0, %xmm2 2784 por %xmm1, %xmm3 2785 2786 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2787 movdqa %xmm4, %xmm1 2788 pandn %xmm2, %xmm0 2789 movdqa %xmm4, %xmm2 2790 pandn %xmm3, %xmm1 2791 movdqa %xmm4, %xmm3 2792 pand $in1_z(%rsp), %xmm2 2793 pand $in1_z+0x10(%rsp), %xmm3 2794 por %xmm0, %xmm2 2795 por %xmm1, %xmm3 2796 movdqu %xmm2, 0x40($r_ptr) 2797 movdqu %xmm3, 0x50($r_ptr) 2798 2799 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2800 movdqa %xmm5, %xmm1 2801 pandn $res_x(%rsp), %xmm0 2802 movdqa %xmm5, %xmm2 2803 pandn $res_x+0x10(%rsp), %xmm1 2804 movdqa %xmm5, %xmm3 2805 pand $in2_x(%rsp), %xmm2 2806 pand $in2_x+0x10(%rsp), %xmm3 2807 por %xmm0, %xmm2 2808 por %xmm1, %xmm3 2809 2810 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2811 movdqa %xmm4, %xmm1 2812 pandn %xmm2, %xmm0 2813 movdqa %xmm4, %xmm2 2814 pandn %xmm3, %xmm1 2815 movdqa %xmm4, %xmm3 2816 pand $in1_x(%rsp), %xmm2 2817 pand $in1_x+0x10(%rsp), %xmm3 2818 por %xmm0, %xmm2 2819 por %xmm1, %xmm3 2820 movdqu %xmm2, 0x00($r_ptr) 2821 movdqu %xmm3, 0x10($r_ptr) 2822 2823 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2824 movdqa %xmm5, %xmm1 2825 pandn $res_y(%rsp), %xmm0 2826 movdqa %xmm5, %xmm2 2827 pandn $res_y+0x10(%rsp), %xmm1 2828 movdqa %xmm5, %xmm3 2829 pand $in2_y(%rsp), %xmm2 2830 pand $in2_y+0x10(%rsp), %xmm3 2831 por %xmm0, %xmm2 2832 por %xmm1, %xmm3 2833 2834 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2835 movdqa %xmm4, %xmm1 2836 pandn %xmm2, %xmm0 2837 movdqa %xmm4, %xmm2 2838 pandn %xmm3, %xmm1 2839 movdqa %xmm4, %xmm3 2840 pand $in1_y(%rsp), %xmm2 2841 pand $in1_y+0x10(%rsp), %xmm3 2842 por %xmm0, %xmm2 2843 por %xmm1, %xmm3 2844 movdqu %xmm2, 0x20($r_ptr) 2845 movdqu %xmm3, 0x30($r_ptr) 2846 2847 add \$32*15+8, %rsp 2848 pop %r15 2849 pop %r14 2850 pop %r13 2851 pop %r12 2852 pop %rbx 2853 pop %rbp 2854 ret 2855.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 2856___ 2857} 2858&gen_add_affine("q"); 2859 2860######################################################################## 2861# AD*X magic 2862# 2863if ($addx) { { 2864######################################################################## 2865# operate in 4-5-0-1 "name space" that matches multiplication output 2866# 2867my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2868 2869$code.=<<___; 2870.type __ecp_nistz256_add_tox,\@abi-omnipotent 2871.align 32 2872__ecp_nistz256_add_tox: 2873 xor $t4, $t4 2874 adc 8*0($b_ptr), $a0 2875 adc 8*1($b_ptr), $a1 2876 mov $a0, $t0 2877 adc 8*2($b_ptr), $a2 2878 adc 8*3($b_ptr), $a3 2879 mov $a1, $t1 2880 adc \$0, $t4 2881 2882 xor $t3, $t3 2883 sbb \$-1, $a0 2884 mov $a2, $t2 2885 sbb $poly1, $a1 2886 sbb \$0, $a2 2887 mov $a3, $t3 2888 sbb $poly3, $a3 2889 sbb \$0, $t4 2890 2891 cmovc $t0, $a0 2892 cmovc $t1, $a1 2893 mov $a0, 8*0($r_ptr) 2894 cmovc $t2, $a2 2895 mov $a1, 8*1($r_ptr) 2896 cmovc $t3, $a3 2897 mov $a2, 8*2($r_ptr) 2898 mov $a3, 8*3($r_ptr) 2899 2900 ret 2901.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 2902 2903.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 2904.align 32 2905__ecp_nistz256_sub_fromx: 2906 xor $t4, $t4 2907 sbb 8*0($b_ptr), $a0 2908 sbb 8*1($b_ptr), $a1 2909 mov $a0, $t0 2910 sbb 8*2($b_ptr), $a2 2911 sbb 8*3($b_ptr), $a3 2912 mov $a1, $t1 2913 sbb \$0, $t4 2914 2915 xor $t3, $t3 2916 adc \$-1, $a0 2917 mov $a2, $t2 2918 adc $poly1, $a1 2919 adc \$0, $a2 2920 mov $a3, $t3 2921 adc $poly3, $a3 2922 2923 bt \$0, $t4 2924 cmovnc $t0, $a0 2925 cmovnc $t1, $a1 2926 mov $a0, 8*0($r_ptr) 2927 cmovnc $t2, $a2 2928 mov $a1, 8*1($r_ptr) 2929 cmovnc $t3, $a3 2930 mov $a2, 8*2($r_ptr) 2931 mov $a3, 8*3($r_ptr) 2932 2933 ret 2934.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 2935 2936.type __ecp_nistz256_subx,\@abi-omnipotent 2937.align 32 2938__ecp_nistz256_subx: 2939 xor $t4, $t4 2940 sbb $a0, $t0 2941 sbb $a1, $t1 2942 mov $t0, $a0 2943 sbb $a2, $t2 2944 sbb $a3, $t3 2945 mov $t1, $a1 2946 sbb \$0, $t4 2947 2948 xor $a3 ,$a3 2949 adc \$-1, $t0 2950 mov $t2, $a2 2951 adc $poly1, $t1 2952 adc \$0, $t2 2953 mov $t3, $a3 2954 adc $poly3, $t3 2955 2956 bt \$0, $t4 2957 cmovc $t0, $a0 2958 cmovc $t1, $a1 2959 cmovc $t2, $a2 2960 cmovc $t3, $a3 2961 2962 ret 2963.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 2964 2965.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 2966.align 32 2967__ecp_nistz256_mul_by_2x: 2968 xor $t4, $t4 2969 adc $a0, $a0 # a0:a3+a0:a3 2970 adc $a1, $a1 2971 mov $a0, $t0 2972 adc $a2, $a2 2973 adc $a3, $a3 2974 mov $a1, $t1 2975 adc \$0, $t4 2976 2977 xor $t3, $t3 2978 sbb \$-1, $a0 2979 mov $a2, $t2 2980 sbb $poly1, $a1 2981 sbb \$0, $a2 2982 mov $a3, $t3 2983 sbb $poly3, $a3 2984 sbb \$0, $t4 2985 2986 cmovc $t0, $a0 2987 cmovc $t1, $a1 2988 mov $a0, 8*0($r_ptr) 2989 cmovc $t2, $a2 2990 mov $a1, 8*1($r_ptr) 2991 cmovc $t3, $a3 2992 mov $a2, 8*2($r_ptr) 2993 mov $a3, 8*3($r_ptr) 2994 2995 ret 2996.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 2997___ 2998 } 2999&gen_double("x"); 3000&gen_add("x"); 3001&gen_add_affine("x"); 3002} 3003}}} 3004 3005$code =~ s/\`([^\`]*)\`/eval $1/gem; 3006print $code; 3007close STDOUT; 3008