1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright 2014 Intel Corporation # 6# # 7# Licensed under the Apache License, Version 2.0 (the "License"); # 8# you may not use this file except in compliance with the License. # 9# You may obtain a copy of the License at # 10# # 11# http://www.apache.org/licenses/LICENSE-2.0 # 12# # 13# Unless required by applicable law or agreed to in writing, software # 14# distributed under the License is distributed on an "AS IS" BASIS, # 15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 16# See the License for the specific language governing permissions and # 17# limitations under the License. # 18# # 19############################################################################## 20# # 21# Developers and authors: # 22# Shay Gueron (1, 2), and Vlad Krasnov (1) # 23# (1) Intel Corporation, Israel Development Center # 24# (2) University of Haifa # 25# Reference: # 26# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with# 27# 256 Bit Primes" # 28# # 29############################################################################## 30 31# Further optimization by <appro@openssl.org>: 32# 33# this/original with/without -DECP_NISTZ256_ASM(*) 34# Opteron +12-49% +110-150% 35# Bulldozer +14-45% +175-210% 36# P4 +18-46% n/a :-( 37# Westmere +12-34% +80-87% 38# Sandy Bridge +9-35% +110-120% 39# Ivy Bridge +9-35% +110-125% 40# Haswell +8-37% +140-160% 41# Broadwell +18-58% +145-210% 42# Atom +15-50% +130-180% 43# VIA Nano +43-160% +300-480% 44# 45# (*) "without -DECP_NISTZ256_ASM" refers to build with 46# "enable-ec_nistp_64_gcc_128"; 47# 48# Ranges denote minimum and maximum improvement coefficients depending 49# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest 50# server-side operation. Keep in mind that +100% means 2x improvement. 51 52$flavour = shift; 53$output = shift; 54if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 55 56$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 57 58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 59( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 60( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 61die "can't locate x86_64-xlate.pl"; 62 63open OUT,"| \"$^X\" $xlate $flavour $output"; 64*STDOUT=*OUT; 65 66if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 67 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 68 $avx = ($1>=2.19) + ($1>=2.22); 69 $addx = ($1>=2.23); 70} 71 72if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 73 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 74 $avx = ($1>=2.09) + ($1>=2.10); 75 $addx = ($1>=2.10); 76} 77 78if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 79 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 80 $avx = ($1>=10) + ($1>=11); 81 $addx = ($1>=12); 82} 83 84if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 85 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 86 $avx = ($ver>=3.0) + ($ver>=3.01); 87 $addx = ($ver>=3.03); 88} 89 90$code.=<<___; 91.text 92.extern OPENSSL_ia32cap_P 93 94# The polynomial 95.align 64 96.Lpoly: 97.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 98 99# 2^512 mod P precomputed for NIST P256 polynomial 100.LRR: 101.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd 102 103.LOne: 104.long 1,1,1,1,1,1,1,1 105.LTwo: 106.long 2,2,2,2,2,2,2,2 107.LThree: 108.long 3,3,3,3,3,3,3,3 109.LONE_mont: 110.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 111___ 112 113{ 114################################################################################ 115# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); 116 117my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 118my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 119my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 120 121$code.=<<___; 122 123.globl ecp_nistz256_mul_by_2 124.type ecp_nistz256_mul_by_2,\@function,2 125.align 64 126ecp_nistz256_mul_by_2: 127 push %r12 128 push %r13 129 130 mov 8*0($a_ptr), $a0 131 mov 8*1($a_ptr), $a1 132 add $a0, $a0 # a0:a3+a0:a3 133 mov 8*2($a_ptr), $a2 134 adc $a1, $a1 135 mov 8*3($a_ptr), $a3 136 lea .Lpoly(%rip), $a_ptr 137 mov $a0, $t0 138 adc $a2, $a2 139 adc $a3, $a3 140 mov $a1, $t1 141 sbb $t4, $t4 142 143 sub 8*0($a_ptr), $a0 144 mov $a2, $t2 145 sbb 8*1($a_ptr), $a1 146 sbb 8*2($a_ptr), $a2 147 mov $a3, $t3 148 sbb 8*3($a_ptr), $a3 149 test $t4, $t4 150 151 cmovz $t0, $a0 152 cmovz $t1, $a1 153 mov $a0, 8*0($r_ptr) 154 cmovz $t2, $a2 155 mov $a1, 8*1($r_ptr) 156 cmovz $t3, $a3 157 mov $a2, 8*2($r_ptr) 158 mov $a3, 8*3($r_ptr) 159 160 pop %r13 161 pop %r12 162 ret 163.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 164 165################################################################################ 166# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]); 167.globl ecp_nistz256_div_by_2 168.type ecp_nistz256_div_by_2,\@function,2 169.align 32 170ecp_nistz256_div_by_2: 171 push %r12 172 push %r13 173 174 mov 8*0($a_ptr), $a0 175 mov 8*1($a_ptr), $a1 176 mov 8*2($a_ptr), $a2 177 mov $a0, $t0 178 mov 8*3($a_ptr), $a3 179 lea .Lpoly(%rip), $a_ptr 180 181 mov $a1, $t1 182 xor $t4, $t4 183 add 8*0($a_ptr), $a0 184 mov $a2, $t2 185 adc 8*1($a_ptr), $a1 186 adc 8*2($a_ptr), $a2 187 mov $a3, $t3 188 adc 8*3($a_ptr), $a3 189 adc \$0, $t4 190 xor $a_ptr, $a_ptr # borrow $a_ptr 191 test \$1, $t0 192 193 cmovz $t0, $a0 194 cmovz $t1, $a1 195 cmovz $t2, $a2 196 cmovz $t3, $a3 197 cmovz $a_ptr, $t4 198 199 mov $a1, $t0 # a0:a3>>1 200 shr \$1, $a0 201 shl \$63, $t0 202 mov $a2, $t1 203 shr \$1, $a1 204 or $t0, $a0 205 shl \$63, $t1 206 mov $a3, $t2 207 shr \$1, $a2 208 or $t1, $a1 209 shl \$63, $t2 210 shr \$1, $a3 211 shl \$63, $t4 212 or $t2, $a2 213 or $t4, $a3 214 215 mov $a0, 8*0($r_ptr) 216 mov $a1, 8*1($r_ptr) 217 mov $a2, 8*2($r_ptr) 218 mov $a3, 8*3($r_ptr) 219 220 pop %r13 221 pop %r12 222 ret 223.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 224 225################################################################################ 226# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]); 227.globl ecp_nistz256_mul_by_3 228.type ecp_nistz256_mul_by_3,\@function,2 229.align 32 230ecp_nistz256_mul_by_3: 231 push %r12 232 push %r13 233 234 mov 8*0($a_ptr), $a0 235 xor $t4, $t4 236 mov 8*1($a_ptr), $a1 237 add $a0, $a0 # a0:a3+a0:a3 238 mov 8*2($a_ptr), $a2 239 adc $a1, $a1 240 mov 8*3($a_ptr), $a3 241 mov $a0, $t0 242 adc $a2, $a2 243 adc $a3, $a3 244 mov $a1, $t1 245 adc \$0, $t4 246 247 sub \$-1, $a0 248 mov $a2, $t2 249 sbb .Lpoly+8*1(%rip), $a1 250 sbb \$0, $a2 251 mov $a3, $t3 252 sbb .Lpoly+8*3(%rip), $a3 253 test $t4, $t4 254 255 cmovz $t0, $a0 256 cmovz $t1, $a1 257 cmovz $t2, $a2 258 cmovz $t3, $a3 259 260 xor $t4, $t4 261 add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3] 262 adc 8*1($a_ptr), $a1 263 mov $a0, $t0 264 adc 8*2($a_ptr), $a2 265 adc 8*3($a_ptr), $a3 266 mov $a1, $t1 267 adc \$0, $t4 268 269 sub \$-1, $a0 270 mov $a2, $t2 271 sbb .Lpoly+8*1(%rip), $a1 272 sbb \$0, $a2 273 mov $a3, $t3 274 sbb .Lpoly+8*3(%rip), $a3 275 test $t4, $t4 276 277 cmovz $t0, $a0 278 cmovz $t1, $a1 279 mov $a0, 8*0($r_ptr) 280 cmovz $t2, $a2 281 mov $a1, 8*1($r_ptr) 282 cmovz $t3, $a3 283 mov $a2, 8*2($r_ptr) 284 mov $a3, 8*3($r_ptr) 285 286 pop %r13 287 pop %r12 288 ret 289.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 290 291################################################################################ 292# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 293.globl ecp_nistz256_add 294.type ecp_nistz256_add,\@function,3 295.align 32 296ecp_nistz256_add: 297 push %r12 298 push %r13 299 300 mov 8*0($a_ptr), $a0 301 xor $t4, $t4 302 mov 8*1($a_ptr), $a1 303 mov 8*2($a_ptr), $a2 304 mov 8*3($a_ptr), $a3 305 lea .Lpoly(%rip), $a_ptr 306 307 add 8*0($b_ptr), $a0 308 adc 8*1($b_ptr), $a1 309 mov $a0, $t0 310 adc 8*2($b_ptr), $a2 311 adc 8*3($b_ptr), $a3 312 mov $a1, $t1 313 adc \$0, $t4 314 315 sub 8*0($a_ptr), $a0 316 mov $a2, $t2 317 sbb 8*1($a_ptr), $a1 318 sbb 8*2($a_ptr), $a2 319 mov $a3, $t3 320 sbb 8*3($a_ptr), $a3 321 test $t4, $t4 322 323 cmovz $t0, $a0 324 cmovz $t1, $a1 325 mov $a0, 8*0($r_ptr) 326 cmovz $t2, $a2 327 mov $a1, 8*1($r_ptr) 328 cmovz $t3, $a3 329 mov $a2, 8*2($r_ptr) 330 mov $a3, 8*3($r_ptr) 331 332 pop %r13 333 pop %r12 334 ret 335.size ecp_nistz256_add,.-ecp_nistz256_add 336 337################################################################################ 338# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 339.globl ecp_nistz256_sub 340.type ecp_nistz256_sub,\@function,3 341.align 32 342ecp_nistz256_sub: 343 push %r12 344 push %r13 345 346 mov 8*0($a_ptr), $a0 347 xor $t4, $t4 348 mov 8*1($a_ptr), $a1 349 mov 8*2($a_ptr), $a2 350 mov 8*3($a_ptr), $a3 351 lea .Lpoly(%rip), $a_ptr 352 353 sub 8*0($b_ptr), $a0 354 sbb 8*1($b_ptr), $a1 355 mov $a0, $t0 356 sbb 8*2($b_ptr), $a2 357 sbb 8*3($b_ptr), $a3 358 mov $a1, $t1 359 sbb \$0, $t4 360 361 add 8*0($a_ptr), $a0 362 mov $a2, $t2 363 adc 8*1($a_ptr), $a1 364 adc 8*2($a_ptr), $a2 365 mov $a3, $t3 366 adc 8*3($a_ptr), $a3 367 test $t4, $t4 368 369 cmovz $t0, $a0 370 cmovz $t1, $a1 371 mov $a0, 8*0($r_ptr) 372 cmovz $t2, $a2 373 mov $a1, 8*1($r_ptr) 374 cmovz $t3, $a3 375 mov $a2, 8*2($r_ptr) 376 mov $a3, 8*3($r_ptr) 377 378 pop %r13 379 pop %r12 380 ret 381.size ecp_nistz256_sub,.-ecp_nistz256_sub 382 383################################################################################ 384# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 385.globl ecp_nistz256_neg 386.type ecp_nistz256_neg,\@function,2 387.align 32 388ecp_nistz256_neg: 389 push %r12 390 push %r13 391 392 xor $a0, $a0 393 xor $a1, $a1 394 xor $a2, $a2 395 xor $a3, $a3 396 xor $t4, $t4 397 398 sub 8*0($a_ptr), $a0 399 sbb 8*1($a_ptr), $a1 400 sbb 8*2($a_ptr), $a2 401 mov $a0, $t0 402 sbb 8*3($a_ptr), $a3 403 lea .Lpoly(%rip), $a_ptr 404 mov $a1, $t1 405 sbb \$0, $t4 406 407 add 8*0($a_ptr), $a0 408 mov $a2, $t2 409 adc 8*1($a_ptr), $a1 410 adc 8*2($a_ptr), $a2 411 mov $a3, $t3 412 adc 8*3($a_ptr), $a3 413 test $t4, $t4 414 415 cmovz $t0, $a0 416 cmovz $t1, $a1 417 mov $a0, 8*0($r_ptr) 418 cmovz $t2, $a2 419 mov $a1, 8*1($r_ptr) 420 cmovz $t3, $a3 421 mov $a2, 8*2($r_ptr) 422 mov $a3, 8*3($r_ptr) 423 424 pop %r13 425 pop %r12 426 ret 427.size ecp_nistz256_neg,.-ecp_nistz256_neg 428___ 429} 430{ 431my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 432my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 433my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 434my ($poly1,$poly3)=($acc6,$acc7); 435 436$code.=<<___; 437################################################################################ 438# void ecp_nistz256_to_mont( 439# uint64_t res[4], 440# uint64_t in[4]); 441.globl ecp_nistz256_to_mont 442.type ecp_nistz256_to_mont,\@function,2 443.align 32 444ecp_nistz256_to_mont: 445___ 446$code.=<<___ if ($addx); 447 mov \$0x80100, %ecx 448 and OPENSSL_ia32cap_P+8(%rip), %ecx 449___ 450$code.=<<___; 451 lea .LRR(%rip), $b_org 452 jmp .Lmul_mont 453.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 454 455################################################################################ 456# void ecp_nistz256_mul_mont( 457# uint64_t res[4], 458# uint64_t a[4], 459# uint64_t b[4]); 460 461.globl ecp_nistz256_mul_mont 462.type ecp_nistz256_mul_mont,\@function,3 463.align 32 464ecp_nistz256_mul_mont: 465___ 466$code.=<<___ if ($addx); 467 mov \$0x80100, %ecx 468 and OPENSSL_ia32cap_P+8(%rip), %ecx 469___ 470$code.=<<___; 471.Lmul_mont: 472 push %rbp 473 push %rbx 474 push %r12 475 push %r13 476 push %r14 477 push %r15 478___ 479$code.=<<___ if ($addx); 480 cmp \$0x80100, %ecx 481 je .Lmul_montx 482___ 483$code.=<<___; 484 mov $b_org, $b_ptr 485 mov 8*0($b_org), %rax 486 mov 8*0($a_ptr), $acc1 487 mov 8*1($a_ptr), $acc2 488 mov 8*2($a_ptr), $acc3 489 mov 8*3($a_ptr), $acc4 490 491 call __ecp_nistz256_mul_montq 492___ 493$code.=<<___ if ($addx); 494 jmp .Lmul_mont_done 495 496.align 32 497.Lmul_montx: 498 mov $b_org, $b_ptr 499 mov 8*0($b_org), %rdx 500 mov 8*0($a_ptr), $acc1 501 mov 8*1($a_ptr), $acc2 502 mov 8*2($a_ptr), $acc3 503 mov 8*3($a_ptr), $acc4 504 lea -128($a_ptr), $a_ptr # control u-op density 505 506 call __ecp_nistz256_mul_montx 507___ 508$code.=<<___; 509.Lmul_mont_done: 510 pop %r15 511 pop %r14 512 pop %r13 513 pop %r12 514 pop %rbx 515 pop %rbp 516 ret 517.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 518 519.type __ecp_nistz256_mul_montq,\@abi-omnipotent 520.align 32 521__ecp_nistz256_mul_montq: 522 ######################################################################## 523 # Multiply a by b[0] 524 mov %rax, $t1 525 mulq $acc1 526 mov .Lpoly+8*1(%rip),$poly1 527 mov %rax, $acc0 528 mov $t1, %rax 529 mov %rdx, $acc1 530 531 mulq $acc2 532 mov .Lpoly+8*3(%rip),$poly3 533 add %rax, $acc1 534 mov $t1, %rax 535 adc \$0, %rdx 536 mov %rdx, $acc2 537 538 mulq $acc3 539 add %rax, $acc2 540 mov $t1, %rax 541 adc \$0, %rdx 542 mov %rdx, $acc3 543 544 mulq $acc4 545 add %rax, $acc3 546 mov $acc0, %rax 547 adc \$0, %rdx 548 xor $acc5, $acc5 549 mov %rdx, $acc4 550 551 ######################################################################## 552 # First reduction step 553 # Basically now we want to multiply acc[0] by p256, 554 # and add the result to the acc. 555 # Due to the special form of p256 we do some optimizations 556 # 557 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 558 # then we add acc[0] and get acc[0] x 2^96 559 560 mov $acc0, $t1 561 shl \$32, $acc0 562 mulq $poly3 563 shr \$32, $t1 564 add $acc0, $acc1 # +=acc[0]<<96 565 adc $t1, $acc2 566 adc %rax, $acc3 567 mov 8*1($b_ptr), %rax 568 adc %rdx, $acc4 569 adc \$0, $acc5 570 xor $acc0, $acc0 571 572 ######################################################################## 573 # Multiply by b[1] 574 mov %rax, $t1 575 mulq 8*0($a_ptr) 576 add %rax, $acc1 577 mov $t1, %rax 578 adc \$0, %rdx 579 mov %rdx, $t0 580 581 mulq 8*1($a_ptr) 582 add $t0, $acc2 583 adc \$0, %rdx 584 add %rax, $acc2 585 mov $t1, %rax 586 adc \$0, %rdx 587 mov %rdx, $t0 588 589 mulq 8*2($a_ptr) 590 add $t0, $acc3 591 adc \$0, %rdx 592 add %rax, $acc3 593 mov $t1, %rax 594 adc \$0, %rdx 595 mov %rdx, $t0 596 597 mulq 8*3($a_ptr) 598 add $t0, $acc4 599 adc \$0, %rdx 600 add %rax, $acc4 601 mov $acc1, %rax 602 adc %rdx, $acc5 603 adc \$0, $acc0 604 605 ######################################################################## 606 # Second reduction step 607 mov $acc1, $t1 608 shl \$32, $acc1 609 mulq $poly3 610 shr \$32, $t1 611 add $acc1, $acc2 612 adc $t1, $acc3 613 adc %rax, $acc4 614 mov 8*2($b_ptr), %rax 615 adc %rdx, $acc5 616 adc \$0, $acc0 617 xor $acc1, $acc1 618 619 ######################################################################## 620 # Multiply by b[2] 621 mov %rax, $t1 622 mulq 8*0($a_ptr) 623 add %rax, $acc2 624 mov $t1, %rax 625 adc \$0, %rdx 626 mov %rdx, $t0 627 628 mulq 8*1($a_ptr) 629 add $t0, $acc3 630 adc \$0, %rdx 631 add %rax, $acc3 632 mov $t1, %rax 633 adc \$0, %rdx 634 mov %rdx, $t0 635 636 mulq 8*2($a_ptr) 637 add $t0, $acc4 638 adc \$0, %rdx 639 add %rax, $acc4 640 mov $t1, %rax 641 adc \$0, %rdx 642 mov %rdx, $t0 643 644 mulq 8*3($a_ptr) 645 add $t0, $acc5 646 adc \$0, %rdx 647 add %rax, $acc5 648 mov $acc2, %rax 649 adc %rdx, $acc0 650 adc \$0, $acc1 651 652 ######################################################################## 653 # Third reduction step 654 mov $acc2, $t1 655 shl \$32, $acc2 656 mulq $poly3 657 shr \$32, $t1 658 add $acc2, $acc3 659 adc $t1, $acc4 660 adc %rax, $acc5 661 mov 8*3($b_ptr), %rax 662 adc %rdx, $acc0 663 adc \$0, $acc1 664 xor $acc2, $acc2 665 666 ######################################################################## 667 # Multiply by b[3] 668 mov %rax, $t1 669 mulq 8*0($a_ptr) 670 add %rax, $acc3 671 mov $t1, %rax 672 adc \$0, %rdx 673 mov %rdx, $t0 674 675 mulq 8*1($a_ptr) 676 add $t0, $acc4 677 adc \$0, %rdx 678 add %rax, $acc4 679 mov $t1, %rax 680 adc \$0, %rdx 681 mov %rdx, $t0 682 683 mulq 8*2($a_ptr) 684 add $t0, $acc5 685 adc \$0, %rdx 686 add %rax, $acc5 687 mov $t1, %rax 688 adc \$0, %rdx 689 mov %rdx, $t0 690 691 mulq 8*3($a_ptr) 692 add $t0, $acc0 693 adc \$0, %rdx 694 add %rax, $acc0 695 mov $acc3, %rax 696 adc %rdx, $acc1 697 adc \$0, $acc2 698 699 ######################################################################## 700 # Final reduction step 701 mov $acc3, $t1 702 shl \$32, $acc3 703 mulq $poly3 704 shr \$32, $t1 705 add $acc3, $acc4 706 adc $t1, $acc5 707 mov $acc4, $t0 708 adc %rax, $acc0 709 adc %rdx, $acc1 710 mov $acc5, $t1 711 adc \$0, $acc2 712 713 ######################################################################## 714 # Branch-less conditional subtraction of P 715 sub \$-1, $acc4 # .Lpoly[0] 716 mov $acc0, $t2 717 sbb $poly1, $acc5 # .Lpoly[1] 718 sbb \$0, $acc0 # .Lpoly[2] 719 mov $acc1, $t3 720 sbb $poly3, $acc1 # .Lpoly[3] 721 sbb \$0, $acc2 722 723 cmovc $t0, $acc4 724 cmovc $t1, $acc5 725 mov $acc4, 8*0($r_ptr) 726 cmovc $t2, $acc0 727 mov $acc5, 8*1($r_ptr) 728 cmovc $t3, $acc1 729 mov $acc0, 8*2($r_ptr) 730 mov $acc1, 8*3($r_ptr) 731 732 ret 733.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 734 735################################################################################ 736# void ecp_nistz256_sqr_mont( 737# uint64_t res[4], 738# uint64_t a[4]); 739 740# we optimize the square according to S.Gueron and V.Krasnov, 741# "Speeding up Big-Number Squaring" 742.globl ecp_nistz256_sqr_mont 743.type ecp_nistz256_sqr_mont,\@function,2 744.align 32 745ecp_nistz256_sqr_mont: 746___ 747$code.=<<___ if ($addx); 748 mov \$0x80100, %ecx 749 and OPENSSL_ia32cap_P+8(%rip), %ecx 750___ 751$code.=<<___; 752 push %rbp 753 push %rbx 754 push %r12 755 push %r13 756 push %r14 757 push %r15 758___ 759$code.=<<___ if ($addx); 760 cmp \$0x80100, %ecx 761 je .Lsqr_montx 762___ 763$code.=<<___; 764 mov 8*0($a_ptr), %rax 765 mov 8*1($a_ptr), $acc6 766 mov 8*2($a_ptr), $acc7 767 mov 8*3($a_ptr), $acc0 768 769 call __ecp_nistz256_sqr_montq 770___ 771$code.=<<___ if ($addx); 772 jmp .Lsqr_mont_done 773 774.align 32 775.Lsqr_montx: 776 mov 8*0($a_ptr), %rdx 777 mov 8*1($a_ptr), $acc6 778 mov 8*2($a_ptr), $acc7 779 mov 8*3($a_ptr), $acc0 780 lea -128($a_ptr), $a_ptr # control u-op density 781 782 call __ecp_nistz256_sqr_montx 783___ 784$code.=<<___; 785.Lsqr_mont_done: 786 pop %r15 787 pop %r14 788 pop %r13 789 pop %r12 790 pop %rbx 791 pop %rbp 792 ret 793.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 794 795.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 796.align 32 797__ecp_nistz256_sqr_montq: 798 mov %rax, $acc5 799 mulq $acc6 # a[1]*a[0] 800 mov %rax, $acc1 801 mov $acc7, %rax 802 mov %rdx, $acc2 803 804 mulq $acc5 # a[0]*a[2] 805 add %rax, $acc2 806 mov $acc0, %rax 807 adc \$0, %rdx 808 mov %rdx, $acc3 809 810 mulq $acc5 # a[0]*a[3] 811 add %rax, $acc3 812 mov $acc7, %rax 813 adc \$0, %rdx 814 mov %rdx, $acc4 815 816 ################################# 817 mulq $acc6 # a[1]*a[2] 818 add %rax, $acc3 819 mov $acc0, %rax 820 adc \$0, %rdx 821 mov %rdx, $t1 822 823 mulq $acc6 # a[1]*a[3] 824 add %rax, $acc4 825 mov $acc0, %rax 826 adc \$0, %rdx 827 add $t1, $acc4 828 mov %rdx, $acc5 829 adc \$0, $acc5 830 831 ################################# 832 mulq $acc7 # a[2]*a[3] 833 xor $acc7, $acc7 834 add %rax, $acc5 835 mov 8*0($a_ptr), %rax 836 mov %rdx, $acc6 837 adc \$0, $acc6 838 839 add $acc1, $acc1 # acc1:6<<1 840 adc $acc2, $acc2 841 adc $acc3, $acc3 842 adc $acc4, $acc4 843 adc $acc5, $acc5 844 adc $acc6, $acc6 845 adc \$0, $acc7 846 847 mulq %rax 848 mov %rax, $acc0 849 mov 8*1($a_ptr), %rax 850 mov %rdx, $t0 851 852 mulq %rax 853 add $t0, $acc1 854 adc %rax, $acc2 855 mov 8*2($a_ptr), %rax 856 adc \$0, %rdx 857 mov %rdx, $t0 858 859 mulq %rax 860 add $t0, $acc3 861 adc %rax, $acc4 862 mov 8*3($a_ptr), %rax 863 adc \$0, %rdx 864 mov %rdx, $t0 865 866 mulq %rax 867 add $t0, $acc5 868 adc %rax, $acc6 869 mov $acc0, %rax 870 adc %rdx, $acc7 871 872 mov .Lpoly+8*1(%rip), $a_ptr 873 mov .Lpoly+8*3(%rip), $t1 874 875 ########################################## 876 # Now the reduction 877 # First iteration 878 mov $acc0, $t0 879 shl \$32, $acc0 880 mulq $t1 881 shr \$32, $t0 882 add $acc0, $acc1 # +=acc[0]<<96 883 adc $t0, $acc2 884 adc %rax, $acc3 885 mov $acc1, %rax 886 adc \$0, %rdx 887 888 ########################################## 889 # Second iteration 890 mov $acc1, $t0 891 shl \$32, $acc1 892 mov %rdx, $acc0 893 mulq $t1 894 shr \$32, $t0 895 add $acc1, $acc2 896 adc $t0, $acc3 897 adc %rax, $acc0 898 mov $acc2, %rax 899 adc \$0, %rdx 900 901 ########################################## 902 # Third iteration 903 mov $acc2, $t0 904 shl \$32, $acc2 905 mov %rdx, $acc1 906 mulq $t1 907 shr \$32, $t0 908 add $acc2, $acc3 909 adc $t0, $acc0 910 adc %rax, $acc1 911 mov $acc3, %rax 912 adc \$0, %rdx 913 914 ########################################### 915 # Last iteration 916 mov $acc3, $t0 917 shl \$32, $acc3 918 mov %rdx, $acc2 919 mulq $t1 920 shr \$32, $t0 921 add $acc3, $acc0 922 adc $t0, $acc1 923 adc %rax, $acc2 924 adc \$0, %rdx 925 xor $acc3, $acc3 926 927 ############################################ 928 # Add the rest of the acc 929 add $acc0, $acc4 930 adc $acc1, $acc5 931 mov $acc4, $acc0 932 adc $acc2, $acc6 933 adc %rdx, $acc7 934 mov $acc5, $acc1 935 adc \$0, $acc3 936 937 sub \$-1, $acc4 # .Lpoly[0] 938 mov $acc6, $acc2 939 sbb $a_ptr, $acc5 # .Lpoly[1] 940 sbb \$0, $acc6 # .Lpoly[2] 941 mov $acc7, $t0 942 sbb $t1, $acc7 # .Lpoly[3] 943 sbb \$0, $acc3 944 945 cmovc $acc0, $acc4 946 cmovc $acc1, $acc5 947 mov $acc4, 8*0($r_ptr) 948 cmovc $acc2, $acc6 949 mov $acc5, 8*1($r_ptr) 950 cmovc $t0, $acc7 951 mov $acc6, 8*2($r_ptr) 952 mov $acc7, 8*3($r_ptr) 953 954 ret 955.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 956___ 957 958if ($addx) { 959$code.=<<___; 960.type __ecp_nistz256_mul_montx,\@abi-omnipotent 961.align 32 962__ecp_nistz256_mul_montx: 963 ######################################################################## 964 # Multiply by b[0] 965 mulx $acc1, $acc0, $acc1 966 mulx $acc2, $t0, $acc2 967 mov \$32, $poly1 968 xor $acc5, $acc5 # cf=0 969 mulx $acc3, $t1, $acc3 970 mov .Lpoly+8*3(%rip), $poly3 971 adc $t0, $acc1 972 mulx $acc4, $t0, $acc4 973 mov $acc0, %rdx 974 adc $t1, $acc2 975 shlx $poly1,$acc0,$t1 976 adc $t0, $acc3 977 shrx $poly1,$acc0,$t0 978 adc \$0, $acc4 979 980 ######################################################################## 981 # First reduction step 982 add $t1, $acc1 983 adc $t0, $acc2 984 985 mulx $poly3, $t0, $t1 986 mov 8*1($b_ptr), %rdx 987 adc $t0, $acc3 988 adc $t1, $acc4 989 adc \$0, $acc5 990 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 991 992 ######################################################################## 993 # Multiply by b[1] 994 mulx 8*0+128($a_ptr), $t0, $t1 995 adcx $t0, $acc1 996 adox $t1, $acc2 997 998 mulx 8*1+128($a_ptr), $t0, $t1 999 adcx $t0, $acc2 1000 adox $t1, $acc3 1001 1002 mulx 8*2+128($a_ptr), $t0, $t1 1003 adcx $t0, $acc3 1004 adox $t1, $acc4 1005 1006 mulx 8*3+128($a_ptr), $t0, $t1 1007 mov $acc1, %rdx 1008 adcx $t0, $acc4 1009 shlx $poly1, $acc1, $t0 1010 adox $t1, $acc5 1011 shrx $poly1, $acc1, $t1 1012 1013 adcx $acc0, $acc5 1014 adox $acc0, $acc0 1015 adc \$0, $acc0 1016 1017 ######################################################################## 1018 # Second reduction step 1019 add $t0, $acc2 1020 adc $t1, $acc3 1021 1022 mulx $poly3, $t0, $t1 1023 mov 8*2($b_ptr), %rdx 1024 adc $t0, $acc4 1025 adc $t1, $acc5 1026 adc \$0, $acc0 1027 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 1028 1029 ######################################################################## 1030 # Multiply by b[2] 1031 mulx 8*0+128($a_ptr), $t0, $t1 1032 adcx $t0, $acc2 1033 adox $t1, $acc3 1034 1035 mulx 8*1+128($a_ptr), $t0, $t1 1036 adcx $t0, $acc3 1037 adox $t1, $acc4 1038 1039 mulx 8*2+128($a_ptr), $t0, $t1 1040 adcx $t0, $acc4 1041 adox $t1, $acc5 1042 1043 mulx 8*3+128($a_ptr), $t0, $t1 1044 mov $acc2, %rdx 1045 adcx $t0, $acc5 1046 shlx $poly1, $acc2, $t0 1047 adox $t1, $acc0 1048 shrx $poly1, $acc2, $t1 1049 1050 adcx $acc1, $acc0 1051 adox $acc1, $acc1 1052 adc \$0, $acc1 1053 1054 ######################################################################## 1055 # Third reduction step 1056 add $t0, $acc3 1057 adc $t1, $acc4 1058 1059 mulx $poly3, $t0, $t1 1060 mov 8*3($b_ptr), %rdx 1061 adc $t0, $acc5 1062 adc $t1, $acc0 1063 adc \$0, $acc1 1064 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 1065 1066 ######################################################################## 1067 # Multiply by b[3] 1068 mulx 8*0+128($a_ptr), $t0, $t1 1069 adcx $t0, $acc3 1070 adox $t1, $acc4 1071 1072 mulx 8*1+128($a_ptr), $t0, $t1 1073 adcx $t0, $acc4 1074 adox $t1, $acc5 1075 1076 mulx 8*2+128($a_ptr), $t0, $t1 1077 adcx $t0, $acc5 1078 adox $t1, $acc0 1079 1080 mulx 8*3+128($a_ptr), $t0, $t1 1081 mov $acc3, %rdx 1082 adcx $t0, $acc0 1083 shlx $poly1, $acc3, $t0 1084 adox $t1, $acc1 1085 shrx $poly1, $acc3, $t1 1086 1087 adcx $acc2, $acc1 1088 adox $acc2, $acc2 1089 adc \$0, $acc2 1090 1091 ######################################################################## 1092 # Fourth reduction step 1093 add $t0, $acc4 1094 adc $t1, $acc5 1095 1096 mulx $poly3, $t0, $t1 1097 mov $acc4, $t2 1098 mov .Lpoly+8*1(%rip), $poly1 1099 adc $t0, $acc0 1100 mov $acc5, $t3 1101 adc $t1, $acc1 1102 adc \$0, $acc2 1103 1104 ######################################################################## 1105 # Branch-less conditional subtraction of P 1106 xor %eax, %eax 1107 mov $acc0, $t0 1108 sbb \$-1, $acc4 # .Lpoly[0] 1109 sbb $poly1, $acc5 # .Lpoly[1] 1110 sbb \$0, $acc0 # .Lpoly[2] 1111 mov $acc1, $t1 1112 sbb $poly3, $acc1 # .Lpoly[3] 1113 sbb \$0, $acc2 1114 1115 cmovc $t2, $acc4 1116 cmovc $t3, $acc5 1117 mov $acc4, 8*0($r_ptr) 1118 cmovc $t0, $acc0 1119 mov $acc5, 8*1($r_ptr) 1120 cmovc $t1, $acc1 1121 mov $acc0, 8*2($r_ptr) 1122 mov $acc1, 8*3($r_ptr) 1123 1124 ret 1125.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 1126 1127.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 1128.align 32 1129__ecp_nistz256_sqr_montx: 1130 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1131 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1132 xor %eax, %eax 1133 adc $t0, $acc2 1134 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1135 mov $acc6, %rdx 1136 adc $t1, $acc3 1137 adc \$0, $acc4 1138 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1139 1140 ################################# 1141 mulx $acc7, $t0, $t1 # a[1]*a[2] 1142 adcx $t0, $acc3 1143 adox $t1, $acc4 1144 1145 mulx $acc0, $t0, $t1 # a[1]*a[3] 1146 mov $acc7, %rdx 1147 adcx $t0, $acc4 1148 adox $t1, $acc5 1149 adc \$0, $acc5 1150 1151 ################################# 1152 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1153 mov 8*0+128($a_ptr), %rdx 1154 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1155 adcx $acc1, $acc1 # acc1:6<<1 1156 adox $t0, $acc5 1157 adcx $acc2, $acc2 1158 adox $acc7, $acc6 # of=0 1159 1160 mulx %rdx, $acc0, $t1 1161 mov 8*1+128($a_ptr), %rdx 1162 adcx $acc3, $acc3 1163 adox $t1, $acc1 1164 adcx $acc4, $acc4 1165 mulx %rdx, $t0, $t4 1166 mov 8*2+128($a_ptr), %rdx 1167 adcx $acc5, $acc5 1168 adox $t0, $acc2 1169 adcx $acc6, $acc6 1170 .byte 0x67 1171 mulx %rdx, $t0, $t1 1172 mov 8*3+128($a_ptr), %rdx 1173 adox $t4, $acc3 1174 adcx $acc7, $acc7 1175 adox $t0, $acc4 1176 mov \$32, $a_ptr 1177 adox $t1, $acc5 1178 .byte 0x67,0x67 1179 mulx %rdx, $t0, $t4 1180 mov $acc0, %rdx 1181 adox $t0, $acc6 1182 shlx $a_ptr, $acc0, $t0 1183 adox $t4, $acc7 1184 shrx $a_ptr, $acc0, $t4 1185 mov .Lpoly+8*3(%rip), $t1 1186 1187 # reduction step 1 1188 add $t0, $acc1 1189 adc $t4, $acc2 1190 1191 mulx $t1, $t0, $acc0 1192 mov $acc1, %rdx 1193 adc $t0, $acc3 1194 shlx $a_ptr, $acc1, $t0 1195 adc \$0, $acc0 1196 shrx $a_ptr, $acc1, $t4 1197 1198 # reduction step 2 1199 add $t0, $acc2 1200 adc $t4, $acc3 1201 1202 mulx $t1, $t0, $acc1 1203 mov $acc2, %rdx 1204 adc $t0, $acc0 1205 shlx $a_ptr, $acc2, $t0 1206 adc \$0, $acc1 1207 shrx $a_ptr, $acc2, $t4 1208 1209 # reduction step 3 1210 add $t0, $acc3 1211 adc $t4, $acc0 1212 1213 mulx $t1, $t0, $acc2 1214 mov $acc3, %rdx 1215 adc $t0, $acc1 1216 shlx $a_ptr, $acc3, $t0 1217 adc \$0, $acc2 1218 shrx $a_ptr, $acc3, $t4 1219 1220 # reduction step 4 1221 add $t0, $acc0 1222 adc $t4, $acc1 1223 1224 mulx $t1, $t0, $acc3 1225 adc $t0, $acc2 1226 adc \$0, $acc3 1227 1228 xor $t3, $t3 # cf=0 1229 adc $acc0, $acc4 # accumulate upper half 1230 mov .Lpoly+8*1(%rip), $a_ptr 1231 adc $acc1, $acc5 1232 mov $acc4, $acc0 1233 adc $acc2, $acc6 1234 adc $acc3, $acc7 1235 mov $acc5, $acc1 1236 adc \$0, $t3 1237 1238 xor %eax, %eax # cf=0 1239 sbb \$-1, $acc4 # .Lpoly[0] 1240 mov $acc6, $acc2 1241 sbb $a_ptr, $acc5 # .Lpoly[1] 1242 sbb \$0, $acc6 # .Lpoly[2] 1243 mov $acc7, $acc3 1244 sbb $t1, $acc7 # .Lpoly[3] 1245 sbb \$0, $t3 1246 1247 cmovc $acc0, $acc4 1248 cmovc $acc1, $acc5 1249 mov $acc4, 8*0($r_ptr) 1250 cmovc $acc2, $acc6 1251 mov $acc5, 8*1($r_ptr) 1252 cmovc $acc3, $acc7 1253 mov $acc6, 8*2($r_ptr) 1254 mov $acc7, 8*3($r_ptr) 1255 1256 ret 1257.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 1258___ 1259} 1260} 1261{ 1262my ($r_ptr,$in_ptr)=("%rdi","%rsi"); 1263my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); 1264my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); 1265 1266$code.=<<___; 1267################################################################################ 1268# void ecp_nistz256_from_mont( 1269# uint64_t res[4], 1270# uint64_t in[4]); 1271# This one performs Montgomery multiplication by 1, so we only need the reduction 1272 1273.globl ecp_nistz256_from_mont 1274.type ecp_nistz256_from_mont,\@function,2 1275.align 32 1276ecp_nistz256_from_mont: 1277 push %r12 1278 push %r13 1279 1280 mov 8*0($in_ptr), %rax 1281 mov .Lpoly+8*3(%rip), $t2 1282 mov 8*1($in_ptr), $acc1 1283 mov 8*2($in_ptr), $acc2 1284 mov 8*3($in_ptr), $acc3 1285 mov %rax, $acc0 1286 mov .Lpoly+8*1(%rip), $t1 1287 1288 ######################################### 1289 # First iteration 1290 mov %rax, $t0 1291 shl \$32, $acc0 1292 mulq $t2 1293 shr \$32, $t0 1294 add $acc0, $acc1 1295 adc $t0, $acc2 1296 adc %rax, $acc3 1297 mov $acc1, %rax 1298 adc \$0, %rdx 1299 1300 ######################################### 1301 # Second iteration 1302 mov $acc1, $t0 1303 shl \$32, $acc1 1304 mov %rdx, $acc0 1305 mulq $t2 1306 shr \$32, $t0 1307 add $acc1, $acc2 1308 adc $t0, $acc3 1309 adc %rax, $acc0 1310 mov $acc2, %rax 1311 adc \$0, %rdx 1312 1313 ########################################## 1314 # Third iteration 1315 mov $acc2, $t0 1316 shl \$32, $acc2 1317 mov %rdx, $acc1 1318 mulq $t2 1319 shr \$32, $t0 1320 add $acc2, $acc3 1321 adc $t0, $acc0 1322 adc %rax, $acc1 1323 mov $acc3, %rax 1324 adc \$0, %rdx 1325 1326 ########################################### 1327 # Last iteration 1328 mov $acc3, $t0 1329 shl \$32, $acc3 1330 mov %rdx, $acc2 1331 mulq $t2 1332 shr \$32, $t0 1333 add $acc3, $acc0 1334 adc $t0, $acc1 1335 mov $acc0, $t0 1336 adc %rax, $acc2 1337 mov $acc1, $in_ptr 1338 adc \$0, %rdx 1339 1340 ########################################### 1341 # Branch-less conditional subtraction 1342 sub \$-1, $acc0 1343 mov $acc2, %rax 1344 sbb $t1, $acc1 1345 sbb \$0, $acc2 1346 mov %rdx, $acc3 1347 sbb $t2, %rdx 1348 sbb $t2, $t2 1349 1350 cmovnz $t0, $acc0 1351 cmovnz $in_ptr, $acc1 1352 mov $acc0, 8*0($r_ptr) 1353 cmovnz %rax, $acc2 1354 mov $acc1, 8*1($r_ptr) 1355 cmovz %rdx, $acc3 1356 mov $acc2, 8*2($r_ptr) 1357 mov $acc3, 8*3($r_ptr) 1358 1359 pop %r13 1360 pop %r12 1361 ret 1362.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 1363___ 1364} 1365{ 1366my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1367my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 1368my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 1369my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 1370 1371$code.=<<___; 1372################################################################################ 1373# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1374.globl ecp_nistz256_select_w5 1375.type ecp_nistz256_select_w5,\@abi-omnipotent 1376.align 32 1377ecp_nistz256_select_w5: 1378___ 1379$code.=<<___ if ($avx>1); 1380 mov OPENSSL_ia32cap_P+8(%rip), %eax 1381 test \$`1<<5`, %eax 1382 jnz .Lavx2_select_w5 1383___ 1384$code.=<<___ if ($win64); 1385 lea -0x88(%rsp), %rax 1386.LSEH_begin_ecp_nistz256_select_w5: 1387 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1388 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1389 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1390 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1391 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1392 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1393 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1394 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1395 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1396 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1397 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1398___ 1399$code.=<<___; 1400 movdqa .LOne(%rip), $ONE 1401 movd $index, $INDEX 1402 1403 pxor $Ra, $Ra 1404 pxor $Rb, $Rb 1405 pxor $Rc, $Rc 1406 pxor $Rd, $Rd 1407 pxor $Re, $Re 1408 pxor $Rf, $Rf 1409 1410 movdqa $ONE, $M0 1411 pshufd \$0, $INDEX, $INDEX 1412 1413 mov \$16, %rax 1414.Lselect_loop_sse_w5: 1415 1416 movdqa $M0, $TMP0 1417 paddd $ONE, $M0 1418 pcmpeqd $INDEX, $TMP0 1419 1420 movdqa 16*0($in_t), $T0a 1421 movdqa 16*1($in_t), $T0b 1422 movdqa 16*2($in_t), $T0c 1423 movdqa 16*3($in_t), $T0d 1424 movdqa 16*4($in_t), $T0e 1425 movdqa 16*5($in_t), $T0f 1426 lea 16*6($in_t), $in_t 1427 1428 pand $TMP0, $T0a 1429 pand $TMP0, $T0b 1430 por $T0a, $Ra 1431 pand $TMP0, $T0c 1432 por $T0b, $Rb 1433 pand $TMP0, $T0d 1434 por $T0c, $Rc 1435 pand $TMP0, $T0e 1436 por $T0d, $Rd 1437 pand $TMP0, $T0f 1438 por $T0e, $Re 1439 por $T0f, $Rf 1440 1441 dec %rax 1442 jnz .Lselect_loop_sse_w5 1443 1444 movdqu $Ra, 16*0($val) 1445 movdqu $Rb, 16*1($val) 1446 movdqu $Rc, 16*2($val) 1447 movdqu $Rd, 16*3($val) 1448 movdqu $Re, 16*4($val) 1449 movdqu $Rf, 16*5($val) 1450___ 1451$code.=<<___ if ($win64); 1452 movaps (%rsp), %xmm6 1453 movaps 0x10(%rsp), %xmm7 1454 movaps 0x20(%rsp), %xmm8 1455 movaps 0x30(%rsp), %xmm9 1456 movaps 0x40(%rsp), %xmm10 1457 movaps 0x50(%rsp), %xmm11 1458 movaps 0x60(%rsp), %xmm12 1459 movaps 0x70(%rsp), %xmm13 1460 movaps 0x80(%rsp), %xmm14 1461 movaps 0x90(%rsp), %xmm15 1462 lea 0xa8(%rsp), %rsp 1463.LSEH_end_ecp_nistz256_select_w5: 1464___ 1465$code.=<<___; 1466 ret 1467.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1468 1469################################################################################ 1470# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1471.globl ecp_nistz256_select_w7 1472.type ecp_nistz256_select_w7,\@abi-omnipotent 1473.align 32 1474ecp_nistz256_select_w7: 1475___ 1476$code.=<<___ if ($avx>1); 1477 mov OPENSSL_ia32cap_P+8(%rip), %eax 1478 test \$`1<<5`, %eax 1479 jnz .Lavx2_select_w7 1480___ 1481$code.=<<___ if ($win64); 1482 lea -0x88(%rsp), %rax 1483.LSEH_begin_ecp_nistz256_select_w7: 1484 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1485 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1486 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1487 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1488 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1489 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1490 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1491 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1492 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1493 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1494 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1495___ 1496$code.=<<___; 1497 movdqa .LOne(%rip), $M0 1498 movd $index, $INDEX 1499 1500 pxor $Ra, $Ra 1501 pxor $Rb, $Rb 1502 pxor $Rc, $Rc 1503 pxor $Rd, $Rd 1504 1505 movdqa $M0, $ONE 1506 pshufd \$0, $INDEX, $INDEX 1507 mov \$64, %rax 1508 1509.Lselect_loop_sse_w7: 1510 movdqa $M0, $TMP0 1511 paddd $ONE, $M0 1512 movdqa 16*0($in_t), $T0a 1513 movdqa 16*1($in_t), $T0b 1514 pcmpeqd $INDEX, $TMP0 1515 movdqa 16*2($in_t), $T0c 1516 movdqa 16*3($in_t), $T0d 1517 lea 16*4($in_t), $in_t 1518 1519 pand $TMP0, $T0a 1520 pand $TMP0, $T0b 1521 por $T0a, $Ra 1522 pand $TMP0, $T0c 1523 por $T0b, $Rb 1524 pand $TMP0, $T0d 1525 por $T0c, $Rc 1526 prefetcht0 255($in_t) 1527 por $T0d, $Rd 1528 1529 dec %rax 1530 jnz .Lselect_loop_sse_w7 1531 1532 movdqu $Ra, 16*0($val) 1533 movdqu $Rb, 16*1($val) 1534 movdqu $Rc, 16*2($val) 1535 movdqu $Rd, 16*3($val) 1536___ 1537$code.=<<___ if ($win64); 1538 movaps (%rsp), %xmm6 1539 movaps 0x10(%rsp), %xmm7 1540 movaps 0x20(%rsp), %xmm8 1541 movaps 0x30(%rsp), %xmm9 1542 movaps 0x40(%rsp), %xmm10 1543 movaps 0x50(%rsp), %xmm11 1544 movaps 0x60(%rsp), %xmm12 1545 movaps 0x70(%rsp), %xmm13 1546 movaps 0x80(%rsp), %xmm14 1547 movaps 0x90(%rsp), %xmm15 1548 lea 0xa8(%rsp), %rsp 1549.LSEH_end_ecp_nistz256_select_w7: 1550___ 1551$code.=<<___; 1552 ret 1553.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1554___ 1555} 1556if ($avx>1) { 1557my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1558my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 1559my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 1560my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 1561 1562$code.=<<___; 1563################################################################################ 1564# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 1565.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 1566.align 32 1567ecp_nistz256_avx2_select_w5: 1568.Lavx2_select_w5: 1569 vzeroupper 1570___ 1571$code.=<<___ if ($win64); 1572 lea -0x88(%rsp), %rax 1573.LSEH_begin_ecp_nistz256_avx2_select_w5: 1574 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1575 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1576 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1577 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1578 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1579 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1580 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1581 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1582 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1583 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1584 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1585___ 1586$code.=<<___; 1587 vmovdqa .LTwo(%rip), $TWO 1588 1589 vpxor $Ra, $Ra, $Ra 1590 vpxor $Rb, $Rb, $Rb 1591 vpxor $Rc, $Rc, $Rc 1592 1593 vmovdqa .LOne(%rip), $M0 1594 vmovdqa .LTwo(%rip), $M1 1595 1596 vmovd $index, %xmm1 1597 vpermd $INDEX, $Ra, $INDEX 1598 1599 mov \$8, %rax 1600.Lselect_loop_avx2_w5: 1601 1602 vmovdqa 32*0($in_t), $T0a 1603 vmovdqa 32*1($in_t), $T0b 1604 vmovdqa 32*2($in_t), $T0c 1605 1606 vmovdqa 32*3($in_t), $T1a 1607 vmovdqa 32*4($in_t), $T1b 1608 vmovdqa 32*5($in_t), $T1c 1609 1610 vpcmpeqd $INDEX, $M0, $TMP0 1611 vpcmpeqd $INDEX, $M1, $TMP1 1612 1613 vpaddd $TWO, $M0, $M0 1614 vpaddd $TWO, $M1, $M1 1615 lea 32*6($in_t), $in_t 1616 1617 vpand $TMP0, $T0a, $T0a 1618 vpand $TMP0, $T0b, $T0b 1619 vpand $TMP0, $T0c, $T0c 1620 vpand $TMP1, $T1a, $T1a 1621 vpand $TMP1, $T1b, $T1b 1622 vpand $TMP1, $T1c, $T1c 1623 1624 vpxor $T0a, $Ra, $Ra 1625 vpxor $T0b, $Rb, $Rb 1626 vpxor $T0c, $Rc, $Rc 1627 vpxor $T1a, $Ra, $Ra 1628 vpxor $T1b, $Rb, $Rb 1629 vpxor $T1c, $Rc, $Rc 1630 1631 dec %rax 1632 jnz .Lselect_loop_avx2_w5 1633 1634 vmovdqu $Ra, 32*0($val) 1635 vmovdqu $Rb, 32*1($val) 1636 vmovdqu $Rc, 32*2($val) 1637 vzeroupper 1638___ 1639$code.=<<___ if ($win64); 1640 movaps (%rsp), %xmm6 1641 movaps 0x10(%rsp), %xmm7 1642 movaps 0x20(%rsp), %xmm8 1643 movaps 0x30(%rsp), %xmm9 1644 movaps 0x40(%rsp), %xmm10 1645 movaps 0x50(%rsp), %xmm11 1646 movaps 0x60(%rsp), %xmm12 1647 movaps 0x70(%rsp), %xmm13 1648 movaps 0x80(%rsp), %xmm14 1649 movaps 0x90(%rsp), %xmm15 1650 lea 0xa8(%rsp), %rsp 1651.LSEH_end_ecp_nistz256_avx2_select_w5: 1652___ 1653$code.=<<___; 1654 ret 1655.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 1656___ 1657} 1658if ($avx>1) { 1659my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1660my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 1661my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 1662my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 1663my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 1664 1665$code.=<<___; 1666 1667################################################################################ 1668# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 1669.globl ecp_nistz256_avx2_select_w7 1670.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 1671.align 32 1672ecp_nistz256_avx2_select_w7: 1673.Lavx2_select_w7: 1674 vzeroupper 1675___ 1676$code.=<<___ if ($win64); 1677 lea -0x88(%rsp), %rax 1678.LSEH_begin_ecp_nistz256_avx2_select_w7: 1679 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1680 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1681 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1682 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1683 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1684 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1685 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1686 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1687 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1688 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1689 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1690___ 1691$code.=<<___; 1692 vmovdqa .LThree(%rip), $THREE 1693 1694 vpxor $Ra, $Ra, $Ra 1695 vpxor $Rb, $Rb, $Rb 1696 1697 vmovdqa .LOne(%rip), $M0 1698 vmovdqa .LTwo(%rip), $M1 1699 vmovdqa .LThree(%rip), $M2 1700 1701 vmovd $index, %xmm1 1702 vpermd $INDEX, $Ra, $INDEX 1703 # Skip index = 0, because it is implicitly the point at infinity 1704 1705 mov \$21, %rax 1706.Lselect_loop_avx2_w7: 1707 1708 vmovdqa 32*0($in_t), $T0a 1709 vmovdqa 32*1($in_t), $T0b 1710 1711 vmovdqa 32*2($in_t), $T1a 1712 vmovdqa 32*3($in_t), $T1b 1713 1714 vmovdqa 32*4($in_t), $T2a 1715 vmovdqa 32*5($in_t), $T2b 1716 1717 vpcmpeqd $INDEX, $M0, $TMP0 1718 vpcmpeqd $INDEX, $M1, $TMP1 1719 vpcmpeqd $INDEX, $M2, $TMP2 1720 1721 vpaddd $THREE, $M0, $M0 1722 vpaddd $THREE, $M1, $M1 1723 vpaddd $THREE, $M2, $M2 1724 lea 32*6($in_t), $in_t 1725 1726 vpand $TMP0, $T0a, $T0a 1727 vpand $TMP0, $T0b, $T0b 1728 vpand $TMP1, $T1a, $T1a 1729 vpand $TMP1, $T1b, $T1b 1730 vpand $TMP2, $T2a, $T2a 1731 vpand $TMP2, $T2b, $T2b 1732 1733 vpxor $T0a, $Ra, $Ra 1734 vpxor $T0b, $Rb, $Rb 1735 vpxor $T1a, $Ra, $Ra 1736 vpxor $T1b, $Rb, $Rb 1737 vpxor $T2a, $Ra, $Ra 1738 vpxor $T2b, $Rb, $Rb 1739 1740 dec %rax 1741 jnz .Lselect_loop_avx2_w7 1742 1743 1744 vmovdqa 32*0($in_t), $T0a 1745 vmovdqa 32*1($in_t), $T0b 1746 1747 vpcmpeqd $INDEX, $M0, $TMP0 1748 1749 vpand $TMP0, $T0a, $T0a 1750 vpand $TMP0, $T0b, $T0b 1751 1752 vpxor $T0a, $Ra, $Ra 1753 vpxor $T0b, $Rb, $Rb 1754 1755 vmovdqu $Ra, 32*0($val) 1756 vmovdqu $Rb, 32*1($val) 1757 vzeroupper 1758___ 1759$code.=<<___ if ($win64); 1760 movaps (%rsp), %xmm6 1761 movaps 0x10(%rsp), %xmm7 1762 movaps 0x20(%rsp), %xmm8 1763 movaps 0x30(%rsp), %xmm9 1764 movaps 0x40(%rsp), %xmm10 1765 movaps 0x50(%rsp), %xmm11 1766 movaps 0x60(%rsp), %xmm12 1767 movaps 0x70(%rsp), %xmm13 1768 movaps 0x80(%rsp), %xmm14 1769 movaps 0x90(%rsp), %xmm15 1770 lea 0xa8(%rsp), %rsp 1771.LSEH_end_ecp_nistz256_avx2_select_w7: 1772___ 1773$code.=<<___; 1774 ret 1775.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1776___ 1777} else { 1778$code.=<<___; 1779.globl ecp_nistz256_avx2_select_w7 1780.type ecp_nistz256_avx2_select_w7,\@function,3 1781.align 32 1782ecp_nistz256_avx2_select_w7: 1783 .byte 0x0f,0x0b # ud2 1784 ret 1785.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1786___ 1787} 1788{{{ 1789######################################################################## 1790# This block implements higher level point_double, point_add and 1791# point_add_affine. The key to performance in this case is to allow 1792# out-of-order execution logic to overlap computations from next step 1793# with tail processing from current step. By using tailored calling 1794# sequence we minimize inter-step overhead to give processor better 1795# shot at overlapping operations... 1796# 1797# You will notice that input data is copied to stack. Trouble is that 1798# there are no registers to spare for holding original pointers and 1799# reloading them, pointers, would create undesired dependencies on 1800# effective addresses calculation paths. In other words it's too done 1801# to favour out-of-order execution logic. 1802# <appro@openssl.org> 1803 1804my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 1805my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 1806my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 1807my ($poly1,$poly3)=($acc6,$acc7); 1808 1809sub load_for_mul () { 1810my ($a,$b,$src0) = @_; 1811my $bias = $src0 eq "%rax" ? 0 : -128; 1812 1813" mov $b, $src0 1814 lea $b, $b_ptr 1815 mov 8*0+$a, $acc1 1816 mov 8*1+$a, $acc2 1817 lea $bias+$a, $a_ptr 1818 mov 8*2+$a, $acc3 1819 mov 8*3+$a, $acc4" 1820} 1821 1822sub load_for_sqr () { 1823my ($a,$src0) = @_; 1824my $bias = $src0 eq "%rax" ? 0 : -128; 1825 1826" mov 8*0+$a, $src0 1827 mov 8*1+$a, $acc6 1828 lea $bias+$a, $a_ptr 1829 mov 8*2+$a, $acc7 1830 mov 8*3+$a, $acc0" 1831} 1832 1833 { 1834######################################################################## 1835# operate in 4-5-0-1 "name space" that matches multiplication output 1836# 1837my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 1838 1839$code.=<<___; 1840.type __ecp_nistz256_add_toq,\@abi-omnipotent 1841.align 32 1842__ecp_nistz256_add_toq: 1843 add 8*0($b_ptr), $a0 1844 adc 8*1($b_ptr), $a1 1845 mov $a0, $t0 1846 adc 8*2($b_ptr), $a2 1847 adc 8*3($b_ptr), $a3 1848 mov $a1, $t1 1849 sbb $t4, $t4 1850 1851 sub \$-1, $a0 1852 mov $a2, $t2 1853 sbb $poly1, $a1 1854 sbb \$0, $a2 1855 mov $a3, $t3 1856 sbb $poly3, $a3 1857 test $t4, $t4 1858 1859 cmovz $t0, $a0 1860 cmovz $t1, $a1 1861 mov $a0, 8*0($r_ptr) 1862 cmovz $t2, $a2 1863 mov $a1, 8*1($r_ptr) 1864 cmovz $t3, $a3 1865 mov $a2, 8*2($r_ptr) 1866 mov $a3, 8*3($r_ptr) 1867 1868 ret 1869.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 1870 1871.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 1872.align 32 1873__ecp_nistz256_sub_fromq: 1874 sub 8*0($b_ptr), $a0 1875 sbb 8*1($b_ptr), $a1 1876 mov $a0, $t0 1877 sbb 8*2($b_ptr), $a2 1878 sbb 8*3($b_ptr), $a3 1879 mov $a1, $t1 1880 sbb $t4, $t4 1881 1882 add \$-1, $a0 1883 mov $a2, $t2 1884 adc $poly1, $a1 1885 adc \$0, $a2 1886 mov $a3, $t3 1887 adc $poly3, $a3 1888 test $t4, $t4 1889 1890 cmovz $t0, $a0 1891 cmovz $t1, $a1 1892 mov $a0, 8*0($r_ptr) 1893 cmovz $t2, $a2 1894 mov $a1, 8*1($r_ptr) 1895 cmovz $t3, $a3 1896 mov $a2, 8*2($r_ptr) 1897 mov $a3, 8*3($r_ptr) 1898 1899 ret 1900.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 1901 1902.type __ecp_nistz256_subq,\@abi-omnipotent 1903.align 32 1904__ecp_nistz256_subq: 1905 sub $a0, $t0 1906 sbb $a1, $t1 1907 mov $t0, $a0 1908 sbb $a2, $t2 1909 sbb $a3, $t3 1910 mov $t1, $a1 1911 sbb $t4, $t4 1912 1913 add \$-1, $t0 1914 mov $t2, $a2 1915 adc $poly1, $t1 1916 adc \$0, $t2 1917 mov $t3, $a3 1918 adc $poly3, $t3 1919 test $t4, $t4 1920 1921 cmovnz $t0, $a0 1922 cmovnz $t1, $a1 1923 cmovnz $t2, $a2 1924 cmovnz $t3, $a3 1925 1926 ret 1927.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 1928 1929.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 1930.align 32 1931__ecp_nistz256_mul_by_2q: 1932 add $a0, $a0 # a0:a3+a0:a3 1933 adc $a1, $a1 1934 mov $a0, $t0 1935 adc $a2, $a2 1936 adc $a3, $a3 1937 mov $a1, $t1 1938 sbb $t4, $t4 1939 1940 sub \$-1, $a0 1941 mov $a2, $t2 1942 sbb $poly1, $a1 1943 sbb \$0, $a2 1944 mov $a3, $t3 1945 sbb $poly3, $a3 1946 test $t4, $t4 1947 1948 cmovz $t0, $a0 1949 cmovz $t1, $a1 1950 mov $a0, 8*0($r_ptr) 1951 cmovz $t2, $a2 1952 mov $a1, 8*1($r_ptr) 1953 cmovz $t3, $a3 1954 mov $a2, 8*2($r_ptr) 1955 mov $a3, 8*3($r_ptr) 1956 1957 ret 1958.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 1959___ 1960 } 1961sub gen_double () { 1962 my $x = shift; 1963 my ($src0,$sfx,$bias); 1964 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1965 1966 if ($x ne "x") { 1967 $src0 = "%rax"; 1968 $sfx = ""; 1969 $bias = 0; 1970 1971$code.=<<___; 1972.globl ecp_nistz256_point_double 1973.type ecp_nistz256_point_double,\@function,2 1974.align 32 1975ecp_nistz256_point_double: 1976___ 1977$code.=<<___ if ($addx); 1978 mov \$0x80100, %ecx 1979 and OPENSSL_ia32cap_P+8(%rip), %ecx 1980 cmp \$0x80100, %ecx 1981 je .Lpoint_doublex 1982___ 1983 } else { 1984 $src0 = "%rdx"; 1985 $sfx = "x"; 1986 $bias = 128; 1987 1988$code.=<<___; 1989.type ecp_nistz256_point_doublex,\@function,2 1990.align 32 1991ecp_nistz256_point_doublex: 1992.Lpoint_doublex: 1993___ 1994 } 1995$code.=<<___; 1996 push %rbp 1997 push %rbx 1998 push %r12 1999 push %r13 2000 push %r14 2001 push %r15 2002 sub \$32*5+8, %rsp 2003 2004.Lpoint_double_shortcut$x: 2005 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 2006 mov $a_ptr, $b_ptr # backup copy 2007 movdqu 0x10($a_ptr), %xmm1 2008 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 2009 mov 0x20+8*1($a_ptr), $acc5 2010 mov 0x20+8*2($a_ptr), $acc0 2011 mov 0x20+8*3($a_ptr), $acc1 2012 mov .Lpoly+8*1(%rip), $poly1 2013 mov .Lpoly+8*3(%rip), $poly3 2014 movdqa %xmm0, $in_x(%rsp) 2015 movdqa %xmm1, $in_x+0x10(%rsp) 2016 lea 0x20($r_ptr), $acc2 2017 lea 0x40($r_ptr), $acc3 2018 movq $r_ptr, %xmm0 2019 movq $acc2, %xmm1 2020 movq $acc3, %xmm2 2021 2022 lea $S(%rsp), $r_ptr 2023 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 2024 2025 mov 0x40+8*0($a_ptr), $src0 2026 mov 0x40+8*1($a_ptr), $acc6 2027 mov 0x40+8*2($a_ptr), $acc7 2028 mov 0x40+8*3($a_ptr), $acc0 2029 lea 0x40-$bias($a_ptr), $a_ptr 2030 lea $Zsqr(%rsp), $r_ptr 2031 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 2032 2033 `&load_for_sqr("$S(%rsp)", "$src0")` 2034 lea $S(%rsp), $r_ptr 2035 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 2036 2037 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 2038 mov 0x40+8*0($b_ptr), $acc1 2039 mov 0x40+8*1($b_ptr), $acc2 2040 mov 0x40+8*2($b_ptr), $acc3 2041 mov 0x40+8*3($b_ptr), $acc4 2042 lea 0x40-$bias($b_ptr), $a_ptr 2043 lea 0x20($b_ptr), $b_ptr 2044 movq %xmm2, $r_ptr 2045 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 2046 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 2047 2048 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2049 mov $in_x+8*1(%rsp), $acc5 2050 lea $Zsqr(%rsp), $b_ptr 2051 mov $in_x+8*2(%rsp), $acc0 2052 mov $in_x+8*3(%rsp), $acc1 2053 lea $M(%rsp), $r_ptr 2054 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 2055 2056 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2057 mov $in_x+8*1(%rsp), $acc5 2058 lea $Zsqr(%rsp), $b_ptr 2059 mov $in_x+8*2(%rsp), $acc0 2060 mov $in_x+8*3(%rsp), $acc1 2061 lea $Zsqr(%rsp), $r_ptr 2062 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 2063 2064 `&load_for_sqr("$S(%rsp)", "$src0")` 2065 movq %xmm1, $r_ptr 2066 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 2067___ 2068{ 2069######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 2070# operate in 4-5-6-7 "name space" that matches squaring output 2071# 2072my ($poly1,$poly3)=($a_ptr,$t1); 2073my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 2074 2075$code.=<<___; 2076 xor $t4, $t4 2077 mov $a0, $t0 2078 add \$-1, $a0 2079 mov $a1, $t1 2080 adc $poly1, $a1 2081 mov $a2, $t2 2082 adc \$0, $a2 2083 mov $a3, $t3 2084 adc $poly3, $a3 2085 adc \$0, $t4 2086 xor $a_ptr, $a_ptr # borrow $a_ptr 2087 test \$1, $t0 2088 2089 cmovz $t0, $a0 2090 cmovz $t1, $a1 2091 cmovz $t2, $a2 2092 cmovz $t3, $a3 2093 cmovz $a_ptr, $t4 2094 2095 mov $a1, $t0 # a0:a3>>1 2096 shr \$1, $a0 2097 shl \$63, $t0 2098 mov $a2, $t1 2099 shr \$1, $a1 2100 or $t0, $a0 2101 shl \$63, $t1 2102 mov $a3, $t2 2103 shr \$1, $a2 2104 or $t1, $a1 2105 shl \$63, $t2 2106 mov $a0, 8*0($r_ptr) 2107 shr \$1, $a3 2108 mov $a1, 8*1($r_ptr) 2109 shl \$63, $t4 2110 or $t2, $a2 2111 or $t4, $a3 2112 mov $a2, 8*2($r_ptr) 2113 mov $a3, 8*3($r_ptr) 2114___ 2115} 2116$code.=<<___; 2117 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 2118 lea $M(%rsp), $r_ptr 2119 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 2120 2121 lea $tmp0(%rsp), $r_ptr 2122 call __ecp_nistz256_mul_by_2$x 2123 2124 lea $M(%rsp), $b_ptr 2125 lea $M(%rsp), $r_ptr 2126 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 2127 2128 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 2129 lea $S(%rsp), $r_ptr 2130 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 2131 2132 lea $tmp0(%rsp), $r_ptr 2133 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 2134 2135 `&load_for_sqr("$M(%rsp)", "$src0")` 2136 movq %xmm0, $r_ptr 2137 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 2138 2139 lea $tmp0(%rsp), $b_ptr 2140 mov $acc6, $acc0 # harmonize sqr output and sub input 2141 mov $acc7, $acc1 2142 mov $a_ptr, $poly1 2143 mov $t1, $poly3 2144 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 2145 2146 mov $S+8*0(%rsp), $t0 2147 mov $S+8*1(%rsp), $t1 2148 mov $S+8*2(%rsp), $t2 2149 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 2150 lea $S(%rsp), $r_ptr 2151 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 2152 2153 mov $M(%rsp), $src0 2154 lea $M(%rsp), $b_ptr 2155 mov $acc4, $acc6 # harmonize sub output and mul input 2156 xor %ecx, %ecx 2157 mov $acc4, $S+8*0(%rsp) # have to save:-( 2158 mov $acc5, $acc2 2159 mov $acc5, $S+8*1(%rsp) 2160 cmovz $acc0, $acc3 2161 mov $acc0, $S+8*2(%rsp) 2162 lea $S-$bias(%rsp), $a_ptr 2163 cmovz $acc1, $acc4 2164 mov $acc1, $S+8*3(%rsp) 2165 mov $acc6, $acc1 2166 lea $S(%rsp), $r_ptr 2167 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 2168 2169 movq %xmm1, $b_ptr 2170 movq %xmm1, $r_ptr 2171 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 2172 2173 add \$32*5+8, %rsp 2174 pop %r15 2175 pop %r14 2176 pop %r13 2177 pop %r12 2178 pop %rbx 2179 pop %rbp 2180 ret 2181.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 2182___ 2183} 2184&gen_double("q"); 2185 2186sub gen_add () { 2187 my $x = shift; 2188 my ($src0,$sfx,$bias); 2189 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 2190 $U1,$U2,$S1,$S2, 2191 $res_x,$res_y,$res_z, 2192 $in1_x,$in1_y,$in1_z, 2193 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 2194 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2195 2196 if ($x ne "x") { 2197 $src0 = "%rax"; 2198 $sfx = ""; 2199 $bias = 0; 2200 2201$code.=<<___; 2202.globl ecp_nistz256_point_add 2203.type ecp_nistz256_point_add,\@function,3 2204.align 32 2205ecp_nistz256_point_add: 2206___ 2207$code.=<<___ if ($addx); 2208 mov \$0x80100, %ecx 2209 and OPENSSL_ia32cap_P+8(%rip), %ecx 2210 cmp \$0x80100, %ecx 2211 je .Lpoint_addx 2212___ 2213 } else { 2214 $src0 = "%rdx"; 2215 $sfx = "x"; 2216 $bias = 128; 2217 2218$code.=<<___; 2219.type ecp_nistz256_point_addx,\@function,3 2220.align 32 2221ecp_nistz256_point_addx: 2222.Lpoint_addx: 2223___ 2224 } 2225$code.=<<___; 2226 push %rbp 2227 push %rbx 2228 push %r12 2229 push %r13 2230 push %r14 2231 push %r15 2232 sub \$32*18+8, %rsp 2233 2234 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2235 movdqu 0x10($a_ptr), %xmm1 2236 movdqu 0x20($a_ptr), %xmm2 2237 movdqu 0x30($a_ptr), %xmm3 2238 movdqu 0x40($a_ptr), %xmm4 2239 movdqu 0x50($a_ptr), %xmm5 2240 mov $a_ptr, $b_ptr # reassign 2241 mov $b_org, $a_ptr # reassign 2242 movdqa %xmm0, $in1_x(%rsp) 2243 movdqa %xmm1, $in1_x+0x10(%rsp) 2244 por %xmm0, %xmm1 2245 movdqa %xmm2, $in1_y(%rsp) 2246 movdqa %xmm3, $in1_y+0x10(%rsp) 2247 por %xmm2, %xmm3 2248 movdqa %xmm4, $in1_z(%rsp) 2249 movdqa %xmm5, $in1_z+0x10(%rsp) 2250 por %xmm1, %xmm3 2251 2252 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 2253 pshufd \$0xb1, %xmm3, %xmm5 2254 movdqu 0x10($a_ptr), %xmm1 2255 movdqu 0x20($a_ptr), %xmm2 2256 por %xmm3, %xmm5 2257 movdqu 0x30($a_ptr), %xmm3 2258 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 2259 mov 0x40+8*1($a_ptr), $acc6 2260 mov 0x40+8*2($a_ptr), $acc7 2261 mov 0x40+8*3($a_ptr), $acc0 2262 movdqa %xmm0, $in2_x(%rsp) 2263 pshufd \$0x1e, %xmm5, %xmm4 2264 movdqa %xmm1, $in2_x+0x10(%rsp) 2265 por %xmm0, %xmm1 2266 movq $r_ptr, %xmm0 # save $r_ptr 2267 movdqa %xmm2, $in2_y(%rsp) 2268 movdqa %xmm3, $in2_y+0x10(%rsp) 2269 por %xmm2, %xmm3 2270 por %xmm4, %xmm5 2271 pxor %xmm4, %xmm4 2272 por %xmm1, %xmm3 2273 2274 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2275 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 2276 mov $acc6, $in2_z+8*1(%rsp) 2277 mov $acc7, $in2_z+8*2(%rsp) 2278 mov $acc0, $in2_z+8*3(%rsp) 2279 lea $Z2sqr(%rsp), $r_ptr # Z2^2 2280 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 2281 2282 pcmpeqd %xmm4, %xmm5 2283 pshufd \$0xb1, %xmm3, %xmm4 2284 por %xmm3, %xmm4 2285 pshufd \$0, %xmm5, %xmm5 # in1infty 2286 pshufd \$0x1e, %xmm4, %xmm3 2287 por %xmm3, %xmm4 2288 pxor %xmm3, %xmm3 2289 pcmpeqd %xmm3, %xmm4 2290 pshufd \$0, %xmm4, %xmm4 # in2infty 2291 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 2292 mov 0x40+8*1($b_ptr), $acc6 2293 mov 0x40+8*2($b_ptr), $acc7 2294 mov 0x40+8*3($b_ptr), $acc0 2295 movq $b_ptr, %xmm1 2296 2297 lea 0x40-$bias($b_ptr), $a_ptr 2298 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2299 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2300 2301 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 2302 lea $S1(%rsp), $r_ptr # S1 = Z2^3 2303 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 2304 2305 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2306 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2307 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2308 2309 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 2310 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 2311 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 2312 2313 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2314 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2315 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2316 2317 lea $S1(%rsp), $b_ptr 2318 lea $R(%rsp), $r_ptr # R = S2 - S1 2319 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 2320 2321 or $acc5, $acc4 # see if result is zero 2322 movdqa %xmm4, %xmm2 2323 or $acc0, $acc4 2324 or $acc1, $acc4 2325 por %xmm5, %xmm2 # in1infty || in2infty 2326 movq $acc4, %xmm3 2327 2328 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2329 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 2330 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 2331 2332 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 2333 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2334 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 2335 2336 lea $U1(%rsp), $b_ptr 2337 lea $H(%rsp), $r_ptr # H = U2 - U1 2338 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 2339 2340 or $acc5, $acc4 # see if result is zero 2341 or $acc0, $acc4 2342 or $acc1, $acc4 2343 2344 .byte 0x3e # predict taken 2345 jnz .Ladd_proceed$x # is_equal(U1,U2)? 2346 movq %xmm2, $acc0 2347 movq %xmm3, $acc1 2348 test $acc0, $acc0 2349 jnz .Ladd_proceed$x # (in1infty || in2infty)? 2350 test $acc1, $acc1 2351 jz .Ladd_double$x # is_equal(S1,S2)? 2352 2353 movq %xmm0, $r_ptr # restore $r_ptr 2354 pxor %xmm0, %xmm0 2355 movdqu %xmm0, 0x00($r_ptr) 2356 movdqu %xmm0, 0x10($r_ptr) 2357 movdqu %xmm0, 0x20($r_ptr) 2358 movdqu %xmm0, 0x30($r_ptr) 2359 movdqu %xmm0, 0x40($r_ptr) 2360 movdqu %xmm0, 0x50($r_ptr) 2361 jmp .Ladd_done$x 2362 2363.align 32 2364.Ladd_double$x: 2365 movq %xmm1, $a_ptr # restore $a_ptr 2366 movq %xmm0, $r_ptr # restore $r_ptr 2367 add \$`32*(18-5)`, %rsp # difference in frame sizes 2368 jmp .Lpoint_double_shortcut$x 2369 2370.align 32 2371.Ladd_proceed$x: 2372 `&load_for_sqr("$R(%rsp)", "$src0")` 2373 lea $Rsqr(%rsp), $r_ptr # R^2 2374 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2375 2376 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2377 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2378 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2379 2380 `&load_for_sqr("$H(%rsp)", "$src0")` 2381 lea $Hsqr(%rsp), $r_ptr # H^2 2382 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2383 2384 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 2385 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2386 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 2387 2388 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 2389 lea $Hcub(%rsp), $r_ptr # H^3 2390 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2391 2392 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 2393 lea $U2(%rsp), $r_ptr # U1*H^2 2394 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 2395___ 2396{ 2397####################################################################### 2398# operate in 4-5-0-1 "name space" that matches multiplication output 2399# 2400my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2401my ($poly1, $poly3)=($acc6,$acc7); 2402 2403$code.=<<___; 2404 #lea $U2(%rsp), $a_ptr 2405 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2406 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2407 2408 add $acc0, $acc0 # a0:a3+a0:a3 2409 lea $Rsqr(%rsp), $a_ptr 2410 adc $acc1, $acc1 2411 mov $acc0, $t0 2412 adc $acc2, $acc2 2413 adc $acc3, $acc3 2414 mov $acc1, $t1 2415 sbb $t4, $t4 2416 2417 sub \$-1, $acc0 2418 mov $acc2, $t2 2419 sbb $poly1, $acc1 2420 sbb \$0, $acc2 2421 mov $acc3, $t3 2422 sbb $poly3, $acc3 2423 test $t4, $t4 2424 2425 cmovz $t0, $acc0 2426 mov 8*0($a_ptr), $t0 2427 cmovz $t1, $acc1 2428 mov 8*1($a_ptr), $t1 2429 cmovz $t2, $acc2 2430 mov 8*2($a_ptr), $t2 2431 cmovz $t3, $acc3 2432 mov 8*3($a_ptr), $t3 2433 2434 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2435 2436 lea $Hcub(%rsp), $b_ptr 2437 lea $res_x(%rsp), $r_ptr 2438 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2439 2440 mov $U2+8*0(%rsp), $t0 2441 mov $U2+8*1(%rsp), $t1 2442 mov $U2+8*2(%rsp), $t2 2443 mov $U2+8*3(%rsp), $t3 2444 lea $res_y(%rsp), $r_ptr 2445 2446 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 2447 2448 mov $acc0, 8*0($r_ptr) # save the result, as 2449 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2450 mov $acc2, 8*2($r_ptr) 2451 mov $acc3, 8*3($r_ptr) 2452___ 2453} 2454$code.=<<___; 2455 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 2456 lea $S2(%rsp), $r_ptr 2457 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 2458 2459 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 2460 lea $res_y(%rsp), $r_ptr 2461 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 2462 2463 lea $S2(%rsp), $b_ptr 2464 lea $res_y(%rsp), $r_ptr 2465 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 2466 2467 movq %xmm0, $r_ptr # restore $r_ptr 2468 2469 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 2470 movdqa %xmm5, %xmm1 2471 pandn $res_z(%rsp), %xmm0 2472 movdqa %xmm5, %xmm2 2473 pandn $res_z+0x10(%rsp), %xmm1 2474 movdqa %xmm5, %xmm3 2475 pand $in2_z(%rsp), %xmm2 2476 pand $in2_z+0x10(%rsp), %xmm3 2477 por %xmm0, %xmm2 2478 por %xmm1, %xmm3 2479 2480 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2481 movdqa %xmm4, %xmm1 2482 pandn %xmm2, %xmm0 2483 movdqa %xmm4, %xmm2 2484 pandn %xmm3, %xmm1 2485 movdqa %xmm4, %xmm3 2486 pand $in1_z(%rsp), %xmm2 2487 pand $in1_z+0x10(%rsp), %xmm3 2488 por %xmm0, %xmm2 2489 por %xmm1, %xmm3 2490 movdqu %xmm2, 0x40($r_ptr) 2491 movdqu %xmm3, 0x50($r_ptr) 2492 2493 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2494 movdqa %xmm5, %xmm1 2495 pandn $res_x(%rsp), %xmm0 2496 movdqa %xmm5, %xmm2 2497 pandn $res_x+0x10(%rsp), %xmm1 2498 movdqa %xmm5, %xmm3 2499 pand $in2_x(%rsp), %xmm2 2500 pand $in2_x+0x10(%rsp), %xmm3 2501 por %xmm0, %xmm2 2502 por %xmm1, %xmm3 2503 2504 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2505 movdqa %xmm4, %xmm1 2506 pandn %xmm2, %xmm0 2507 movdqa %xmm4, %xmm2 2508 pandn %xmm3, %xmm1 2509 movdqa %xmm4, %xmm3 2510 pand $in1_x(%rsp), %xmm2 2511 pand $in1_x+0x10(%rsp), %xmm3 2512 por %xmm0, %xmm2 2513 por %xmm1, %xmm3 2514 movdqu %xmm2, 0x00($r_ptr) 2515 movdqu %xmm3, 0x10($r_ptr) 2516 2517 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2518 movdqa %xmm5, %xmm1 2519 pandn $res_y(%rsp), %xmm0 2520 movdqa %xmm5, %xmm2 2521 pandn $res_y+0x10(%rsp), %xmm1 2522 movdqa %xmm5, %xmm3 2523 pand $in2_y(%rsp), %xmm2 2524 pand $in2_y+0x10(%rsp), %xmm3 2525 por %xmm0, %xmm2 2526 por %xmm1, %xmm3 2527 2528 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2529 movdqa %xmm4, %xmm1 2530 pandn %xmm2, %xmm0 2531 movdqa %xmm4, %xmm2 2532 pandn %xmm3, %xmm1 2533 movdqa %xmm4, %xmm3 2534 pand $in1_y(%rsp), %xmm2 2535 pand $in1_y+0x10(%rsp), %xmm3 2536 por %xmm0, %xmm2 2537 por %xmm1, %xmm3 2538 movdqu %xmm2, 0x20($r_ptr) 2539 movdqu %xmm3, 0x30($r_ptr) 2540 2541.Ladd_done$x: 2542 add \$32*18+8, %rsp 2543 pop %r15 2544 pop %r14 2545 pop %r13 2546 pop %r12 2547 pop %rbx 2548 pop %rbp 2549 ret 2550.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 2551___ 2552} 2553&gen_add("q"); 2554 2555sub gen_add_affine () { 2556 my $x = shift; 2557 my ($src0,$sfx,$bias); 2558 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 2559 $res_x,$res_y,$res_z, 2560 $in1_x,$in1_y,$in1_z, 2561 $in2_x,$in2_y)=map(32*$_,(0..14)); 2562 my $Z1sqr = $S2; 2563 2564 if ($x ne "x") { 2565 $src0 = "%rax"; 2566 $sfx = ""; 2567 $bias = 0; 2568 2569$code.=<<___; 2570.globl ecp_nistz256_point_add_affine 2571.type ecp_nistz256_point_add_affine,\@function,3 2572.align 32 2573ecp_nistz256_point_add_affine: 2574___ 2575$code.=<<___ if ($addx); 2576 mov \$0x80100, %ecx 2577 and OPENSSL_ia32cap_P+8(%rip), %ecx 2578 cmp \$0x80100, %ecx 2579 je .Lpoint_add_affinex 2580___ 2581 } else { 2582 $src0 = "%rdx"; 2583 $sfx = "x"; 2584 $bias = 128; 2585 2586$code.=<<___; 2587.type ecp_nistz256_point_add_affinex,\@function,3 2588.align 32 2589ecp_nistz256_point_add_affinex: 2590.Lpoint_add_affinex: 2591___ 2592 } 2593$code.=<<___; 2594 push %rbp 2595 push %rbx 2596 push %r12 2597 push %r13 2598 push %r14 2599 push %r15 2600 sub \$32*15+8, %rsp 2601 2602 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2603 mov $b_org, $b_ptr # reassign 2604 movdqu 0x10($a_ptr), %xmm1 2605 movdqu 0x20($a_ptr), %xmm2 2606 movdqu 0x30($a_ptr), %xmm3 2607 movdqu 0x40($a_ptr), %xmm4 2608 movdqu 0x50($a_ptr), %xmm5 2609 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 2610 mov 0x40+8*1($a_ptr), $acc6 2611 mov 0x40+8*2($a_ptr), $acc7 2612 mov 0x40+8*3($a_ptr), $acc0 2613 movdqa %xmm0, $in1_x(%rsp) 2614 movdqa %xmm1, $in1_x+0x10(%rsp) 2615 por %xmm0, %xmm1 2616 movdqa %xmm2, $in1_y(%rsp) 2617 movdqa %xmm3, $in1_y+0x10(%rsp) 2618 por %xmm2, %xmm3 2619 movdqa %xmm4, $in1_z(%rsp) 2620 movdqa %xmm5, $in1_z+0x10(%rsp) 2621 por %xmm1, %xmm3 2622 2623 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 2624 pshufd \$0xb1, %xmm3, %xmm5 2625 movdqu 0x10($b_ptr), %xmm1 2626 movdqu 0x20($b_ptr), %xmm2 2627 por %xmm3, %xmm5 2628 movdqu 0x30($b_ptr), %xmm3 2629 movdqa %xmm0, $in2_x(%rsp) 2630 pshufd \$0x1e, %xmm5, %xmm4 2631 movdqa %xmm1, $in2_x+0x10(%rsp) 2632 por %xmm0, %xmm1 2633 movq $r_ptr, %xmm0 # save $r_ptr 2634 movdqa %xmm2, $in2_y(%rsp) 2635 movdqa %xmm3, $in2_y+0x10(%rsp) 2636 por %xmm2, %xmm3 2637 por %xmm4, %xmm5 2638 pxor %xmm4, %xmm4 2639 por %xmm1, %xmm3 2640 2641 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2642 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2643 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2644 2645 pcmpeqd %xmm4, %xmm5 2646 pshufd \$0xb1, %xmm3, %xmm4 2647 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 2648 #lea 0x00($b_ptr), $b_ptr 2649 mov $acc4, $acc1 # harmonize sqr output and mul input 2650 por %xmm3, %xmm4 2651 pshufd \$0, %xmm5, %xmm5 # in1infty 2652 pshufd \$0x1e, %xmm4, %xmm3 2653 mov $acc5, $acc2 2654 por %xmm3, %xmm4 2655 pxor %xmm3, %xmm3 2656 mov $acc6, $acc3 2657 pcmpeqd %xmm3, %xmm4 2658 pshufd \$0, %xmm4, %xmm4 # in2infty 2659 2660 lea $Z1sqr-$bias(%rsp), $a_ptr 2661 mov $acc7, $acc4 2662 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2663 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 2664 2665 lea $in1_x(%rsp), $b_ptr 2666 lea $H(%rsp), $r_ptr # H = U2 - U1 2667 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 2668 2669 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2670 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2671 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2672 2673 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2674 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2675 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2676 2677 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2678 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2679 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2680 2681 lea $in1_y(%rsp), $b_ptr 2682 lea $R(%rsp), $r_ptr # R = S2 - S1 2683 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 2684 2685 `&load_for_sqr("$H(%rsp)", "$src0")` 2686 lea $Hsqr(%rsp), $r_ptr # H^2 2687 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2688 2689 `&load_for_sqr("$R(%rsp)", "$src0")` 2690 lea $Rsqr(%rsp), $r_ptr # R^2 2691 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2692 2693 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 2694 lea $Hcub(%rsp), $r_ptr # H^3 2695 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2696 2697 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2698 lea $U2(%rsp), $r_ptr # U1*H^2 2699 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 2700___ 2701{ 2702####################################################################### 2703# operate in 4-5-0-1 "name space" that matches multiplication output 2704# 2705my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2706my ($poly1, $poly3)=($acc6,$acc7); 2707 2708$code.=<<___; 2709 #lea $U2(%rsp), $a_ptr 2710 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2711 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2712 2713 add $acc0, $acc0 # a0:a3+a0:a3 2714 lea $Rsqr(%rsp), $a_ptr 2715 adc $acc1, $acc1 2716 mov $acc0, $t0 2717 adc $acc2, $acc2 2718 adc $acc3, $acc3 2719 mov $acc1, $t1 2720 sbb $t4, $t4 2721 2722 sub \$-1, $acc0 2723 mov $acc2, $t2 2724 sbb $poly1, $acc1 2725 sbb \$0, $acc2 2726 mov $acc3, $t3 2727 sbb $poly3, $acc3 2728 test $t4, $t4 2729 2730 cmovz $t0, $acc0 2731 mov 8*0($a_ptr), $t0 2732 cmovz $t1, $acc1 2733 mov 8*1($a_ptr), $t1 2734 cmovz $t2, $acc2 2735 mov 8*2($a_ptr), $t2 2736 cmovz $t3, $acc3 2737 mov 8*3($a_ptr), $t3 2738 2739 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2740 2741 lea $Hcub(%rsp), $b_ptr 2742 lea $res_x(%rsp), $r_ptr 2743 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2744 2745 mov $U2+8*0(%rsp), $t0 2746 mov $U2+8*1(%rsp), $t1 2747 mov $U2+8*2(%rsp), $t2 2748 mov $U2+8*3(%rsp), $t3 2749 lea $H(%rsp), $r_ptr 2750 2751 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 2752 2753 mov $acc0, 8*0($r_ptr) # save the result, as 2754 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2755 mov $acc2, 8*2($r_ptr) 2756 mov $acc3, 8*3($r_ptr) 2757___ 2758} 2759$code.=<<___; 2760 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 2761 lea $S2(%rsp), $r_ptr 2762 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 2763 2764 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 2765 lea $H(%rsp), $r_ptr 2766 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 2767 2768 lea $S2(%rsp), $b_ptr 2769 lea $res_y(%rsp), $r_ptr 2770 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 2771 2772 movq %xmm0, $r_ptr # restore $r_ptr 2773 2774 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 2775 movdqa %xmm5, %xmm1 2776 pandn $res_z(%rsp), %xmm0 2777 movdqa %xmm5, %xmm2 2778 pandn $res_z+0x10(%rsp), %xmm1 2779 movdqa %xmm5, %xmm3 2780 pand .LONE_mont(%rip), %xmm2 2781 pand .LONE_mont+0x10(%rip), %xmm3 2782 por %xmm0, %xmm2 2783 por %xmm1, %xmm3 2784 2785 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2786 movdqa %xmm4, %xmm1 2787 pandn %xmm2, %xmm0 2788 movdqa %xmm4, %xmm2 2789 pandn %xmm3, %xmm1 2790 movdqa %xmm4, %xmm3 2791 pand $in1_z(%rsp), %xmm2 2792 pand $in1_z+0x10(%rsp), %xmm3 2793 por %xmm0, %xmm2 2794 por %xmm1, %xmm3 2795 movdqu %xmm2, 0x40($r_ptr) 2796 movdqu %xmm3, 0x50($r_ptr) 2797 2798 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2799 movdqa %xmm5, %xmm1 2800 pandn $res_x(%rsp), %xmm0 2801 movdqa %xmm5, %xmm2 2802 pandn $res_x+0x10(%rsp), %xmm1 2803 movdqa %xmm5, %xmm3 2804 pand $in2_x(%rsp), %xmm2 2805 pand $in2_x+0x10(%rsp), %xmm3 2806 por %xmm0, %xmm2 2807 por %xmm1, %xmm3 2808 2809 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2810 movdqa %xmm4, %xmm1 2811 pandn %xmm2, %xmm0 2812 movdqa %xmm4, %xmm2 2813 pandn %xmm3, %xmm1 2814 movdqa %xmm4, %xmm3 2815 pand $in1_x(%rsp), %xmm2 2816 pand $in1_x+0x10(%rsp), %xmm3 2817 por %xmm0, %xmm2 2818 por %xmm1, %xmm3 2819 movdqu %xmm2, 0x00($r_ptr) 2820 movdqu %xmm3, 0x10($r_ptr) 2821 2822 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2823 movdqa %xmm5, %xmm1 2824 pandn $res_y(%rsp), %xmm0 2825 movdqa %xmm5, %xmm2 2826 pandn $res_y+0x10(%rsp), %xmm1 2827 movdqa %xmm5, %xmm3 2828 pand $in2_y(%rsp), %xmm2 2829 pand $in2_y+0x10(%rsp), %xmm3 2830 por %xmm0, %xmm2 2831 por %xmm1, %xmm3 2832 2833 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2834 movdqa %xmm4, %xmm1 2835 pandn %xmm2, %xmm0 2836 movdqa %xmm4, %xmm2 2837 pandn %xmm3, %xmm1 2838 movdqa %xmm4, %xmm3 2839 pand $in1_y(%rsp), %xmm2 2840 pand $in1_y+0x10(%rsp), %xmm3 2841 por %xmm0, %xmm2 2842 por %xmm1, %xmm3 2843 movdqu %xmm2, 0x20($r_ptr) 2844 movdqu %xmm3, 0x30($r_ptr) 2845 2846 add \$32*15+8, %rsp 2847 pop %r15 2848 pop %r14 2849 pop %r13 2850 pop %r12 2851 pop %rbx 2852 pop %rbp 2853 ret 2854.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 2855___ 2856} 2857&gen_add_affine("q"); 2858 2859######################################################################## 2860# AD*X magic 2861# 2862if ($addx) { { 2863######################################################################## 2864# operate in 4-5-0-1 "name space" that matches multiplication output 2865# 2866my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2867 2868$code.=<<___; 2869.type __ecp_nistz256_add_tox,\@abi-omnipotent 2870.align 32 2871__ecp_nistz256_add_tox: 2872 xor $t4, $t4 2873 adc 8*0($b_ptr), $a0 2874 adc 8*1($b_ptr), $a1 2875 mov $a0, $t0 2876 adc 8*2($b_ptr), $a2 2877 adc 8*3($b_ptr), $a3 2878 mov $a1, $t1 2879 adc \$0, $t4 2880 2881 xor $t3, $t3 2882 sbb \$-1, $a0 2883 mov $a2, $t2 2884 sbb $poly1, $a1 2885 sbb \$0, $a2 2886 mov $a3, $t3 2887 sbb $poly3, $a3 2888 2889 bt \$0, $t4 2890 cmovnc $t0, $a0 2891 cmovnc $t1, $a1 2892 mov $a0, 8*0($r_ptr) 2893 cmovnc $t2, $a2 2894 mov $a1, 8*1($r_ptr) 2895 cmovnc $t3, $a3 2896 mov $a2, 8*2($r_ptr) 2897 mov $a3, 8*3($r_ptr) 2898 2899 ret 2900.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 2901 2902.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 2903.align 32 2904__ecp_nistz256_sub_fromx: 2905 xor $t4, $t4 2906 sbb 8*0($b_ptr), $a0 2907 sbb 8*1($b_ptr), $a1 2908 mov $a0, $t0 2909 sbb 8*2($b_ptr), $a2 2910 sbb 8*3($b_ptr), $a3 2911 mov $a1, $t1 2912 sbb \$0, $t4 2913 2914 xor $t3, $t3 2915 adc \$-1, $a0 2916 mov $a2, $t2 2917 adc $poly1, $a1 2918 adc \$0, $a2 2919 mov $a3, $t3 2920 adc $poly3, $a3 2921 2922 bt \$0, $t4 2923 cmovnc $t0, $a0 2924 cmovnc $t1, $a1 2925 mov $a0, 8*0($r_ptr) 2926 cmovnc $t2, $a2 2927 mov $a1, 8*1($r_ptr) 2928 cmovnc $t3, $a3 2929 mov $a2, 8*2($r_ptr) 2930 mov $a3, 8*3($r_ptr) 2931 2932 ret 2933.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 2934 2935.type __ecp_nistz256_subx,\@abi-omnipotent 2936.align 32 2937__ecp_nistz256_subx: 2938 xor $t4, $t4 2939 sbb $a0, $t0 2940 sbb $a1, $t1 2941 mov $t0, $a0 2942 sbb $a2, $t2 2943 sbb $a3, $t3 2944 mov $t1, $a1 2945 sbb \$0, $t4 2946 2947 xor $a3 ,$a3 2948 adc \$-1, $t0 2949 mov $t2, $a2 2950 adc $poly1, $t1 2951 adc \$0, $t2 2952 mov $t3, $a3 2953 adc $poly3, $t3 2954 2955 bt \$0, $t4 2956 cmovc $t0, $a0 2957 cmovc $t1, $a1 2958 cmovc $t2, $a2 2959 cmovc $t3, $a3 2960 2961 ret 2962.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 2963 2964.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 2965.align 32 2966__ecp_nistz256_mul_by_2x: 2967 xor $t4, $t4 2968 adc $a0, $a0 # a0:a3+a0:a3 2969 adc $a1, $a1 2970 mov $a0, $t0 2971 adc $a2, $a2 2972 adc $a3, $a3 2973 mov $a1, $t1 2974 adc \$0, $t4 2975 2976 xor $t3, $t3 2977 sbb \$-1, $a0 2978 mov $a2, $t2 2979 sbb $poly1, $a1 2980 sbb \$0, $a2 2981 mov $a3, $t3 2982 sbb $poly3, $a3 2983 2984 bt \$0, $t4 2985 cmovnc $t0, $a0 2986 cmovnc $t1, $a1 2987 mov $a0, 8*0($r_ptr) 2988 cmovnc $t2, $a2 2989 mov $a1, 8*1($r_ptr) 2990 cmovnc $t3, $a3 2991 mov $a2, 8*2($r_ptr) 2992 mov $a3, 8*3($r_ptr) 2993 2994 ret 2995.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 2996___ 2997 } 2998&gen_double("x"); 2999&gen_add("x"); 3000&gen_add_affine("x"); 3001} 3002}}} 3003 3004$code =~ s/\`([^\`]*)\`/eval $1/gem; 3005print $code; 3006close STDOUT; 3007