1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright 2014 Intel Corporation # 6# # 7# Licensed under the Apache License, Version 2.0 (the "License"); # 8# you may not use this file except in compliance with the License. # 9# You may obtain a copy of the License at # 10# # 11# http://www.apache.org/licenses/LICENSE-2.0 # 12# # 13# Unless required by applicable law or agreed to in writing, software # 14# distributed under the License is distributed on an "AS IS" BASIS, # 15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 16# See the License for the specific language governing permissions and # 17# limitations under the License. # 18# # 19############################################################################## 20# # 21# Developers and authors: # 22# Shay Gueron (1, 2), and Vlad Krasnov (1) # 23# (1) Intel Corporation, Israel Development Center # 24# (2) University of Haifa # 25# Reference: # 26# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with# 27# 256 Bit Primes" # 28# # 29############################################################################## 30 31# Further optimization by <appro@openssl.org>: 32# 33# this/original with/without -DECP_NISTZ256_ASM(*) 34# Opteron +12-49% +110-150% 35# Bulldozer +14-45% +175-210% 36# P4 +18-46% n/a :-( 37# Westmere +12-34% +80-87% 38# Sandy Bridge +9-35% +110-120% 39# Ivy Bridge +9-35% +110-125% 40# Haswell +8-37% +140-160% 41# Broadwell +18-58% +145-210% 42# Atom +15-50% +130-180% 43# VIA Nano +43-160% +300-480% 44# 45# (*) "without -DECP_NISTZ256_ASM" refers to build with 46# "enable-ec_nistp_64_gcc_128"; 47# 48# Ranges denote minimum and maximum improvement coefficients depending 49# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest 50# server-side operation. Keep in mind that +100% means 2x improvement. 51 52$flavour = shift; 53$output = shift; 54if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 55 56$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 57 58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 59( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 60( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 61die "can't locate x86_64-xlate.pl"; 62 63open OUT,"| \"$^X\" $xlate $flavour $output"; 64*STDOUT=*OUT; 65 66if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 67 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 68 $avx = ($1>=2.19) + ($1>=2.22); 69 $addx = ($1>=2.23); 70} 71 72if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 73 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 74 $avx = ($1>=2.09) + ($1>=2.10); 75 $addx = ($1>=2.10); 76} 77 78if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 79 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 80 $avx = ($1>=10) + ($1>=11); 81 $addx = ($1>=12); 82} 83 84if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { 85 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 86 $avx = ($ver>=3.0) + ($ver>=3.01); 87 $addx = ($ver>=3.03); 88} 89 90$code.=<<___; 91.text 92.extern OPENSSL_ia32cap_P 93 94# The polynomial 95.align 64 96.Lpoly: 97.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 98 99# 2^512 mod P precomputed for NIST P256 polynomial 100.LRR: 101.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd 102 103.LOne: 104.long 1,1,1,1,1,1,1,1 105.LTwo: 106.long 2,2,2,2,2,2,2,2 107.LThree: 108.long 3,3,3,3,3,3,3,3 109.LONE_mont: 110.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 111___ 112 113{ 114################################################################################ 115# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); 116 117my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 118my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 119my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 120 121$code.=<<___; 122 123.globl ecp_nistz256_mul_by_2 124.type ecp_nistz256_mul_by_2,\@function,2 125.align 64 126ecp_nistz256_mul_by_2: 127 push %r12 128 push %r13 129 130 mov 8*0($a_ptr), $a0 131 mov 8*1($a_ptr), $a1 132 add $a0, $a0 # a0:a3+a0:a3 133 mov 8*2($a_ptr), $a2 134 adc $a1, $a1 135 mov 8*3($a_ptr), $a3 136 lea .Lpoly(%rip), $a_ptr 137 mov $a0, $t0 138 adc $a2, $a2 139 adc $a3, $a3 140 mov $a1, $t1 141 sbb $t4, $t4 142 143 sub 8*0($a_ptr), $a0 144 mov $a2, $t2 145 sbb 8*1($a_ptr), $a1 146 sbb 8*2($a_ptr), $a2 147 mov $a3, $t3 148 sbb 8*3($a_ptr), $a3 149 test $t4, $t4 150 151 cmovz $t0, $a0 152 cmovz $t1, $a1 153 mov $a0, 8*0($r_ptr) 154 cmovz $t2, $a2 155 mov $a1, 8*1($r_ptr) 156 cmovz $t3, $a3 157 mov $a2, 8*2($r_ptr) 158 mov $a3, 8*3($r_ptr) 159 160 pop %r13 161 pop %r12 162 ret 163.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 164 165################################################################################ 166# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]); 167.globl ecp_nistz256_div_by_2 168.type ecp_nistz256_div_by_2,\@function,2 169.align 32 170ecp_nistz256_div_by_2: 171 push %r12 172 push %r13 173 174 mov 8*0($a_ptr), $a0 175 mov 8*1($a_ptr), $a1 176 mov 8*2($a_ptr), $a2 177 mov $a0, $t0 178 mov 8*3($a_ptr), $a3 179 lea .Lpoly(%rip), $a_ptr 180 181 mov $a1, $t1 182 xor $t4, $t4 183 add 8*0($a_ptr), $a0 184 mov $a2, $t2 185 adc 8*1($a_ptr), $a1 186 adc 8*2($a_ptr), $a2 187 mov $a3, $t3 188 adc 8*3($a_ptr), $a3 189 adc \$0, $t4 190 xor $a_ptr, $a_ptr # borrow $a_ptr 191 test \$1, $t0 192 193 cmovz $t0, $a0 194 cmovz $t1, $a1 195 cmovz $t2, $a2 196 cmovz $t3, $a3 197 cmovz $a_ptr, $t4 198 199 mov $a1, $t0 # a0:a3>>1 200 shr \$1, $a0 201 shl \$63, $t0 202 mov $a2, $t1 203 shr \$1, $a1 204 or $t0, $a0 205 shl \$63, $t1 206 mov $a3, $t2 207 shr \$1, $a2 208 or $t1, $a1 209 shl \$63, $t2 210 shr \$1, $a3 211 shl \$63, $t4 212 or $t2, $a2 213 or $t4, $a3 214 215 mov $a0, 8*0($r_ptr) 216 mov $a1, 8*1($r_ptr) 217 mov $a2, 8*2($r_ptr) 218 mov $a3, 8*3($r_ptr) 219 220 pop %r13 221 pop %r12 222 ret 223.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 224 225################################################################################ 226# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]); 227.globl ecp_nistz256_mul_by_3 228.type ecp_nistz256_mul_by_3,\@function,2 229.align 32 230ecp_nistz256_mul_by_3: 231 push %r12 232 push %r13 233 234 mov 8*0($a_ptr), $a0 235 xor $t4, $t4 236 mov 8*1($a_ptr), $a1 237 add $a0, $a0 # a0:a3+a0:a3 238 mov 8*2($a_ptr), $a2 239 adc $a1, $a1 240 mov 8*3($a_ptr), $a3 241 mov $a0, $t0 242 adc $a2, $a2 243 adc $a3, $a3 244 mov $a1, $t1 245 adc \$0, $t4 246 247 sub \$-1, $a0 248 mov $a2, $t2 249 sbb .Lpoly+8*1(%rip), $a1 250 sbb \$0, $a2 251 mov $a3, $t3 252 sbb .Lpoly+8*3(%rip), $a3 253 test $t4, $t4 254 255 cmovz $t0, $a0 256 cmovz $t1, $a1 257 cmovz $t2, $a2 258 cmovz $t3, $a3 259 260 xor $t4, $t4 261 add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3] 262 adc 8*1($a_ptr), $a1 263 mov $a0, $t0 264 adc 8*2($a_ptr), $a2 265 adc 8*3($a_ptr), $a3 266 mov $a1, $t1 267 adc \$0, $t4 268 269 sub \$-1, $a0 270 mov $a2, $t2 271 sbb .Lpoly+8*1(%rip), $a1 272 sbb \$0, $a2 273 mov $a3, $t3 274 sbb .Lpoly+8*3(%rip), $a3 275 test $t4, $t4 276 277 cmovz $t0, $a0 278 cmovz $t1, $a1 279 mov $a0, 8*0($r_ptr) 280 cmovz $t2, $a2 281 mov $a1, 8*1($r_ptr) 282 cmovz $t3, $a3 283 mov $a2, 8*2($r_ptr) 284 mov $a3, 8*3($r_ptr) 285 286 pop %r13 287 pop %r12 288 ret 289.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 290 291################################################################################ 292# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 293.globl ecp_nistz256_add 294.type ecp_nistz256_add,\@function,3 295.align 32 296ecp_nistz256_add: 297 push %r12 298 push %r13 299 300 mov 8*0($a_ptr), $a0 301 xor $t4, $t4 302 mov 8*1($a_ptr), $a1 303 mov 8*2($a_ptr), $a2 304 mov 8*3($a_ptr), $a3 305 lea .Lpoly(%rip), $a_ptr 306 307 add 8*0($b_ptr), $a0 308 adc 8*1($b_ptr), $a1 309 mov $a0, $t0 310 adc 8*2($b_ptr), $a2 311 adc 8*3($b_ptr), $a3 312 mov $a1, $t1 313 adc \$0, $t4 314 315 sub 8*0($a_ptr), $a0 316 mov $a2, $t2 317 sbb 8*1($a_ptr), $a1 318 sbb 8*2($a_ptr), $a2 319 mov $a3, $t3 320 sbb 8*3($a_ptr), $a3 321 test $t4, $t4 322 323 cmovz $t0, $a0 324 cmovz $t1, $a1 325 mov $a0, 8*0($r_ptr) 326 cmovz $t2, $a2 327 mov $a1, 8*1($r_ptr) 328 cmovz $t3, $a3 329 mov $a2, 8*2($r_ptr) 330 mov $a3, 8*3($r_ptr) 331 332 pop %r13 333 pop %r12 334 ret 335.size ecp_nistz256_add,.-ecp_nistz256_add 336 337################################################################################ 338# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 339.globl ecp_nistz256_sub 340.type ecp_nistz256_sub,\@function,3 341.align 32 342ecp_nistz256_sub: 343 push %r12 344 push %r13 345 346 mov 8*0($a_ptr), $a0 347 xor $t4, $t4 348 mov 8*1($a_ptr), $a1 349 mov 8*2($a_ptr), $a2 350 mov 8*3($a_ptr), $a3 351 lea .Lpoly(%rip), $a_ptr 352 353 sub 8*0($b_ptr), $a0 354 sbb 8*1($b_ptr), $a1 355 mov $a0, $t0 356 sbb 8*2($b_ptr), $a2 357 sbb 8*3($b_ptr), $a3 358 mov $a1, $t1 359 sbb \$0, $t4 360 361 add 8*0($a_ptr), $a0 362 mov $a2, $t2 363 adc 8*1($a_ptr), $a1 364 adc 8*2($a_ptr), $a2 365 mov $a3, $t3 366 adc 8*3($a_ptr), $a3 367 test $t4, $t4 368 369 cmovz $t0, $a0 370 cmovz $t1, $a1 371 mov $a0, 8*0($r_ptr) 372 cmovz $t2, $a2 373 mov $a1, 8*1($r_ptr) 374 cmovz $t3, $a3 375 mov $a2, 8*2($r_ptr) 376 mov $a3, 8*3($r_ptr) 377 378 pop %r13 379 pop %r12 380 ret 381.size ecp_nistz256_sub,.-ecp_nistz256_sub 382 383################################################################################ 384# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 385.globl ecp_nistz256_neg 386.type ecp_nistz256_neg,\@function,2 387.align 32 388ecp_nistz256_neg: 389 push %r12 390 push %r13 391 392 xor $a0, $a0 393 xor $a1, $a1 394 xor $a2, $a2 395 xor $a3, $a3 396 xor $t4, $t4 397 398 sub 8*0($a_ptr), $a0 399 sbb 8*1($a_ptr), $a1 400 sbb 8*2($a_ptr), $a2 401 mov $a0, $t0 402 sbb 8*3($a_ptr), $a3 403 lea .Lpoly(%rip), $a_ptr 404 mov $a1, $t1 405 sbb \$0, $t4 406 407 add 8*0($a_ptr), $a0 408 mov $a2, $t2 409 adc 8*1($a_ptr), $a1 410 adc 8*2($a_ptr), $a2 411 mov $a3, $t3 412 adc 8*3($a_ptr), $a3 413 test $t4, $t4 414 415 cmovz $t0, $a0 416 cmovz $t1, $a1 417 mov $a0, 8*0($r_ptr) 418 cmovz $t2, $a2 419 mov $a1, 8*1($r_ptr) 420 cmovz $t3, $a3 421 mov $a2, 8*2($r_ptr) 422 mov $a3, 8*3($r_ptr) 423 424 pop %r13 425 pop %r12 426 ret 427.size ecp_nistz256_neg,.-ecp_nistz256_neg 428___ 429} 430{ 431my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 432my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 433my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 434my ($poly1,$poly3)=($acc6,$acc7); 435 436$code.=<<___; 437################################################################################ 438# void ecp_nistz256_to_mont( 439# uint64_t res[4], 440# uint64_t in[4]); 441.globl ecp_nistz256_to_mont 442.type ecp_nistz256_to_mont,\@function,2 443.align 32 444ecp_nistz256_to_mont: 445___ 446$code.=<<___ if ($addx); 447 mov \$0x80100, %ecx 448 and OPENSSL_ia32cap_P+8(%rip), %ecx 449___ 450$code.=<<___; 451 lea .LRR(%rip), $b_org 452 jmp .Lmul_mont 453.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 454 455################################################################################ 456# void ecp_nistz256_mul_mont( 457# uint64_t res[4], 458# uint64_t a[4], 459# uint64_t b[4]); 460 461.globl ecp_nistz256_mul_mont 462.type ecp_nistz256_mul_mont,\@function,3 463.align 32 464ecp_nistz256_mul_mont: 465___ 466$code.=<<___ if ($addx); 467 mov \$0x80100, %ecx 468 and OPENSSL_ia32cap_P+8(%rip), %ecx 469___ 470$code.=<<___; 471.Lmul_mont: 472 push %rbp 473 push %rbx 474 push %r12 475 push %r13 476 push %r14 477 push %r15 478___ 479$code.=<<___ if ($addx); 480 cmp \$0x80100, %ecx 481 je .Lmul_montx 482___ 483$code.=<<___; 484 mov $b_org, $b_ptr 485 mov 8*0($b_org), %rax 486 mov 8*0($a_ptr), $acc1 487 mov 8*1($a_ptr), $acc2 488 mov 8*2($a_ptr), $acc3 489 mov 8*3($a_ptr), $acc4 490 491 call __ecp_nistz256_mul_montq 492___ 493$code.=<<___ if ($addx); 494 jmp .Lmul_mont_done 495 496.align 32 497.Lmul_montx: 498 mov $b_org, $b_ptr 499 mov 8*0($b_org), %rdx 500 mov 8*0($a_ptr), $acc1 501 mov 8*1($a_ptr), $acc2 502 mov 8*2($a_ptr), $acc3 503 mov 8*3($a_ptr), $acc4 504 lea -128($a_ptr), $a_ptr # control u-op density 505 506 call __ecp_nistz256_mul_montx 507___ 508$code.=<<___; 509.Lmul_mont_done: 510 pop %r15 511 pop %r14 512 pop %r13 513 pop %r12 514 pop %rbx 515 pop %rbp 516 ret 517.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 518 519.type __ecp_nistz256_mul_montq,\@abi-omnipotent 520.align 32 521__ecp_nistz256_mul_montq: 522 ######################################################################## 523 # Multiply a by b[0] 524 mov %rax, $t1 525 mulq $acc1 526 mov .Lpoly+8*1(%rip),$poly1 527 mov %rax, $acc0 528 mov $t1, %rax 529 mov %rdx, $acc1 530 531 mulq $acc2 532 mov .Lpoly+8*3(%rip),$poly3 533 add %rax, $acc1 534 mov $t1, %rax 535 adc \$0, %rdx 536 mov %rdx, $acc2 537 538 mulq $acc3 539 add %rax, $acc2 540 mov $t1, %rax 541 adc \$0, %rdx 542 mov %rdx, $acc3 543 544 mulq $acc4 545 add %rax, $acc3 546 mov $acc0, %rax 547 adc \$0, %rdx 548 xor $acc5, $acc5 549 mov %rdx, $acc4 550 551 ######################################################################## 552 # First reduction step 553 # Basically now we want to multiply acc[0] by p256, 554 # and add the result to the acc. 555 # Due to the special form of p256 we do some optimizations 556 # 557 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 558 # then we add acc[0] and get acc[0] x 2^96 559 560 mov $acc0, $t1 561 shl \$32, $acc0 562 mulq $poly3 563 shr \$32, $t1 564 add $acc0, $acc1 # +=acc[0]<<96 565 adc $t1, $acc2 566 adc %rax, $acc3 567 mov 8*1($b_ptr), %rax 568 adc %rdx, $acc4 569 adc \$0, $acc5 570 xor $acc0, $acc0 571 572 ######################################################################## 573 # Multiply by b[1] 574 mov %rax, $t1 575 mulq 8*0($a_ptr) 576 add %rax, $acc1 577 mov $t1, %rax 578 adc \$0, %rdx 579 mov %rdx, $t0 580 581 mulq 8*1($a_ptr) 582 add $t0, $acc2 583 adc \$0, %rdx 584 add %rax, $acc2 585 mov $t1, %rax 586 adc \$0, %rdx 587 mov %rdx, $t0 588 589 mulq 8*2($a_ptr) 590 add $t0, $acc3 591 adc \$0, %rdx 592 add %rax, $acc3 593 mov $t1, %rax 594 adc \$0, %rdx 595 mov %rdx, $t0 596 597 mulq 8*3($a_ptr) 598 add $t0, $acc4 599 adc \$0, %rdx 600 add %rax, $acc4 601 mov $acc1, %rax 602 adc %rdx, $acc5 603 adc \$0, $acc0 604 605 ######################################################################## 606 # Second reduction step 607 mov $acc1, $t1 608 shl \$32, $acc1 609 mulq $poly3 610 shr \$32, $t1 611 add $acc1, $acc2 612 adc $t1, $acc3 613 adc %rax, $acc4 614 mov 8*2($b_ptr), %rax 615 adc %rdx, $acc5 616 adc \$0, $acc0 617 xor $acc1, $acc1 618 619 ######################################################################## 620 # Multiply by b[2] 621 mov %rax, $t1 622 mulq 8*0($a_ptr) 623 add %rax, $acc2 624 mov $t1, %rax 625 adc \$0, %rdx 626 mov %rdx, $t0 627 628 mulq 8*1($a_ptr) 629 add $t0, $acc3 630 adc \$0, %rdx 631 add %rax, $acc3 632 mov $t1, %rax 633 adc \$0, %rdx 634 mov %rdx, $t0 635 636 mulq 8*2($a_ptr) 637 add $t0, $acc4 638 adc \$0, %rdx 639 add %rax, $acc4 640 mov $t1, %rax 641 adc \$0, %rdx 642 mov %rdx, $t0 643 644 mulq 8*3($a_ptr) 645 add $t0, $acc5 646 adc \$0, %rdx 647 add %rax, $acc5 648 mov $acc2, %rax 649 adc %rdx, $acc0 650 adc \$0, $acc1 651 652 ######################################################################## 653 # Third reduction step 654 mov $acc2, $t1 655 shl \$32, $acc2 656 mulq $poly3 657 shr \$32, $t1 658 add $acc2, $acc3 659 adc $t1, $acc4 660 adc %rax, $acc5 661 mov 8*3($b_ptr), %rax 662 adc %rdx, $acc0 663 adc \$0, $acc1 664 xor $acc2, $acc2 665 666 ######################################################################## 667 # Multiply by b[3] 668 mov %rax, $t1 669 mulq 8*0($a_ptr) 670 add %rax, $acc3 671 mov $t1, %rax 672 adc \$0, %rdx 673 mov %rdx, $t0 674 675 mulq 8*1($a_ptr) 676 add $t0, $acc4 677 adc \$0, %rdx 678 add %rax, $acc4 679 mov $t1, %rax 680 adc \$0, %rdx 681 mov %rdx, $t0 682 683 mulq 8*2($a_ptr) 684 add $t0, $acc5 685 adc \$0, %rdx 686 add %rax, $acc5 687 mov $t1, %rax 688 adc \$0, %rdx 689 mov %rdx, $t0 690 691 mulq 8*3($a_ptr) 692 add $t0, $acc0 693 adc \$0, %rdx 694 add %rax, $acc0 695 mov $acc3, %rax 696 adc %rdx, $acc1 697 adc \$0, $acc2 698 699 ######################################################################## 700 # Final reduction step 701 mov $acc3, $t1 702 shl \$32, $acc3 703 mulq $poly3 704 shr \$32, $t1 705 add $acc3, $acc4 706 adc $t1, $acc5 707 mov $acc4, $t0 708 adc %rax, $acc0 709 adc %rdx, $acc1 710 mov $acc5, $t1 711 adc \$0, $acc2 712 713 ######################################################################## 714 # Branch-less conditional subtraction of P 715 sub \$-1, $acc4 # .Lpoly[0] 716 mov $acc0, $t2 717 sbb $poly1, $acc5 # .Lpoly[1] 718 sbb \$0, $acc0 # .Lpoly[2] 719 mov $acc1, $t3 720 sbb $poly3, $acc1 # .Lpoly[3] 721 sbb \$0, $acc2 722 723 cmovc $t0, $acc4 724 cmovc $t1, $acc5 725 mov $acc4, 8*0($r_ptr) 726 cmovc $t2, $acc0 727 mov $acc5, 8*1($r_ptr) 728 cmovc $t3, $acc1 729 mov $acc0, 8*2($r_ptr) 730 mov $acc1, 8*3($r_ptr) 731 732 ret 733.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 734 735################################################################################ 736# void ecp_nistz256_sqr_mont( 737# uint64_t res[4], 738# uint64_t a[4]); 739 740# we optimize the square according to S.Gueron and V.Krasnov, 741# "Speeding up Big-Number Squaring" 742.globl ecp_nistz256_sqr_mont 743.type ecp_nistz256_sqr_mont,\@function,2 744.align 32 745ecp_nistz256_sqr_mont: 746___ 747$code.=<<___ if ($addx); 748 mov \$0x80100, %ecx 749 and OPENSSL_ia32cap_P+8(%rip), %ecx 750___ 751$code.=<<___; 752 push %rbp 753 push %rbx 754 push %r12 755 push %r13 756 push %r14 757 push %r15 758___ 759$code.=<<___ if ($addx); 760 cmp \$0x80100, %ecx 761 je .Lsqr_montx 762___ 763$code.=<<___; 764 mov 8*0($a_ptr), %rax 765 mov 8*1($a_ptr), $acc6 766 mov 8*2($a_ptr), $acc7 767 mov 8*3($a_ptr), $acc0 768 769 call __ecp_nistz256_sqr_montq 770___ 771$code.=<<___ if ($addx); 772 jmp .Lsqr_mont_done 773 774.align 32 775.Lsqr_montx: 776 mov 8*0($a_ptr), %rdx 777 mov 8*1($a_ptr), $acc6 778 mov 8*2($a_ptr), $acc7 779 mov 8*3($a_ptr), $acc0 780 lea -128($a_ptr), $a_ptr # control u-op density 781 782 call __ecp_nistz256_sqr_montx 783___ 784$code.=<<___; 785.Lsqr_mont_done: 786 pop %r15 787 pop %r14 788 pop %r13 789 pop %r12 790 pop %rbx 791 pop %rbp 792 ret 793.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 794 795.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 796.align 32 797__ecp_nistz256_sqr_montq: 798 mov %rax, $acc5 799 mulq $acc6 # a[1]*a[0] 800 mov %rax, $acc1 801 mov $acc7, %rax 802 mov %rdx, $acc2 803 804 mulq $acc5 # a[0]*a[2] 805 add %rax, $acc2 806 mov $acc0, %rax 807 adc \$0, %rdx 808 mov %rdx, $acc3 809 810 mulq $acc5 # a[0]*a[3] 811 add %rax, $acc3 812 mov $acc7, %rax 813 adc \$0, %rdx 814 mov %rdx, $acc4 815 816 ################################# 817 mulq $acc6 # a[1]*a[2] 818 add %rax, $acc3 819 mov $acc0, %rax 820 adc \$0, %rdx 821 mov %rdx, $t1 822 823 mulq $acc6 # a[1]*a[3] 824 add %rax, $acc4 825 mov $acc0, %rax 826 adc \$0, %rdx 827 add $t1, $acc4 828 mov %rdx, $acc5 829 adc \$0, $acc5 830 831 ################################# 832 mulq $acc7 # a[2]*a[3] 833 xor $acc7, $acc7 834 add %rax, $acc5 835 mov 8*0($a_ptr), %rax 836 mov %rdx, $acc6 837 adc \$0, $acc6 838 839 add $acc1, $acc1 # acc1:6<<1 840 adc $acc2, $acc2 841 adc $acc3, $acc3 842 adc $acc4, $acc4 843 adc $acc5, $acc5 844 adc $acc6, $acc6 845 adc \$0, $acc7 846 847 mulq %rax 848 mov %rax, $acc0 849 mov 8*1($a_ptr), %rax 850 mov %rdx, $t0 851 852 mulq %rax 853 add $t0, $acc1 854 adc %rax, $acc2 855 mov 8*2($a_ptr), %rax 856 adc \$0, %rdx 857 mov %rdx, $t0 858 859 mulq %rax 860 add $t0, $acc3 861 adc %rax, $acc4 862 mov 8*3($a_ptr), %rax 863 adc \$0, %rdx 864 mov %rdx, $t0 865 866 mulq %rax 867 add $t0, $acc5 868 adc %rax, $acc6 869 mov $acc0, %rax 870 adc %rdx, $acc7 871 872 mov .Lpoly+8*1(%rip), $a_ptr 873 mov .Lpoly+8*3(%rip), $t1 874 875 ########################################## 876 # Now the reduction 877 # First iteration 878 mov $acc0, $t0 879 shl \$32, $acc0 880 mulq $t1 881 shr \$32, $t0 882 add $acc0, $acc1 # +=acc[0]<<96 883 adc $t0, $acc2 884 adc %rax, $acc3 885 mov $acc1, %rax 886 adc \$0, %rdx 887 888 ########################################## 889 # Second iteration 890 mov $acc1, $t0 891 shl \$32, $acc1 892 mov %rdx, $acc0 893 mulq $t1 894 shr \$32, $t0 895 add $acc1, $acc2 896 adc $t0, $acc3 897 adc %rax, $acc0 898 mov $acc2, %rax 899 adc \$0, %rdx 900 901 ########################################## 902 # Third iteration 903 mov $acc2, $t0 904 shl \$32, $acc2 905 mov %rdx, $acc1 906 mulq $t1 907 shr \$32, $t0 908 add $acc2, $acc3 909 adc $t0, $acc0 910 adc %rax, $acc1 911 mov $acc3, %rax 912 adc \$0, %rdx 913 914 ########################################### 915 # Last iteration 916 mov $acc3, $t0 917 shl \$32, $acc3 918 mov %rdx, $acc2 919 mulq $t1 920 shr \$32, $t0 921 add $acc3, $acc0 922 adc $t0, $acc1 923 adc %rax, $acc2 924 adc \$0, %rdx 925 xor $acc3, $acc3 926 927 ############################################ 928 # Add the rest of the acc 929 add $acc0, $acc4 930 adc $acc1, $acc5 931 mov $acc4, $acc0 932 adc $acc2, $acc6 933 adc %rdx, $acc7 934 mov $acc5, $acc1 935 adc \$0, $acc3 936 937 sub \$-1, $acc4 # .Lpoly[0] 938 mov $acc6, $acc2 939 sbb $a_ptr, $acc5 # .Lpoly[1] 940 sbb \$0, $acc6 # .Lpoly[2] 941 mov $acc7, $t0 942 sbb $t1, $acc7 # .Lpoly[3] 943 sbb \$0, $acc3 944 945 cmovc $acc0, $acc4 946 cmovc $acc1, $acc5 947 mov $acc4, 8*0($r_ptr) 948 cmovc $acc2, $acc6 949 mov $acc5, 8*1($r_ptr) 950 cmovc $t0, $acc7 951 mov $acc6, 8*2($r_ptr) 952 mov $acc7, 8*3($r_ptr) 953 954 ret 955.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 956___ 957 958if ($addx) { 959$code.=<<___; 960.type __ecp_nistz256_mul_montx,\@abi-omnipotent 961.align 32 962__ecp_nistz256_mul_montx: 963 ######################################################################## 964 # Multiply by b[0] 965 mulx $acc1, $acc0, $acc1 966 mulx $acc2, $t0, $acc2 967 mov \$32, $poly1 968 xor $acc5, $acc5 # cf=0 969 mulx $acc3, $t1, $acc3 970 mov .Lpoly+8*3(%rip), $poly3 971 adc $t0, $acc1 972 mulx $acc4, $t0, $acc4 973 mov $acc0, %rdx 974 adc $t1, $acc2 975 shlx $poly1,$acc0,$t1 976 adc $t0, $acc3 977 shrx $poly1,$acc0,$t0 978 adc \$0, $acc4 979 980 ######################################################################## 981 # First reduction step 982 add $t1, $acc1 983 adc $t0, $acc2 984 985 mulx $poly3, $t0, $t1 986 mov 8*1($b_ptr), %rdx 987 adc $t0, $acc3 988 adc $t1, $acc4 989 adc \$0, $acc5 990 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 991 992 ######################################################################## 993 # Multiply by b[1] 994 mulx 8*0+128($a_ptr), $t0, $t1 995 adcx $t0, $acc1 996 adox $t1, $acc2 997 998 mulx 8*1+128($a_ptr), $t0, $t1 999 adcx $t0, $acc2 1000 adox $t1, $acc3 1001 1002 mulx 8*2+128($a_ptr), $t0, $t1 1003 adcx $t0, $acc3 1004 adox $t1, $acc4 1005 1006 mulx 8*3+128($a_ptr), $t0, $t1 1007 mov $acc1, %rdx 1008 adcx $t0, $acc4 1009 shlx $poly1, $acc1, $t0 1010 adox $t1, $acc5 1011 shrx $poly1, $acc1, $t1 1012 1013 adcx $acc0, $acc5 1014 adox $acc0, $acc0 1015 adc \$0, $acc0 1016 1017 ######################################################################## 1018 # Second reduction step 1019 add $t0, $acc2 1020 adc $t1, $acc3 1021 1022 mulx $poly3, $t0, $t1 1023 mov 8*2($b_ptr), %rdx 1024 adc $t0, $acc4 1025 adc $t1, $acc5 1026 adc \$0, $acc0 1027 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 1028 1029 ######################################################################## 1030 # Multiply by b[2] 1031 mulx 8*0+128($a_ptr), $t0, $t1 1032 adcx $t0, $acc2 1033 adox $t1, $acc3 1034 1035 mulx 8*1+128($a_ptr), $t0, $t1 1036 adcx $t0, $acc3 1037 adox $t1, $acc4 1038 1039 mulx 8*2+128($a_ptr), $t0, $t1 1040 adcx $t0, $acc4 1041 adox $t1, $acc5 1042 1043 mulx 8*3+128($a_ptr), $t0, $t1 1044 mov $acc2, %rdx 1045 adcx $t0, $acc5 1046 shlx $poly1, $acc2, $t0 1047 adox $t1, $acc0 1048 shrx $poly1, $acc2, $t1 1049 1050 adcx $acc1, $acc0 1051 adox $acc1, $acc1 1052 adc \$0, $acc1 1053 1054 ######################################################################## 1055 # Third reduction step 1056 add $t0, $acc3 1057 adc $t1, $acc4 1058 1059 mulx $poly3, $t0, $t1 1060 mov 8*3($b_ptr), %rdx 1061 adc $t0, $acc5 1062 adc $t1, $acc0 1063 adc \$0, $acc1 1064 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 1065 1066 ######################################################################## 1067 # Multiply by b[3] 1068 mulx 8*0+128($a_ptr), $t0, $t1 1069 adcx $t0, $acc3 1070 adox $t1, $acc4 1071 1072 mulx 8*1+128($a_ptr), $t0, $t1 1073 adcx $t0, $acc4 1074 adox $t1, $acc5 1075 1076 mulx 8*2+128($a_ptr), $t0, $t1 1077 adcx $t0, $acc5 1078 adox $t1, $acc0 1079 1080 mulx 8*3+128($a_ptr), $t0, $t1 1081 mov $acc3, %rdx 1082 adcx $t0, $acc0 1083 shlx $poly1, $acc3, $t0 1084 adox $t1, $acc1 1085 shrx $poly1, $acc3, $t1 1086 1087 adcx $acc2, $acc1 1088 adox $acc2, $acc2 1089 adc \$0, $acc2 1090 1091 ######################################################################## 1092 # Fourth reduction step 1093 add $t0, $acc4 1094 adc $t1, $acc5 1095 1096 mulx $poly3, $t0, $t1 1097 mov $acc4, $t2 1098 mov .Lpoly+8*1(%rip), $poly1 1099 adc $t0, $acc0 1100 mov $acc5, $t3 1101 adc $t1, $acc1 1102 adc \$0, $acc2 1103 1104 ######################################################################## 1105 # Branch-less conditional subtraction of P 1106 xor %eax, %eax 1107 mov $acc0, $t0 1108 sbb \$-1, $acc4 # .Lpoly[0] 1109 sbb $poly1, $acc5 # .Lpoly[1] 1110 sbb \$0, $acc0 # .Lpoly[2] 1111 mov $acc1, $t1 1112 sbb $poly3, $acc1 # .Lpoly[3] 1113 sbb \$0, $acc2 1114 1115 cmovc $t2, $acc4 1116 cmovc $t3, $acc5 1117 mov $acc4, 8*0($r_ptr) 1118 cmovc $t0, $acc0 1119 mov $acc5, 8*1($r_ptr) 1120 cmovc $t1, $acc1 1121 mov $acc0, 8*2($r_ptr) 1122 mov $acc1, 8*3($r_ptr) 1123 1124 ret 1125.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 1126 1127.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 1128.align 32 1129__ecp_nistz256_sqr_montx: 1130 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1131 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1132 xor %eax, %eax 1133 adc $t0, $acc2 1134 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1135 mov $acc6, %rdx 1136 adc $t1, $acc3 1137 adc \$0, $acc4 1138 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1139 1140 ################################# 1141 mulx $acc7, $t0, $t1 # a[1]*a[2] 1142 adcx $t0, $acc3 1143 adox $t1, $acc4 1144 1145 mulx $acc0, $t0, $t1 # a[1]*a[3] 1146 mov $acc7, %rdx 1147 adcx $t0, $acc4 1148 adox $t1, $acc5 1149 adc \$0, $acc5 1150 1151 ################################# 1152 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1153 mov 8*0+128($a_ptr), %rdx 1154 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1155 adcx $acc1, $acc1 # acc1:6<<1 1156 adox $t0, $acc5 1157 adcx $acc2, $acc2 1158 adox $acc7, $acc6 # of=0 1159 1160 mulx %rdx, $acc0, $t1 1161 mov 8*1+128($a_ptr), %rdx 1162 adcx $acc3, $acc3 1163 adox $t1, $acc1 1164 adcx $acc4, $acc4 1165 mulx %rdx, $t0, $t4 1166 mov 8*2+128($a_ptr), %rdx 1167 adcx $acc5, $acc5 1168 adox $t0, $acc2 1169 adcx $acc6, $acc6 1170 .byte 0x67 1171 mulx %rdx, $t0, $t1 1172 mov 8*3+128($a_ptr), %rdx 1173 adox $t4, $acc3 1174 adcx $acc7, $acc7 1175 adox $t0, $acc4 1176 mov \$32, $a_ptr 1177 adox $t1, $acc5 1178 .byte 0x67,0x67 1179 mulx %rdx, $t0, $t4 1180 mov $acc0, %rdx 1181 adox $t0, $acc6 1182 shlx $a_ptr, $acc0, $t0 1183 adox $t4, $acc7 1184 shrx $a_ptr, $acc0, $t4 1185 mov .Lpoly+8*3(%rip), $t1 1186 1187 # reduction step 1 1188 add $t0, $acc1 1189 adc $t4, $acc2 1190 1191 mulx $t1, $t0, $acc0 1192 mov $acc1, %rdx 1193 adc $t0, $acc3 1194 shlx $a_ptr, $acc1, $t0 1195 adc \$0, $acc0 1196 shrx $a_ptr, $acc1, $t4 1197 1198 # reduction step 2 1199 add $t0, $acc2 1200 adc $t4, $acc3 1201 1202 mulx $t1, $t0, $acc1 1203 mov $acc2, %rdx 1204 adc $t0, $acc0 1205 shlx $a_ptr, $acc2, $t0 1206 adc \$0, $acc1 1207 shrx $a_ptr, $acc2, $t4 1208 1209 # reduction step 3 1210 add $t0, $acc3 1211 adc $t4, $acc0 1212 1213 mulx $t1, $t0, $acc2 1214 mov $acc3, %rdx 1215 adc $t0, $acc1 1216 shlx $a_ptr, $acc3, $t0 1217 adc \$0, $acc2 1218 shrx $a_ptr, $acc3, $t4 1219 1220 # reduction step 4 1221 add $t0, $acc0 1222 adc $t4, $acc1 1223 1224 mulx $t1, $t0, $acc3 1225 adc $t0, $acc2 1226 adc \$0, $acc3 1227 1228 xor $t3, $t3 # cf=0 1229 adc $acc0, $acc4 # accumulate upper half 1230 mov .Lpoly+8*1(%rip), $a_ptr 1231 adc $acc1, $acc5 1232 mov $acc4, $acc0 1233 adc $acc2, $acc6 1234 adc $acc3, $acc7 1235 mov $acc5, $acc1 1236 adc \$0, $t3 1237 1238 xor %eax, %eax # cf=0 1239 sbb \$-1, $acc4 # .Lpoly[0] 1240 mov $acc6, $acc2 1241 sbb $a_ptr, $acc5 # .Lpoly[1] 1242 sbb \$0, $acc6 # .Lpoly[2] 1243 mov $acc7, $acc3 1244 sbb $t1, $acc7 # .Lpoly[3] 1245 sbb \$0, $t3 1246 1247 cmovc $acc0, $acc4 1248 cmovc $acc1, $acc5 1249 mov $acc4, 8*0($r_ptr) 1250 cmovc $acc2, $acc6 1251 mov $acc5, 8*1($r_ptr) 1252 cmovc $acc3, $acc7 1253 mov $acc6, 8*2($r_ptr) 1254 mov $acc7, 8*3($r_ptr) 1255 1256 ret 1257.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 1258___ 1259} 1260} 1261{ 1262my ($r_ptr,$in_ptr)=("%rdi","%rsi"); 1263my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); 1264my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); 1265 1266$code.=<<___; 1267################################################################################ 1268# void ecp_nistz256_from_mont( 1269# uint64_t res[4], 1270# uint64_t in[4]); 1271# This one performs Montgomery multiplication by 1, so we only need the reduction 1272 1273.globl ecp_nistz256_from_mont 1274.type ecp_nistz256_from_mont,\@function,2 1275.align 32 1276ecp_nistz256_from_mont: 1277 push %r12 1278 push %r13 1279 1280 mov 8*0($in_ptr), %rax 1281 mov .Lpoly+8*3(%rip), $t2 1282 mov 8*1($in_ptr), $acc1 1283 mov 8*2($in_ptr), $acc2 1284 mov 8*3($in_ptr), $acc3 1285 mov %rax, $acc0 1286 mov .Lpoly+8*1(%rip), $t1 1287 1288 ######################################### 1289 # First iteration 1290 mov %rax, $t0 1291 shl \$32, $acc0 1292 mulq $t2 1293 shr \$32, $t0 1294 add $acc0, $acc1 1295 adc $t0, $acc2 1296 adc %rax, $acc3 1297 mov $acc1, %rax 1298 adc \$0, %rdx 1299 1300 ######################################### 1301 # Second iteration 1302 mov $acc1, $t0 1303 shl \$32, $acc1 1304 mov %rdx, $acc0 1305 mulq $t2 1306 shr \$32, $t0 1307 add $acc1, $acc2 1308 adc $t0, $acc3 1309 adc %rax, $acc0 1310 mov $acc2, %rax 1311 adc \$0, %rdx 1312 1313 ########################################## 1314 # Third iteration 1315 mov $acc2, $t0 1316 shl \$32, $acc2 1317 mov %rdx, $acc1 1318 mulq $t2 1319 shr \$32, $t0 1320 add $acc2, $acc3 1321 adc $t0, $acc0 1322 adc %rax, $acc1 1323 mov $acc3, %rax 1324 adc \$0, %rdx 1325 1326 ########################################### 1327 # Last iteration 1328 mov $acc3, $t0 1329 shl \$32, $acc3 1330 mov %rdx, $acc2 1331 mulq $t2 1332 shr \$32, $t0 1333 add $acc3, $acc0 1334 adc $t0, $acc1 1335 mov $acc0, $t0 1336 adc %rax, $acc2 1337 mov $acc1, $in_ptr 1338 adc \$0, %rdx 1339 1340 ########################################### 1341 # Branch-less conditional subtraction 1342 sub \$-1, $acc0 1343 mov $acc2, %rax 1344 sbb $t1, $acc1 1345 sbb \$0, $acc2 1346 mov %rdx, $acc3 1347 sbb $t2, %rdx 1348 sbb $t2, $t2 1349 1350 cmovnz $t0, $acc0 1351 cmovnz $in_ptr, $acc1 1352 mov $acc0, 8*0($r_ptr) 1353 cmovnz %rax, $acc2 1354 mov $acc1, 8*1($r_ptr) 1355 cmovz %rdx, $acc3 1356 mov $acc2, 8*2($r_ptr) 1357 mov $acc3, 8*3($r_ptr) 1358 1359 pop %r13 1360 pop %r12 1361 ret 1362.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 1363___ 1364} 1365{ 1366my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1367my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 1368my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 1369my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 1370 1371$code.=<<___; 1372################################################################################ 1373# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1374.globl ecp_nistz256_select_w5 1375.type ecp_nistz256_select_w5,\@abi-omnipotent 1376.align 32 1377ecp_nistz256_select_w5: 1378___ 1379$code.=<<___ if ($avx>1); 1380 mov OPENSSL_ia32cap_P+8(%rip), %eax 1381 test \$`1<<5`, %eax 1382 jnz .Lavx2_select_w5 1383___ 1384$code.=<<___ if ($win64); 1385 lea -0x88(%rsp), %rax 1386.LSEH_begin_ecp_nistz256_select_w5: 1387 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1388 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1389 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1390 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1391 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1392 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1393 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1394 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1395 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1396 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1397 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1398___ 1399$code.=<<___; 1400 movdqa .LOne(%rip), $ONE 1401 movd $index, $INDEX 1402 1403 pxor $Ra, $Ra 1404 pxor $Rb, $Rb 1405 pxor $Rc, $Rc 1406 pxor $Rd, $Rd 1407 pxor $Re, $Re 1408 pxor $Rf, $Rf 1409 1410 movdqa $ONE, $M0 1411 pshufd \$0, $INDEX, $INDEX 1412 1413 mov \$16, %rax 1414.Lselect_loop_sse_w5: 1415 1416 movdqa $M0, $TMP0 1417 paddd $ONE, $M0 1418 pcmpeqd $INDEX, $TMP0 1419 1420 movdqa 16*0($in_t), $T0a 1421 movdqa 16*1($in_t), $T0b 1422 movdqa 16*2($in_t), $T0c 1423 movdqa 16*3($in_t), $T0d 1424 movdqa 16*4($in_t), $T0e 1425 movdqa 16*5($in_t), $T0f 1426 lea 16*6($in_t), $in_t 1427 1428 pand $TMP0, $T0a 1429 pand $TMP0, $T0b 1430 por $T0a, $Ra 1431 pand $TMP0, $T0c 1432 por $T0b, $Rb 1433 pand $TMP0, $T0d 1434 por $T0c, $Rc 1435 pand $TMP0, $T0e 1436 por $T0d, $Rd 1437 pand $TMP0, $T0f 1438 por $T0e, $Re 1439 por $T0f, $Rf 1440 1441 dec %rax 1442 jnz .Lselect_loop_sse_w5 1443 1444 movdqu $Ra, 16*0($val) 1445 movdqu $Rb, 16*1($val) 1446 movdqu $Rc, 16*2($val) 1447 movdqu $Rd, 16*3($val) 1448 movdqu $Re, 16*4($val) 1449 movdqu $Rf, 16*5($val) 1450___ 1451$code.=<<___ if ($win64); 1452 movaps (%rsp), %xmm6 1453 movaps 0x10(%rsp), %xmm7 1454 movaps 0x20(%rsp), %xmm8 1455 movaps 0x30(%rsp), %xmm9 1456 movaps 0x40(%rsp), %xmm10 1457 movaps 0x50(%rsp), %xmm11 1458 movaps 0x60(%rsp), %xmm12 1459 movaps 0x70(%rsp), %xmm13 1460 movaps 0x80(%rsp), %xmm14 1461 movaps 0x90(%rsp), %xmm15 1462 lea 0xa8(%rsp), %rsp 1463.LSEH_end_ecp_nistz256_select_w5: 1464___ 1465$code.=<<___; 1466 ret 1467.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1468 1469################################################################################ 1470# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1471.globl ecp_nistz256_select_w7 1472.type ecp_nistz256_select_w7,\@abi-omnipotent 1473.align 32 1474ecp_nistz256_select_w7: 1475___ 1476$code.=<<___ if ($avx>1); 1477 mov OPENSSL_ia32cap_P+8(%rip), %eax 1478 test \$`1<<5`, %eax 1479 jnz .Lavx2_select_w7 1480___ 1481$code.=<<___ if ($win64); 1482 lea -0x88(%rsp), %rax 1483.LSEH_begin_ecp_nistz256_select_w7: 1484 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1485 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1486 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1487 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1488 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1489 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1490 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1491 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1492 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1493 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1494 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1495___ 1496$code.=<<___; 1497 movdqa .LOne(%rip), $M0 1498 movd $index, $INDEX 1499 1500 pxor $Ra, $Ra 1501 pxor $Rb, $Rb 1502 pxor $Rc, $Rc 1503 pxor $Rd, $Rd 1504 1505 movdqa $M0, $ONE 1506 pshufd \$0, $INDEX, $INDEX 1507 mov \$64, %rax 1508 1509.Lselect_loop_sse_w7: 1510 movdqa $M0, $TMP0 1511 paddd $ONE, $M0 1512 movdqa 16*0($in_t), $T0a 1513 movdqa 16*1($in_t), $T0b 1514 pcmpeqd $INDEX, $TMP0 1515 movdqa 16*2($in_t), $T0c 1516 movdqa 16*3($in_t), $T0d 1517 lea 16*4($in_t), $in_t 1518 1519 pand $TMP0, $T0a 1520 pand $TMP0, $T0b 1521 por $T0a, $Ra 1522 pand $TMP0, $T0c 1523 por $T0b, $Rb 1524 pand $TMP0, $T0d 1525 por $T0c, $Rc 1526 prefetcht0 255($in_t) 1527 por $T0d, $Rd 1528 1529 dec %rax 1530 jnz .Lselect_loop_sse_w7 1531 1532 movdqu $Ra, 16*0($val) 1533 movdqu $Rb, 16*1($val) 1534 movdqu $Rc, 16*2($val) 1535 movdqu $Rd, 16*3($val) 1536___ 1537$code.=<<___ if ($win64); 1538 movaps (%rsp), %xmm6 1539 movaps 0x10(%rsp), %xmm7 1540 movaps 0x20(%rsp), %xmm8 1541 movaps 0x30(%rsp), %xmm9 1542 movaps 0x40(%rsp), %xmm10 1543 movaps 0x50(%rsp), %xmm11 1544 movaps 0x60(%rsp), %xmm12 1545 movaps 0x70(%rsp), %xmm13 1546 movaps 0x80(%rsp), %xmm14 1547 movaps 0x90(%rsp), %xmm15 1548 lea 0xa8(%rsp), %rsp 1549.LSEH_end_ecp_nistz256_select_w7: 1550___ 1551$code.=<<___; 1552 ret 1553.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1554___ 1555} 1556if ($avx>1) { 1557my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1558my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 1559my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 1560my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 1561 1562$code.=<<___; 1563################################################################################ 1564# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 1565.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 1566.align 32 1567ecp_nistz256_avx2_select_w5: 1568.Lavx2_select_w5: 1569 vzeroupper 1570___ 1571$code.=<<___ if ($win64); 1572 lea -0x88(%rsp), %rax 1573.LSEH_begin_ecp_nistz256_avx2_select_w5: 1574 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1575 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1576 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1577 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1578 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1579 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1580 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1581 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1582 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1583 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1584 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1585___ 1586$code.=<<___; 1587 vmovdqa .LTwo(%rip), $TWO 1588 1589 vpxor $Ra, $Ra, $Ra 1590 vpxor $Rb, $Rb, $Rb 1591 vpxor $Rc, $Rc, $Rc 1592 1593 vmovdqa .LOne(%rip), $M0 1594 vmovdqa .LTwo(%rip), $M1 1595 1596 vmovd $index, %xmm1 1597 vpermd $INDEX, $Ra, $INDEX 1598 1599 mov \$8, %rax 1600.Lselect_loop_avx2_w5: 1601 1602 vmovdqa 32*0($in_t), $T0a 1603 vmovdqa 32*1($in_t), $T0b 1604 vmovdqa 32*2($in_t), $T0c 1605 1606 vmovdqa 32*3($in_t), $T1a 1607 vmovdqa 32*4($in_t), $T1b 1608 vmovdqa 32*5($in_t), $T1c 1609 1610 vpcmpeqd $INDEX, $M0, $TMP0 1611 vpcmpeqd $INDEX, $M1, $TMP1 1612 1613 vpaddd $TWO, $M0, $M0 1614 vpaddd $TWO, $M1, $M1 1615 lea 32*6($in_t), $in_t 1616 1617 vpand $TMP0, $T0a, $T0a 1618 vpand $TMP0, $T0b, $T0b 1619 vpand $TMP0, $T0c, $T0c 1620 vpand $TMP1, $T1a, $T1a 1621 vpand $TMP1, $T1b, $T1b 1622 vpand $TMP1, $T1c, $T1c 1623 1624 vpxor $T0a, $Ra, $Ra 1625 vpxor $T0b, $Rb, $Rb 1626 vpxor $T0c, $Rc, $Rc 1627 vpxor $T1a, $Ra, $Ra 1628 vpxor $T1b, $Rb, $Rb 1629 vpxor $T1c, $Rc, $Rc 1630 1631 dec %rax 1632 jnz .Lselect_loop_avx2_w5 1633 1634 vmovdqu $Ra, 32*0($val) 1635 vmovdqu $Rb, 32*1($val) 1636 vmovdqu $Rc, 32*2($val) 1637 vzeroupper 1638___ 1639$code.=<<___ if ($win64); 1640 movaps (%rsp), %xmm6 1641 movaps 0x10(%rsp), %xmm7 1642 movaps 0x20(%rsp), %xmm8 1643 movaps 0x30(%rsp), %xmm9 1644 movaps 0x40(%rsp), %xmm10 1645 movaps 0x50(%rsp), %xmm11 1646 movaps 0x60(%rsp), %xmm12 1647 movaps 0x70(%rsp), %xmm13 1648 movaps 0x80(%rsp), %xmm14 1649 movaps 0x90(%rsp), %xmm15 1650 lea 0xa8(%rsp), %rsp 1651.LSEH_end_ecp_nistz256_avx2_select_w5: 1652___ 1653$code.=<<___; 1654 ret 1655.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 1656___ 1657} 1658if ($avx>1) { 1659my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1660my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 1661my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 1662my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 1663my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 1664 1665$code.=<<___; 1666 1667################################################################################ 1668# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 1669.globl ecp_nistz256_avx2_select_w7 1670.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 1671.align 32 1672ecp_nistz256_avx2_select_w7: 1673.Lavx2_select_w7: 1674 vzeroupper 1675___ 1676$code.=<<___ if ($win64); 1677 lea -0x88(%rsp), %rax 1678.LSEH_begin_ecp_nistz256_avx2_select_w7: 1679 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1680 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1681 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1682 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1683 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1684 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1685 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1686 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1687 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1688 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1689 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1690___ 1691$code.=<<___; 1692 vmovdqa .LThree(%rip), $THREE 1693 1694 vpxor $Ra, $Ra, $Ra 1695 vpxor $Rb, $Rb, $Rb 1696 1697 vmovdqa .LOne(%rip), $M0 1698 vmovdqa .LTwo(%rip), $M1 1699 vmovdqa .LThree(%rip), $M2 1700 1701 vmovd $index, %xmm1 1702 vpermd $INDEX, $Ra, $INDEX 1703 # Skip index = 0, because it is implicitly the point at infinity 1704 1705 mov \$21, %rax 1706.Lselect_loop_avx2_w7: 1707 1708 vmovdqa 32*0($in_t), $T0a 1709 vmovdqa 32*1($in_t), $T0b 1710 1711 vmovdqa 32*2($in_t), $T1a 1712 vmovdqa 32*3($in_t), $T1b 1713 1714 vmovdqa 32*4($in_t), $T2a 1715 vmovdqa 32*5($in_t), $T2b 1716 1717 vpcmpeqd $INDEX, $M0, $TMP0 1718 vpcmpeqd $INDEX, $M1, $TMP1 1719 vpcmpeqd $INDEX, $M2, $TMP2 1720 1721 vpaddd $THREE, $M0, $M0 1722 vpaddd $THREE, $M1, $M1 1723 vpaddd $THREE, $M2, $M2 1724 lea 32*6($in_t), $in_t 1725 1726 vpand $TMP0, $T0a, $T0a 1727 vpand $TMP0, $T0b, $T0b 1728 vpand $TMP1, $T1a, $T1a 1729 vpand $TMP1, $T1b, $T1b 1730 vpand $TMP2, $T2a, $T2a 1731 vpand $TMP2, $T2b, $T2b 1732 1733 vpxor $T0a, $Ra, $Ra 1734 vpxor $T0b, $Rb, $Rb 1735 vpxor $T1a, $Ra, $Ra 1736 vpxor $T1b, $Rb, $Rb 1737 vpxor $T2a, $Ra, $Ra 1738 vpxor $T2b, $Rb, $Rb 1739 1740 dec %rax 1741 jnz .Lselect_loop_avx2_w7 1742 1743 1744 vmovdqa 32*0($in_t), $T0a 1745 vmovdqa 32*1($in_t), $T0b 1746 1747 vpcmpeqd $INDEX, $M0, $TMP0 1748 1749 vpand $TMP0, $T0a, $T0a 1750 vpand $TMP0, $T0b, $T0b 1751 1752 vpxor $T0a, $Ra, $Ra 1753 vpxor $T0b, $Rb, $Rb 1754 1755 vmovdqu $Ra, 32*0($val) 1756 vmovdqu $Rb, 32*1($val) 1757 vzeroupper 1758___ 1759$code.=<<___ if ($win64); 1760 movaps (%rsp), %xmm6 1761 movaps 0x10(%rsp), %xmm7 1762 movaps 0x20(%rsp), %xmm8 1763 movaps 0x30(%rsp), %xmm9 1764 movaps 0x40(%rsp), %xmm10 1765 movaps 0x50(%rsp), %xmm11 1766 movaps 0x60(%rsp), %xmm12 1767 movaps 0x70(%rsp), %xmm13 1768 movaps 0x80(%rsp), %xmm14 1769 movaps 0x90(%rsp), %xmm15 1770 lea 0xa8(%rsp), %rsp 1771.LSEH_end_ecp_nistz256_avx2_select_w7: 1772___ 1773$code.=<<___; 1774 ret 1775.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1776___ 1777} else { 1778$code.=<<___; 1779.globl ecp_nistz256_avx2_select_w7 1780.type ecp_nistz256_avx2_select_w7,\@function,3 1781.align 32 1782ecp_nistz256_avx2_select_w7: 1783 .byte 0x0f,0x0b # ud2 1784 ret 1785.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1786___ 1787} 1788{{{ 1789######################################################################## 1790# This block implements higher level point_double, point_add and 1791# point_add_affine. The key to performance in this case is to allow 1792# out-of-order execution logic to overlap computations from next step 1793# with tail processing from current step. By using tailored calling 1794# sequence we minimize inter-step overhead to give processor better 1795# shot at overlapping operations... 1796# 1797# You will notice that input data is copied to stack. Trouble is that 1798# there are no registers to spare for holding original pointers and 1799# reloading them, pointers, would create undesired dependencies on 1800# effective addresses calculation paths. In other words it's too done 1801# to favour out-of-order execution logic. 1802# <appro@openssl.org> 1803 1804my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 1805my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 1806my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 1807my ($poly1,$poly3)=($acc6,$acc7); 1808 1809sub load_for_mul () { 1810my ($a,$b,$src0) = @_; 1811my $bias = $src0 eq "%rax" ? 0 : -128; 1812 1813" mov $b, $src0 1814 lea $b, $b_ptr 1815 mov 8*0+$a, $acc1 1816 mov 8*1+$a, $acc2 1817 lea $bias+$a, $a_ptr 1818 mov 8*2+$a, $acc3 1819 mov 8*3+$a, $acc4" 1820} 1821 1822sub load_for_sqr () { 1823my ($a,$src0) = @_; 1824my $bias = $src0 eq "%rax" ? 0 : -128; 1825 1826" mov 8*0+$a, $src0 1827 mov 8*1+$a, $acc6 1828 lea $bias+$a, $a_ptr 1829 mov 8*2+$a, $acc7 1830 mov 8*3+$a, $acc0" 1831} 1832 1833 { 1834######################################################################## 1835# operate in 4-5-0-1 "name space" that matches multiplication output 1836# 1837my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 1838 1839$code.=<<___; 1840.type __ecp_nistz256_add_toq,\@abi-omnipotent 1841.align 32 1842__ecp_nistz256_add_toq: 1843 add 8*0($b_ptr), $a0 1844 adc 8*1($b_ptr), $a1 1845 mov $a0, $t0 1846 adc 8*2($b_ptr), $a2 1847 adc 8*3($b_ptr), $a3 1848 mov $a1, $t1 1849 sbb $t4, $t4 1850 1851 sub \$-1, $a0 1852 mov $a2, $t2 1853 sbb $poly1, $a1 1854 sbb \$0, $a2 1855 mov $a3, $t3 1856 sbb $poly3, $a3 1857 test $t4, $t4 1858 1859 cmovz $t0, $a0 1860 cmovz $t1, $a1 1861 mov $a0, 8*0($r_ptr) 1862 cmovz $t2, $a2 1863 mov $a1, 8*1($r_ptr) 1864 cmovz $t3, $a3 1865 mov $a2, 8*2($r_ptr) 1866 mov $a3, 8*3($r_ptr) 1867 1868 ret 1869.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 1870 1871.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 1872.align 32 1873__ecp_nistz256_sub_fromq: 1874 sub 8*0($b_ptr), $a0 1875 sbb 8*1($b_ptr), $a1 1876 mov $a0, $t0 1877 sbb 8*2($b_ptr), $a2 1878 sbb 8*3($b_ptr), $a3 1879 mov $a1, $t1 1880 sbb $t4, $t4 1881 1882 add \$-1, $a0 1883 mov $a2, $t2 1884 adc $poly1, $a1 1885 adc \$0, $a2 1886 mov $a3, $t3 1887 adc $poly3, $a3 1888 test $t4, $t4 1889 1890 cmovz $t0, $a0 1891 cmovz $t1, $a1 1892 mov $a0, 8*0($r_ptr) 1893 cmovz $t2, $a2 1894 mov $a1, 8*1($r_ptr) 1895 cmovz $t3, $a3 1896 mov $a2, 8*2($r_ptr) 1897 mov $a3, 8*3($r_ptr) 1898 1899 ret 1900.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 1901 1902.type __ecp_nistz256_subq,\@abi-omnipotent 1903.align 32 1904__ecp_nistz256_subq: 1905 sub $a0, $t0 1906 sbb $a1, $t1 1907 mov $t0, $a0 1908 sbb $a2, $t2 1909 sbb $a3, $t3 1910 mov $t1, $a1 1911 sbb $t4, $t4 1912 1913 add \$-1, $t0 1914 mov $t2, $a2 1915 adc $poly1, $t1 1916 adc \$0, $t2 1917 mov $t3, $a3 1918 adc $poly3, $t3 1919 test $t4, $t4 1920 1921 cmovnz $t0, $a0 1922 cmovnz $t1, $a1 1923 cmovnz $t2, $a2 1924 cmovnz $t3, $a3 1925 1926 ret 1927.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 1928 1929.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 1930.align 32 1931__ecp_nistz256_mul_by_2q: 1932 add $a0, $a0 # a0:a3+a0:a3 1933 adc $a1, $a1 1934 mov $a0, $t0 1935 adc $a2, $a2 1936 adc $a3, $a3 1937 mov $a1, $t1 1938 sbb $t4, $t4 1939 1940 sub \$-1, $a0 1941 mov $a2, $t2 1942 sbb $poly1, $a1 1943 sbb \$0, $a2 1944 mov $a3, $t3 1945 sbb $poly3, $a3 1946 test $t4, $t4 1947 1948 cmovz $t0, $a0 1949 cmovz $t1, $a1 1950 mov $a0, 8*0($r_ptr) 1951 cmovz $t2, $a2 1952 mov $a1, 8*1($r_ptr) 1953 cmovz $t3, $a3 1954 mov $a2, 8*2($r_ptr) 1955 mov $a3, 8*3($r_ptr) 1956 1957 ret 1958.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 1959___ 1960 } 1961sub gen_double () { 1962 my $x = shift; 1963 my ($src0,$sfx,$bias); 1964 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1965 1966 if ($x ne "x") { 1967 $src0 = "%rax"; 1968 $sfx = ""; 1969 $bias = 0; 1970 1971$code.=<<___; 1972.globl ecp_nistz256_point_double 1973.type ecp_nistz256_point_double,\@function,2 1974.align 32 1975ecp_nistz256_point_double: 1976___ 1977$code.=<<___ if ($addx); 1978 mov \$0x80100, %ecx 1979 and OPENSSL_ia32cap_P+8(%rip), %ecx 1980 cmp \$0x80100, %ecx 1981 je .Lpoint_doublex 1982___ 1983 } else { 1984 $src0 = "%rdx"; 1985 $sfx = "x"; 1986 $bias = 128; 1987 1988$code.=<<___; 1989.type ecp_nistz256_point_doublex,\@function,2 1990.align 32 1991ecp_nistz256_point_doublex: 1992.Lpoint_doublex: 1993___ 1994 } 1995$code.=<<___; 1996 push %rbp 1997 push %rbx 1998 push %r12 1999 push %r13 2000 push %r14 2001 push %r15 2002 sub \$32*5+8, %rsp 2003 2004 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 2005 mov $a_ptr, $b_ptr # backup copy 2006 movdqu 0x10($a_ptr), %xmm1 2007 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 2008 mov 0x20+8*1($a_ptr), $acc5 2009 mov 0x20+8*2($a_ptr), $acc0 2010 mov 0x20+8*3($a_ptr), $acc1 2011 mov .Lpoly+8*1(%rip), $poly1 2012 mov .Lpoly+8*3(%rip), $poly3 2013 movdqa %xmm0, $in_x(%rsp) 2014 movdqa %xmm1, $in_x+0x10(%rsp) 2015 lea 0x20($r_ptr), $acc2 2016 lea 0x40($r_ptr), $acc3 2017 movq $r_ptr, %xmm0 2018 movq $acc2, %xmm1 2019 movq $acc3, %xmm2 2020 2021 lea $S(%rsp), $r_ptr 2022 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 2023 2024 mov 0x40+8*0($a_ptr), $src0 2025 mov 0x40+8*1($a_ptr), $acc6 2026 mov 0x40+8*2($a_ptr), $acc7 2027 mov 0x40+8*3($a_ptr), $acc0 2028 lea 0x40-$bias($a_ptr), $a_ptr 2029 lea $Zsqr(%rsp), $r_ptr 2030 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 2031 2032 `&load_for_sqr("$S(%rsp)", "$src0")` 2033 lea $S(%rsp), $r_ptr 2034 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 2035 2036 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 2037 mov 0x40+8*0($b_ptr), $acc1 2038 mov 0x40+8*1($b_ptr), $acc2 2039 mov 0x40+8*2($b_ptr), $acc3 2040 mov 0x40+8*3($b_ptr), $acc4 2041 lea 0x40-$bias($b_ptr), $a_ptr 2042 lea 0x20($b_ptr), $b_ptr 2043 movq %xmm2, $r_ptr 2044 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 2045 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 2046 2047 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2048 mov $in_x+8*1(%rsp), $acc5 2049 lea $Zsqr(%rsp), $b_ptr 2050 mov $in_x+8*2(%rsp), $acc0 2051 mov $in_x+8*3(%rsp), $acc1 2052 lea $M(%rsp), $r_ptr 2053 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 2054 2055 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2056 mov $in_x+8*1(%rsp), $acc5 2057 lea $Zsqr(%rsp), $b_ptr 2058 mov $in_x+8*2(%rsp), $acc0 2059 mov $in_x+8*3(%rsp), $acc1 2060 lea $Zsqr(%rsp), $r_ptr 2061 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 2062 2063 `&load_for_sqr("$S(%rsp)", "$src0")` 2064 movq %xmm1, $r_ptr 2065 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 2066___ 2067{ 2068######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 2069# operate in 4-5-6-7 "name space" that matches squaring output 2070# 2071my ($poly1,$poly3)=($a_ptr,$t1); 2072my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 2073 2074$code.=<<___; 2075 xor $t4, $t4 2076 mov $a0, $t0 2077 add \$-1, $a0 2078 mov $a1, $t1 2079 adc $poly1, $a1 2080 mov $a2, $t2 2081 adc \$0, $a2 2082 mov $a3, $t3 2083 adc $poly3, $a3 2084 adc \$0, $t4 2085 xor $a_ptr, $a_ptr # borrow $a_ptr 2086 test \$1, $t0 2087 2088 cmovz $t0, $a0 2089 cmovz $t1, $a1 2090 cmovz $t2, $a2 2091 cmovz $t3, $a3 2092 cmovz $a_ptr, $t4 2093 2094 mov $a1, $t0 # a0:a3>>1 2095 shr \$1, $a0 2096 shl \$63, $t0 2097 mov $a2, $t1 2098 shr \$1, $a1 2099 or $t0, $a0 2100 shl \$63, $t1 2101 mov $a3, $t2 2102 shr \$1, $a2 2103 or $t1, $a1 2104 shl \$63, $t2 2105 mov $a0, 8*0($r_ptr) 2106 shr \$1, $a3 2107 mov $a1, 8*1($r_ptr) 2108 shl \$63, $t4 2109 or $t2, $a2 2110 or $t4, $a3 2111 mov $a2, 8*2($r_ptr) 2112 mov $a3, 8*3($r_ptr) 2113___ 2114} 2115$code.=<<___; 2116 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 2117 lea $M(%rsp), $r_ptr 2118 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 2119 2120 lea $tmp0(%rsp), $r_ptr 2121 call __ecp_nistz256_mul_by_2$x 2122 2123 lea $M(%rsp), $b_ptr 2124 lea $M(%rsp), $r_ptr 2125 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 2126 2127 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 2128 lea $S(%rsp), $r_ptr 2129 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 2130 2131 lea $tmp0(%rsp), $r_ptr 2132 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 2133 2134 `&load_for_sqr("$M(%rsp)", "$src0")` 2135 movq %xmm0, $r_ptr 2136 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 2137 2138 lea $tmp0(%rsp), $b_ptr 2139 mov $acc6, $acc0 # harmonize sqr output and sub input 2140 mov $acc7, $acc1 2141 mov $a_ptr, $poly1 2142 mov $t1, $poly3 2143 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 2144 2145 mov $S+8*0(%rsp), $t0 2146 mov $S+8*1(%rsp), $t1 2147 mov $S+8*2(%rsp), $t2 2148 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 2149 lea $S(%rsp), $r_ptr 2150 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 2151 2152 mov $M(%rsp), $src0 2153 lea $M(%rsp), $b_ptr 2154 mov $acc4, $acc6 # harmonize sub output and mul input 2155 xor %ecx, %ecx 2156 mov $acc4, $S+8*0(%rsp) # have to save:-( 2157 mov $acc5, $acc2 2158 mov $acc5, $S+8*1(%rsp) 2159 cmovz $acc0, $acc3 2160 mov $acc0, $S+8*2(%rsp) 2161 lea $S-$bias(%rsp), $a_ptr 2162 cmovz $acc1, $acc4 2163 mov $acc1, $S+8*3(%rsp) 2164 mov $acc6, $acc1 2165 lea $S(%rsp), $r_ptr 2166 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 2167 2168 movq %xmm1, $b_ptr 2169 movq %xmm1, $r_ptr 2170 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 2171 2172 add \$32*5+8, %rsp 2173 pop %r15 2174 pop %r14 2175 pop %r13 2176 pop %r12 2177 pop %rbx 2178 pop %rbp 2179 ret 2180.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 2181___ 2182} 2183&gen_double("q"); 2184 2185sub gen_add () { 2186 my $x = shift; 2187 my ($src0,$sfx,$bias); 2188 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 2189 $U1,$U2,$S1,$S2, 2190 $res_x,$res_y,$res_z, 2191 $in1_x,$in1_y,$in1_z, 2192 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 2193 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2194 2195 if ($x ne "x") { 2196 $src0 = "%rax"; 2197 $sfx = ""; 2198 $bias = 0; 2199 2200$code.=<<___; 2201.globl ecp_nistz256_point_add 2202.type ecp_nistz256_point_add,\@function,3 2203.align 32 2204ecp_nistz256_point_add: 2205___ 2206$code.=<<___ if ($addx); 2207 mov \$0x80100, %ecx 2208 and OPENSSL_ia32cap_P+8(%rip), %ecx 2209 cmp \$0x80100, %ecx 2210 je .Lpoint_addx 2211___ 2212 } else { 2213 $src0 = "%rdx"; 2214 $sfx = "x"; 2215 $bias = 128; 2216 2217$code.=<<___; 2218.type ecp_nistz256_point_addx,\@function,3 2219.align 32 2220ecp_nistz256_point_addx: 2221.Lpoint_addx: 2222___ 2223 } 2224$code.=<<___; 2225 push %rbp 2226 push %rbx 2227 push %r12 2228 push %r13 2229 push %r14 2230 push %r15 2231 sub \$32*18+8, %rsp 2232 2233 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2234 movdqu 0x10($a_ptr), %xmm1 2235 movdqu 0x20($a_ptr), %xmm2 2236 movdqu 0x30($a_ptr), %xmm3 2237 movdqu 0x40($a_ptr), %xmm4 2238 movdqu 0x50($a_ptr), %xmm5 2239 mov $a_ptr, $b_ptr # reassign 2240 mov $b_org, $a_ptr # reassign 2241 movdqa %xmm0, $in1_x(%rsp) 2242 movdqa %xmm1, $in1_x+0x10(%rsp) 2243 por %xmm0, %xmm1 2244 movdqa %xmm2, $in1_y(%rsp) 2245 movdqa %xmm3, $in1_y+0x10(%rsp) 2246 por %xmm2, %xmm3 2247 movdqa %xmm4, $in1_z(%rsp) 2248 movdqa %xmm5, $in1_z+0x10(%rsp) 2249 por %xmm1, %xmm3 2250 2251 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 2252 pshufd \$0xb1, %xmm3, %xmm5 2253 movdqu 0x10($a_ptr), %xmm1 2254 movdqu 0x20($a_ptr), %xmm2 2255 por %xmm3, %xmm5 2256 movdqu 0x30($a_ptr), %xmm3 2257 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 2258 mov 0x40+8*1($a_ptr), $acc6 2259 mov 0x40+8*2($a_ptr), $acc7 2260 mov 0x40+8*3($a_ptr), $acc0 2261 movdqa %xmm0, $in2_x(%rsp) 2262 pshufd \$0x1e, %xmm5, %xmm4 2263 movdqa %xmm1, $in2_x+0x10(%rsp) 2264 por %xmm0, %xmm1 2265 movq $r_ptr, %xmm0 # save $r_ptr 2266 movdqa %xmm2, $in2_y(%rsp) 2267 movdqa %xmm3, $in2_y+0x10(%rsp) 2268 por %xmm2, %xmm3 2269 por %xmm4, %xmm5 2270 pxor %xmm4, %xmm4 2271 por %xmm1, %xmm3 2272 2273 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2274 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 2275 mov $acc6, $in2_z+8*1(%rsp) 2276 mov $acc7, $in2_z+8*2(%rsp) 2277 mov $acc0, $in2_z+8*3(%rsp) 2278 lea $Z2sqr(%rsp), $r_ptr # Z2^2 2279 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 2280 2281 pcmpeqd %xmm4, %xmm5 2282 pshufd \$0xb1, %xmm3, %xmm4 2283 por %xmm3, %xmm4 2284 pshufd \$0, %xmm5, %xmm5 # in1infty 2285 pshufd \$0x1e, %xmm4, %xmm3 2286 por %xmm3, %xmm4 2287 pxor %xmm3, %xmm3 2288 pcmpeqd %xmm3, %xmm4 2289 pshufd \$0, %xmm4, %xmm4 # in2infty 2290 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 2291 mov 0x40+8*1($b_ptr), $acc6 2292 mov 0x40+8*2($b_ptr), $acc7 2293 mov 0x40+8*3($b_ptr), $acc0 2294 2295 lea 0x40-$bias($b_ptr), $a_ptr 2296 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2297 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2298 2299 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 2300 lea $S1(%rsp), $r_ptr # S1 = Z2^3 2301 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 2302 2303 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2304 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2305 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2306 2307 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 2308 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 2309 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 2310 2311 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2312 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2313 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2314 2315 lea $S1(%rsp), $b_ptr 2316 lea $R(%rsp), $r_ptr # R = S2 - S1 2317 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 2318 2319 or $acc5, $acc4 # see if result is zero 2320 movdqa %xmm4, %xmm2 2321 or $acc0, $acc4 2322 or $acc1, $acc4 2323 por %xmm5, %xmm2 # in1infty || in2infty 2324 movq $acc4, %xmm3 2325 2326 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2327 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 2328 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 2329 2330 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 2331 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2332 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 2333 2334 lea $U1(%rsp), $b_ptr 2335 lea $H(%rsp), $r_ptr # H = U2 - U1 2336 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 2337 2338 or $acc5, $acc4 # see if result is zero 2339 or $acc0, $acc4 2340 or $acc1, $acc4 2341 2342 .byte 0x3e # predict taken 2343 jnz .Ladd_proceed$x # is_equal(U1,U2)? 2344 movq %xmm2, $acc0 2345 movq %xmm3, $acc1 2346 test $acc0, $acc0 2347 jnz .Ladd_proceed$x # (in1infty || in2infty)? 2348 test $acc1, $acc1 2349 jz .Ladd_proceed$x # is_equal(S1,S2)? 2350 2351 movq %xmm0, $r_ptr # restore $r_ptr 2352 pxor %xmm0, %xmm0 2353 movdqu %xmm0, 0x00($r_ptr) 2354 movdqu %xmm0, 0x10($r_ptr) 2355 movdqu %xmm0, 0x20($r_ptr) 2356 movdqu %xmm0, 0x30($r_ptr) 2357 movdqu %xmm0, 0x40($r_ptr) 2358 movdqu %xmm0, 0x50($r_ptr) 2359 jmp .Ladd_done$x 2360 2361.align 32 2362.Ladd_proceed$x: 2363 `&load_for_sqr("$R(%rsp)", "$src0")` 2364 lea $Rsqr(%rsp), $r_ptr # R^2 2365 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2366 2367 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2368 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2369 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2370 2371 `&load_for_sqr("$H(%rsp)", "$src0")` 2372 lea $Hsqr(%rsp), $r_ptr # H^2 2373 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2374 2375 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 2376 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2377 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 2378 2379 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 2380 lea $Hcub(%rsp), $r_ptr # H^3 2381 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2382 2383 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 2384 lea $U2(%rsp), $r_ptr # U1*H^2 2385 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 2386___ 2387{ 2388####################################################################### 2389# operate in 4-5-0-1 "name space" that matches multiplication output 2390# 2391my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2392my ($poly1, $poly3)=($acc6,$acc7); 2393 2394$code.=<<___; 2395 #lea $U2(%rsp), $a_ptr 2396 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2397 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2398 2399 add $acc0, $acc0 # a0:a3+a0:a3 2400 lea $Rsqr(%rsp), $a_ptr 2401 adc $acc1, $acc1 2402 mov $acc0, $t0 2403 adc $acc2, $acc2 2404 adc $acc3, $acc3 2405 mov $acc1, $t1 2406 sbb $t4, $t4 2407 2408 sub \$-1, $acc0 2409 mov $acc2, $t2 2410 sbb $poly1, $acc1 2411 sbb \$0, $acc2 2412 mov $acc3, $t3 2413 sbb $poly3, $acc3 2414 test $t4, $t4 2415 2416 cmovz $t0, $acc0 2417 mov 8*0($a_ptr), $t0 2418 cmovz $t1, $acc1 2419 mov 8*1($a_ptr), $t1 2420 cmovz $t2, $acc2 2421 mov 8*2($a_ptr), $t2 2422 cmovz $t3, $acc3 2423 mov 8*3($a_ptr), $t3 2424 2425 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2426 2427 lea $Hcub(%rsp), $b_ptr 2428 lea $res_x(%rsp), $r_ptr 2429 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2430 2431 mov $U2+8*0(%rsp), $t0 2432 mov $U2+8*1(%rsp), $t1 2433 mov $U2+8*2(%rsp), $t2 2434 mov $U2+8*3(%rsp), $t3 2435 lea $res_y(%rsp), $r_ptr 2436 2437 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 2438 2439 mov $acc0, 8*0($r_ptr) # save the result, as 2440 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2441 mov $acc2, 8*2($r_ptr) 2442 mov $acc3, 8*3($r_ptr) 2443___ 2444} 2445$code.=<<___; 2446 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 2447 lea $S2(%rsp), $r_ptr 2448 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 2449 2450 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 2451 lea $res_y(%rsp), $r_ptr 2452 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 2453 2454 lea $S2(%rsp), $b_ptr 2455 lea $res_y(%rsp), $r_ptr 2456 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 2457 2458 movq %xmm0, $r_ptr # restore $r_ptr 2459 2460 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 2461 movdqa %xmm5, %xmm1 2462 pandn $res_z(%rsp), %xmm0 2463 movdqa %xmm5, %xmm2 2464 pandn $res_z+0x10(%rsp), %xmm1 2465 movdqa %xmm5, %xmm3 2466 pand $in2_z(%rsp), %xmm2 2467 pand $in2_z+0x10(%rsp), %xmm3 2468 por %xmm0, %xmm2 2469 por %xmm1, %xmm3 2470 2471 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2472 movdqa %xmm4, %xmm1 2473 pandn %xmm2, %xmm0 2474 movdqa %xmm4, %xmm2 2475 pandn %xmm3, %xmm1 2476 movdqa %xmm4, %xmm3 2477 pand $in1_z(%rsp), %xmm2 2478 pand $in1_z+0x10(%rsp), %xmm3 2479 por %xmm0, %xmm2 2480 por %xmm1, %xmm3 2481 movdqu %xmm2, 0x40($r_ptr) 2482 movdqu %xmm3, 0x50($r_ptr) 2483 2484 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2485 movdqa %xmm5, %xmm1 2486 pandn $res_x(%rsp), %xmm0 2487 movdqa %xmm5, %xmm2 2488 pandn $res_x+0x10(%rsp), %xmm1 2489 movdqa %xmm5, %xmm3 2490 pand $in2_x(%rsp), %xmm2 2491 pand $in2_x+0x10(%rsp), %xmm3 2492 por %xmm0, %xmm2 2493 por %xmm1, %xmm3 2494 2495 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2496 movdqa %xmm4, %xmm1 2497 pandn %xmm2, %xmm0 2498 movdqa %xmm4, %xmm2 2499 pandn %xmm3, %xmm1 2500 movdqa %xmm4, %xmm3 2501 pand $in1_x(%rsp), %xmm2 2502 pand $in1_x+0x10(%rsp), %xmm3 2503 por %xmm0, %xmm2 2504 por %xmm1, %xmm3 2505 movdqu %xmm2, 0x00($r_ptr) 2506 movdqu %xmm3, 0x10($r_ptr) 2507 2508 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2509 movdqa %xmm5, %xmm1 2510 pandn $res_y(%rsp), %xmm0 2511 movdqa %xmm5, %xmm2 2512 pandn $res_y+0x10(%rsp), %xmm1 2513 movdqa %xmm5, %xmm3 2514 pand $in2_y(%rsp), %xmm2 2515 pand $in2_y+0x10(%rsp), %xmm3 2516 por %xmm0, %xmm2 2517 por %xmm1, %xmm3 2518 2519 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2520 movdqa %xmm4, %xmm1 2521 pandn %xmm2, %xmm0 2522 movdqa %xmm4, %xmm2 2523 pandn %xmm3, %xmm1 2524 movdqa %xmm4, %xmm3 2525 pand $in1_y(%rsp), %xmm2 2526 pand $in1_y+0x10(%rsp), %xmm3 2527 por %xmm0, %xmm2 2528 por %xmm1, %xmm3 2529 movdqu %xmm2, 0x20($r_ptr) 2530 movdqu %xmm3, 0x30($r_ptr) 2531 2532.Ladd_done$x: 2533 add \$32*18+8, %rsp 2534 pop %r15 2535 pop %r14 2536 pop %r13 2537 pop %r12 2538 pop %rbx 2539 pop %rbp 2540 ret 2541.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 2542___ 2543} 2544&gen_add("q"); 2545 2546sub gen_add_affine () { 2547 my $x = shift; 2548 my ($src0,$sfx,$bias); 2549 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 2550 $res_x,$res_y,$res_z, 2551 $in1_x,$in1_y,$in1_z, 2552 $in2_x,$in2_y)=map(32*$_,(0..14)); 2553 my $Z1sqr = $S2; 2554 2555 if ($x ne "x") { 2556 $src0 = "%rax"; 2557 $sfx = ""; 2558 $bias = 0; 2559 2560$code.=<<___; 2561.globl ecp_nistz256_point_add_affine 2562.type ecp_nistz256_point_add_affine,\@function,3 2563.align 32 2564ecp_nistz256_point_add_affine: 2565___ 2566$code.=<<___ if ($addx); 2567 mov \$0x80100, %ecx 2568 and OPENSSL_ia32cap_P+8(%rip), %ecx 2569 cmp \$0x80100, %ecx 2570 je .Lpoint_add_affinex 2571___ 2572 } else { 2573 $src0 = "%rdx"; 2574 $sfx = "x"; 2575 $bias = 128; 2576 2577$code.=<<___; 2578.type ecp_nistz256_point_add_affinex,\@function,3 2579.align 32 2580ecp_nistz256_point_add_affinex: 2581.Lpoint_add_affinex: 2582___ 2583 } 2584$code.=<<___; 2585 push %rbp 2586 push %rbx 2587 push %r12 2588 push %r13 2589 push %r14 2590 push %r15 2591 sub \$32*15+8, %rsp 2592 2593 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2594 mov $b_org, $b_ptr # reassign 2595 movdqu 0x10($a_ptr), %xmm1 2596 movdqu 0x20($a_ptr), %xmm2 2597 movdqu 0x30($a_ptr), %xmm3 2598 movdqu 0x40($a_ptr), %xmm4 2599 movdqu 0x50($a_ptr), %xmm5 2600 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 2601 mov 0x40+8*1($a_ptr), $acc6 2602 mov 0x40+8*2($a_ptr), $acc7 2603 mov 0x40+8*3($a_ptr), $acc0 2604 movdqa %xmm0, $in1_x(%rsp) 2605 movdqa %xmm1, $in1_x+0x10(%rsp) 2606 por %xmm0, %xmm1 2607 movdqa %xmm2, $in1_y(%rsp) 2608 movdqa %xmm3, $in1_y+0x10(%rsp) 2609 por %xmm2, %xmm3 2610 movdqa %xmm4, $in1_z(%rsp) 2611 movdqa %xmm5, $in1_z+0x10(%rsp) 2612 por %xmm1, %xmm3 2613 2614 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 2615 pshufd \$0xb1, %xmm3, %xmm5 2616 movdqu 0x10($b_ptr), %xmm1 2617 movdqu 0x20($b_ptr), %xmm2 2618 por %xmm3, %xmm5 2619 movdqu 0x30($b_ptr), %xmm3 2620 movdqa %xmm0, $in2_x(%rsp) 2621 pshufd \$0x1e, %xmm5, %xmm4 2622 movdqa %xmm1, $in2_x+0x10(%rsp) 2623 por %xmm0, %xmm1 2624 movq $r_ptr, %xmm0 # save $r_ptr 2625 movdqa %xmm2, $in2_y(%rsp) 2626 movdqa %xmm3, $in2_y+0x10(%rsp) 2627 por %xmm2, %xmm3 2628 por %xmm4, %xmm5 2629 pxor %xmm4, %xmm4 2630 por %xmm1, %xmm3 2631 2632 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2633 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2634 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2635 2636 pcmpeqd %xmm4, %xmm5 2637 pshufd \$0xb1, %xmm3, %xmm4 2638 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 2639 #lea 0x00($b_ptr), $b_ptr 2640 mov $acc4, $acc1 # harmonize sqr output and mul input 2641 por %xmm3, %xmm4 2642 pshufd \$0, %xmm5, %xmm5 # in1infty 2643 pshufd \$0x1e, %xmm4, %xmm3 2644 mov $acc5, $acc2 2645 por %xmm3, %xmm4 2646 pxor %xmm3, %xmm3 2647 mov $acc6, $acc3 2648 pcmpeqd %xmm3, %xmm4 2649 pshufd \$0, %xmm4, %xmm4 # in2infty 2650 2651 lea $Z1sqr-$bias(%rsp), $a_ptr 2652 mov $acc7, $acc4 2653 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2654 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 2655 2656 lea $in1_x(%rsp), $b_ptr 2657 lea $H(%rsp), $r_ptr # H = U2 - U1 2658 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 2659 2660 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2661 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2662 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2663 2664 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2665 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2666 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2667 2668 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2669 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2670 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2671 2672 lea $in1_y(%rsp), $b_ptr 2673 lea $R(%rsp), $r_ptr # R = S2 - S1 2674 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 2675 2676 `&load_for_sqr("$H(%rsp)", "$src0")` 2677 lea $Hsqr(%rsp), $r_ptr # H^2 2678 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2679 2680 `&load_for_sqr("$R(%rsp)", "$src0")` 2681 lea $Rsqr(%rsp), $r_ptr # R^2 2682 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2683 2684 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 2685 lea $Hcub(%rsp), $r_ptr # H^3 2686 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2687 2688 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2689 lea $U2(%rsp), $r_ptr # U1*H^2 2690 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 2691___ 2692{ 2693####################################################################### 2694# operate in 4-5-0-1 "name space" that matches multiplication output 2695# 2696my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2697my ($poly1, $poly3)=($acc6,$acc7); 2698 2699$code.=<<___; 2700 #lea $U2(%rsp), $a_ptr 2701 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2702 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2703 2704 add $acc0, $acc0 # a0:a3+a0:a3 2705 lea $Rsqr(%rsp), $a_ptr 2706 adc $acc1, $acc1 2707 mov $acc0, $t0 2708 adc $acc2, $acc2 2709 adc $acc3, $acc3 2710 mov $acc1, $t1 2711 sbb $t4, $t4 2712 2713 sub \$-1, $acc0 2714 mov $acc2, $t2 2715 sbb $poly1, $acc1 2716 sbb \$0, $acc2 2717 mov $acc3, $t3 2718 sbb $poly3, $acc3 2719 test $t4, $t4 2720 2721 cmovz $t0, $acc0 2722 mov 8*0($a_ptr), $t0 2723 cmovz $t1, $acc1 2724 mov 8*1($a_ptr), $t1 2725 cmovz $t2, $acc2 2726 mov 8*2($a_ptr), $t2 2727 cmovz $t3, $acc3 2728 mov 8*3($a_ptr), $t3 2729 2730 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2731 2732 lea $Hcub(%rsp), $b_ptr 2733 lea $res_x(%rsp), $r_ptr 2734 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2735 2736 mov $U2+8*0(%rsp), $t0 2737 mov $U2+8*1(%rsp), $t1 2738 mov $U2+8*2(%rsp), $t2 2739 mov $U2+8*3(%rsp), $t3 2740 lea $H(%rsp), $r_ptr 2741 2742 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 2743 2744 mov $acc0, 8*0($r_ptr) # save the result, as 2745 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2746 mov $acc2, 8*2($r_ptr) 2747 mov $acc3, 8*3($r_ptr) 2748___ 2749} 2750$code.=<<___; 2751 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 2752 lea $S2(%rsp), $r_ptr 2753 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 2754 2755 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 2756 lea $H(%rsp), $r_ptr 2757 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 2758 2759 lea $S2(%rsp), $b_ptr 2760 lea $res_y(%rsp), $r_ptr 2761 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 2762 2763 movq %xmm0, $r_ptr # restore $r_ptr 2764 2765 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 2766 movdqa %xmm5, %xmm1 2767 pandn $res_z(%rsp), %xmm0 2768 movdqa %xmm5, %xmm2 2769 pandn $res_z+0x10(%rsp), %xmm1 2770 movdqa %xmm5, %xmm3 2771 pand .LONE_mont(%rip), %xmm2 2772 pand .LONE_mont+0x10(%rip), %xmm3 2773 por %xmm0, %xmm2 2774 por %xmm1, %xmm3 2775 2776 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2777 movdqa %xmm4, %xmm1 2778 pandn %xmm2, %xmm0 2779 movdqa %xmm4, %xmm2 2780 pandn %xmm3, %xmm1 2781 movdqa %xmm4, %xmm3 2782 pand $in1_z(%rsp), %xmm2 2783 pand $in1_z+0x10(%rsp), %xmm3 2784 por %xmm0, %xmm2 2785 por %xmm1, %xmm3 2786 movdqu %xmm2, 0x40($r_ptr) 2787 movdqu %xmm3, 0x50($r_ptr) 2788 2789 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2790 movdqa %xmm5, %xmm1 2791 pandn $res_x(%rsp), %xmm0 2792 movdqa %xmm5, %xmm2 2793 pandn $res_x+0x10(%rsp), %xmm1 2794 movdqa %xmm5, %xmm3 2795 pand $in2_x(%rsp), %xmm2 2796 pand $in2_x+0x10(%rsp), %xmm3 2797 por %xmm0, %xmm2 2798 por %xmm1, %xmm3 2799 2800 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2801 movdqa %xmm4, %xmm1 2802 pandn %xmm2, %xmm0 2803 movdqa %xmm4, %xmm2 2804 pandn %xmm3, %xmm1 2805 movdqa %xmm4, %xmm3 2806 pand $in1_x(%rsp), %xmm2 2807 pand $in1_x+0x10(%rsp), %xmm3 2808 por %xmm0, %xmm2 2809 por %xmm1, %xmm3 2810 movdqu %xmm2, 0x00($r_ptr) 2811 movdqu %xmm3, 0x10($r_ptr) 2812 2813 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2814 movdqa %xmm5, %xmm1 2815 pandn $res_y(%rsp), %xmm0 2816 movdqa %xmm5, %xmm2 2817 pandn $res_y+0x10(%rsp), %xmm1 2818 movdqa %xmm5, %xmm3 2819 pand $in2_y(%rsp), %xmm2 2820 pand $in2_y+0x10(%rsp), %xmm3 2821 por %xmm0, %xmm2 2822 por %xmm1, %xmm3 2823 2824 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2825 movdqa %xmm4, %xmm1 2826 pandn %xmm2, %xmm0 2827 movdqa %xmm4, %xmm2 2828 pandn %xmm3, %xmm1 2829 movdqa %xmm4, %xmm3 2830 pand $in1_y(%rsp), %xmm2 2831 pand $in1_y+0x10(%rsp), %xmm3 2832 por %xmm0, %xmm2 2833 por %xmm1, %xmm3 2834 movdqu %xmm2, 0x20($r_ptr) 2835 movdqu %xmm3, 0x30($r_ptr) 2836 2837 add \$32*15+8, %rsp 2838 pop %r15 2839 pop %r14 2840 pop %r13 2841 pop %r12 2842 pop %rbx 2843 pop %rbp 2844 ret 2845.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 2846___ 2847} 2848&gen_add_affine("q"); 2849 2850######################################################################## 2851# AD*X magic 2852# 2853if ($addx) { { 2854######################################################################## 2855# operate in 4-5-0-1 "name space" that matches multiplication output 2856# 2857my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2858 2859$code.=<<___; 2860.type __ecp_nistz256_add_tox,\@abi-omnipotent 2861.align 32 2862__ecp_nistz256_add_tox: 2863 xor $t4, $t4 2864 adc 8*0($b_ptr), $a0 2865 adc 8*1($b_ptr), $a1 2866 mov $a0, $t0 2867 adc 8*2($b_ptr), $a2 2868 adc 8*3($b_ptr), $a3 2869 mov $a1, $t1 2870 adc \$0, $t4 2871 2872 xor $t3, $t3 2873 sbb \$-1, $a0 2874 mov $a2, $t2 2875 sbb $poly1, $a1 2876 sbb \$0, $a2 2877 mov $a3, $t3 2878 sbb $poly3, $a3 2879 2880 bt \$0, $t4 2881 cmovnc $t0, $a0 2882 cmovnc $t1, $a1 2883 mov $a0, 8*0($r_ptr) 2884 cmovnc $t2, $a2 2885 mov $a1, 8*1($r_ptr) 2886 cmovnc $t3, $a3 2887 mov $a2, 8*2($r_ptr) 2888 mov $a3, 8*3($r_ptr) 2889 2890 ret 2891.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 2892 2893.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 2894.align 32 2895__ecp_nistz256_sub_fromx: 2896 xor $t4, $t4 2897 sbb 8*0($b_ptr), $a0 2898 sbb 8*1($b_ptr), $a1 2899 mov $a0, $t0 2900 sbb 8*2($b_ptr), $a2 2901 sbb 8*3($b_ptr), $a3 2902 mov $a1, $t1 2903 sbb \$0, $t4 2904 2905 xor $t3, $t3 2906 adc \$-1, $a0 2907 mov $a2, $t2 2908 adc $poly1, $a1 2909 adc \$0, $a2 2910 mov $a3, $t3 2911 adc $poly3, $a3 2912 2913 bt \$0, $t4 2914 cmovnc $t0, $a0 2915 cmovnc $t1, $a1 2916 mov $a0, 8*0($r_ptr) 2917 cmovnc $t2, $a2 2918 mov $a1, 8*1($r_ptr) 2919 cmovnc $t3, $a3 2920 mov $a2, 8*2($r_ptr) 2921 mov $a3, 8*3($r_ptr) 2922 2923 ret 2924.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 2925 2926.type __ecp_nistz256_subx,\@abi-omnipotent 2927.align 32 2928__ecp_nistz256_subx: 2929 xor $t4, $t4 2930 sbb $a0, $t0 2931 sbb $a1, $t1 2932 mov $t0, $a0 2933 sbb $a2, $t2 2934 sbb $a3, $t3 2935 mov $t1, $a1 2936 sbb \$0, $t4 2937 2938 xor $a3 ,$a3 2939 adc \$-1, $t0 2940 mov $t2, $a2 2941 adc $poly1, $t1 2942 adc \$0, $t2 2943 mov $t3, $a3 2944 adc $poly3, $t3 2945 2946 bt \$0, $t4 2947 cmovc $t0, $a0 2948 cmovc $t1, $a1 2949 cmovc $t2, $a2 2950 cmovc $t3, $a3 2951 2952 ret 2953.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 2954 2955.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 2956.align 32 2957__ecp_nistz256_mul_by_2x: 2958 xor $t4, $t4 2959 adc $a0, $a0 # a0:a3+a0:a3 2960 adc $a1, $a1 2961 mov $a0, $t0 2962 adc $a2, $a2 2963 adc $a3, $a3 2964 mov $a1, $t1 2965 adc \$0, $t4 2966 2967 xor $t3, $t3 2968 sbb \$-1, $a0 2969 mov $a2, $t2 2970 sbb $poly1, $a1 2971 sbb \$0, $a2 2972 mov $a3, $t3 2973 sbb $poly3, $a3 2974 2975 bt \$0, $t4 2976 cmovnc $t0, $a0 2977 cmovnc $t1, $a1 2978 mov $a0, 8*0($r_ptr) 2979 cmovnc $t2, $a2 2980 mov $a1, 8*1($r_ptr) 2981 cmovnc $t3, $a3 2982 mov $a2, 8*2($r_ptr) 2983 mov $a3, 8*3($r_ptr) 2984 2985 ret 2986.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 2987___ 2988 } 2989&gen_double("x"); 2990&gen_add("x"); 2991&gen_add_affine("x"); 2992} 2993}}} 2994 2995$code =~ s/\`([^\`]*)\`/eval $1/gem; 2996print $code; 2997close STDOUT; 2998