1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright 2014 Intel Corporation # 6# # 7# Licensed under the Apache License, Version 2.0 (the "License"); # 8# you may not use this file except in compliance with the License. # 9# You may obtain a copy of the License at # 10# # 11# http://www.apache.org/licenses/LICENSE-2.0 # 12# # 13# Unless required by applicable law or agreed to in writing, software # 14# distributed under the License is distributed on an "AS IS" BASIS, # 15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 16# See the License for the specific language governing permissions and # 17# limitations under the License. # 18# # 19############################################################################## 20# # 21# Developers and authors: # 22# Shay Gueron (1, 2), and Vlad Krasnov (1) # 23# (1) Intel Corporation, Israel Development Center # 24# (2) University of Haifa # 25# Reference: # 26# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with# 27# 256 Bit Primes" # 28# # 29############################################################################## 30 31$flavour = shift; 32$output = shift; 33if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 34 35$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 36 37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 39( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 40die "can't locate x86_64-xlate.pl"; 41 42open OUT,"| \"$^X\" $xlate $flavour $output"; 43*STDOUT=*OUT; 44 45if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 46 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 47 $avx = ($1>=2.19) + ($1>=2.22); 48 $addx = ($1>=2.23); 49} 50 51if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 52 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 53 $avx = ($1>=2.09) + ($1>=2.10); 54 $addx = ($1>=2.10); 55} 56 57if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 58 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 59 $avx = ($1>=10) + ($1>=11); 60 $addx = ($1>=12); 61} 62 63if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { 64 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 65 $avx = ($ver>=3.0) + ($ver>=3.01); 66 $addx = ($ver>=3.03); 67} 68 69if ($avx>=2) {{ 70$digit_size = "\$29"; 71$n_digits = "\$9"; 72 73$code.=<<___; 74.text 75 76.align 64 77.LAVX2_AND_MASK: 78.LAVX2_POLY: 79.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 80.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 81.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 82.quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff 83.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 84.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 85.quad 0x00040000, 0x00040000, 0x00040000, 0x00040000 86.quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000 87.quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff 88 89.LAVX2_POLY_x2: 90.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC 91.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC 92.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC 93.quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC 94.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE 95.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE 96.quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE 97.quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE 98.quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC 99 100.LAVX2_POLY_x8: 101.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 102.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 103.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 104.quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8 105.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC 106.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC 107.quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC 108.quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC 109.quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8 110 111.LONE: 112.quad 0x00000020, 0x00000020, 0x00000020, 0x00000020 113.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 114.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 115.quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000 116.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 117.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 118.quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff 119.quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff 120.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 121 122# RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL 123# Montgomery form (*2^256) to our format (*2^261) 124 125.LTO_MONT_AVX2: 126.quad 0x00000400, 0x00000400, 0x00000400, 0x00000400 127.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 128.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 129.quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000 130.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 131.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 132.quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff 133.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 134.quad 0x00000003, 0x00000003, 0x00000003, 0x00000003 135 136.LFROM_MONT_AVX2: 137.quad 0x00000001, 0x00000001, 0x00000001, 0x00000001 138.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 139.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 140.quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00 141.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 142.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff 143.quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff 144.quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff 145.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 146 147.LIntOne: 148.long 1,1,1,1,1,1,1,1 149___ 150 151{ 152# This function recieves a pointer to an array of four affine points 153# (X, Y, <1>) and rearanges the data for AVX2 execution, while 154# converting it to 2^29 radix redundant form 155 156my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3, 157 $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15)); 158 159$code.=<<___; 160.globl ecp_nistz256_avx2_transpose_convert 161.type ecp_nistz256_avx2_transpose_convert,\@function,2 162.align 64 163ecp_nistz256_avx2_transpose_convert: 164 vzeroupper 165___ 166$code.=<<___ if ($win64); 167 lea -8-16*10(%rsp), %rsp 168 vmovaps %xmm6, -8-16*10(%rax) 169 vmovaps %xmm7, -8-16*9(%rax) 170 vmovaps %xmm8, -8-16*8(%rax) 171 vmovaps %xmm9, -8-16*7(%rax) 172 vmovaps %xmm10, -8-16*6(%rax) 173 vmovaps %xmm11, -8-16*5(%rax) 174 vmovaps %xmm12, -8-16*4(%rax) 175 vmovaps %xmm13, -8-16*3(%rax) 176 vmovaps %xmm14, -8-16*2(%rax) 177 vmovaps %xmm15, -8-16*1(%rax) 178___ 179$code.=<<___; 180 # Load the data 181 vmovdqa 32*0(%rsi), $X0 182 lea 112(%rsi), %rax # size optimization 183 vmovdqa 32*1(%rsi), $Y0 184 lea .LAVX2_AND_MASK(%rip), %rdx 185 vmovdqa 32*2(%rsi), $X1 186 vmovdqa 32*3(%rsi), $Y1 187 vmovdqa 32*4-112(%rax), $X2 188 vmovdqa 32*5-112(%rax), $Y2 189 vmovdqa 32*6-112(%rax), $X3 190 vmovdqa 32*7-112(%rax), $Y3 191 192 # Transpose X and Y independently 193 vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0] 194 vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0] 195 vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1] 196 vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1] 197 198 vpunpcklqdq $Y1, $Y0, $T4 199 vpunpcklqdq $Y3, $Y2, $T5 200 vpunpckhqdq $Y1, $Y0, $T6 201 vpunpckhqdq $Y3, $Y2, $T7 202 203 vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0] 204 vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1] 205 vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2] 206 vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3] 207 208 vperm2i128 \$0x20, $T5, $T4, $Y0 209 vperm2i128 \$0x20, $T7, $T6, $Y1 210 vperm2i128 \$0x31, $T5, $T4, $Y2 211 vperm2i128 \$0x31, $T7, $T6, $Y3 212 vmovdqa (%rdx), $T7 213 214 vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask; 215 vpsrlq \$29, $X0, $X0 216 vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask; 217 vpsrlq \$29, $X0, $X0 218 vpsllq \$6, $X1, $T2 219 vpxor $X0, $T2, $T2 220 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; 221 vpsrlq \$23, $X1, $X1 222 vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; 223 vpsrlq \$29, $X1, $X1 224 vpsllq \$12, $X2, $T4 225 vpxor $X1, $T4, $T4 226 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; 227 vpsrlq \$17, $X2, $X2 228 vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; 229 vpsrlq \$29, $X2, $X2 230 vpsllq \$18, $X3, $T6 231 vpxor $X2, $T6, $T6 232 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; 233 vpsrlq \$11, $X3, $X3 234 vmovdqa $T0, 32*0(%rdi) 235 lea 112(%rdi), %rax # size optimization 236 vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; 237 vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; 238 239 vmovdqa $T1, 32*1(%rdi) 240 vmovdqa $T2, 32*2(%rdi) 241 vmovdqa $T3, 32*3(%rdi) 242 vmovdqa $T4, 32*4-112(%rax) 243 vmovdqa $T5, 32*5-112(%rax) 244 vmovdqa $T6, 32*6-112(%rax) 245 vmovdqa $T0, 32*7-112(%rax) 246 vmovdqa $X3, 32*8-112(%rax) 247 lea 448(%rdi), %rax # size optimization 248 249 vpand $T7, $Y0, $T0 # out[0] = in[0] & mask; 250 vpsrlq \$29, $Y0, $Y0 251 vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask; 252 vpsrlq \$29, $Y0, $Y0 253 vpsllq \$6, $Y1, $T2 254 vpxor $Y0, $T2, $T2 255 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; 256 vpsrlq \$23, $Y1, $Y1 257 vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; 258 vpsrlq \$29, $Y1, $Y1 259 vpsllq \$12, $Y2, $T4 260 vpxor $Y1, $T4, $T4 261 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; 262 vpsrlq \$17, $Y2, $Y2 263 vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; 264 vpsrlq \$29, $Y2, $Y2 265 vpsllq \$18, $Y3, $T6 266 vpxor $Y2, $T6, $T6 267 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; 268 vpsrlq \$11, $Y3, $Y3 269 vmovdqa $T0, 32*9-448(%rax) 270 vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; 271 vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; 272 273 vmovdqa $T1, 32*10-448(%rax) 274 vmovdqa $T2, 32*11-448(%rax) 275 vmovdqa $T3, 32*12-448(%rax) 276 vmovdqa $T4, 32*13-448(%rax) 277 vmovdqa $T5, 32*14-448(%rax) 278 vmovdqa $T6, 32*15-448(%rax) 279 vmovdqa $T0, 32*16-448(%rax) 280 vmovdqa $Y3, 32*17-448(%rax) 281 282 vzeroupper 283___ 284$code.=<<___ if ($win64); 285 movaps 16*0(%rsp), %xmm6 286 movaps 16*1(%rsp), %xmm7 287 movaps 16*2(%rsp), %xmm8 288 movaps 16*3(%rsp), %xmm9 289 movaps 16*4(%rsp), %xmm10 290 movaps 16*5(%rsp), %xmm11 291 movaps 16*6(%rsp), %xmm12 292 movaps 16*7(%rsp), %xmm13 293 movaps 16*8(%rsp), %xmm14 294 movaps 16*9(%rsp), %xmm15 295 lea 8+16*10(%rsp), %rsp 296___ 297$code.=<<___; 298 ret 299.size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert 300___ 301} 302{ 303################################################################################ 304# This function recieves a pointer to an array of four AVX2 formatted points 305# (X, Y, Z) convert the data to normal representation, and rearanges the data 306 307my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8)); 308my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15)); 309 310$code.=<<___; 311 312.globl ecp_nistz256_avx2_convert_transpose_back 313.type ecp_nistz256_avx2_convert_transpose_back,\@function,2 314.align 32 315ecp_nistz256_avx2_convert_transpose_back: 316 vzeroupper 317___ 318$code.=<<___ if ($win64); 319 lea -8-16*10(%rsp), %rsp 320 vmovaps %xmm6, -8-16*10(%rax) 321 vmovaps %xmm7, -8-16*9(%rax) 322 vmovaps %xmm8, -8-16*8(%rax) 323 vmovaps %xmm9, -8-16*7(%rax) 324 vmovaps %xmm10, -8-16*6(%rax) 325 vmovaps %xmm11, -8-16*5(%rax) 326 vmovaps %xmm12, -8-16*4(%rax) 327 vmovaps %xmm13, -8-16*3(%rax) 328 vmovaps %xmm14, -8-16*2(%rax) 329 vmovaps %xmm15, -8-16*1(%rax) 330___ 331$code.=<<___; 332 mov \$3, %ecx 333 334.Lconv_loop: 335 vmovdqa 32*0(%rsi), $D0 336 lea 160(%rsi), %rax # size optimization 337 vmovdqa 32*1(%rsi), $D1 338 vmovdqa 32*2(%rsi), $D2 339 vmovdqa 32*3(%rsi), $D3 340 vmovdqa 32*4-160(%rax), $D4 341 vmovdqa 32*5-160(%rax), $D5 342 vmovdqa 32*6-160(%rax), $D6 343 vmovdqa 32*7-160(%rax), $D7 344 vmovdqa 32*8-160(%rax), $D8 345 346 vpsllq \$29, $D1, $D1 347 vpsllq \$58, $D2, $T0 348 vpaddq $D1, $D0, $D0 349 vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2); 350 351 vpsrlq \$6, $D2, $D2 352 vpsllq \$23, $D3, $D3 353 vpsllq \$52, $D4, $T1 354 vpaddq $D2, $D3, $D3 355 vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64); 356 357 vpsrlq \$12, $D4, $D4 358 vpsllq \$17, $D5, $D5 359 vpsllq \$46, $D6, $T2 360 vpaddq $D4, $D5, $D5 361 vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64); 362 363 vpsrlq \$18, $D6, $D6 364 vpsllq \$11, $D7, $D7 365 vpsllq \$40, $D8, $T3 366 vpaddq $D6, $D7, $D7 367 vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64); 368 369 vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0] 370 vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0] 371 vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1] 372 vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1] 373 374 vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0] 375 vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1] 376 vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2] 377 vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3] 378 379 vmovdqa $D0, 32*0(%rdi) 380 vmovdqa $D1, 32*3(%rdi) 381 vmovdqa $D2, 32*6(%rdi) 382 vmovdqa $D3, 32*9(%rdi) 383 384 lea 32*9(%rsi), %rsi 385 lea 32*1(%rdi), %rdi 386 387 dec %ecx 388 jnz .Lconv_loop 389 390 vzeroupper 391___ 392$code.=<<___ if ($win64); 393 movaps 16*0(%rsp), %xmm6 394 movaps 16*1(%rsp), %xmm7 395 movaps 16*2(%rsp), %xmm8 396 movaps 16*3(%rsp), %xmm9 397 movaps 16*4(%rsp), %xmm10 398 movaps 16*5(%rsp), %xmm11 399 movaps 16*6(%rsp), %xmm12 400 movaps 16*7(%rsp), %xmm13 401 movaps 16*8(%rsp), %xmm14 402 movaps 16*9(%rsp), %xmm15 403 lea 8+16*10(%rsp), %rsp 404___ 405$code.=<<___; 406 ret 407.size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back 408___ 409} 410{ 411my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx"); 412my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8)); 413my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13)); 414 415sub NORMALIZE { 416my $ret=<<___; 417 vpsrlq $digit_size, $ACC0, $T0 418 vpand $AND_MASK, $ACC0, $ACC0 419 vpaddq $T0, $ACC1, $ACC1 420 421 vpsrlq $digit_size, $ACC1, $T0 422 vpand $AND_MASK, $ACC1, $ACC1 423 vpaddq $T0, $ACC2, $ACC2 424 425 vpsrlq $digit_size, $ACC2, $T0 426 vpand $AND_MASK, $ACC2, $ACC2 427 vpaddq $T0, $ACC3, $ACC3 428 429 vpsrlq $digit_size, $ACC3, $T0 430 vpand $AND_MASK, $ACC3, $ACC3 431 vpaddq $T0, $ACC4, $ACC4 432 433 vpsrlq $digit_size, $ACC4, $T0 434 vpand $AND_MASK, $ACC4, $ACC4 435 vpaddq $T0, $ACC5, $ACC5 436 437 vpsrlq $digit_size, $ACC5, $T0 438 vpand $AND_MASK, $ACC5, $ACC5 439 vpaddq $T0, $ACC6, $ACC6 440 441 vpsrlq $digit_size, $ACC6, $T0 442 vpand $AND_MASK, $ACC6, $ACC6 443 vpaddq $T0, $ACC7, $ACC7 444 445 vpsrlq $digit_size, $ACC7, $T0 446 vpand $AND_MASK, $ACC7, $ACC7 447 vpaddq $T0, $ACC8, $ACC8 448 #vpand $AND_MASK, $ACC8, $ACC8 449___ 450 $ret; 451} 452 453sub STORE { 454my $ret=<<___; 455 vmovdqa $ACC0, 32*0(%rdi) 456 lea 160(%rdi), %rax # size optimization 457 vmovdqa $ACC1, 32*1(%rdi) 458 vmovdqa $ACC2, 32*2(%rdi) 459 vmovdqa $ACC3, 32*3(%rdi) 460 vmovdqa $ACC4, 32*4-160(%rax) 461 vmovdqa $ACC5, 32*5-160(%rax) 462 vmovdqa $ACC6, 32*6-160(%rax) 463 vmovdqa $ACC7, 32*7-160(%rax) 464 vmovdqa $ACC8, 32*8-160(%rax) 465___ 466 $ret; 467} 468 469$code.=<<___; 470.type avx2_normalize,\@abi-omnipotent 471.align 32 472avx2_normalize: 473 vpsrlq $digit_size, $ACC0, $T0 474 vpand $AND_MASK, $ACC0, $ACC0 475 vpaddq $T0, $ACC1, $ACC1 476 477 vpsrlq $digit_size, $ACC1, $T0 478 vpand $AND_MASK, $ACC1, $ACC1 479 vpaddq $T0, $ACC2, $ACC2 480 481 vpsrlq $digit_size, $ACC2, $T0 482 vpand $AND_MASK, $ACC2, $ACC2 483 vpaddq $T0, $ACC3, $ACC3 484 485 vpsrlq $digit_size, $ACC3, $T0 486 vpand $AND_MASK, $ACC3, $ACC3 487 vpaddq $T0, $ACC4, $ACC4 488 489 vpsrlq $digit_size, $ACC4, $T0 490 vpand $AND_MASK, $ACC4, $ACC4 491 vpaddq $T0, $ACC5, $ACC5 492 493 vpsrlq $digit_size, $ACC5, $T0 494 vpand $AND_MASK, $ACC5, $ACC5 495 vpaddq $T0, $ACC6, $ACC6 496 497 vpsrlq $digit_size, $ACC6, $T0 498 vpand $AND_MASK, $ACC6, $ACC6 499 vpaddq $T0, $ACC7, $ACC7 500 501 vpsrlq $digit_size, $ACC7, $T0 502 vpand $AND_MASK, $ACC7, $ACC7 503 vpaddq $T0, $ACC8, $ACC8 504 #vpand $AND_MASK, $ACC8, $ACC8 505 506 ret 507.size avx2_normalize,.-avx2_normalize 508 509.type avx2_normalize_n_store,\@abi-omnipotent 510.align 32 511avx2_normalize_n_store: 512 vpsrlq $digit_size, $ACC0, $T0 513 vpand $AND_MASK, $ACC0, $ACC0 514 vpaddq $T0, $ACC1, $ACC1 515 516 vpsrlq $digit_size, $ACC1, $T0 517 vpand $AND_MASK, $ACC1, $ACC1 518 vmovdqa $ACC0, 32*0(%rdi) 519 lea 160(%rdi), %rax # size optimization 520 vpaddq $T0, $ACC2, $ACC2 521 522 vpsrlq $digit_size, $ACC2, $T0 523 vpand $AND_MASK, $ACC2, $ACC2 524 vmovdqa $ACC1, 32*1(%rdi) 525 vpaddq $T0, $ACC3, $ACC3 526 527 vpsrlq $digit_size, $ACC3, $T0 528 vpand $AND_MASK, $ACC3, $ACC3 529 vmovdqa $ACC2, 32*2(%rdi) 530 vpaddq $T0, $ACC4, $ACC4 531 532 vpsrlq $digit_size, $ACC4, $T0 533 vpand $AND_MASK, $ACC4, $ACC4 534 vmovdqa $ACC3, 32*3(%rdi) 535 vpaddq $T0, $ACC5, $ACC5 536 537 vpsrlq $digit_size, $ACC5, $T0 538 vpand $AND_MASK, $ACC5, $ACC5 539 vmovdqa $ACC4, 32*4-160(%rax) 540 vpaddq $T0, $ACC6, $ACC6 541 542 vpsrlq $digit_size, $ACC6, $T0 543 vpand $AND_MASK, $ACC6, $ACC6 544 vmovdqa $ACC5, 32*5-160(%rax) 545 vpaddq $T0, $ACC7, $ACC7 546 547 vpsrlq $digit_size, $ACC7, $T0 548 vpand $AND_MASK, $ACC7, $ACC7 549 vmovdqa $ACC6, 32*6-160(%rax) 550 vpaddq $T0, $ACC8, $ACC8 551 #vpand $AND_MASK, $ACC8, $ACC8 552 vmovdqa $ACC7, 32*7-160(%rax) 553 vmovdqa $ACC8, 32*8-160(%rax) 554 555 ret 556.size avx2_normalize_n_store,.-avx2_normalize_n_store 557 558################################################################################ 559# void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4); 560.type avx2_mul_x4,\@abi-omnipotent 561.align 32 562avx2_mul_x4: 563 lea .LAVX2_POLY(%rip), %rax 564 565 vpxor $ACC0, $ACC0, $ACC0 566 vpxor $ACC1, $ACC1, $ACC1 567 vpxor $ACC2, $ACC2, $ACC2 568 vpxor $ACC3, $ACC3, $ACC3 569 vpxor $ACC4, $ACC4, $ACC4 570 vpxor $ACC5, $ACC5, $ACC5 571 vpxor $ACC6, $ACC6, $ACC6 572 vpxor $ACC7, $ACC7, $ACC7 573 574 vmovdqa 32*7(%rax), %ymm14 575 vmovdqa 32*8(%rax), %ymm15 576 577 mov $n_digits, $itr 578 lea -512($a_ptr), $a_ptr # strategic bias to control u-op density 579 jmp .Lavx2_mul_x4_loop 580 581.align 32 582.Lavx2_mul_x4_loop: 583 vmovdqa 32*0($b_ptr), $B 584 lea 32*1($b_ptr), $b_ptr 585 586 vpmuludq 32*0+512($a_ptr), $B, $T0 587 vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW 588 vpaddq $T0, $ACC0, $ACC0 589 vpmuludq 32*2+512($a_ptr), $B, $T0 590 vpaddq $OVERFLOW, $ACC1, $ACC1 591 vpand $AND_MASK, $ACC0, $Y 592 vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW 593 vpaddq $T0, $ACC2, $ACC2 594 vpmuludq 32*4+512($a_ptr), $B, $T0 595 vpaddq $OVERFLOW, $ACC3, $ACC3 596 vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW 597 vpaddq $T0, $ACC4, $ACC4 598 vpmuludq 32*6+512($a_ptr), $B, $T0 599 vpaddq $OVERFLOW, $ACC5, $ACC5 600 vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW 601 vpaddq $T0, $ACC6, $ACC6 602 603 # Skip some multiplications, optimizing for the constant poly 604 vpmuludq $AND_MASK, $Y, $T0 605 vpaddq $OVERFLOW, $ACC7, $ACC7 606 vpmuludq 32*8+512($a_ptr), $B, $ACC8 607 vpaddq $T0, $ACC0, $OVERFLOW 608 vpaddq $T0, $ACC1, $ACC0 609 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 610 vpaddq $T0, $ACC2, $ACC1 611 vpmuludq 32*3(%rax), $Y, $T0 612 vpaddq $OVERFLOW, $ACC0, $ACC0 613 vpaddq $T0, $ACC3, $ACC2 614 .byte 0x67 615 vmovdqa $ACC4, $ACC3 616 vpsllq \$18, $Y, $OVERFLOW 617 .byte 0x67 618 vmovdqa $ACC5, $ACC4 619 vpmuludq %ymm14, $Y, $T0 620 vpaddq $OVERFLOW, $ACC6, $ACC5 621 vpmuludq %ymm15, $Y, $OVERFLOW 622 vpaddq $T0, $ACC7, $ACC6 623 vpaddq $OVERFLOW, $ACC8, $ACC7 624 625 dec $itr 626 jnz .Lavx2_mul_x4_loop 627 628 vpxor $ACC8, $ACC8, $ACC8 629 630 ret 631.size avx2_mul_x4,.-avx2_mul_x4 632 633# Function optimized for the constant 1 634################################################################################ 635# void avx2_mul_by1_x4(void* RESULTx4, void *Ax4); 636.type avx2_mul_by1_x4,\@abi-omnipotent 637.align 32 638avx2_mul_by1_x4: 639 lea .LAVX2_POLY(%rip), %rax 640 641 vpxor $ACC0, $ACC0, $ACC0 642 vpxor $ACC1, $ACC1, $ACC1 643 vpxor $ACC2, $ACC2, $ACC2 644 vpxor $ACC3, $ACC3, $ACC3 645 vpxor $ACC4, $ACC4, $ACC4 646 vpxor $ACC5, $ACC5, $ACC5 647 vpxor $ACC6, $ACC6, $ACC6 648 vpxor $ACC7, $ACC7, $ACC7 649 vpxor $ACC8, $ACC8, $ACC8 650 651 vmovdqa 32*3+.LONE(%rip), %ymm14 652 vmovdqa 32*7+.LONE(%rip), %ymm15 653 654 mov $n_digits, $itr 655 jmp .Lavx2_mul_by1_x4_loop 656 657.align 32 658.Lavx2_mul_by1_x4_loop: 659 vmovdqa 32*0($a_ptr), $B 660 .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr 661 662 vpsllq \$5, $B, $OVERFLOW 663 vpmuludq %ymm14, $B, $T0 664 vpaddq $OVERFLOW, $ACC0, $ACC0 665 vpaddq $T0, $ACC3, $ACC3 666 .byte 0x67 667 vpmuludq $AND_MASK, $B, $T0 668 vpand $AND_MASK, $ACC0, $Y 669 vpaddq $T0, $ACC4, $ACC4 670 vpaddq $T0, $ACC5, $ACC5 671 vpaddq $T0, $ACC6, $ACC6 672 vpsllq \$23, $B, $T0 673 674 .byte 0x67,0x67 675 vpmuludq %ymm15, $B, $OVERFLOW 676 vpsubq $T0, $ACC6, $ACC6 677 678 vpmuludq $AND_MASK, $Y, $T0 679 vpaddq $OVERFLOW, $ACC7, $ACC7 680 vpaddq $T0, $ACC0, $OVERFLOW 681 vpaddq $T0, $ACC1, $ACC0 682 .byte 0x67,0x67 683 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 684 vpaddq $T0, $ACC2, $ACC1 685 vpmuludq 32*3(%rax), $Y, $T0 686 vpaddq $OVERFLOW, $ACC0, $ACC0 687 vpaddq $T0, $ACC3, $ACC2 688 vmovdqa $ACC4, $ACC3 689 vpsllq \$18, $Y, $OVERFLOW 690 vmovdqa $ACC5, $ACC4 691 vpmuludq 32*7(%rax), $Y, $T0 692 vpaddq $OVERFLOW, $ACC6, $ACC5 693 vpaddq $T0, $ACC7, $ACC6 694 vpmuludq 32*8(%rax), $Y, $ACC7 695 696 dec $itr 697 jnz .Lavx2_mul_by1_x4_loop 698 699 ret 700.size avx2_mul_by1_x4,.-avx2_mul_by1_x4 701 702################################################################################ 703# void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4); 704.type avx2_sqr_x4,\@abi-omnipotent 705.align 32 706avx2_sqr_x4: 707 lea .LAVX2_POLY(%rip), %rax 708 709 vmovdqa 32*7(%rax), %ymm14 710 vmovdqa 32*8(%rax), %ymm15 711 712 vmovdqa 32*0($a_ptr), $B 713 vmovdqa 32*1($a_ptr), $ACC1 714 vmovdqa 32*2($a_ptr), $ACC2 715 vmovdqa 32*3($a_ptr), $ACC3 716 vmovdqa 32*4($a_ptr), $ACC4 717 vmovdqa 32*5($a_ptr), $ACC5 718 vmovdqa 32*6($a_ptr), $ACC6 719 vmovdqa 32*7($a_ptr), $ACC7 720 vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7 721 vmovdqa 32*8($a_ptr), $ACC8 722 vpaddq $ACC2, $ACC2, $ACC2 723 vmovdqa $ACC1, 32*0(%rcx) 724 vpaddq $ACC3, $ACC3, $ACC3 725 vmovdqa $ACC2, 32*1(%rcx) 726 vpaddq $ACC4, $ACC4, $ACC4 727 vmovdqa $ACC3, 32*2(%rcx) 728 vpaddq $ACC5, $ACC5, $ACC5 729 vmovdqa $ACC4, 32*3(%rcx) 730 vpaddq $ACC6, $ACC6, $ACC6 731 vmovdqa $ACC5, 32*4(%rcx) 732 vpaddq $ACC7, $ACC7, $ACC7 733 vmovdqa $ACC6, 32*5(%rcx) 734 vpaddq $ACC8, $ACC8, $ACC8 735 vmovdqa $ACC7, 32*6(%rcx) 736 vmovdqa $ACC8, 32*7(%rcx) 737 738 #itr 1 739 vpmuludq $B, $B, $ACC0 740 vpmuludq $B, $ACC1, $ACC1 741 vpand $AND_MASK, $ACC0, $Y 742 vpmuludq $B, $ACC2, $ACC2 743 vpmuludq $B, $ACC3, $ACC3 744 vpmuludq $B, $ACC4, $ACC4 745 vpmuludq $B, $ACC5, $ACC5 746 vpmuludq $B, $ACC6, $ACC6 747 vpmuludq $AND_MASK, $Y, $T0 748 vpmuludq $B, $ACC7, $ACC7 749 vpmuludq $B, $ACC8, $ACC8 750 vmovdqa 32*1($a_ptr), $B 751 752 vpaddq $T0, $ACC0, $OVERFLOW 753 vpaddq $T0, $ACC1, $ACC0 754 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 755 vpaddq $T0, $ACC2, $ACC1 756 vpmuludq 32*3(%rax), $Y, $T0 757 vpaddq $OVERFLOW, $ACC0, $ACC0 758 vpaddq $T0, $ACC3, $ACC2 759 vmovdqa $ACC4, $ACC3 760 vpsllq \$18, $Y, $T0 761 vmovdqa $ACC5, $ACC4 762 vpmuludq %ymm14, $Y, $OVERFLOW 763 vpaddq $T0, $ACC6, $ACC5 764 vpmuludq %ymm15, $Y, $T0 765 vpaddq $OVERFLOW, $ACC7, $ACC6 766 vpaddq $T0, $ACC8, $ACC7 767 768 #itr 2 769 vpmuludq $B, $B, $OVERFLOW 770 vpand $AND_MASK, $ACC0, $Y 771 vpmuludq 32*1(%rcx), $B, $T0 772 vpaddq $OVERFLOW, $ACC1, $ACC1 773 vpmuludq 32*2(%rcx), $B, $OVERFLOW 774 vpaddq $T0, $ACC2, $ACC2 775 vpmuludq 32*3(%rcx), $B, $T0 776 vpaddq $OVERFLOW, $ACC3, $ACC3 777 vpmuludq 32*4(%rcx), $B, $OVERFLOW 778 vpaddq $T0, $ACC4, $ACC4 779 vpmuludq 32*5(%rcx), $B, $T0 780 vpaddq $OVERFLOW, $ACC5, $ACC5 781 vpmuludq 32*6(%rcx), $B, $OVERFLOW 782 vpaddq $T0, $ACC6, $ACC6 783 784 vpmuludq $AND_MASK, $Y, $T0 785 vpaddq $OVERFLOW, $ACC7, $ACC7 786 vpmuludq 32*7(%rcx), $B, $ACC8 787 vmovdqa 32*2($a_ptr), $B 788 vpaddq $T0, $ACC0, $OVERFLOW 789 vpaddq $T0, $ACC1, $ACC0 790 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 791 vpaddq $T0, $ACC2, $ACC1 792 vpmuludq 32*3(%rax), $Y, $T0 793 vpaddq $OVERFLOW, $ACC0, $ACC0 794 vpaddq $T0, $ACC3, $ACC2 795 vmovdqa $ACC4, $ACC3 796 vpsllq \$18, $Y, $T0 797 vmovdqa $ACC5, $ACC4 798 vpmuludq %ymm14, $Y, $OVERFLOW 799 vpaddq $T0, $ACC6, $ACC5 800 vpmuludq %ymm15, $Y, $T0 801 vpaddq $OVERFLOW, $ACC7, $ACC6 802 vpaddq $T0, $ACC8, $ACC7 803 804 #itr 3 805 vpmuludq $B, $B, $T0 806 vpand $AND_MASK, $ACC0, $Y 807 vpmuludq 32*2(%rcx), $B, $OVERFLOW 808 vpaddq $T0, $ACC2, $ACC2 809 vpmuludq 32*3(%rcx), $B, $T0 810 vpaddq $OVERFLOW, $ACC3, $ACC3 811 vpmuludq 32*4(%rcx), $B, $OVERFLOW 812 vpaddq $T0, $ACC4, $ACC4 813 vpmuludq 32*5(%rcx), $B, $T0 814 vpaddq $OVERFLOW, $ACC5, $ACC5 815 vpmuludq 32*6(%rcx), $B, $OVERFLOW 816 vpaddq $T0, $ACC6, $ACC6 817 818 vpmuludq $AND_MASK, $Y, $T0 819 vpaddq $OVERFLOW, $ACC7, $ACC7 820 vpmuludq 32*7(%rcx), $B, $ACC8 821 vmovdqa 32*3($a_ptr), $B 822 vpaddq $T0, $ACC0, $OVERFLOW 823 vpaddq $T0, $ACC1, $ACC0 824 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 825 vpaddq $T0, $ACC2, $ACC1 826 vpmuludq 32*3(%rax), $Y, $T0 827 vpaddq $OVERFLOW, $ACC0, $ACC0 828 vpaddq $T0, $ACC3, $ACC2 829 vmovdqa $ACC4, $ACC3 830 vpsllq \$18, $Y, $T0 831 vmovdqa $ACC5, $ACC4 832 vpmuludq %ymm14, $Y, $OVERFLOW 833 vpaddq $T0, $ACC6, $ACC5 834 vpmuludq %ymm15, $Y, $T0 835 vpand $AND_MASK, $ACC0, $Y 836 vpaddq $OVERFLOW, $ACC7, $ACC6 837 vpaddq $T0, $ACC8, $ACC7 838 839 #itr 4 840 vpmuludq $B, $B, $OVERFLOW 841 vpmuludq 32*3(%rcx), $B, $T0 842 vpaddq $OVERFLOW, $ACC3, $ACC3 843 vpmuludq 32*4(%rcx), $B, $OVERFLOW 844 vpaddq $T0, $ACC4, $ACC4 845 vpmuludq 32*5(%rcx), $B, $T0 846 vpaddq $OVERFLOW, $ACC5, $ACC5 847 vpmuludq 32*6(%rcx), $B, $OVERFLOW 848 vpaddq $T0, $ACC6, $ACC6 849 850 vpmuludq $AND_MASK, $Y, $T0 851 vpaddq $OVERFLOW, $ACC7, $ACC7 852 vpmuludq 32*7(%rcx), $B, $ACC8 853 vmovdqa 32*4($a_ptr), $B 854 vpaddq $T0, $ACC0, $OVERFLOW 855 vpaddq $T0, $ACC1, $ACC0 856 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 857 vpaddq $T0, $ACC2, $ACC1 858 vpmuludq 32*3(%rax), $Y, $T0 859 vpaddq $OVERFLOW, $ACC0, $ACC0 860 vpaddq $T0, $ACC3, $ACC2 861 vmovdqa $ACC4, $ACC3 862 vpsllq \$18, $Y, $T0 863 vmovdqa $ACC5, $ACC4 864 vpmuludq %ymm14, $Y, $OVERFLOW 865 vpaddq $T0, $ACC6, $ACC5 866 vpmuludq %ymm15, $Y, $T0 867 vpand $AND_MASK, $ACC0, $Y 868 vpaddq $OVERFLOW, $ACC7, $ACC6 869 vpaddq $T0, $ACC8, $ACC7 870 871 #itr 5 872 vpmuludq $B, $B, $T0 873 vpmuludq 32*4(%rcx), $B, $OVERFLOW 874 vpaddq $T0, $ACC4, $ACC4 875 vpmuludq 32*5(%rcx), $B, $T0 876 vpaddq $OVERFLOW, $ACC5, $ACC5 877 vpmuludq 32*6(%rcx), $B, $OVERFLOW 878 vpaddq $T0, $ACC6, $ACC6 879 880 vpmuludq $AND_MASK, $Y, $T0 881 vpaddq $OVERFLOW, $ACC7, $ACC7 882 vpmuludq 32*7(%rcx), $B, $ACC8 883 vmovdqa 32*5($a_ptr), $B 884 vpaddq $T0, $ACC0, $OVERFLOW 885 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 886 vpaddq $T0, $ACC1, $ACC0 887 vpaddq $T0, $ACC2, $ACC1 888 vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0 889 vpaddq $OVERFLOW, $ACC0, $ACC0 890 vpaddq $T0, $ACC3, $ACC2 891 vmovdqa $ACC4, $ACC3 892 vpsllq \$18, $Y, $T0 893 vmovdqa $ACC5, $ACC4 894 vpmuludq %ymm14, $Y, $OVERFLOW 895 vpaddq $T0, $ACC6, $ACC5 896 vpmuludq %ymm15, $Y, $T0 897 vpand $AND_MASK, $ACC0, $Y 898 vpaddq $OVERFLOW, $ACC7, $ACC6 899 vpaddq $T0, $ACC8, $ACC7 900 901 #itr 6 902 vpmuludq $B, $B, $OVERFLOW 903 vpmuludq 32*5(%rcx), $B, $T0 904 vpaddq $OVERFLOW, $ACC5, $ACC5 905 vpmuludq 32*6(%rcx), $B, $OVERFLOW 906 vpaddq $T0, $ACC6, $ACC6 907 908 vpmuludq $AND_MASK, $Y, $T0 909 vpaddq $OVERFLOW, $ACC7, $ACC7 910 vpmuludq 32*7(%rcx), $B, $ACC8 911 vmovdqa 32*6($a_ptr), $B 912 vpaddq $T0, $ACC0, $OVERFLOW 913 vpaddq $T0, $ACC1, $ACC0 914 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 915 vpaddq $T0, $ACC2, $ACC1 916 vpmuludq 32*3(%rax), $Y, $T0 917 vpaddq $OVERFLOW, $ACC0, $ACC0 918 vpaddq $T0, $ACC3, $ACC2 919 vmovdqa $ACC4, $ACC3 920 vpsllq \$18, $Y, $T0 921 vmovdqa $ACC5, $ACC4 922 vpmuludq %ymm14, $Y, $OVERFLOW 923 vpaddq $T0, $ACC6, $ACC5 924 vpmuludq %ymm15, $Y, $T0 925 vpand $AND_MASK, $ACC0, $Y 926 vpaddq $OVERFLOW, $ACC7, $ACC6 927 vpaddq $T0, $ACC8, $ACC7 928 929 #itr 7 930 vpmuludq $B, $B, $T0 931 vpmuludq 32*6(%rcx), $B, $OVERFLOW 932 vpaddq $T0, $ACC6, $ACC6 933 934 vpmuludq $AND_MASK, $Y, $T0 935 vpaddq $OVERFLOW, $ACC7, $ACC7 936 vpmuludq 32*7(%rcx), $B, $ACC8 937 vmovdqa 32*7($a_ptr), $B 938 vpaddq $T0, $ACC0, $OVERFLOW 939 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 940 vpaddq $T0, $ACC1, $ACC0 941 vpaddq $T0, $ACC2, $ACC1 942 vpmuludq 32*3(%rax), $Y, $T0 943 vpaddq $OVERFLOW, $ACC0, $ACC0 944 vpaddq $T0, $ACC3, $ACC2 945 vmovdqa $ACC4, $ACC3 946 vpsllq \$18, $Y, $T0 947 vmovdqa $ACC5, $ACC4 948 vpmuludq %ymm14, $Y, $OVERFLOW 949 vpaddq $T0, $ACC6, $ACC5 950 vpmuludq %ymm15, $Y, $T0 951 vpand $AND_MASK, $ACC0, $Y 952 vpaddq $OVERFLOW, $ACC7, $ACC6 953 vpaddq $T0, $ACC8, $ACC7 954 955 #itr 8 956 vpmuludq $B, $B, $OVERFLOW 957 958 vpmuludq $AND_MASK, $Y, $T0 959 vpaddq $OVERFLOW, $ACC7, $ACC7 960 vpmuludq 32*7(%rcx), $B, $ACC8 961 vmovdqa 32*8($a_ptr), $B 962 vpaddq $T0, $ACC0, $OVERFLOW 963 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 964 vpaddq $T0, $ACC1, $ACC0 965 vpaddq $T0, $ACC2, $ACC1 966 vpmuludq 32*3(%rax), $Y, $T0 967 vpaddq $OVERFLOW, $ACC0, $ACC0 968 vpaddq $T0, $ACC3, $ACC2 969 vmovdqa $ACC4, $ACC3 970 vpsllq \$18, $Y, $T0 971 vmovdqa $ACC5, $ACC4 972 vpmuludq %ymm14, $Y, $OVERFLOW 973 vpaddq $T0, $ACC6, $ACC5 974 vpmuludq %ymm15, $Y, $T0 975 vpand $AND_MASK, $ACC0, $Y 976 vpaddq $OVERFLOW, $ACC7, $ACC6 977 vpaddq $T0, $ACC8, $ACC7 978 979 #itr 9 980 vpmuludq $B, $B, $ACC8 981 982 vpmuludq $AND_MASK, $Y, $T0 983 vpaddq $T0, $ACC0, $OVERFLOW 984 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW 985 vpaddq $T0, $ACC1, $ACC0 986 vpaddq $T0, $ACC2, $ACC1 987 vpmuludq 32*3(%rax), $Y, $T0 988 vpaddq $OVERFLOW, $ACC0, $ACC0 989 vpaddq $T0, $ACC3, $ACC2 990 vmovdqa $ACC4, $ACC3 991 vpsllq \$18, $Y, $T0 992 vmovdqa $ACC5, $ACC4 993 vpmuludq %ymm14, $Y, $OVERFLOW 994 vpaddq $T0, $ACC6, $ACC5 995 vpmuludq %ymm15, $Y, $T0 996 vpaddq $OVERFLOW, $ACC7, $ACC6 997 vpaddq $T0, $ACC8, $ACC7 998 999 vpxor $ACC8, $ACC8, $ACC8 1000 1001 ret 1002.size avx2_sqr_x4,.-avx2_sqr_x4 1003 1004################################################################################ 1005# void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4); 1006.type avx2_sub_x4,\@abi-omnipotent 1007.align 32 1008avx2_sub_x4: 1009 vmovdqa 32*0($a_ptr), $ACC0 1010 lea 160($a_ptr), $a_ptr 1011 lea .LAVX2_POLY_x8+128(%rip), %rax 1012 lea 128($b_ptr), $b_ptr 1013 vmovdqa 32*1-160($a_ptr), $ACC1 1014 vmovdqa 32*2-160($a_ptr), $ACC2 1015 vmovdqa 32*3-160($a_ptr), $ACC3 1016 vmovdqa 32*4-160($a_ptr), $ACC4 1017 vmovdqa 32*5-160($a_ptr), $ACC5 1018 vmovdqa 32*6-160($a_ptr), $ACC6 1019 vmovdqa 32*7-160($a_ptr), $ACC7 1020 vmovdqa 32*8-160($a_ptr), $ACC8 1021 1022 vpaddq 32*0-128(%rax), $ACC0, $ACC0 1023 vpaddq 32*1-128(%rax), $ACC1, $ACC1 1024 vpaddq 32*2-128(%rax), $ACC2, $ACC2 1025 vpaddq 32*3-128(%rax), $ACC3, $ACC3 1026 vpaddq 32*4-128(%rax), $ACC4, $ACC4 1027 vpaddq 32*5-128(%rax), $ACC5, $ACC5 1028 vpaddq 32*6-128(%rax), $ACC6, $ACC6 1029 vpaddq 32*7-128(%rax), $ACC7, $ACC7 1030 vpaddq 32*8-128(%rax), $ACC8, $ACC8 1031 1032 vpsubq 32*0-128($b_ptr), $ACC0, $ACC0 1033 vpsubq 32*1-128($b_ptr), $ACC1, $ACC1 1034 vpsubq 32*2-128($b_ptr), $ACC2, $ACC2 1035 vpsubq 32*3-128($b_ptr), $ACC3, $ACC3 1036 vpsubq 32*4-128($b_ptr), $ACC4, $ACC4 1037 vpsubq 32*5-128($b_ptr), $ACC5, $ACC5 1038 vpsubq 32*6-128($b_ptr), $ACC6, $ACC6 1039 vpsubq 32*7-128($b_ptr), $ACC7, $ACC7 1040 vpsubq 32*8-128($b_ptr), $ACC8, $ACC8 1041 1042 ret 1043.size avx2_sub_x4,.-avx2_sub_x4 1044 1045.type avx2_select_n_store,\@abi-omnipotent 1046.align 32 1047avx2_select_n_store: 1048 vmovdqa `8+32*9*8`(%rsp), $Y 1049 vpor `8+32*9*8+32`(%rsp), $Y, $Y 1050 1051 vpandn $ACC0, $Y, $ACC0 1052 vpandn $ACC1, $Y, $ACC1 1053 vpandn $ACC2, $Y, $ACC2 1054 vpandn $ACC3, $Y, $ACC3 1055 vpandn $ACC4, $Y, $ACC4 1056 vpandn $ACC5, $Y, $ACC5 1057 vpandn $ACC6, $Y, $ACC6 1058 vmovdqa `8+32*9*8+32`(%rsp), $B 1059 vpandn $ACC7, $Y, $ACC7 1060 vpandn `8+32*9*8`(%rsp), $B, $B 1061 vpandn $ACC8, $Y, $ACC8 1062 1063 vpand 32*0(%rsi), $B, $T0 1064 lea 160(%rsi), %rax 1065 vpand 32*1(%rsi), $B, $Y 1066 vpxor $T0, $ACC0, $ACC0 1067 vpand 32*2(%rsi), $B, $T0 1068 vpxor $Y, $ACC1, $ACC1 1069 vpand 32*3(%rsi), $B, $Y 1070 vpxor $T0, $ACC2, $ACC2 1071 vpand 32*4-160(%rax), $B, $T0 1072 vpxor $Y, $ACC3, $ACC3 1073 vpand 32*5-160(%rax), $B, $Y 1074 vpxor $T0, $ACC4, $ACC4 1075 vpand 32*6-160(%rax), $B, $T0 1076 vpxor $Y, $ACC5, $ACC5 1077 vpand 32*7-160(%rax), $B, $Y 1078 vpxor $T0, $ACC6, $ACC6 1079 vpand 32*8-160(%rax), $B, $T0 1080 vmovdqa `8+32*9*8+32`(%rsp), $B 1081 vpxor $Y, $ACC7, $ACC7 1082 1083 vpand 32*0(%rdx), $B, $Y 1084 lea 160(%rdx), %rax 1085 vpxor $T0, $ACC8, $ACC8 1086 vpand 32*1(%rdx), $B, $T0 1087 vpxor $Y, $ACC0, $ACC0 1088 vpand 32*2(%rdx), $B, $Y 1089 vpxor $T0, $ACC1, $ACC1 1090 vpand 32*3(%rdx), $B, $T0 1091 vpxor $Y, $ACC2, $ACC2 1092 vpand 32*4-160(%rax), $B, $Y 1093 vpxor $T0, $ACC3, $ACC3 1094 vpand 32*5-160(%rax), $B, $T0 1095 vpxor $Y, $ACC4, $ACC4 1096 vpand 32*6-160(%rax), $B, $Y 1097 vpxor $T0, $ACC5, $ACC5 1098 vpand 32*7-160(%rax), $B, $T0 1099 vpxor $Y, $ACC6, $ACC6 1100 vpand 32*8-160(%rax), $B, $Y 1101 vpxor $T0, $ACC7, $ACC7 1102 vpxor $Y, $ACC8, $ACC8 1103 `&STORE` 1104 1105 ret 1106.size avx2_select_n_store,.-avx2_select_n_store 1107___ 1108$code.=<<___ if (0); # inlined 1109################################################################################ 1110# void avx2_mul_by2_x4(void* RESULTx4, void *Ax4); 1111.type avx2_mul_by2_x4,\@abi-omnipotent 1112.align 32 1113avx2_mul_by2_x4: 1114 vmovdqa 32*0($a_ptr), $ACC0 1115 lea 160($a_ptr), %rax 1116 vmovdqa 32*1($a_ptr), $ACC1 1117 vmovdqa 32*2($a_ptr), $ACC2 1118 vmovdqa 32*3($a_ptr), $ACC3 1119 vmovdqa 32*4-160(%rax), $ACC4 1120 vmovdqa 32*5-160(%rax), $ACC5 1121 vmovdqa 32*6-160(%rax), $ACC6 1122 vmovdqa 32*7-160(%rax), $ACC7 1123 vmovdqa 32*8-160(%rax), $ACC8 1124 1125 vpaddq $ACC0, $ACC0, $ACC0 1126 vpaddq $ACC1, $ACC1, $ACC1 1127 vpaddq $ACC2, $ACC2, $ACC2 1128 vpaddq $ACC3, $ACC3, $ACC3 1129 vpaddq $ACC4, $ACC4, $ACC4 1130 vpaddq $ACC5, $ACC5, $ACC5 1131 vpaddq $ACC6, $ACC6, $ACC6 1132 vpaddq $ACC7, $ACC7, $ACC7 1133 vpaddq $ACC8, $ACC8, $ACC8 1134 1135 ret 1136.size avx2_mul_by2_x4,.-avx2_mul_by2_x4 1137___ 1138my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx"); 1139my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10"); 1140 1141$code.=<<___; 1142################################################################################ 1143# void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4); 1144.globl ecp_nistz256_avx2_point_add_affine_x4 1145.type ecp_nistz256_avx2_point_add_affine_x4,\@function,3 1146.align 32 1147ecp_nistz256_avx2_point_add_affine_x4: 1148 mov %rsp, %rax 1149 push %rbp 1150 vzeroupper 1151___ 1152$code.=<<___ if ($win64); 1153 lea -16*10(%rsp), %rsp 1154 vmovaps %xmm6, -8-16*10(%rax) 1155 vmovaps %xmm7, -8-16*9(%rax) 1156 vmovaps %xmm8, -8-16*8(%rax) 1157 vmovaps %xmm9, -8-16*7(%rax) 1158 vmovaps %xmm10, -8-16*6(%rax) 1159 vmovaps %xmm11, -8-16*5(%rax) 1160 vmovaps %xmm12, -8-16*4(%rax) 1161 vmovaps %xmm13, -8-16*3(%rax) 1162 vmovaps %xmm14, -8-16*2(%rax) 1163 vmovaps %xmm15, -8-16*1(%rax) 1164___ 1165$code.=<<___; 1166 lea -8(%rax), %rbp 1167 1168# Result + 32*0 = Result.X 1169# Result + 32*9 = Result.Y 1170# Result + 32*18 = Result.Z 1171 1172# A + 32*0 = A.X 1173# A + 32*9 = A.Y 1174# A + 32*18 = A.Z 1175 1176# B + 32*0 = B.X 1177# B + 32*9 = B.Y 1178 1179 sub \$`32*9*8+32*2+32*8`, %rsp 1180 and \$-64, %rsp 1181 1182 mov $r_ptr_in, $r_ptr 1183 mov $a_ptr_in, $a_ptr 1184 mov $b_ptr_in, $b_ptr 1185 1186 vmovdqa 32*0($a_ptr_in), %ymm0 1187 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK 1188 vpxor %ymm1, %ymm1, %ymm1 1189 lea 256($a_ptr_in), %rax # size optimization 1190 vpor 32*1($a_ptr_in), %ymm0, %ymm0 1191 vpor 32*2($a_ptr_in), %ymm0, %ymm0 1192 vpor 32*3($a_ptr_in), %ymm0, %ymm0 1193 vpor 32*4-256(%rax), %ymm0, %ymm0 1194 lea 256(%rax), %rcx # size optimization 1195 vpor 32*5-256(%rax), %ymm0, %ymm0 1196 vpor 32*6-256(%rax), %ymm0, %ymm0 1197 vpor 32*7-256(%rax), %ymm0, %ymm0 1198 vpor 32*8-256(%rax), %ymm0, %ymm0 1199 vpor 32*9-256(%rax), %ymm0, %ymm0 1200 vpor 32*10-256(%rax), %ymm0, %ymm0 1201 vpor 32*11-256(%rax), %ymm0, %ymm0 1202 vpor 32*12-512(%rcx), %ymm0, %ymm0 1203 vpor 32*13-512(%rcx), %ymm0, %ymm0 1204 vpor 32*14-512(%rcx), %ymm0, %ymm0 1205 vpor 32*15-512(%rcx), %ymm0, %ymm0 1206 vpor 32*16-512(%rcx), %ymm0, %ymm0 1207 vpor 32*17-512(%rcx), %ymm0, %ymm0 1208 vpcmpeqq %ymm1, %ymm0, %ymm0 1209 vmovdqa %ymm0, `32*9*8`(%rsp) 1210 1211 vpxor %ymm1, %ymm1, %ymm1 1212 vmovdqa 32*0($b_ptr), %ymm0 1213 lea 256($b_ptr), %rax # size optimization 1214 vpor 32*1($b_ptr), %ymm0, %ymm0 1215 vpor 32*2($b_ptr), %ymm0, %ymm0 1216 vpor 32*3($b_ptr), %ymm0, %ymm0 1217 vpor 32*4-256(%rax), %ymm0, %ymm0 1218 lea 256(%rax), %rcx # size optimization 1219 vpor 32*5-256(%rax), %ymm0, %ymm0 1220 vpor 32*6-256(%rax), %ymm0, %ymm0 1221 vpor 32*7-256(%rax), %ymm0, %ymm0 1222 vpor 32*8-256(%rax), %ymm0, %ymm0 1223 vpor 32*9-256(%rax), %ymm0, %ymm0 1224 vpor 32*10-256(%rax), %ymm0, %ymm0 1225 vpor 32*11-256(%rax), %ymm0, %ymm0 1226 vpor 32*12-512(%rcx), %ymm0, %ymm0 1227 vpor 32*13-512(%rcx), %ymm0, %ymm0 1228 vpor 32*14-512(%rcx), %ymm0, %ymm0 1229 vpor 32*15-512(%rcx), %ymm0, %ymm0 1230 vpor 32*16-512(%rcx), %ymm0, %ymm0 1231 vpor 32*17-512(%rcx), %ymm0, %ymm0 1232 vpcmpeqq %ymm1, %ymm0, %ymm0 1233 vmovdqa %ymm0, `32*9*8+32`(%rsp) 1234 1235 # Z1^2 = Z1*Z1 1236 lea `32*9*2`($a_ptr), %rsi 1237 lea `32*9*2`(%rsp), %rdi 1238 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector 1239 call avx2_sqr_x4 1240 call avx2_normalize_n_store 1241 1242 # U2 = X2*Z1^2 1243 lea `32*9*0`($b_ptr), %rsi 1244 lea `32*9*2`(%rsp), %rdx 1245 lea `32*9*0`(%rsp), %rdi 1246 call avx2_mul_x4 1247 #call avx2_normalize 1248 `&STORE` 1249 1250 # S2 = Z1*Z1^2 = Z1^3 1251 lea `32*9*2`($a_ptr), %rsi 1252 lea `32*9*2`(%rsp), %rdx 1253 lea `32*9*1`(%rsp), %rdi 1254 call avx2_mul_x4 1255 call avx2_normalize_n_store 1256 1257 # S2 = S2*Y2 = Y2*Z1^3 1258 lea `32*9*1`($b_ptr), %rsi 1259 lea `32*9*1`(%rsp), %rdx 1260 lea `32*9*1`(%rsp), %rdi 1261 call avx2_mul_x4 1262 call avx2_normalize_n_store 1263 1264 # H = U2 - U1 = U2 - X1 1265 lea `32*9*0`(%rsp), %rsi 1266 lea `32*9*0`($a_ptr), %rdx 1267 lea `32*9*3`(%rsp), %rdi 1268 call avx2_sub_x4 1269 call avx2_normalize_n_store 1270 1271 # R = S2 - S1 = S2 - Y1 1272 lea `32*9*1`(%rsp), %rsi 1273 lea `32*9*1`($a_ptr), %rdx 1274 lea `32*9*4`(%rsp), %rdi 1275 call avx2_sub_x4 1276 call avx2_normalize_n_store 1277 1278 # Z3 = H*Z1*Z2 1279 lea `32*9*3`(%rsp), %rsi 1280 lea `32*9*2`($a_ptr), %rdx 1281 lea `32*9*2`($r_ptr), %rdi 1282 call avx2_mul_x4 1283 call avx2_normalize 1284 1285 lea .LONE(%rip), %rsi 1286 lea `32*9*2`($a_ptr), %rdx 1287 call avx2_select_n_store 1288 1289 # R^2 = R^2 1290 lea `32*9*4`(%rsp), %rsi 1291 lea `32*9*6`(%rsp), %rdi 1292 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector 1293 call avx2_sqr_x4 1294 call avx2_normalize_n_store 1295 1296 # H^2 = H^2 1297 lea `32*9*3`(%rsp), %rsi 1298 lea `32*9*5`(%rsp), %rdi 1299 call avx2_sqr_x4 1300 call avx2_normalize_n_store 1301 1302 # H^3 = H^2*H 1303 lea `32*9*3`(%rsp), %rsi 1304 lea `32*9*5`(%rsp), %rdx 1305 lea `32*9*7`(%rsp), %rdi 1306 call avx2_mul_x4 1307 call avx2_normalize_n_store 1308 1309 # U2 = U1*H^2 1310 lea `32*9*0`($a_ptr), %rsi 1311 lea `32*9*5`(%rsp), %rdx 1312 lea `32*9*0`(%rsp), %rdi 1313 call avx2_mul_x4 1314 #call avx2_normalize 1315 `&STORE` 1316 1317 # Hsqr = U2*2 1318 #lea 32*9*0(%rsp), %rsi 1319 #lea 32*9*5(%rsp), %rdi 1320 #call avx2_mul_by2_x4 1321 1322 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 1323 lea `32*9*5`(%rsp), %rdi 1324 vpaddq $ACC1, $ACC1, $ACC1 1325 vpaddq $ACC2, $ACC2, $ACC2 1326 vpaddq $ACC3, $ACC3, $ACC3 1327 vpaddq $ACC4, $ACC4, $ACC4 1328 vpaddq $ACC5, $ACC5, $ACC5 1329 vpaddq $ACC6, $ACC6, $ACC6 1330 vpaddq $ACC7, $ACC7, $ACC7 1331 vpaddq $ACC8, $ACC8, $ACC8 1332 call avx2_normalize_n_store 1333 1334 # X3 = R^2 - H^3 1335 #lea 32*9*6(%rsp), %rsi 1336 #lea 32*9*7(%rsp), %rdx 1337 #lea 32*9*5(%rsp), %rcx 1338 #lea 32*9*0($r_ptr), %rdi 1339 #call avx2_sub_x4 1340 #NORMALIZE 1341 #STORE 1342 1343 # X3 = X3 - U2*2 1344 #lea 32*9*0($r_ptr), %rsi 1345 #lea 32*9*0($r_ptr), %rdi 1346 #call avx2_sub_x4 1347 #NORMALIZE 1348 #STORE 1349 1350 lea `32*9*6+128`(%rsp), %rsi 1351 lea .LAVX2_POLY_x2+128(%rip), %rax 1352 lea `32*9*7+128`(%rsp), %rdx 1353 lea `32*9*5+128`(%rsp), %rcx 1354 lea `32*9*0`($r_ptr), %rdi 1355 1356 vmovdqa 32*0-128(%rsi), $ACC0 1357 vmovdqa 32*1-128(%rsi), $ACC1 1358 vmovdqa 32*2-128(%rsi), $ACC2 1359 vmovdqa 32*3-128(%rsi), $ACC3 1360 vmovdqa 32*4-128(%rsi), $ACC4 1361 vmovdqa 32*5-128(%rsi), $ACC5 1362 vmovdqa 32*6-128(%rsi), $ACC6 1363 vmovdqa 32*7-128(%rsi), $ACC7 1364 vmovdqa 32*8-128(%rsi), $ACC8 1365 1366 vpaddq 32*0-128(%rax), $ACC0, $ACC0 1367 vpaddq 32*1-128(%rax), $ACC1, $ACC1 1368 vpaddq 32*2-128(%rax), $ACC2, $ACC2 1369 vpaddq 32*3-128(%rax), $ACC3, $ACC3 1370 vpaddq 32*4-128(%rax), $ACC4, $ACC4 1371 vpaddq 32*5-128(%rax), $ACC5, $ACC5 1372 vpaddq 32*6-128(%rax), $ACC6, $ACC6 1373 vpaddq 32*7-128(%rax), $ACC7, $ACC7 1374 vpaddq 32*8-128(%rax), $ACC8, $ACC8 1375 1376 vpsubq 32*0-128(%rdx), $ACC0, $ACC0 1377 vpsubq 32*1-128(%rdx), $ACC1, $ACC1 1378 vpsubq 32*2-128(%rdx), $ACC2, $ACC2 1379 vpsubq 32*3-128(%rdx), $ACC3, $ACC3 1380 vpsubq 32*4-128(%rdx), $ACC4, $ACC4 1381 vpsubq 32*5-128(%rdx), $ACC5, $ACC5 1382 vpsubq 32*6-128(%rdx), $ACC6, $ACC6 1383 vpsubq 32*7-128(%rdx), $ACC7, $ACC7 1384 vpsubq 32*8-128(%rdx), $ACC8, $ACC8 1385 1386 vpsubq 32*0-128(%rcx), $ACC0, $ACC0 1387 vpsubq 32*1-128(%rcx), $ACC1, $ACC1 1388 vpsubq 32*2-128(%rcx), $ACC2, $ACC2 1389 vpsubq 32*3-128(%rcx), $ACC3, $ACC3 1390 vpsubq 32*4-128(%rcx), $ACC4, $ACC4 1391 vpsubq 32*5-128(%rcx), $ACC5, $ACC5 1392 vpsubq 32*6-128(%rcx), $ACC6, $ACC6 1393 vpsubq 32*7-128(%rcx), $ACC7, $ACC7 1394 vpsubq 32*8-128(%rcx), $ACC8, $ACC8 1395 call avx2_normalize 1396 1397 lea 32*0($b_ptr), %rsi 1398 lea 32*0($a_ptr), %rdx 1399 call avx2_select_n_store 1400 1401 # H = U2 - X3 1402 lea `32*9*0`(%rsp), %rsi 1403 lea `32*9*0`($r_ptr), %rdx 1404 lea `32*9*3`(%rsp), %rdi 1405 call avx2_sub_x4 1406 call avx2_normalize_n_store 1407 1408 # 1409 lea `32*9*3`(%rsp), %rsi 1410 lea `32*9*4`(%rsp), %rdx 1411 lea `32*9*3`(%rsp), %rdi 1412 call avx2_mul_x4 1413 call avx2_normalize_n_store 1414 1415 # 1416 lea `32*9*7`(%rsp), %rsi 1417 lea `32*9*1`($a_ptr), %rdx 1418 lea `32*9*1`(%rsp), %rdi 1419 call avx2_mul_x4 1420 call avx2_normalize_n_store 1421 1422 # 1423 lea `32*9*3`(%rsp), %rsi 1424 lea `32*9*1`(%rsp), %rdx 1425 lea `32*9*1`($r_ptr), %rdi 1426 call avx2_sub_x4 1427 call avx2_normalize 1428 1429 lea 32*9($b_ptr), %rsi 1430 lea 32*9($a_ptr), %rdx 1431 call avx2_select_n_store 1432 1433 #lea 32*9*0($r_ptr), %rsi 1434 #lea 32*9*0($r_ptr), %rdi 1435 #call avx2_mul_by1_x4 1436 #NORMALIZE 1437 #STORE 1438 1439 lea `32*9*1`($r_ptr), %rsi 1440 lea `32*9*1`($r_ptr), %rdi 1441 call avx2_mul_by1_x4 1442 call avx2_normalize_n_store 1443 1444 vzeroupper 1445___ 1446$code.=<<___ if ($win64); 1447 movaps %xmm6, -16*10(%rbp) 1448 movaps %xmm7, -16*9(%rbp) 1449 movaps %xmm8, -16*8(%rbp) 1450 movaps %xmm9, -16*7(%rbp) 1451 movaps %xmm10, -16*6(%rbp) 1452 movaps %xmm11, -16*5(%rbp) 1453 movaps %xmm12, -16*4(%rbp) 1454 movaps %xmm13, -16*3(%rbp) 1455 movaps %xmm14, -16*2(%rbp) 1456 movaps %xmm15, -16*1(%rbp) 1457___ 1458$code.=<<___; 1459 mov %rbp, %rsp 1460 pop %rbp 1461 ret 1462.size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4 1463 1464################################################################################ 1465# void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4); 1466.globl ecp_nistz256_avx2_point_add_affines_x4 1467.type ecp_nistz256_avx2_point_add_affines_x4,\@function,3 1468.align 32 1469ecp_nistz256_avx2_point_add_affines_x4: 1470 mov %rsp, %rax 1471 push %rbp 1472 vzeroupper 1473___ 1474$code.=<<___ if ($win64); 1475 lea -16*10(%rsp), %rsp 1476 vmovaps %xmm6, -8-16*10(%rax) 1477 vmovaps %xmm7, -8-16*9(%rax) 1478 vmovaps %xmm8, -8-16*8(%rax) 1479 vmovaps %xmm9, -8-16*7(%rax) 1480 vmovaps %xmm10, -8-16*6(%rax) 1481 vmovaps %xmm11, -8-16*5(%rax) 1482 vmovaps %xmm12, -8-16*4(%rax) 1483 vmovaps %xmm13, -8-16*3(%rax) 1484 vmovaps %xmm14, -8-16*2(%rax) 1485 vmovaps %xmm15, -8-16*1(%rax) 1486___ 1487$code.=<<___; 1488 lea -8(%rax), %rbp 1489 1490# Result + 32*0 = Result.X 1491# Result + 32*9 = Result.Y 1492# Result + 32*18 = Result.Z 1493 1494# A + 32*0 = A.X 1495# A + 32*9 = A.Y 1496 1497# B + 32*0 = B.X 1498# B + 32*9 = B.Y 1499 1500 sub \$`32*9*8+32*2+32*8`, %rsp 1501 and \$-64, %rsp 1502 1503 mov $r_ptr_in, $r_ptr 1504 mov $a_ptr_in, $a_ptr 1505 mov $b_ptr_in, $b_ptr 1506 1507 vmovdqa 32*0($a_ptr_in), %ymm0 1508 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK 1509 vpxor %ymm1, %ymm1, %ymm1 1510 lea 256($a_ptr_in), %rax # size optimization 1511 vpor 32*1($a_ptr_in), %ymm0, %ymm0 1512 vpor 32*2($a_ptr_in), %ymm0, %ymm0 1513 vpor 32*3($a_ptr_in), %ymm0, %ymm0 1514 vpor 32*4-256(%rax), %ymm0, %ymm0 1515 lea 256(%rax), %rcx # size optimization 1516 vpor 32*5-256(%rax), %ymm0, %ymm0 1517 vpor 32*6-256(%rax), %ymm0, %ymm0 1518 vpor 32*7-256(%rax), %ymm0, %ymm0 1519 vpor 32*8-256(%rax), %ymm0, %ymm0 1520 vpor 32*9-256(%rax), %ymm0, %ymm0 1521 vpor 32*10-256(%rax), %ymm0, %ymm0 1522 vpor 32*11-256(%rax), %ymm0, %ymm0 1523 vpor 32*12-512(%rcx), %ymm0, %ymm0 1524 vpor 32*13-512(%rcx), %ymm0, %ymm0 1525 vpor 32*14-512(%rcx), %ymm0, %ymm0 1526 vpor 32*15-512(%rcx), %ymm0, %ymm0 1527 vpor 32*16-512(%rcx), %ymm0, %ymm0 1528 vpor 32*17-512(%rcx), %ymm0, %ymm0 1529 vpcmpeqq %ymm1, %ymm0, %ymm0 1530 vmovdqa %ymm0, `32*9*8`(%rsp) 1531 1532 vpxor %ymm1, %ymm1, %ymm1 1533 vmovdqa 32*0($b_ptr), %ymm0 1534 lea 256($b_ptr), %rax # size optimization 1535 vpor 32*1($b_ptr), %ymm0, %ymm0 1536 vpor 32*2($b_ptr), %ymm0, %ymm0 1537 vpor 32*3($b_ptr), %ymm0, %ymm0 1538 vpor 32*4-256(%rax), %ymm0, %ymm0 1539 lea 256(%rax), %rcx # size optimization 1540 vpor 32*5-256(%rax), %ymm0, %ymm0 1541 vpor 32*6-256(%rax), %ymm0, %ymm0 1542 vpor 32*7-256(%rax), %ymm0, %ymm0 1543 vpor 32*8-256(%rax), %ymm0, %ymm0 1544 vpor 32*9-256(%rax), %ymm0, %ymm0 1545 vpor 32*10-256(%rax), %ymm0, %ymm0 1546 vpor 32*11-256(%rax), %ymm0, %ymm0 1547 vpor 32*12-512(%rcx), %ymm0, %ymm0 1548 vpor 32*13-512(%rcx), %ymm0, %ymm0 1549 vpor 32*14-512(%rcx), %ymm0, %ymm0 1550 vpor 32*15-512(%rcx), %ymm0, %ymm0 1551 vpor 32*16-512(%rcx), %ymm0, %ymm0 1552 vpor 32*17-512(%rcx), %ymm0, %ymm0 1553 vpcmpeqq %ymm1, %ymm0, %ymm0 1554 vmovdqa %ymm0, `32*9*8+32`(%rsp) 1555 1556 # H = U2 - U1 = X2 - X1 1557 lea `32*9*0`($b_ptr), %rsi 1558 lea `32*9*0`($a_ptr), %rdx 1559 lea `32*9*3`(%rsp), %rdi 1560 call avx2_sub_x4 1561 call avx2_normalize_n_store 1562 1563 # R = S2 - S1 = Y2 - Y1 1564 lea `32*9*1`($b_ptr), %rsi 1565 lea `32*9*1`($a_ptr), %rdx 1566 lea `32*9*4`(%rsp), %rdi 1567 call avx2_sub_x4 1568 call avx2_normalize_n_store 1569 1570 # Z3 = H*Z1*Z2 = H 1571 lea `32*9*3`(%rsp), %rsi 1572 lea `32*9*2`($r_ptr), %rdi 1573 call avx2_mul_by1_x4 1574 call avx2_normalize 1575 1576 vmovdqa `32*9*8`(%rsp), $B 1577 vpor `32*9*8+32`(%rsp), $B, $B 1578 1579 vpandn $ACC0, $B, $ACC0 1580 lea .LONE+128(%rip), %rax 1581 vpandn $ACC1, $B, $ACC1 1582 vpandn $ACC2, $B, $ACC2 1583 vpandn $ACC3, $B, $ACC3 1584 vpandn $ACC4, $B, $ACC4 1585 vpandn $ACC5, $B, $ACC5 1586 vpandn $ACC6, $B, $ACC6 1587 vpandn $ACC7, $B, $ACC7 1588 1589 vpand 32*0-128(%rax), $B, $T0 1590 vpandn $ACC8, $B, $ACC8 1591 vpand 32*1-128(%rax), $B, $Y 1592 vpxor $T0, $ACC0, $ACC0 1593 vpand 32*2-128(%rax), $B, $T0 1594 vpxor $Y, $ACC1, $ACC1 1595 vpand 32*3-128(%rax), $B, $Y 1596 vpxor $T0, $ACC2, $ACC2 1597 vpand 32*4-128(%rax), $B, $T0 1598 vpxor $Y, $ACC3, $ACC3 1599 vpand 32*5-128(%rax), $B, $Y 1600 vpxor $T0, $ACC4, $ACC4 1601 vpand 32*6-128(%rax), $B, $T0 1602 vpxor $Y, $ACC5, $ACC5 1603 vpand 32*7-128(%rax), $B, $Y 1604 vpxor $T0, $ACC6, $ACC6 1605 vpand 32*8-128(%rax), $B, $T0 1606 vpxor $Y, $ACC7, $ACC7 1607 vpxor $T0, $ACC8, $ACC8 1608 `&STORE` 1609 1610 # R^2 = R^2 1611 lea `32*9*4`(%rsp), %rsi 1612 lea `32*9*6`(%rsp), %rdi 1613 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector 1614 call avx2_sqr_x4 1615 call avx2_normalize_n_store 1616 1617 # H^2 = H^2 1618 lea `32*9*3`(%rsp), %rsi 1619 lea `32*9*5`(%rsp), %rdi 1620 call avx2_sqr_x4 1621 call avx2_normalize_n_store 1622 1623 # H^3 = H^2*H 1624 lea `32*9*3`(%rsp), %rsi 1625 lea `32*9*5`(%rsp), %rdx 1626 lea `32*9*7`(%rsp), %rdi 1627 call avx2_mul_x4 1628 call avx2_normalize_n_store 1629 1630 # U2 = U1*H^2 1631 lea `32*9*0`($a_ptr), %rsi 1632 lea `32*9*5`(%rsp), %rdx 1633 lea `32*9*0`(%rsp), %rdi 1634 call avx2_mul_x4 1635 #call avx2_normalize 1636 `&STORE` 1637 1638 # Hsqr = U2*2 1639 #lea 32*9*0(%rsp), %rsi 1640 #lea 32*9*5(%rsp), %rdi 1641 #call avx2_mul_by2_x4 1642 1643 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 1644 lea `32*9*5`(%rsp), %rdi 1645 vpaddq $ACC1, $ACC1, $ACC1 1646 vpaddq $ACC2, $ACC2, $ACC2 1647 vpaddq $ACC3, $ACC3, $ACC3 1648 vpaddq $ACC4, $ACC4, $ACC4 1649 vpaddq $ACC5, $ACC5, $ACC5 1650 vpaddq $ACC6, $ACC6, $ACC6 1651 vpaddq $ACC7, $ACC7, $ACC7 1652 vpaddq $ACC8, $ACC8, $ACC8 1653 call avx2_normalize_n_store 1654 1655 # X3 = R^2 - H^3 1656 #lea 32*9*6(%rsp), %rsi 1657 #lea 32*9*7(%rsp), %rdx 1658 #lea 32*9*5(%rsp), %rcx 1659 #lea 32*9*0($r_ptr), %rdi 1660 #call avx2_sub_x4 1661 #NORMALIZE 1662 #STORE 1663 1664 # X3 = X3 - U2*2 1665 #lea 32*9*0($r_ptr), %rsi 1666 #lea 32*9*0($r_ptr), %rdi 1667 #call avx2_sub_x4 1668 #NORMALIZE 1669 #STORE 1670 1671 lea `32*9*6+128`(%rsp), %rsi 1672 lea .LAVX2_POLY_x2+128(%rip), %rax 1673 lea `32*9*7+128`(%rsp), %rdx 1674 lea `32*9*5+128`(%rsp), %rcx 1675 lea `32*9*0`($r_ptr), %rdi 1676 1677 vmovdqa 32*0-128(%rsi), $ACC0 1678 vmovdqa 32*1-128(%rsi), $ACC1 1679 vmovdqa 32*2-128(%rsi), $ACC2 1680 vmovdqa 32*3-128(%rsi), $ACC3 1681 vmovdqa 32*4-128(%rsi), $ACC4 1682 vmovdqa 32*5-128(%rsi), $ACC5 1683 vmovdqa 32*6-128(%rsi), $ACC6 1684 vmovdqa 32*7-128(%rsi), $ACC7 1685 vmovdqa 32*8-128(%rsi), $ACC8 1686 1687 vpaddq 32*0-128(%rax), $ACC0, $ACC0 1688 vpaddq 32*1-128(%rax), $ACC1, $ACC1 1689 vpaddq 32*2-128(%rax), $ACC2, $ACC2 1690 vpaddq 32*3-128(%rax), $ACC3, $ACC3 1691 vpaddq 32*4-128(%rax), $ACC4, $ACC4 1692 vpaddq 32*5-128(%rax), $ACC5, $ACC5 1693 vpaddq 32*6-128(%rax), $ACC6, $ACC6 1694 vpaddq 32*7-128(%rax), $ACC7, $ACC7 1695 vpaddq 32*8-128(%rax), $ACC8, $ACC8 1696 1697 vpsubq 32*0-128(%rdx), $ACC0, $ACC0 1698 vpsubq 32*1-128(%rdx), $ACC1, $ACC1 1699 vpsubq 32*2-128(%rdx), $ACC2, $ACC2 1700 vpsubq 32*3-128(%rdx), $ACC3, $ACC3 1701 vpsubq 32*4-128(%rdx), $ACC4, $ACC4 1702 vpsubq 32*5-128(%rdx), $ACC5, $ACC5 1703 vpsubq 32*6-128(%rdx), $ACC6, $ACC6 1704 vpsubq 32*7-128(%rdx), $ACC7, $ACC7 1705 vpsubq 32*8-128(%rdx), $ACC8, $ACC8 1706 1707 vpsubq 32*0-128(%rcx), $ACC0, $ACC0 1708 vpsubq 32*1-128(%rcx), $ACC1, $ACC1 1709 vpsubq 32*2-128(%rcx), $ACC2, $ACC2 1710 vpsubq 32*3-128(%rcx), $ACC3, $ACC3 1711 vpsubq 32*4-128(%rcx), $ACC4, $ACC4 1712 vpsubq 32*5-128(%rcx), $ACC5, $ACC5 1713 vpsubq 32*6-128(%rcx), $ACC6, $ACC6 1714 vpsubq 32*7-128(%rcx), $ACC7, $ACC7 1715 vpsubq 32*8-128(%rcx), $ACC8, $ACC8 1716 call avx2_normalize 1717 1718 lea 32*0($b_ptr), %rsi 1719 lea 32*0($a_ptr), %rdx 1720 call avx2_select_n_store 1721 1722 # H = U2 - X3 1723 lea `32*9*0`(%rsp), %rsi 1724 lea `32*9*0`($r_ptr), %rdx 1725 lea `32*9*3`(%rsp), %rdi 1726 call avx2_sub_x4 1727 call avx2_normalize_n_store 1728 1729 # H = H*R 1730 lea `32*9*3`(%rsp), %rsi 1731 lea `32*9*4`(%rsp), %rdx 1732 lea `32*9*3`(%rsp), %rdi 1733 call avx2_mul_x4 1734 call avx2_normalize_n_store 1735 1736 # S2 = S1 * H^3 1737 lea `32*9*7`(%rsp), %rsi 1738 lea `32*9*1`($a_ptr), %rdx 1739 lea `32*9*1`(%rsp), %rdi 1740 call avx2_mul_x4 1741 call avx2_normalize_n_store 1742 1743 # 1744 lea `32*9*3`(%rsp), %rsi 1745 lea `32*9*1`(%rsp), %rdx 1746 lea `32*9*1`($r_ptr), %rdi 1747 call avx2_sub_x4 1748 call avx2_normalize 1749 1750 lea 32*9($b_ptr), %rsi 1751 lea 32*9($a_ptr), %rdx 1752 call avx2_select_n_store 1753 1754 #lea 32*9*0($r_ptr), %rsi 1755 #lea 32*9*0($r_ptr), %rdi 1756 #call avx2_mul_by1_x4 1757 #NORMALIZE 1758 #STORE 1759 1760 lea `32*9*1`($r_ptr), %rsi 1761 lea `32*9*1`($r_ptr), %rdi 1762 call avx2_mul_by1_x4 1763 call avx2_normalize_n_store 1764 1765 vzeroupper 1766___ 1767$code.=<<___ if ($win64); 1768 movaps %xmm6, -16*10(%rbp) 1769 movaps %xmm7, -16*9(%rbp) 1770 movaps %xmm8, -16*8(%rbp) 1771 movaps %xmm9, -16*7(%rbp) 1772 movaps %xmm10, -16*6(%rbp) 1773 movaps %xmm11, -16*5(%rbp) 1774 movaps %xmm12, -16*4(%rbp) 1775 movaps %xmm13, -16*3(%rbp) 1776 movaps %xmm14, -16*2(%rbp) 1777 movaps %xmm15, -16*1(%rbp) 1778___ 1779$code.=<<___; 1780 mov %rbp, %rsp 1781 pop %rbp 1782 ret 1783.size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4 1784 1785################################################################################ 1786# void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4); 1787.globl ecp_nistz256_avx2_to_mont 1788.type ecp_nistz256_avx2_to_mont,\@function,2 1789.align 32 1790ecp_nistz256_avx2_to_mont: 1791 vzeroupper 1792___ 1793$code.=<<___ if ($win64); 1794 lea -8-16*10(%rsp), %rsp 1795 vmovaps %xmm6, -8-16*10(%rax) 1796 vmovaps %xmm7, -8-16*9(%rax) 1797 vmovaps %xmm8, -8-16*8(%rax) 1798 vmovaps %xmm9, -8-16*7(%rax) 1799 vmovaps %xmm10, -8-16*6(%rax) 1800 vmovaps %xmm11, -8-16*5(%rax) 1801 vmovaps %xmm12, -8-16*4(%rax) 1802 vmovaps %xmm13, -8-16*3(%rax) 1803 vmovaps %xmm14, -8-16*2(%rax) 1804 vmovaps %xmm15, -8-16*1(%rax) 1805___ 1806$code.=<<___; 1807 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK 1808 lea .LTO_MONT_AVX2(%rip), %rdx 1809 call avx2_mul_x4 1810 call avx2_normalize_n_store 1811 1812 vzeroupper 1813___ 1814$code.=<<___ if ($win64); 1815 movaps 16*0(%rsp), %xmm6 1816 movaps 16*1(%rsp), %xmm7 1817 movaps 16*2(%rsp), %xmm8 1818 movaps 16*3(%rsp), %xmm9 1819 movaps 16*4(%rsp), %xmm10 1820 movaps 16*5(%rsp), %xmm11 1821 movaps 16*6(%rsp), %xmm12 1822 movaps 16*7(%rsp), %xmm13 1823 movaps 16*8(%rsp), %xmm14 1824 movaps 16*9(%rsp), %xmm15 1825 lea 8+16*10(%rsp), %rsp 1826___ 1827$code.=<<___; 1828 ret 1829.size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont 1830 1831################################################################################ 1832# void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4); 1833.globl ecp_nistz256_avx2_from_mont 1834.type ecp_nistz256_avx2_from_mont,\@function,2 1835.align 32 1836ecp_nistz256_avx2_from_mont: 1837 vzeroupper 1838___ 1839$code.=<<___ if ($win64); 1840 lea -8-16*10(%rsp), %rsp 1841 vmovaps %xmm6, -8-16*10(%rax) 1842 vmovaps %xmm7, -8-16*9(%rax) 1843 vmovaps %xmm8, -8-16*8(%rax) 1844 vmovaps %xmm9, -8-16*7(%rax) 1845 vmovaps %xmm10, -8-16*6(%rax) 1846 vmovaps %xmm11, -8-16*5(%rax) 1847 vmovaps %xmm12, -8-16*4(%rax) 1848 vmovaps %xmm13, -8-16*3(%rax) 1849 vmovaps %xmm14, -8-16*2(%rax) 1850 vmovaps %xmm15, -8-16*1(%rax) 1851___ 1852$code.=<<___; 1853 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK 1854 lea .LFROM_MONT_AVX2(%rip), %rdx 1855 call avx2_mul_x4 1856 call avx2_normalize_n_store 1857 1858 vzeroupper 1859___ 1860$code.=<<___ if ($win64); 1861 movaps 16*0(%rsp), %xmm6 1862 movaps 16*1(%rsp), %xmm7 1863 movaps 16*2(%rsp), %xmm8 1864 movaps 16*3(%rsp), %xmm9 1865 movaps 16*4(%rsp), %xmm10 1866 movaps 16*5(%rsp), %xmm11 1867 movaps 16*6(%rsp), %xmm12 1868 movaps 16*7(%rsp), %xmm13 1869 movaps 16*8(%rsp), %xmm14 1870 movaps 16*9(%rsp), %xmm15 1871 lea 8+16*10(%rsp), %rsp 1872___ 1873$code.=<<___; 1874 ret 1875.size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont 1876 1877################################################################################ 1878# void ecp_nistz256_avx2_set1(void* RESULTx4); 1879.globl ecp_nistz256_avx2_set1 1880.type ecp_nistz256_avx2_set1,\@function,1 1881.align 32 1882ecp_nistz256_avx2_set1: 1883 lea .LONE+128(%rip), %rax 1884 lea 128(%rdi), %rdi 1885 vzeroupper 1886 vmovdqa 32*0-128(%rax), %ymm0 1887 vmovdqa 32*1-128(%rax), %ymm1 1888 vmovdqa 32*2-128(%rax), %ymm2 1889 vmovdqa 32*3-128(%rax), %ymm3 1890 vmovdqa 32*4-128(%rax), %ymm4 1891 vmovdqa 32*5-128(%rax), %ymm5 1892 vmovdqa %ymm0, 32*0-128(%rdi) 1893 vmovdqa 32*6-128(%rax), %ymm0 1894 vmovdqa %ymm1, 32*1-128(%rdi) 1895 vmovdqa 32*7-128(%rax), %ymm1 1896 vmovdqa %ymm2, 32*2-128(%rdi) 1897 vmovdqa 32*8-128(%rax), %ymm2 1898 vmovdqa %ymm3, 32*3-128(%rdi) 1899 vmovdqa %ymm4, 32*4-128(%rdi) 1900 vmovdqa %ymm5, 32*5-128(%rdi) 1901 vmovdqa %ymm0, 32*6-128(%rdi) 1902 vmovdqa %ymm1, 32*7-128(%rdi) 1903 vmovdqa %ymm2, 32*8-128(%rdi) 1904 1905 vzeroupper 1906 ret 1907.size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1 1908___ 1909} 1910{ 1911################################################################################ 1912# void ecp_nistz256_avx2_multi_select_w7(void* RESULT, void *in, 1913# int index0, int index1, int index2, int index3); 1914################################################################################ 1915 1916my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d"); 1917my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3)); 1918my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11)); 1919my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15)); 1920 1921$code.=<<___; 1922.globl ecp_nistz256_avx2_multi_select_w7 1923.type ecp_nistz256_avx2_multi_select_w7,\@function,6 1924.align 32 1925ecp_nistz256_avx2_multi_select_w7: 1926 vzeroupper 1927___ 1928$code.=<<___ if ($win64); 1929 lea -8-16*10(%rsp), %rsp 1930 vmovaps %xmm6, -8-16*10(%rax) 1931 vmovaps %xmm7, -8-16*9(%rax) 1932 vmovaps %xmm8, -8-16*8(%rax) 1933 vmovaps %xmm9, -8-16*7(%rax) 1934 vmovaps %xmm10, -8-16*6(%rax) 1935 vmovaps %xmm11, -8-16*5(%rax) 1936 vmovaps %xmm12, -8-16*4(%rax) 1937 vmovaps %xmm13, -8-16*3(%rax) 1938 vmovaps %xmm14, -8-16*2(%rax) 1939 vmovaps %xmm15, -8-16*1(%rax) 1940___ 1941$code.=<<___; 1942 lea .LIntOne(%rip), %rax 1943 1944 vmovd $index0, %xmm0 1945 vmovd $index1, %xmm1 1946 vmovd $index2, %xmm2 1947 vmovd $index3, %xmm3 1948 1949 vpxor $R0a, $R0a, $R0a 1950 vpxor $R0b, $R0b, $R0b 1951 vpxor $R1a, $R1a, $R1a 1952 vpxor $R1b, $R1b, $R1b 1953 vpxor $R2a, $R2a, $R2a 1954 vpxor $R2b, $R2b, $R2b 1955 vpxor $R3a, $R3a, $R3a 1956 vpxor $R3b, $R3b, $R3b 1957 vmovdqa (%rax), $M0 1958 1959 vpermd $INDEX0, $R0a, $INDEX0 1960 vpermd $INDEX1, $R0a, $INDEX1 1961 vpermd $INDEX2, $R0a, $INDEX2 1962 vpermd $INDEX3, $R0a, $INDEX3 1963 1964 mov \$64, %ecx 1965 lea 112($val), $val # size optimization 1966 jmp .Lmulti_select_loop_avx2 1967 1968# INDEX=0, corresponds to the point at infty (0,0) 1969.align 32 1970.Lmulti_select_loop_avx2: 1971 vpcmpeqd $INDEX0, $M0, $TMP0 1972 1973 vmovdqa `32*0+32*64*2*0`($in_t), $T0 1974 vmovdqa `32*1+32*64*2*0`($in_t), $T1 1975 vpand $TMP0, $T0, $T0 1976 vpand $TMP0, $T1, $T1 1977 vpxor $T0, $R0a, $R0a 1978 vpxor $T1, $R0b, $R0b 1979 1980 vpcmpeqd $INDEX1, $M0, $TMP0 1981 1982 vmovdqa `32*0+32*64*2*1`($in_t), $T0 1983 vmovdqa `32*1+32*64*2*1`($in_t), $T1 1984 vpand $TMP0, $T0, $T0 1985 vpand $TMP0, $T1, $T1 1986 vpxor $T0, $R1a, $R1a 1987 vpxor $T1, $R1b, $R1b 1988 1989 vpcmpeqd $INDEX2, $M0, $TMP0 1990 1991 vmovdqa `32*0+32*64*2*2`($in_t), $T0 1992 vmovdqa `32*1+32*64*2*2`($in_t), $T1 1993 vpand $TMP0, $T0, $T0 1994 vpand $TMP0, $T1, $T1 1995 vpxor $T0, $R2a, $R2a 1996 vpxor $T1, $R2b, $R2b 1997 1998 vpcmpeqd $INDEX3, $M0, $TMP0 1999 2000 vmovdqa `32*0+32*64*2*3`($in_t), $T0 2001 vmovdqa `32*1+32*64*2*3`($in_t), $T1 2002 vpand $TMP0, $T0, $T0 2003 vpand $TMP0, $T1, $T1 2004 vpxor $T0, $R3a, $R3a 2005 vpxor $T1, $R3b, $R3b 2006 2007 vpaddd (%rax), $M0, $M0 # increment 2008 lea 32*2($in_t), $in_t 2009 2010 dec %ecx 2011 jnz .Lmulti_select_loop_avx2 2012 2013 vmovdqu $R0a, 32*0-112($val) 2014 vmovdqu $R0b, 32*1-112($val) 2015 vmovdqu $R1a, 32*2-112($val) 2016 vmovdqu $R1b, 32*3-112($val) 2017 vmovdqu $R2a, 32*4-112($val) 2018 vmovdqu $R2b, 32*5-112($val) 2019 vmovdqu $R3a, 32*6-112($val) 2020 vmovdqu $R3b, 32*7-112($val) 2021 2022 vzeroupper 2023___ 2024$code.=<<___ if ($win64); 2025 movaps 16*0(%rsp), %xmm6 2026 movaps 16*1(%rsp), %xmm7 2027 movaps 16*2(%rsp), %xmm8 2028 movaps 16*3(%rsp), %xmm9 2029 movaps 16*4(%rsp), %xmm10 2030 movaps 16*5(%rsp), %xmm11 2031 movaps 16*6(%rsp), %xmm12 2032 movaps 16*7(%rsp), %xmm13 2033 movaps 16*8(%rsp), %xmm14 2034 movaps 16*9(%rsp), %xmm15 2035 lea 8+16*10(%rsp), %rsp 2036___ 2037$code.=<<___; 2038 ret 2039.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7 2040 2041.extern OPENSSL_ia32cap_P 2042.globl ecp_nistz_avx2_eligible 2043.type ecp_nistz_avx2_eligible,\@abi-omnipotent 2044.align 32 2045ecp_nistz_avx2_eligible: 2046 mov OPENSSL_ia32cap_P+8(%rip),%eax 2047 shr \$5,%eax 2048 and \$1,%eax 2049 ret 2050.size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible 2051___ 2052} 2053}} else {{ # assembler is too old 2054$code.=<<___; 2055.text 2056 2057.globl ecp_nistz256_avx2_transpose_convert 2058.globl ecp_nistz256_avx2_convert_transpose_back 2059.globl ecp_nistz256_avx2_point_add_affine_x4 2060.globl ecp_nistz256_avx2_point_add_affines_x4 2061.globl ecp_nistz256_avx2_to_mont 2062.globl ecp_nistz256_avx2_from_mont 2063.globl ecp_nistz256_avx2_set1 2064.globl ecp_nistz256_avx2_multi_select_w7 2065.type ecp_nistz256_avx2_multi_select_w7,\@abi-omnipotent 2066ecp_nistz256_avx2_transpose_convert: 2067ecp_nistz256_avx2_convert_transpose_back: 2068ecp_nistz256_avx2_point_add_affine_x4: 2069ecp_nistz256_avx2_point_add_affines_x4: 2070ecp_nistz256_avx2_to_mont: 2071ecp_nistz256_avx2_from_mont: 2072ecp_nistz256_avx2_set1: 2073ecp_nistz256_avx2_multi_select_w7: 2074 .byte 0x0f,0x0b # ud2 2075 ret 2076.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7 2077 2078.globl ecp_nistz_avx2_eligible 2079.type ecp_nistz_avx2_eligible,\@abi-omnipotent 2080ecp_nistz_avx2_eligible: 2081 xor %eax,%eax 2082 ret 2083.size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible 2084___ 2085}} 2086 2087foreach (split("\n",$code)) { 2088 s/\`([^\`]*)\`/eval($1)/geo; 2089 2090 print $_,"\n"; 2091} 2092 2093close STDOUT; 2094