aesni-sha256-x86_64.pl revision 325335
1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# January 2013 11# 12# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled 13# in http://download.intel.com/design/intarch/papers/323686.pdf, is 14# that since AESNI-CBC encrypt exhibit *very* low instruction-level 15# parallelism, interleaving it with another algorithm would allow to 16# utilize processor resources better and achieve better performance. 17# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and 18# AESNI code is weaved into it. As SHA256 dominates execution time, 19# stitch performance does not depend on AES key length. Below are 20# performance numbers in cycles per processed byte, less is better, 21# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched 22# subroutine: 23# 24# AES-128/-192/-256+SHA256 this(**)gain 25# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% 26# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% 27# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% 28# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% 29# 30# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that 31# Westmere is omitted from loop, this is because gain was not 32# estimated high enough to justify the effort; 33# (**) these are EVP-free results, results obtained with 'speed 34# -evp aes-256-cbc-hmac-sha256' will vary by percent or two; 35 36$flavour = shift; 37$output = shift; 38if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 39 40$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 41 42$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 44( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 45die "can't locate x86_64-xlate.pl"; 46 47if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 49 $avx = ($1>=2.19) + ($1>=2.22); 50} 51 52if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 54 $avx = ($1>=2.09) + ($1>=2.10); 55} 56 57if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 58 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 59 $avx = ($1>=10) + ($1>=12); 60} 61 62if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 63 $avx = ($2>=3.0) + ($2>3.0); 64} 65 66$shaext=$avx; ### set to zero if compiling for 1.0.1 67$avx=1 if (!$shaext && $avx); 68 69open OUT,"| \"$^X\" $xlate $flavour $output"; 70*STDOUT=*OUT; 71 72$func="aesni_cbc_sha256_enc"; 73$TABLE="K256"; 74$SZ=4; 75@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 76 "%r8d","%r9d","%r10d","%r11d"); 77($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); 78@Sigma0=( 2,13,22); 79@Sigma1=( 6,11,25); 80@sigma0=( 7,18, 3); 81@sigma1=(17,19,10); 82$rounds=64; 83 84######################################################################## 85# void aesni_cbc_sha256_enc(const void *inp, 86# void *out, 87# size_t length, 88# const AES_KEY *key, 89# unsigned char *iv, 90# SHA256_CTX *ctx, 91# const void *in0); 92($inp, $out, $len, $key, $ivp, $ctx, $in0) = 93("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 94 95$Tbl="%rbp"; 96 97$_inp="16*$SZ+0*8(%rsp)"; 98$_out="16*$SZ+1*8(%rsp)"; 99$_end="16*$SZ+2*8(%rsp)"; 100$_key="16*$SZ+3*8(%rsp)"; 101$_ivp="16*$SZ+4*8(%rsp)"; 102$_ctx="16*$SZ+5*8(%rsp)"; 103$_in0="16*$SZ+6*8(%rsp)"; 104$_rsp="16*$SZ+7*8(%rsp)"; 105$framesz=16*$SZ+8*8; 106 107$code=<<___; 108.text 109 110.extern OPENSSL_ia32cap_P 111.globl $func 112.type $func,\@abi-omnipotent 113.align 16 114$func: 115___ 116 if ($avx) { 117$code.=<<___; 118 lea OPENSSL_ia32cap_P(%rip),%r11 119 mov \$1,%eax 120 cmp \$0,`$win64?"%rcx":"%rdi"` 121 je .Lprobe 122 mov 0(%r11),%eax 123 mov 4(%r11),%r10 124___ 125$code.=<<___ if ($shaext); 126 bt \$61,%r10 # check for SHA 127 jc ${func}_shaext 128___ 129$code.=<<___; 130 mov %r10,%r11 131 shr \$32,%r11 132 133 test \$`1<<11`,%r10d # check for XOP 134 jnz ${func}_xop 135___ 136$code.=<<___ if ($avx>1); 137 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 138 cmp \$`1<<8|1<<5|1<<3`,%r11d 139 je ${func}_avx2 140___ 141$code.=<<___; 142 and \$`1<<28`,%r10d # check for AVX 143 jnz ${func}_avx 144 ud2 145___ 146 } 147$code.=<<___; 148 xor %eax,%eax 149 cmp \$0,`$win64?"%rcx":"%rdi"` 150 je .Lprobe 151 ud2 152.Lprobe: 153 ret 154.size $func,.-$func 155 156.align 64 157.type $TABLE,\@object 158$TABLE: 159 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 160 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 161 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 162 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 163 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 164 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 165 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 166 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 167 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 168 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 169 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 170 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 171 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 172 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 173 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 174 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 175 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 176 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 177 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 178 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 179 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 180 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 181 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 182 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 183 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 184 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 185 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 186 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 187 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 188 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 189 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 190 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 191 192 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 193 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 194 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 195 .long 0,0,0,0, 0,0,0,0 196 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 197.align 64 198___ 199 200###################################################################### 201# SIMD code paths 202# 203{{{ 204($iv,$inout,$roundkey,$temp, 205 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); 206 207$aesni_cbc_idx=0; 208@aesni_cbc_block = ( 209## &vmovdqu ($roundkey,"0x00-0x80($inp)");' 210## &vmovdqu ($inout,($inp)); 211## &mov ($_inp,$inp); 212 213 '&vpxor ($inout,$inout,$roundkey);'. 214 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', 215 216 '&vpxor ($inout,$inout,$iv);', 217 218 '&vaesenc ($inout,$inout,$roundkey);'. 219 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', 220 221 '&vaesenc ($inout,$inout,$roundkey);'. 222 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', 223 224 '&vaesenc ($inout,$inout,$roundkey);'. 225 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', 226 227 '&vaesenc ($inout,$inout,$roundkey);'. 228 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', 229 230 '&vaesenc ($inout,$inout,$roundkey);'. 231 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', 232 233 '&vaesenc ($inout,$inout,$roundkey);'. 234 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', 235 236 '&vaesenc ($inout,$inout,$roundkey);'. 237 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', 238 239 '&vaesenc ($inout,$inout,$roundkey);'. 240 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', 241 242 '&vaesenc ($inout,$inout,$roundkey);'. 243 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', 244 245 '&vaesenclast ($temp,$inout,$roundkey);'. 246 ' &vaesenc ($inout,$inout,$roundkey);'. 247 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', 248 249 '&vpand ($iv,$temp,$mask10);'. 250 ' &vaesenc ($inout,$inout,$roundkey);'. 251 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', 252 253 '&vaesenclast ($temp,$inout,$roundkey);'. 254 ' &vaesenc ($inout,$inout,$roundkey);'. 255 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', 256 257 '&vpand ($temp,$temp,$mask12);'. 258 ' &vaesenc ($inout,$inout,$roundkey);'. 259 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', 260 261 '&vpor ($iv,$iv,$temp);'. 262 ' &vaesenclast ($temp,$inout,$roundkey);'. 263 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' 264 265## &mov ($inp,$_inp); 266## &mov ($out,$_out); 267## &vpand ($temp,$temp,$mask14); 268## &vpor ($iv,$iv,$temp); 269## &vmovdqu ($iv,($out,$inp); 270## &lea (inp,16($inp)); 271); 272 273my $a4=$T1; 274my ($a,$b,$c,$d,$e,$f,$g,$h); 275 276sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 277{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 278 my $arg = pop; 279 $arg = "\$$arg" if ($arg*1 eq $arg); 280 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 281} 282 283sub body_00_15 () { 284 ( 285 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 286 287 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 288 '&mov ($a,$a1)', 289 '&mov ($a4,$f)', 290 291 '&xor ($a0,$e)', 292 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 293 '&xor ($a4,$g)', # f^g 294 295 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 296 '&xor ($a1,$a)', 297 '&and ($a4,$e)', # (f^g)&e 298 299 @aesni_cbc_block[$aesni_cbc_idx++]. 300 '&xor ($a0,$e)', 301 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 302 '&mov ($a2,$a)', 303 304 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 305 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 306 '&xor ($a2,$b)', # a^b, b^c in next round 307 308 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 309 '&add ($h,$a4)', # h+=Ch(e,f,g) 310 '&and ($a3,$a2)', # (b^c)&(a^b) 311 312 '&xor ($a1,$a)', 313 '&add ($h,$a0)', # h+=Sigma1(e) 314 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 315 316 '&add ($d,$h)', # d+=h 317 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 318 '&add ($h,$a3)', # h+=Maj(a,b,c) 319 320 '&mov ($a0,$d)', 321 '&add ($a1,$h);'. # h+=Sigma0(a) 322 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 323 ); 324} 325 326if ($avx) {{ 327###################################################################### 328# XOP code path 329# 330$code.=<<___; 331.type ${func}_xop,\@function,6 332.align 64 333${func}_xop: 334.Lxop_shortcut: 335 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 336 push %rbx 337 push %rbp 338 push %r12 339 push %r13 340 push %r14 341 push %r15 342 mov %rsp,%r11 # copy %rsp 343 sub \$`$framesz+$win64*16*10`,%rsp 344 and \$-64,%rsp # align stack frame 345 346 shl \$6,$len 347 sub $inp,$out # re-bias 348 sub $inp,$in0 349 add $inp,$len # end of input 350 351 #mov $inp,$_inp # saved later 352 mov $out,$_out 353 mov $len,$_end 354 #mov $key,$_key # remains resident in $inp register 355 mov $ivp,$_ivp 356 mov $ctx,$_ctx 357 mov $in0,$_in0 358 mov %r11,$_rsp 359___ 360$code.=<<___ if ($win64); 361 movaps %xmm6,`$framesz+16*0`(%rsp) 362 movaps %xmm7,`$framesz+16*1`(%rsp) 363 movaps %xmm8,`$framesz+16*2`(%rsp) 364 movaps %xmm9,`$framesz+16*3`(%rsp) 365 movaps %xmm10,`$framesz+16*4`(%rsp) 366 movaps %xmm11,`$framesz+16*5`(%rsp) 367 movaps %xmm12,`$framesz+16*6`(%rsp) 368 movaps %xmm13,`$framesz+16*7`(%rsp) 369 movaps %xmm14,`$framesz+16*8`(%rsp) 370 movaps %xmm15,`$framesz+16*9`(%rsp) 371___ 372$code.=<<___; 373.Lprologue_xop: 374 vzeroall 375 376 mov $inp,%r12 # borrow $a4 377 lea 0x80($key),$inp # size optimization, reassign 378 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 379 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 380 mov $ctx,%r15 # borrow $a2 381 mov $in0,%rsi # borrow $a3 382 vmovdqu ($ivp),$iv # load IV 383 sub \$9,%r14 384 385 mov $SZ*0(%r15),$A 386 mov $SZ*1(%r15),$B 387 mov $SZ*2(%r15),$C 388 mov $SZ*3(%r15),$D 389 mov $SZ*4(%r15),$E 390 mov $SZ*5(%r15),$F 391 mov $SZ*6(%r15),$G 392 mov $SZ*7(%r15),$H 393 394 vmovdqa 0x00(%r13,%r14,8),$mask14 395 vmovdqa 0x10(%r13,%r14,8),$mask12 396 vmovdqa 0x20(%r13,%r14,8),$mask10 397 vmovdqu 0x00-0x80($inp),$roundkey 398 jmp .Lloop_xop 399___ 400 if ($SZ==4) { # SHA256 401 my @X = map("%xmm$_",(0..3)); 402 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 403 404$code.=<<___; 405.align 16 406.Lloop_xop: 407 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 408 vmovdqu 0x00(%rsi,%r12),@X[0] 409 vmovdqu 0x10(%rsi,%r12),@X[1] 410 vmovdqu 0x20(%rsi,%r12),@X[2] 411 vmovdqu 0x30(%rsi,%r12),@X[3] 412 vpshufb $t3,@X[0],@X[0] 413 lea $TABLE(%rip),$Tbl 414 vpshufb $t3,@X[1],@X[1] 415 vpshufb $t3,@X[2],@X[2] 416 vpaddd 0x00($Tbl),@X[0],$t0 417 vpshufb $t3,@X[3],@X[3] 418 vpaddd 0x20($Tbl),@X[1],$t1 419 vpaddd 0x40($Tbl),@X[2],$t2 420 vpaddd 0x60($Tbl),@X[3],$t3 421 vmovdqa $t0,0x00(%rsp) 422 mov $A,$a1 423 vmovdqa $t1,0x10(%rsp) 424 mov $B,$a3 425 vmovdqa $t2,0x20(%rsp) 426 xor $C,$a3 # magic 427 vmovdqa $t3,0x30(%rsp) 428 mov $E,$a0 429 jmp .Lxop_00_47 430 431.align 16 432.Lxop_00_47: 433 sub \$-16*2*$SZ,$Tbl # size optimization 434 vmovdqu (%r12),$inout # $a4 435 mov %r12,$_inp # $a4 436___ 437sub XOP_256_00_47 () { 438my $j = shift; 439my $body = shift; 440my @X = @_; 441my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 442 443 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 444 eval(shift(@insns)); 445 eval(shift(@insns)); 446 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 447 eval(shift(@insns)); 448 eval(shift(@insns)); 449 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 450 eval(shift(@insns)); 451 eval(shift(@insns)); 452 &vpsrld ($t0,$t0,$sigma0[2]); 453 eval(shift(@insns)); 454 eval(shift(@insns)); 455 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 456 eval(shift(@insns)); 457 eval(shift(@insns)); 458 eval(shift(@insns)); 459 eval(shift(@insns)); 460 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 461 eval(shift(@insns)); 462 eval(shift(@insns)); 463 &vpxor ($t0,$t0,$t1); 464 eval(shift(@insns)); 465 eval(shift(@insns)); 466 eval(shift(@insns)); 467 eval(shift(@insns)); 468 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 469 eval(shift(@insns)); 470 eval(shift(@insns)); 471 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 472 eval(shift(@insns)); 473 eval(shift(@insns)); 474 &vpsrld ($t2,@X[3],$sigma1[2]); 475 eval(shift(@insns)); 476 eval(shift(@insns)); 477 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 478 eval(shift(@insns)); 479 eval(shift(@insns)); 480 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 481 eval(shift(@insns)); 482 eval(shift(@insns)); 483 &vpxor ($t3,$t3,$t2); 484 eval(shift(@insns)); 485 eval(shift(@insns)); 486 eval(shift(@insns)); 487 eval(shift(@insns)); 488 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 489 eval(shift(@insns)); 490 eval(shift(@insns)); 491 eval(shift(@insns)); 492 eval(shift(@insns)); 493 &vpsrldq ($t3,$t3,8); 494 eval(shift(@insns)); 495 eval(shift(@insns)); 496 eval(shift(@insns)); 497 eval(shift(@insns)); 498 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 499 eval(shift(@insns)); 500 eval(shift(@insns)); 501 eval(shift(@insns)); 502 eval(shift(@insns)); 503 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 504 eval(shift(@insns)); 505 eval(shift(@insns)); 506 &vpsrld ($t2,@X[0],$sigma1[2]); 507 eval(shift(@insns)); 508 eval(shift(@insns)); 509 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 510 eval(shift(@insns)); 511 eval(shift(@insns)); 512 &vpxor ($t3,$t3,$t2); 513 eval(shift(@insns)); 514 eval(shift(@insns)); 515 eval(shift(@insns)); 516 eval(shift(@insns)); 517 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 518 eval(shift(@insns)); 519 eval(shift(@insns)); 520 eval(shift(@insns)); 521 eval(shift(@insns)); 522 &vpslldq ($t3,$t3,8); # 22 instructions 523 eval(shift(@insns)); 524 eval(shift(@insns)); 525 eval(shift(@insns)); 526 eval(shift(@insns)); 527 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 528 eval(shift(@insns)); 529 eval(shift(@insns)); 530 eval(shift(@insns)); 531 eval(shift(@insns)); 532 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 533 foreach (@insns) { eval; } # remaining instructions 534 &vmovdqa (16*$j."(%rsp)",$t2); 535} 536 537 $aesni_cbc_idx=0; 538 for ($i=0,$j=0; $j<4; $j++) { 539 &XOP_256_00_47($j,\&body_00_15,@X); 540 push(@X,shift(@X)); # rotate(@X) 541 } 542 &mov ("%r12",$_inp); # borrow $a4 543 &vpand ($temp,$temp,$mask14); 544 &mov ("%r15",$_out); # borrow $a2 545 &vpor ($iv,$iv,$temp); 546 &vmovdqu ("(%r15,%r12)",$iv); # write output 547 &lea ("%r12","16(%r12)"); # inp++ 548 549 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 550 &jne (".Lxop_00_47"); 551 552 &vmovdqu ($inout,"(%r12)"); 553 &mov ($_inp,"%r12"); 554 555 $aesni_cbc_idx=0; 556 for ($i=0; $i<16; ) { 557 foreach(body_00_15()) { eval; } 558 } 559 } 560$code.=<<___; 561 mov $_inp,%r12 # borrow $a4 562 mov $_out,%r13 # borrow $a0 563 mov $_ctx,%r15 # borrow $a2 564 mov $_in0,%rsi # borrow $a3 565 566 vpand $mask14,$temp,$temp 567 mov $a1,$A 568 vpor $temp,$iv,$iv 569 vmovdqu $iv,(%r13,%r12) # write output 570 lea 16(%r12),%r12 # inp++ 571 572 add $SZ*0(%r15),$A 573 add $SZ*1(%r15),$B 574 add $SZ*2(%r15),$C 575 add $SZ*3(%r15),$D 576 add $SZ*4(%r15),$E 577 add $SZ*5(%r15),$F 578 add $SZ*6(%r15),$G 579 add $SZ*7(%r15),$H 580 581 cmp $_end,%r12 582 583 mov $A,$SZ*0(%r15) 584 mov $B,$SZ*1(%r15) 585 mov $C,$SZ*2(%r15) 586 mov $D,$SZ*3(%r15) 587 mov $E,$SZ*4(%r15) 588 mov $F,$SZ*5(%r15) 589 mov $G,$SZ*6(%r15) 590 mov $H,$SZ*7(%r15) 591 592 jb .Lloop_xop 593 594 mov $_ivp,$ivp 595 mov $_rsp,%rsi 596 vmovdqu $iv,($ivp) # output IV 597 vzeroall 598___ 599$code.=<<___ if ($win64); 600 movaps `$framesz+16*0`(%rsp),%xmm6 601 movaps `$framesz+16*1`(%rsp),%xmm7 602 movaps `$framesz+16*2`(%rsp),%xmm8 603 movaps `$framesz+16*3`(%rsp),%xmm9 604 movaps `$framesz+16*4`(%rsp),%xmm10 605 movaps `$framesz+16*5`(%rsp),%xmm11 606 movaps `$framesz+16*6`(%rsp),%xmm12 607 movaps `$framesz+16*7`(%rsp),%xmm13 608 movaps `$framesz+16*8`(%rsp),%xmm14 609 movaps `$framesz+16*9`(%rsp),%xmm15 610___ 611$code.=<<___; 612 mov (%rsi),%r15 613 mov 8(%rsi),%r14 614 mov 16(%rsi),%r13 615 mov 24(%rsi),%r12 616 mov 32(%rsi),%rbp 617 mov 40(%rsi),%rbx 618 lea 48(%rsi),%rsp 619.Lepilogue_xop: 620 ret 621.size ${func}_xop,.-${func}_xop 622___ 623###################################################################### 624# AVX+shrd code path 625# 626local *ror = sub { &shrd(@_[0],@_) }; 627 628$code.=<<___; 629.type ${func}_avx,\@function,6 630.align 64 631${func}_avx: 632.Lavx_shortcut: 633 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 634 push %rbx 635 push %rbp 636 push %r12 637 push %r13 638 push %r14 639 push %r15 640 mov %rsp,%r11 # copy %rsp 641 sub \$`$framesz+$win64*16*10`,%rsp 642 and \$-64,%rsp # align stack frame 643 644 shl \$6,$len 645 sub $inp,$out # re-bias 646 sub $inp,$in0 647 add $inp,$len # end of input 648 649 #mov $inp,$_inp # saved later 650 mov $out,$_out 651 mov $len,$_end 652 #mov $key,$_key # remains resident in $inp register 653 mov $ivp,$_ivp 654 mov $ctx,$_ctx 655 mov $in0,$_in0 656 mov %r11,$_rsp 657___ 658$code.=<<___ if ($win64); 659 movaps %xmm6,`$framesz+16*0`(%rsp) 660 movaps %xmm7,`$framesz+16*1`(%rsp) 661 movaps %xmm8,`$framesz+16*2`(%rsp) 662 movaps %xmm9,`$framesz+16*3`(%rsp) 663 movaps %xmm10,`$framesz+16*4`(%rsp) 664 movaps %xmm11,`$framesz+16*5`(%rsp) 665 movaps %xmm12,`$framesz+16*6`(%rsp) 666 movaps %xmm13,`$framesz+16*7`(%rsp) 667 movaps %xmm14,`$framesz+16*8`(%rsp) 668 movaps %xmm15,`$framesz+16*9`(%rsp) 669___ 670$code.=<<___; 671.Lprologue_avx: 672 vzeroall 673 674 mov $inp,%r12 # borrow $a4 675 lea 0x80($key),$inp # size optimization, reassign 676 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 677 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 678 mov $ctx,%r15 # borrow $a2 679 mov $in0,%rsi # borrow $a3 680 vmovdqu ($ivp),$iv # load IV 681 sub \$9,%r14 682 683 mov $SZ*0(%r15),$A 684 mov $SZ*1(%r15),$B 685 mov $SZ*2(%r15),$C 686 mov $SZ*3(%r15),$D 687 mov $SZ*4(%r15),$E 688 mov $SZ*5(%r15),$F 689 mov $SZ*6(%r15),$G 690 mov $SZ*7(%r15),$H 691 692 vmovdqa 0x00(%r13,%r14,8),$mask14 693 vmovdqa 0x10(%r13,%r14,8),$mask12 694 vmovdqa 0x20(%r13,%r14,8),$mask10 695 vmovdqu 0x00-0x80($inp),$roundkey 696___ 697 if ($SZ==4) { # SHA256 698 my @X = map("%xmm$_",(0..3)); 699 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 700 701$code.=<<___; 702 jmp .Lloop_avx 703.align 16 704.Lloop_avx: 705 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 706 vmovdqu 0x00(%rsi,%r12),@X[0] 707 vmovdqu 0x10(%rsi,%r12),@X[1] 708 vmovdqu 0x20(%rsi,%r12),@X[2] 709 vmovdqu 0x30(%rsi,%r12),@X[3] 710 vpshufb $t3,@X[0],@X[0] 711 lea $TABLE(%rip),$Tbl 712 vpshufb $t3,@X[1],@X[1] 713 vpshufb $t3,@X[2],@X[2] 714 vpaddd 0x00($Tbl),@X[0],$t0 715 vpshufb $t3,@X[3],@X[3] 716 vpaddd 0x20($Tbl),@X[1],$t1 717 vpaddd 0x40($Tbl),@X[2],$t2 718 vpaddd 0x60($Tbl),@X[3],$t3 719 vmovdqa $t0,0x00(%rsp) 720 mov $A,$a1 721 vmovdqa $t1,0x10(%rsp) 722 mov $B,$a3 723 vmovdqa $t2,0x20(%rsp) 724 xor $C,$a3 # magic 725 vmovdqa $t3,0x30(%rsp) 726 mov $E,$a0 727 jmp .Lavx_00_47 728 729.align 16 730.Lavx_00_47: 731 sub \$-16*2*$SZ,$Tbl # size optimization 732 vmovdqu (%r12),$inout # $a4 733 mov %r12,$_inp # $a4 734___ 735sub Xupdate_256_AVX () { 736 ( 737 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 738 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 739 '&vpsrld ($t2,$t0,$sigma0[0]);', 740 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 741 '&vpsrld ($t3,$t0,$sigma0[2])', 742 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 743 '&vpxor ($t0,$t3,$t2)', 744 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 745 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 746 '&vpxor ($t0,$t0,$t1)', 747 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 748 '&vpxor ($t0,$t0,$t2)', 749 '&vpsrld ($t2,$t3,$sigma1[2]);', 750 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 751 '&vpsrlq ($t3,$t3,$sigma1[0]);', 752 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 753 '&vpxor ($t2,$t2,$t3);', 754 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 755 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) 756 '&vpshufd ($t2,$t2,0b10000100)', 757 '&vpsrldq ($t2,$t2,8)', 758 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 759 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 760 '&vpsrld ($t2,$t3,$sigma1[2])', 761 '&vpsrlq ($t3,$t3,$sigma1[0])', 762 '&vpxor ($t2,$t2,$t3);', 763 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 764 '&vpxor ($t2,$t2,$t3)', 765 '&vpshufd ($t2,$t2,0b11101000)', 766 '&vpslldq ($t2,$t2,8)', 767 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 768 ); 769} 770 771sub AVX_256_00_47 () { 772my $j = shift; 773my $body = shift; 774my @X = @_; 775my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 776 777 foreach (Xupdate_256_AVX()) { # 29 instructions 778 eval; 779 eval(shift(@insns)); 780 eval(shift(@insns)); 781 eval(shift(@insns)); 782 } 783 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 784 foreach (@insns) { eval; } # remaining instructions 785 &vmovdqa (16*$j."(%rsp)",$t2); 786} 787 788 $aesni_cbc_idx=0; 789 for ($i=0,$j=0; $j<4; $j++) { 790 &AVX_256_00_47($j,\&body_00_15,@X); 791 push(@X,shift(@X)); # rotate(@X) 792 } 793 &mov ("%r12",$_inp); # borrow $a4 794 &vpand ($temp,$temp,$mask14); 795 &mov ("%r15",$_out); # borrow $a2 796 &vpor ($iv,$iv,$temp); 797 &vmovdqu ("(%r15,%r12)",$iv); # write output 798 &lea ("%r12","16(%r12)"); # inp++ 799 800 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 801 &jne (".Lavx_00_47"); 802 803 &vmovdqu ($inout,"(%r12)"); 804 &mov ($_inp,"%r12"); 805 806 $aesni_cbc_idx=0; 807 for ($i=0; $i<16; ) { 808 foreach(body_00_15()) { eval; } 809 } 810 811 } 812$code.=<<___; 813 mov $_inp,%r12 # borrow $a4 814 mov $_out,%r13 # borrow $a0 815 mov $_ctx,%r15 # borrow $a2 816 mov $_in0,%rsi # borrow $a3 817 818 vpand $mask14,$temp,$temp 819 mov $a1,$A 820 vpor $temp,$iv,$iv 821 vmovdqu $iv,(%r13,%r12) # write output 822 lea 16(%r12),%r12 # inp++ 823 824 add $SZ*0(%r15),$A 825 add $SZ*1(%r15),$B 826 add $SZ*2(%r15),$C 827 add $SZ*3(%r15),$D 828 add $SZ*4(%r15),$E 829 add $SZ*5(%r15),$F 830 add $SZ*6(%r15),$G 831 add $SZ*7(%r15),$H 832 833 cmp $_end,%r12 834 835 mov $A,$SZ*0(%r15) 836 mov $B,$SZ*1(%r15) 837 mov $C,$SZ*2(%r15) 838 mov $D,$SZ*3(%r15) 839 mov $E,$SZ*4(%r15) 840 mov $F,$SZ*5(%r15) 841 mov $G,$SZ*6(%r15) 842 mov $H,$SZ*7(%r15) 843 jb .Lloop_avx 844 845 mov $_ivp,$ivp 846 mov $_rsp,%rsi 847 vmovdqu $iv,($ivp) # output IV 848 vzeroall 849___ 850$code.=<<___ if ($win64); 851 movaps `$framesz+16*0`(%rsp),%xmm6 852 movaps `$framesz+16*1`(%rsp),%xmm7 853 movaps `$framesz+16*2`(%rsp),%xmm8 854 movaps `$framesz+16*3`(%rsp),%xmm9 855 movaps `$framesz+16*4`(%rsp),%xmm10 856 movaps `$framesz+16*5`(%rsp),%xmm11 857 movaps `$framesz+16*6`(%rsp),%xmm12 858 movaps `$framesz+16*7`(%rsp),%xmm13 859 movaps `$framesz+16*8`(%rsp),%xmm14 860 movaps `$framesz+16*9`(%rsp),%xmm15 861___ 862$code.=<<___; 863 mov (%rsi),%r15 864 mov 8(%rsi),%r14 865 mov 16(%rsi),%r13 866 mov 24(%rsi),%r12 867 mov 32(%rsi),%rbp 868 mov 40(%rsi),%rbx 869 lea 48(%rsi),%rsp 870.Lepilogue_avx: 871 ret 872.size ${func}_avx,.-${func}_avx 873___ 874 875if ($avx>1) {{ 876###################################################################### 877# AVX2+BMI code path 878# 879my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 880my $PUSH8=8*2*$SZ; 881use integer; 882 883sub bodyx_00_15 () { 884 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 885 ( 886 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 887 888 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 889 '&and ($a4,$e)', # f&e 890 '&rorx ($a0,$e,$Sigma1[2])', 891 '&rorx ($a2,$e,$Sigma1[1])', 892 893 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 894 '&lea ($h,"($h,$a4)")', 895 '&andn ($a4,$e,$g)', # ~e&g 896 '&xor ($a0,$a2)', 897 898 '&rorx ($a1,$e,$Sigma1[0])', 899 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 900 '&xor ($a0,$a1)', # Sigma1(e) 901 '&mov ($a2,$a)', 902 903 '&rorx ($a4,$a,$Sigma0[2])', 904 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 905 '&xor ($a2,$b)', # a^b, b^c in next round 906 '&rorx ($a1,$a,$Sigma0[1])', 907 908 '&rorx ($a0,$a,$Sigma0[0])', 909 '&lea ($d,"($d,$h)")', # d+=h 910 '&and ($a3,$a2)', # (b^c)&(a^b) 911 @aesni_cbc_block[$aesni_cbc_idx++]. 912 '&xor ($a1,$a4)', 913 914 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 915 '&xor ($a1,$a0)', # Sigma0(a) 916 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 917 '&mov ($a4,$e)', # copy of f in future 918 919 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 920 ); 921 # and at the finish one has to $a+=$a1 922} 923 924$code.=<<___; 925.type ${func}_avx2,\@function,6 926.align 64 927${func}_avx2: 928.Lavx2_shortcut: 929 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 930 push %rbx 931 push %rbp 932 push %r12 933 push %r13 934 push %r14 935 push %r15 936 mov %rsp,%r11 # copy %rsp 937 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp 938 and \$-256*$SZ,%rsp # align stack frame 939 add \$`2*$SZ*($rounds-8)`,%rsp 940 941 shl \$6,$len 942 sub $inp,$out # re-bias 943 sub $inp,$in0 944 add $inp,$len # end of input 945 946 #mov $inp,$_inp # saved later 947 #mov $out,$_out # kept in $offload 948 mov $len,$_end 949 #mov $key,$_key # remains resident in $inp register 950 mov $ivp,$_ivp 951 mov $ctx,$_ctx 952 mov $in0,$_in0 953 mov %r11,$_rsp 954___ 955$code.=<<___ if ($win64); 956 movaps %xmm6,`$framesz+16*0`(%rsp) 957 movaps %xmm7,`$framesz+16*1`(%rsp) 958 movaps %xmm8,`$framesz+16*2`(%rsp) 959 movaps %xmm9,`$framesz+16*3`(%rsp) 960 movaps %xmm10,`$framesz+16*4`(%rsp) 961 movaps %xmm11,`$framesz+16*5`(%rsp) 962 movaps %xmm12,`$framesz+16*6`(%rsp) 963 movaps %xmm13,`$framesz+16*7`(%rsp) 964 movaps %xmm14,`$framesz+16*8`(%rsp) 965 movaps %xmm15,`$framesz+16*9`(%rsp) 966___ 967$code.=<<___; 968.Lprologue_avx2: 969 vzeroall 970 971 mov $inp,%r13 # borrow $a0 972 vpinsrq \$1,$out,$offload,$offload 973 lea 0x80($key),$inp # size optimization, reassign 974 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 975 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 976 mov $ctx,%r15 # borrow $a2 977 mov $in0,%rsi # borrow $a3 978 vmovdqu ($ivp),$iv # load IV 979 lea -9(%r14),%r14 980 981 vmovdqa 0x00(%r12,%r14,8),$mask14 982 vmovdqa 0x10(%r12,%r14,8),$mask12 983 vmovdqa 0x20(%r12,%r14,8),$mask10 984 985 sub \$-16*$SZ,%r13 # inp++, size optimization 986 mov $SZ*0(%r15),$A 987 lea (%rsi,%r13),%r12 # borrow $a0 988 mov $SZ*1(%r15),$B 989 cmp $len,%r13 # $_end 990 mov $SZ*2(%r15),$C 991 cmove %rsp,%r12 # next block or random data 992 mov $SZ*3(%r15),$D 993 mov $SZ*4(%r15),$E 994 mov $SZ*5(%r15),$F 995 mov $SZ*6(%r15),$G 996 mov $SZ*7(%r15),$H 997 vmovdqu 0x00-0x80($inp),$roundkey 998___ 999 if ($SZ==4) { # SHA256 1000 my @X = map("%ymm$_",(0..3)); 1001 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); 1002 1003$code.=<<___; 1004 jmp .Loop_avx2 1005.align 16 1006.Loop_avx2: 1007 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1008 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 1009 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 1010 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 1011 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 1012 1013 vinserti128 \$1,(%r12),@X[0],@X[0] 1014 vinserti128 \$1,16(%r12),@X[1],@X[1] 1015 vpshufb $t3,@X[0],@X[0] 1016 vinserti128 \$1,32(%r12),@X[2],@X[2] 1017 vpshufb $t3,@X[1],@X[1] 1018 vinserti128 \$1,48(%r12),@X[3],@X[3] 1019 1020 lea $TABLE(%rip),$Tbl 1021 vpshufb $t3,@X[2],@X[2] 1022 lea -16*$SZ(%r13),%r13 1023 vpaddd 0x00($Tbl),@X[0],$t0 1024 vpshufb $t3,@X[3],@X[3] 1025 vpaddd 0x20($Tbl),@X[1],$t1 1026 vpaddd 0x40($Tbl),@X[2],$t2 1027 vpaddd 0x60($Tbl),@X[3],$t3 1028 vmovdqa $t0,0x00(%rsp) 1029 xor $a1,$a1 1030 vmovdqa $t1,0x20(%rsp) 1031 lea -$PUSH8(%rsp),%rsp 1032 mov $B,$a3 1033 vmovdqa $t2,0x00(%rsp) 1034 xor $C,$a3 # magic 1035 vmovdqa $t3,0x20(%rsp) 1036 mov $F,$a4 1037 sub \$-16*2*$SZ,$Tbl # size optimization 1038 jmp .Lavx2_00_47 1039 1040.align 16 1041.Lavx2_00_47: 1042 vmovdqu (%r13),$inout 1043 vpinsrq \$0,%r13,$offload,$offload 1044___ 1045 1046sub AVX2_256_00_47 () { 1047my $j = shift; 1048my $body = shift; 1049my @X = @_; 1050my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1051my $base = "+2*$PUSH8(%rsp)"; 1052 1053 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1054 foreach (Xupdate_256_AVX()) { # 29 instructions 1055 eval; 1056 eval(shift(@insns)); 1057 eval(shift(@insns)); 1058 eval(shift(@insns)); 1059 } 1060 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1061 foreach (@insns) { eval; } # remaining instructions 1062 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1063} 1064 $aesni_cbc_idx=0; 1065 for ($i=0,$j=0; $j<4; $j++) { 1066 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1067 push(@X,shift(@X)); # rotate(@X) 1068 } 1069 &vmovq ("%r13",$offload); # borrow $a0 1070 &vpextrq ("%r15",$offload,1); # borrow $a2 1071 &vpand ($temp,$temp,$mask14); 1072 &vpor ($iv,$iv,$temp); 1073 &vmovdqu ("(%r15,%r13)",$iv); # write output 1074 &lea ("%r13","16(%r13)"); # inp++ 1075 1076 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1077 &cmpb (($SZ-1)."($Tbl)",0); 1078 &jne (".Lavx2_00_47"); 1079 1080 &vmovdqu ($inout,"(%r13)"); 1081 &vpinsrq ($offload,$offload,"%r13",0); 1082 1083 $aesni_cbc_idx=0; 1084 for ($i=0; $i<16; ) { 1085 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1086 foreach(bodyx_00_15()) { eval; } 1087 } 1088 } 1089$code.=<<___; 1090 vpextrq \$1,$offload,%r12 # $_out, borrow $a4 1091 vmovq $offload,%r13 # $_inp, borrow $a0 1092 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1093 add $a1,$A 1094 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 1095 1096 vpand $mask14,$temp,$temp 1097 vpor $temp,$iv,$iv 1098 vmovdqu $iv,(%r12,%r13) # write output 1099 lea 16(%r13),%r13 1100 1101 add $SZ*0(%r15),$A 1102 add $SZ*1(%r15),$B 1103 add $SZ*2(%r15),$C 1104 add $SZ*3(%r15),$D 1105 add $SZ*4(%r15),$E 1106 add $SZ*5(%r15),$F 1107 add $SZ*6(%r15),$G 1108 add $SZ*7(%r15),$H 1109 1110 mov $A,$SZ*0(%r15) 1111 mov $B,$SZ*1(%r15) 1112 mov $C,$SZ*2(%r15) 1113 mov $D,$SZ*3(%r15) 1114 mov $E,$SZ*4(%r15) 1115 mov $F,$SZ*5(%r15) 1116 mov $G,$SZ*6(%r15) 1117 mov $H,$SZ*7(%r15) 1118 1119 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end 1120 je .Ldone_avx2 1121 1122 xor $a1,$a1 1123 mov $B,$a3 1124 mov $F,$a4 1125 xor $C,$a3 # magic 1126 jmp .Lower_avx2 1127.align 16 1128.Lower_avx2: 1129 vmovdqu (%r13),$inout 1130 vpinsrq \$0,%r13,$offload,$offload 1131___ 1132 $aesni_cbc_idx=0; 1133 for ($i=0; $i<16; ) { 1134 my $base="+16($Tbl)"; 1135 foreach(bodyx_00_15()) { eval; } 1136 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); 1137 } 1138$code.=<<___; 1139 vmovq $offload,%r13 # borrow $a0 1140 vpextrq \$1,$offload,%r15 # borrow $a2 1141 vpand $mask14,$temp,$temp 1142 vpor $temp,$iv,$iv 1143 lea -$PUSH8($Tbl),$Tbl 1144 vmovdqu $iv,(%r15,%r13) # write output 1145 lea 16(%r13),%r13 # inp++ 1146 cmp %rsp,$Tbl 1147 jae .Lower_avx2 1148 1149 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1150 lea 16*$SZ(%r13),%r13 1151 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 1152 add $a1,$A 1153 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 1154 1155 add $SZ*0(%r15),$A 1156 add $SZ*1(%r15),$B 1157 add $SZ*2(%r15),$C 1158 add $SZ*3(%r15),$D 1159 add $SZ*4(%r15),$E 1160 add $SZ*5(%r15),$F 1161 add $SZ*6(%r15),$G 1162 lea (%rsi,%r13),%r12 1163 add $SZ*7(%r15),$H 1164 1165 cmp $_end,%r13 1166 1167 mov $A,$SZ*0(%r15) 1168 cmove %rsp,%r12 # next block or stale data 1169 mov $B,$SZ*1(%r15) 1170 mov $C,$SZ*2(%r15) 1171 mov $D,$SZ*3(%r15) 1172 mov $E,$SZ*4(%r15) 1173 mov $F,$SZ*5(%r15) 1174 mov $G,$SZ*6(%r15) 1175 mov $H,$SZ*7(%r15) 1176 1177 jbe .Loop_avx2 1178 lea (%rsp),$Tbl 1179 1180.Ldone_avx2: 1181 lea ($Tbl),%rsp 1182 mov $_ivp,$ivp 1183 mov $_rsp,%rsi 1184 vmovdqu $iv,($ivp) # output IV 1185 vzeroall 1186___ 1187$code.=<<___ if ($win64); 1188 movaps `$framesz+16*0`(%rsp),%xmm6 1189 movaps `$framesz+16*1`(%rsp),%xmm7 1190 movaps `$framesz+16*2`(%rsp),%xmm8 1191 movaps `$framesz+16*3`(%rsp),%xmm9 1192 movaps `$framesz+16*4`(%rsp),%xmm10 1193 movaps `$framesz+16*5`(%rsp),%xmm11 1194 movaps `$framesz+16*6`(%rsp),%xmm12 1195 movaps `$framesz+16*7`(%rsp),%xmm13 1196 movaps `$framesz+16*8`(%rsp),%xmm14 1197 movaps `$framesz+16*9`(%rsp),%xmm15 1198___ 1199$code.=<<___; 1200 mov (%rsi),%r15 1201 mov 8(%rsi),%r14 1202 mov 16(%rsi),%r13 1203 mov 24(%rsi),%r12 1204 mov 32(%rsi),%rbp 1205 mov 40(%rsi),%rbx 1206 lea 48(%rsi),%rsp 1207.Lepilogue_avx2: 1208 ret 1209.size ${func}_avx2,.-${func}_avx2 1210___ 1211}} 1212}} 1213{{ 1214my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1215 1216my ($rounds,$Tbl)=("%r11d","%rbx"); 1217 1218my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); 1219my @rndkey=("%xmm4","%xmm5"); 1220my $r=0; 1221my $sn=0; 1222 1223my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); 1224my @MSG=map("%xmm$_",(10..13)); 1225 1226my $aesenc=sub { 1227 use integer; 1228 my ($n,$k)=($r/10,$r%10); 1229 if ($k==0) { 1230 $code.=<<___; 1231 movups `16*$n`($in0),$in # load input 1232 xorps $rndkey0,$in 1233___ 1234 $code.=<<___ if ($n); 1235 movups $iv,`16*($n-1)`($out,$in0) # write output 1236___ 1237 $code.=<<___; 1238 xorps $in,$iv 1239 movups `32+16*$k-112`($key),$rndkey[1] 1240 aesenc $rndkey[0],$iv 1241___ 1242 } elsif ($k==9) { 1243 $sn++; 1244 $code.=<<___; 1245 cmp \$11,$rounds 1246 jb .Laesenclast$sn 1247 movups `32+16*($k+0)-112`($key),$rndkey[1] 1248 aesenc $rndkey[0],$iv 1249 movups `32+16*($k+1)-112`($key),$rndkey[0] 1250 aesenc $rndkey[1],$iv 1251 je .Laesenclast$sn 1252 movups `32+16*($k+2)-112`($key),$rndkey[1] 1253 aesenc $rndkey[0],$iv 1254 movups `32+16*($k+3)-112`($key),$rndkey[0] 1255 aesenc $rndkey[1],$iv 1256.Laesenclast$sn: 1257 aesenclast $rndkey[0],$iv 1258 movups 16-112($key),$rndkey[1] # forward reference 1259 nop 1260___ 1261 } else { 1262 $code.=<<___; 1263 movups `32+16*$k-112`($key),$rndkey[1] 1264 aesenc $rndkey[0],$iv 1265___ 1266 } 1267 $r++; unshift(@rndkey,pop(@rndkey)); 1268}; 1269 1270if ($shaext) { 1271my $Tbl="%rax"; 1272 1273$code.=<<___; 1274.type ${func}_shaext,\@function,6 1275.align 32 1276${func}_shaext: 1277 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1278___ 1279$code.=<<___ if ($win64); 1280 lea `-8-10*16`(%rsp),%rsp 1281 movaps %xmm6,-8-10*16(%rax) 1282 movaps %xmm7,-8-9*16(%rax) 1283 movaps %xmm8,-8-8*16(%rax) 1284 movaps %xmm9,-8-7*16(%rax) 1285 movaps %xmm10,-8-6*16(%rax) 1286 movaps %xmm11,-8-5*16(%rax) 1287 movaps %xmm12,-8-4*16(%rax) 1288 movaps %xmm13,-8-3*16(%rax) 1289 movaps %xmm14,-8-2*16(%rax) 1290 movaps %xmm15,-8-1*16(%rax) 1291.Lprologue_shaext: 1292___ 1293$code.=<<___; 1294 lea K256+0x80(%rip),$Tbl 1295 movdqu ($ctx),$ABEF # DCBA 1296 movdqu 16($ctx),$CDGH # HGFE 1297 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 1298 1299 mov 240($key),$rounds 1300 sub $in0,$out 1301 movups ($key),$rndkey0 # $key[0] 1302 movups ($ivp),$iv # load IV 1303 movups 16($key),$rndkey[0] # forward reference 1304 lea 112($key),$key # size optimization 1305 1306 pshufd \$0x1b,$ABEF,$Wi # ABCD 1307 pshufd \$0xb1,$ABEF,$ABEF # CDAB 1308 pshufd \$0x1b,$CDGH,$CDGH # EFGH 1309 movdqa $TMP,$BSWAP # offload 1310 palignr \$8,$CDGH,$ABEF # ABEF 1311 punpcklqdq $Wi,$CDGH # CDGH 1312 1313 jmp .Loop_shaext 1314 1315.align 16 1316.Loop_shaext: 1317 movdqu ($inp),@MSG[0] 1318 movdqu 0x10($inp),@MSG[1] 1319 movdqu 0x20($inp),@MSG[2] 1320 pshufb $TMP,@MSG[0] 1321 movdqu 0x30($inp),@MSG[3] 1322 1323 movdqa 0*32-0x80($Tbl),$Wi 1324 paddd @MSG[0],$Wi 1325 pshufb $TMP,@MSG[1] 1326 movdqa $CDGH,$CDGH_SAVE # offload 1327 movdqa $ABEF,$ABEF_SAVE # offload 1328___ 1329 &$aesenc(); 1330$code.=<<___; 1331 sha256rnds2 $ABEF,$CDGH # 0-3 1332 pshufd \$0x0e,$Wi,$Wi 1333___ 1334 &$aesenc(); 1335$code.=<<___; 1336 sha256rnds2 $CDGH,$ABEF 1337 1338 movdqa 1*32-0x80($Tbl),$Wi 1339 paddd @MSG[1],$Wi 1340 pshufb $TMP,@MSG[2] 1341 lea 0x40($inp),$inp 1342___ 1343 &$aesenc(); 1344$code.=<<___; 1345 sha256rnds2 $ABEF,$CDGH # 4-7 1346 pshufd \$0x0e,$Wi,$Wi 1347___ 1348 &$aesenc(); 1349$code.=<<___; 1350 sha256rnds2 $CDGH,$ABEF 1351 1352 movdqa 2*32-0x80($Tbl),$Wi 1353 paddd @MSG[2],$Wi 1354 pshufb $TMP,@MSG[3] 1355 sha256msg1 @MSG[1],@MSG[0] 1356___ 1357 &$aesenc(); 1358$code.=<<___; 1359 sha256rnds2 $ABEF,$CDGH # 8-11 1360 pshufd \$0x0e,$Wi,$Wi 1361 movdqa @MSG[3],$TMP 1362 palignr \$4,@MSG[2],$TMP 1363 paddd $TMP,@MSG[0] 1364___ 1365 &$aesenc(); 1366$code.=<<___; 1367 sha256rnds2 $CDGH,$ABEF 1368 1369 movdqa 3*32-0x80($Tbl),$Wi 1370 paddd @MSG[3],$Wi 1371 sha256msg2 @MSG[3],@MSG[0] 1372 sha256msg1 @MSG[2],@MSG[1] 1373___ 1374 &$aesenc(); 1375$code.=<<___; 1376 sha256rnds2 $ABEF,$CDGH # 12-15 1377 pshufd \$0x0e,$Wi,$Wi 1378___ 1379 &$aesenc(); 1380$code.=<<___; 1381 movdqa @MSG[0],$TMP 1382 palignr \$4,@MSG[3],$TMP 1383 paddd $TMP,@MSG[1] 1384 sha256rnds2 $CDGH,$ABEF 1385___ 1386for($i=4;$i<16-3;$i++) { 1387 &$aesenc() if (($r%10)==0); 1388$code.=<<___; 1389 movdqa $i*32-0x80($Tbl),$Wi 1390 paddd @MSG[0],$Wi 1391 sha256msg2 @MSG[0],@MSG[1] 1392 sha256msg1 @MSG[3],@MSG[2] 1393___ 1394 &$aesenc(); 1395$code.=<<___; 1396 sha256rnds2 $ABEF,$CDGH # 16-19... 1397 pshufd \$0x0e,$Wi,$Wi 1398 movdqa @MSG[1],$TMP 1399 palignr \$4,@MSG[0],$TMP 1400 paddd $TMP,@MSG[2] 1401___ 1402 &$aesenc(); 1403 &$aesenc() if ($r==19); 1404$code.=<<___; 1405 sha256rnds2 $CDGH,$ABEF 1406___ 1407 push(@MSG,shift(@MSG)); 1408} 1409$code.=<<___; 1410 movdqa 13*32-0x80($Tbl),$Wi 1411 paddd @MSG[0],$Wi 1412 sha256msg2 @MSG[0],@MSG[1] 1413 sha256msg1 @MSG[3],@MSG[2] 1414___ 1415 &$aesenc(); 1416$code.=<<___; 1417 sha256rnds2 $ABEF,$CDGH # 52-55 1418 pshufd \$0x0e,$Wi,$Wi 1419 movdqa @MSG[1],$TMP 1420 palignr \$4,@MSG[0],$TMP 1421 paddd $TMP,@MSG[2] 1422___ 1423 &$aesenc(); 1424 &$aesenc(); 1425$code.=<<___; 1426 sha256rnds2 $CDGH,$ABEF 1427 1428 movdqa 14*32-0x80($Tbl),$Wi 1429 paddd @MSG[1],$Wi 1430 sha256msg2 @MSG[1],@MSG[2] 1431 movdqa $BSWAP,$TMP 1432___ 1433 &$aesenc(); 1434$code.=<<___; 1435 sha256rnds2 $ABEF,$CDGH # 56-59 1436 pshufd \$0x0e,$Wi,$Wi 1437___ 1438 &$aesenc(); 1439$code.=<<___; 1440 sha256rnds2 $CDGH,$ABEF 1441 1442 movdqa 15*32-0x80($Tbl),$Wi 1443 paddd @MSG[2],$Wi 1444___ 1445 &$aesenc(); 1446 &$aesenc(); 1447$code.=<<___; 1448 sha256rnds2 $ABEF,$CDGH # 60-63 1449 pshufd \$0x0e,$Wi,$Wi 1450___ 1451 &$aesenc(); 1452$code.=<<___; 1453 sha256rnds2 $CDGH,$ABEF 1454 #pxor $CDGH,$rndkey0 # black magic 1455___ 1456 while ($r<40) { &$aesenc(); } # remaining aesenc's 1457$code.=<<___; 1458 #xorps $CDGH,$rndkey0 # black magic 1459 paddd $CDGH_SAVE,$CDGH 1460 paddd $ABEF_SAVE,$ABEF 1461 1462 dec $len 1463 movups $iv,48($out,$in0) # write output 1464 lea 64($in0),$in0 1465 jnz .Loop_shaext 1466 1467 pshufd \$0xb1,$CDGH,$CDGH # DCHG 1468 pshufd \$0x1b,$ABEF,$TMP # FEBA 1469 pshufd \$0xb1,$ABEF,$ABEF # BAFE 1470 punpckhqdq $CDGH,$ABEF # DCBA 1471 palignr \$8,$TMP,$CDGH # HGFE 1472 1473 movups $iv,($ivp) # write IV 1474 movdqu $ABEF,($ctx) 1475 movdqu $CDGH,16($ctx) 1476___ 1477$code.=<<___ if ($win64); 1478 movaps 0*16(%rsp),%xmm6 1479 movaps 1*16(%rsp),%xmm7 1480 movaps 2*16(%rsp),%xmm8 1481 movaps 3*16(%rsp),%xmm9 1482 movaps 4*16(%rsp),%xmm10 1483 movaps 5*16(%rsp),%xmm11 1484 movaps 6*16(%rsp),%xmm12 1485 movaps 7*16(%rsp),%xmm13 1486 movaps 8*16(%rsp),%xmm14 1487 movaps 9*16(%rsp),%xmm15 1488 lea 8+10*16(%rsp),%rsp 1489.Lepilogue_shaext: 1490___ 1491$code.=<<___; 1492 ret 1493.size ${func}_shaext,.-${func}_shaext 1494___ 1495} 1496}}}}} 1497 1498# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1499# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1500if ($win64 && $avx) { 1501$rec="%rcx"; 1502$frame="%rdx"; 1503$context="%r8"; 1504$disp="%r9"; 1505 1506$code.=<<___; 1507.extern __imp_RtlVirtualUnwind 1508.type se_handler,\@abi-omnipotent 1509.align 16 1510se_handler: 1511 push %rsi 1512 push %rdi 1513 push %rbx 1514 push %rbp 1515 push %r12 1516 push %r13 1517 push %r14 1518 push %r15 1519 pushfq 1520 sub \$64,%rsp 1521 1522 mov 120($context),%rax # pull context->Rax 1523 mov 248($context),%rbx # pull context->Rip 1524 1525 mov 8($disp),%rsi # disp->ImageBase 1526 mov 56($disp),%r11 # disp->HanderlData 1527 1528 mov 0(%r11),%r10d # HandlerData[0] 1529 lea (%rsi,%r10),%r10 # prologue label 1530 cmp %r10,%rbx # context->Rip<prologue label 1531 jb .Lin_prologue 1532 1533 mov 152($context),%rax # pull context->Rsp 1534 1535 mov 4(%r11),%r10d # HandlerData[1] 1536 lea (%rsi,%r10),%r10 # epilogue label 1537 cmp %r10,%rbx # context->Rip>=epilogue label 1538 jae .Lin_prologue 1539___ 1540$code.=<<___ if ($shaext); 1541 lea aesni_cbc_sha256_enc_shaext(%rip),%r10 1542 cmp %r10,%rbx 1543 jb .Lnot_in_shaext 1544 1545 lea (%rax),%rsi 1546 lea 512($context),%rdi # &context.Xmm6 1547 mov \$20,%ecx 1548 .long 0xa548f3fc # cld; rep movsq 1549 lea 168(%rax),%rax # adjust stack pointer 1550 jmp .Lin_prologue 1551.Lnot_in_shaext: 1552___ 1553$code.=<<___ if ($avx>1); 1554 lea .Lavx2_shortcut(%rip),%r10 1555 cmp %r10,%rbx # context->Rip<avx2_shortcut 1556 jb .Lnot_in_avx2 1557 1558 and \$-256*$SZ,%rax 1559 add \$`2*$SZ*($rounds-8)`,%rax 1560.Lnot_in_avx2: 1561___ 1562$code.=<<___; 1563 mov %rax,%rsi # put aside Rsp 1564 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp 1565 lea 48(%rax),%rax 1566 1567 mov -8(%rax),%rbx 1568 mov -16(%rax),%rbp 1569 mov -24(%rax),%r12 1570 mov -32(%rax),%r13 1571 mov -40(%rax),%r14 1572 mov -48(%rax),%r15 1573 mov %rbx,144($context) # restore context->Rbx 1574 mov %rbp,160($context) # restore context->Rbp 1575 mov %r12,216($context) # restore context->R12 1576 mov %r13,224($context) # restore context->R13 1577 mov %r14,232($context) # restore context->R14 1578 mov %r15,240($context) # restore context->R15 1579 1580 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area 1581 lea 512($context),%rdi # &context.Xmm6 1582 mov \$20,%ecx 1583 .long 0xa548f3fc # cld; rep movsq 1584 1585.Lin_prologue: 1586 mov 8(%rax),%rdi 1587 mov 16(%rax),%rsi 1588 mov %rax,152($context) # restore context->Rsp 1589 mov %rsi,168($context) # restore context->Rsi 1590 mov %rdi,176($context) # restore context->Rdi 1591 1592 mov 40($disp),%rdi # disp->ContextRecord 1593 mov $context,%rsi # context 1594 mov \$154,%ecx # sizeof(CONTEXT) 1595 .long 0xa548f3fc # cld; rep movsq 1596 1597 mov $disp,%rsi 1598 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1599 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1600 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1601 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1602 mov 40(%rsi),%r10 # disp->ContextRecord 1603 lea 56(%rsi),%r11 # &disp->HandlerData 1604 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1605 mov %r10,32(%rsp) # arg5 1606 mov %r11,40(%rsp) # arg6 1607 mov %r12,48(%rsp) # arg7 1608 mov %rcx,56(%rsp) # arg8, (NULL) 1609 call *__imp_RtlVirtualUnwind(%rip) 1610 1611 mov \$1,%eax # ExceptionContinueSearch 1612 add \$64,%rsp 1613 popfq 1614 pop %r15 1615 pop %r14 1616 pop %r13 1617 pop %r12 1618 pop %rbp 1619 pop %rbx 1620 pop %rdi 1621 pop %rsi 1622 ret 1623.size se_handler,.-se_handler 1624 1625.section .pdata 1626 .rva .LSEH_begin_${func}_xop 1627 .rva .LSEH_end_${func}_xop 1628 .rva .LSEH_info_${func}_xop 1629 1630 .rva .LSEH_begin_${func}_avx 1631 .rva .LSEH_end_${func}_avx 1632 .rva .LSEH_info_${func}_avx 1633___ 1634$code.=<<___ if ($avx>1); 1635 .rva .LSEH_begin_${func}_avx2 1636 .rva .LSEH_end_${func}_avx2 1637 .rva .LSEH_info_${func}_avx2 1638___ 1639$code.=<<___ if ($shaext); 1640 .rva .LSEH_begin_${func}_shaext 1641 .rva .LSEH_end_${func}_shaext 1642 .rva .LSEH_info_${func}_shaext 1643___ 1644$code.=<<___; 1645.section .xdata 1646.align 8 1647.LSEH_info_${func}_xop: 1648 .byte 9,0,0,0 1649 .rva se_handler 1650 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 1651 1652.LSEH_info_${func}_avx: 1653 .byte 9,0,0,0 1654 .rva se_handler 1655 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1656___ 1657$code.=<<___ if ($avx>1); 1658.LSEH_info_${func}_avx2: 1659 .byte 9,0,0,0 1660 .rva se_handler 1661 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 1662___ 1663$code.=<<___ if ($shaext); 1664.LSEH_info_${func}_shaext: 1665 .byte 9,0,0,0 1666 .rva se_handler 1667 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] 1668___ 1669} 1670 1671#################################################################### 1672sub rex { 1673 local *opcode=shift; 1674 my ($dst,$src)=@_; 1675 my $rex=0; 1676 1677 $rex|=0x04 if($dst>=8); 1678 $rex|=0x01 if($src>=8); 1679 unshift @opcode,$rex|0x40 if($rex); 1680} 1681 1682{ 1683 my %opcodelet = ( 1684 "sha256rnds2" => 0xcb, 1685 "sha256msg1" => 0xcc, 1686 "sha256msg2" => 0xcd ); 1687 1688 sub sha256op38 { 1689 my $instr = shift; 1690 1691 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1692 my @opcode=(0x0f,0x38); 1693 rex(\@opcode,$2,$1); 1694 push @opcode,$opcodelet{$instr}; 1695 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1696 return ".byte\t".join(',',@opcode); 1697 } else { 1698 return $instr."\t".@_[0]; 1699 } 1700 } 1701} 1702 1703$code =~ s/\`([^\`]*)\`/eval $1/gem; 1704$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; 1705print $code; 1706close STDOUT; 1707