1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# January 2013 11# 12# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled 13# in http://download.intel.com/design/intarch/papers/323686.pdf, is 14# that since AESNI-CBC encrypt exhibit *very* low instruction-level 15# parallelism, interleaving it with another algorithm would allow to 16# utilize processor resources better and achieve better performance. 17# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and 18# AESNI code is weaved into it. As SHA256 dominates execution time, 19# stitch performance does not depend on AES key length. Below are 20# performance numbers in cycles per processed byte, less is better, 21# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched 22# subroutine: 23# 24# AES-128/-192/-256+SHA256 this(**)gain 25# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% 26# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% 27# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% 28# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% 29# 30# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that 31# Westmere is omitted from loop, this is because gain was not 32# estimated high enough to justify the effort; 33# (**) these are EVP-free results, results obtained with 'speed 34# -evp aes-256-cbc-hmac-sha256' will vary by percent or two; 35 36$flavour = shift; 37$output = shift; 38if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 39 40$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 41 42$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 44( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 45die "can't locate x86_64-xlate.pl"; 46 47if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 49 $avx = ($1>=2.19) + ($1>=2.22); 50} 51 52if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 54 $avx = ($1>=2.09) + ($1>=2.10); 55} 56 57if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 58 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 59 $avx = ($1>=10) + ($1>=12); 60} 61 62if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 63 $avx = ($2>=3.0) + ($2>3.0); 64} 65 66$shaext=$avx; ### set to zero if compiling for 1.0.1 67$avx=1 if (!$shaext && $avx); 68 69open OUT,"| \"$^X\" $xlate $flavour $output"; 70*STDOUT=*OUT; 71 72$func="aesni_cbc_sha256_enc"; 73$TABLE="K256"; 74$SZ=4; 75@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 76 "%r8d","%r9d","%r10d","%r11d"); 77($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); 78@Sigma0=( 2,13,22); 79@Sigma1=( 6,11,25); 80@sigma0=( 7,18, 3); 81@sigma1=(17,19,10); 82$rounds=64; 83 84######################################################################## 85# void aesni_cbc_sha256_enc(const void *inp, 86# void *out, 87# size_t length, 88# const AES_KEY *key, 89# unsigned char *iv, 90# SHA256_CTX *ctx, 91# const void *in0); 92($inp, $out, $len, $key, $ivp, $ctx, $in0) = 93("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 94 95$Tbl="%rbp"; 96 97$_inp="16*$SZ+0*8(%rsp)"; 98$_out="16*$SZ+1*8(%rsp)"; 99$_end="16*$SZ+2*8(%rsp)"; 100$_key="16*$SZ+3*8(%rsp)"; 101$_ivp="16*$SZ+4*8(%rsp)"; 102$_ctx="16*$SZ+5*8(%rsp)"; 103$_in0="16*$SZ+6*8(%rsp)"; 104$_rsp="16*$SZ+7*8(%rsp)"; 105$framesz=16*$SZ+8*8; 106 107$code=<<___; 108.text 109 110.extern OPENSSL_ia32cap_P 111.globl $func 112.type $func,\@abi-omnipotent 113.align 16 114$func: 115___ 116 if ($avx) { 117$code.=<<___; 118 lea OPENSSL_ia32cap_P(%rip),%r11 119 mov \$1,%eax 120 cmp \$0,`$win64?"%rcx":"%rdi"` 121 je .Lprobe 122 mov 0(%r11),%eax 123 mov 4(%r11),%r10 124___ 125$code.=<<___ if ($shaext); 126 bt \$61,%r10 # check for SHA 127 jc ${func}_shaext 128___ 129$code.=<<___; 130 mov %r10,%r11 131 shr \$32,%r11 132 133 test \$`1<<11`,%r10d # check for XOP 134 jnz ${func}_xop 135___ 136$code.=<<___ if ($avx>1); 137 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 138 cmp \$`1<<8|1<<5|1<<3`,%r11d 139 je ${func}_avx2 140___ 141$code.=<<___; 142 and \$`1<<28`,%r10d # check for AVX 143 jnz ${func}_avx 144 ud2 145___ 146 } 147$code.=<<___; 148 xor %eax,%eax 149 cmp \$0,`$win64?"%rcx":"%rdi"` 150 je .Lprobe 151 ud2 152.Lprobe: 153 ret 154.size $func,.-$func 155 156.align 64 157.type $TABLE,\@object 158$TABLE: 159 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 160 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 161 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 162 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 163 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 164 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 165 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 166 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 167 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 168 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 169 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 170 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 171 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 172 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 173 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 174 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 175 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 176 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 177 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 178 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 179 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 180 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 181 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 182 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 183 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 184 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 185 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 186 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 187 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 188 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 189 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 190 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 191 192 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 193 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 194 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 195 .long 0,0,0,0, 0,0,0,0 196 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 197.align 64 198___ 199 200###################################################################### 201# SIMD code paths 202# 203{{{ 204($iv,$inout,$roundkey,$temp, 205 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); 206 207$aesni_cbc_idx=0; 208@aesni_cbc_block = ( 209## &vmovdqu ($roundkey,"0x00-0x80($inp)");' 210## &vmovdqu ($inout,($inp)); 211## &mov ($_inp,$inp); 212 213 '&vpxor ($inout,$inout,$roundkey);'. 214 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', 215 216 '&vpxor ($inout,$inout,$iv);', 217 218 '&vaesenc ($inout,$inout,$roundkey);'. 219 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', 220 221 '&vaesenc ($inout,$inout,$roundkey);'. 222 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', 223 224 '&vaesenc ($inout,$inout,$roundkey);'. 225 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', 226 227 '&vaesenc ($inout,$inout,$roundkey);'. 228 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', 229 230 '&vaesenc ($inout,$inout,$roundkey);'. 231 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', 232 233 '&vaesenc ($inout,$inout,$roundkey);'. 234 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', 235 236 '&vaesenc ($inout,$inout,$roundkey);'. 237 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', 238 239 '&vaesenc ($inout,$inout,$roundkey);'. 240 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', 241 242 '&vaesenc ($inout,$inout,$roundkey);'. 243 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', 244 245 '&vaesenclast ($temp,$inout,$roundkey);'. 246 ' &vaesenc ($inout,$inout,$roundkey);'. 247 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', 248 249 '&vpand ($iv,$temp,$mask10);'. 250 ' &vaesenc ($inout,$inout,$roundkey);'. 251 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', 252 253 '&vaesenclast ($temp,$inout,$roundkey);'. 254 ' &vaesenc ($inout,$inout,$roundkey);'. 255 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', 256 257 '&vpand ($temp,$temp,$mask12);'. 258 ' &vaesenc ($inout,$inout,$roundkey);'. 259 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', 260 261 '&vpor ($iv,$iv,$temp);'. 262 ' &vaesenclast ($temp,$inout,$roundkey);'. 263 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' 264 265## &mov ($inp,$_inp); 266## &mov ($out,$_out); 267## &vpand ($temp,$temp,$mask14); 268## &vpor ($iv,$iv,$temp); 269## &vmovdqu ($iv,($out,$inp); 270## &lea (inp,16($inp)); 271); 272 273my $a4=$T1; 274my ($a,$b,$c,$d,$e,$f,$g,$h); 275 276sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 277{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 278 my $arg = pop; 279 $arg = "\$$arg" if ($arg*1 eq $arg); 280 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 281} 282 283sub body_00_15 () { 284 ( 285 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 286 287 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 288 '&mov ($a,$a1)', 289 '&mov ($a4,$f)', 290 291 '&xor ($a0,$e)', 292 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 293 '&xor ($a4,$g)', # f^g 294 295 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 296 '&xor ($a1,$a)', 297 '&and ($a4,$e)', # (f^g)&e 298 299 @aesni_cbc_block[$aesni_cbc_idx++]. 300 '&xor ($a0,$e)', 301 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 302 '&mov ($a2,$a)', 303 304 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 305 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 306 '&xor ($a2,$b)', # a^b, b^c in next round 307 308 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 309 '&add ($h,$a4)', # h+=Ch(e,f,g) 310 '&and ($a3,$a2)', # (b^c)&(a^b) 311 312 '&xor ($a1,$a)', 313 '&add ($h,$a0)', # h+=Sigma1(e) 314 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 315 316 '&add ($d,$h)', # d+=h 317 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 318 '&add ($h,$a3)', # h+=Maj(a,b,c) 319 320 '&mov ($a0,$d)', 321 '&add ($a1,$h);'. # h+=Sigma0(a) 322 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 323 ); 324} 325 326if ($avx) {{ 327###################################################################### 328# XOP code path 329# 330$code.=<<___; 331.type ${func}_xop,\@function,6 332.align 64 333${func}_xop: 334.Lxop_shortcut: 335 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 336 push %rbx 337 push %rbp 338 push %r12 339 push %r13 340 push %r14 341 push %r15 342 mov %rsp,%r11 # copy %rsp 343 sub \$`$framesz+$win64*16*10`,%rsp 344 and \$-64,%rsp # align stack frame 345 346 shl \$6,$len 347 sub $inp,$out # re-bias 348 sub $inp,$in0 349 add $inp,$len # end of input 350 351 #mov $inp,$_inp # saved later 352 mov $out,$_out 353 mov $len,$_end 354 #mov $key,$_key # remains resident in $inp register 355 mov $ivp,$_ivp 356 mov $ctx,$_ctx 357 mov $in0,$_in0 358 mov %r11,$_rsp 359___ 360$code.=<<___ if ($win64); 361 movaps %xmm6,`$framesz+16*0`(%rsp) 362 movaps %xmm7,`$framesz+16*1`(%rsp) 363 movaps %xmm8,`$framesz+16*2`(%rsp) 364 movaps %xmm9,`$framesz+16*3`(%rsp) 365 movaps %xmm10,`$framesz+16*4`(%rsp) 366 movaps %xmm11,`$framesz+16*5`(%rsp) 367 movaps %xmm12,`$framesz+16*6`(%rsp) 368 movaps %xmm13,`$framesz+16*7`(%rsp) 369 movaps %xmm14,`$framesz+16*8`(%rsp) 370 movaps %xmm15,`$framesz+16*9`(%rsp) 371___ 372$code.=<<___; 373.Lprologue_xop: 374 vzeroall 375 376 mov $inp,%r12 # borrow $a4 377 lea 0x80($key),$inp # size optimization, reassign 378 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 379 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 380 mov $ctx,%r15 # borrow $a2 381 mov $in0,%rsi # borrow $a3 382 vmovdqu ($ivp),$iv # load IV 383 sub \$9,%r14 384 385 mov $SZ*0(%r15),$A 386 mov $SZ*1(%r15),$B 387 mov $SZ*2(%r15),$C 388 mov $SZ*3(%r15),$D 389 mov $SZ*4(%r15),$E 390 mov $SZ*5(%r15),$F 391 mov $SZ*6(%r15),$G 392 mov $SZ*7(%r15),$H 393 394 vmovdqa 0x00(%r13,%r14,8),$mask14 395 vmovdqa 0x10(%r13,%r14,8),$mask12 396 vmovdqa 0x20(%r13,%r14,8),$mask10 397 vmovdqu 0x00-0x80($inp),$roundkey 398 jmp .Lloop_xop 399___ 400 if ($SZ==4) { # SHA256 401 my @X = map("%xmm$_",(0..3)); 402 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 403 404$code.=<<___; 405.align 16 406.Lloop_xop: 407 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 408 vmovdqu 0x00(%rsi,%r12),@X[0] 409 vmovdqu 0x10(%rsi,%r12),@X[1] 410 vmovdqu 0x20(%rsi,%r12),@X[2] 411 vmovdqu 0x30(%rsi,%r12),@X[3] 412 vpshufb $t3,@X[0],@X[0] 413 lea $TABLE(%rip),$Tbl 414 vpshufb $t3,@X[1],@X[1] 415 vpshufb $t3,@X[2],@X[2] 416 vpaddd 0x00($Tbl),@X[0],$t0 417 vpshufb $t3,@X[3],@X[3] 418 vpaddd 0x20($Tbl),@X[1],$t1 419 vpaddd 0x40($Tbl),@X[2],$t2 420 vpaddd 0x60($Tbl),@X[3],$t3 421 vmovdqa $t0,0x00(%rsp) 422 mov $A,$a1 423 vmovdqa $t1,0x10(%rsp) 424 mov $B,$a3 425 vmovdqa $t2,0x20(%rsp) 426 xor $C,$a3 # magic 427 vmovdqa $t3,0x30(%rsp) 428 mov $E,$a0 429 jmp .Lxop_00_47 430 431.align 16 432.Lxop_00_47: 433 sub \$-16*2*$SZ,$Tbl # size optimization 434 vmovdqu (%r12),$inout # $a4 435 mov %r12,$_inp # $a4 436___ 437sub XOP_256_00_47 () { 438my $j = shift; 439my $body = shift; 440my @X = @_; 441my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 442 443 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 444 eval(shift(@insns)); 445 eval(shift(@insns)); 446 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 447 eval(shift(@insns)); 448 eval(shift(@insns)); 449 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 450 eval(shift(@insns)); 451 eval(shift(@insns)); 452 &vpsrld ($t0,$t0,$sigma0[2]); 453 eval(shift(@insns)); 454 eval(shift(@insns)); 455 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 456 eval(shift(@insns)); 457 eval(shift(@insns)); 458 eval(shift(@insns)); 459 eval(shift(@insns)); 460 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 461 eval(shift(@insns)); 462 eval(shift(@insns)); 463 &vpxor ($t0,$t0,$t1); 464 eval(shift(@insns)); 465 eval(shift(@insns)); 466 eval(shift(@insns)); 467 eval(shift(@insns)); 468 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 469 eval(shift(@insns)); 470 eval(shift(@insns)); 471 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 472 eval(shift(@insns)); 473 eval(shift(@insns)); 474 &vpsrld ($t2,@X[3],$sigma1[2]); 475 eval(shift(@insns)); 476 eval(shift(@insns)); 477 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 478 eval(shift(@insns)); 479 eval(shift(@insns)); 480 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 481 eval(shift(@insns)); 482 eval(shift(@insns)); 483 &vpxor ($t3,$t3,$t2); 484 eval(shift(@insns)); 485 eval(shift(@insns)); 486 eval(shift(@insns)); 487 eval(shift(@insns)); 488 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 489 eval(shift(@insns)); 490 eval(shift(@insns)); 491 eval(shift(@insns)); 492 eval(shift(@insns)); 493 &vpsrldq ($t3,$t3,8); 494 eval(shift(@insns)); 495 eval(shift(@insns)); 496 eval(shift(@insns)); 497 eval(shift(@insns)); 498 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 499 eval(shift(@insns)); 500 eval(shift(@insns)); 501 eval(shift(@insns)); 502 eval(shift(@insns)); 503 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 504 eval(shift(@insns)); 505 eval(shift(@insns)); 506 &vpsrld ($t2,@X[0],$sigma1[2]); 507 eval(shift(@insns)); 508 eval(shift(@insns)); 509 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 510 eval(shift(@insns)); 511 eval(shift(@insns)); 512 &vpxor ($t3,$t3,$t2); 513 eval(shift(@insns)); 514 eval(shift(@insns)); 515 eval(shift(@insns)); 516 eval(shift(@insns)); 517 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 518 eval(shift(@insns)); 519 eval(shift(@insns)); 520 eval(shift(@insns)); 521 eval(shift(@insns)); 522 &vpslldq ($t3,$t3,8); # 22 instructions 523 eval(shift(@insns)); 524 eval(shift(@insns)); 525 eval(shift(@insns)); 526 eval(shift(@insns)); 527 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 528 eval(shift(@insns)); 529 eval(shift(@insns)); 530 eval(shift(@insns)); 531 eval(shift(@insns)); 532 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 533 foreach (@insns) { eval; } # remaining instructions 534 &vmovdqa (16*$j."(%rsp)",$t2); 535} 536 537 $aesni_cbc_idx=0; 538 for ($i=0,$j=0; $j<4; $j++) { 539 &XOP_256_00_47($j,\&body_00_15,@X); 540 push(@X,shift(@X)); # rotate(@X) 541 } 542 &mov ("%r12",$_inp); # borrow $a4 543 &vpand ($temp,$temp,$mask14); 544 &mov ("%r15",$_out); # borrow $a2 545 &vpor ($iv,$iv,$temp); 546 &vmovdqu ("(%r15,%r12)",$iv); # write output 547 &lea ("%r12","16(%r12)"); # inp++ 548 549 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 550 &jne (".Lxop_00_47"); 551 552 &vmovdqu ($inout,"(%r12)"); 553 &mov ($_inp,"%r12"); 554 555 $aesni_cbc_idx=0; 556 for ($i=0; $i<16; ) { 557 foreach(body_00_15()) { eval; } 558 } 559 } 560$code.=<<___; 561 mov $_inp,%r12 # borrow $a4 562 mov $_out,%r13 # borrow $a0 563 mov $_ctx,%r15 # borrow $a2 564 mov $_in0,%rsi # borrow $a3 565 566 vpand $mask14,$temp,$temp 567 mov $a1,$A 568 vpor $temp,$iv,$iv 569 vmovdqu $iv,(%r13,%r12) # write output 570 lea 16(%r12),%r12 # inp++ 571 572 add $SZ*0(%r15),$A 573 add $SZ*1(%r15),$B 574 add $SZ*2(%r15),$C 575 add $SZ*3(%r15),$D 576 add $SZ*4(%r15),$E 577 add $SZ*5(%r15),$F 578 add $SZ*6(%r15),$G 579 add $SZ*7(%r15),$H 580 581 cmp $_end,%r12 582 583 mov $A,$SZ*0(%r15) 584 mov $B,$SZ*1(%r15) 585 mov $C,$SZ*2(%r15) 586 mov $D,$SZ*3(%r15) 587 mov $E,$SZ*4(%r15) 588 mov $F,$SZ*5(%r15) 589 mov $G,$SZ*6(%r15) 590 mov $H,$SZ*7(%r15) 591 592 jb .Lloop_xop 593 594 mov $_ivp,$ivp 595 mov $_rsp,%rsi 596 vmovdqu $iv,($ivp) # output IV 597 vzeroall 598___ 599$code.=<<___ if ($win64); 600 movaps `$framesz+16*0`(%rsp),%xmm6 601 movaps `$framesz+16*1`(%rsp),%xmm7 602 movaps `$framesz+16*2`(%rsp),%xmm8 603 movaps `$framesz+16*3`(%rsp),%xmm9 604 movaps `$framesz+16*4`(%rsp),%xmm10 605 movaps `$framesz+16*5`(%rsp),%xmm11 606 movaps `$framesz+16*6`(%rsp),%xmm12 607 movaps `$framesz+16*7`(%rsp),%xmm13 608 movaps `$framesz+16*8`(%rsp),%xmm14 609 movaps `$framesz+16*9`(%rsp),%xmm15 610___ 611$code.=<<___; 612 mov (%rsi),%r15 613 mov 8(%rsi),%r14 614 mov 16(%rsi),%r13 615 mov 24(%rsi),%r12 616 mov 32(%rsi),%rbp 617 mov 40(%rsi),%rbx 618 lea 48(%rsi),%rsp 619.Lepilogue_xop: 620 ret 621.size ${func}_xop,.-${func}_xop 622___ 623###################################################################### 624# AVX+shrd code path 625# 626local *ror = sub { &shrd(@_[0],@_) }; 627 628$code.=<<___; 629.type ${func}_avx,\@function,6 630.align 64 631${func}_avx: 632.Lavx_shortcut: 633 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 634 push %rbx 635 push %rbp 636 push %r12 637 push %r13 638 push %r14 639 push %r15 640 mov %rsp,%r11 # copy %rsp 641 sub \$`$framesz+$win64*16*10`,%rsp 642 and \$-64,%rsp # align stack frame 643 644 shl \$6,$len 645 sub $inp,$out # re-bias 646 sub $inp,$in0 647 add $inp,$len # end of input 648 649 #mov $inp,$_inp # saved later 650 mov $out,$_out 651 mov $len,$_end 652 #mov $key,$_key # remains resident in $inp register 653 mov $ivp,$_ivp 654 mov $ctx,$_ctx 655 mov $in0,$_in0 656 mov %r11,$_rsp 657___ 658$code.=<<___ if ($win64); 659 movaps %xmm6,`$framesz+16*0`(%rsp) 660 movaps %xmm7,`$framesz+16*1`(%rsp) 661 movaps %xmm8,`$framesz+16*2`(%rsp) 662 movaps %xmm9,`$framesz+16*3`(%rsp) 663 movaps %xmm10,`$framesz+16*4`(%rsp) 664 movaps %xmm11,`$framesz+16*5`(%rsp) 665 movaps %xmm12,`$framesz+16*6`(%rsp) 666 movaps %xmm13,`$framesz+16*7`(%rsp) 667 movaps %xmm14,`$framesz+16*8`(%rsp) 668 movaps %xmm15,`$framesz+16*9`(%rsp) 669___ 670$code.=<<___; 671.Lprologue_avx: 672 vzeroall 673 674 mov $inp,%r12 # borrow $a4 675 lea 0x80($key),$inp # size optimization, reassign 676 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 677 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 678 mov $ctx,%r15 # borrow $a2 679 mov $in0,%rsi # borrow $a3 680 vmovdqu ($ivp),$iv # load IV 681 sub \$9,%r14 682 683 mov $SZ*0(%r15),$A 684 mov $SZ*1(%r15),$B 685 mov $SZ*2(%r15),$C 686 mov $SZ*3(%r15),$D 687 mov $SZ*4(%r15),$E 688 mov $SZ*5(%r15),$F 689 mov $SZ*6(%r15),$G 690 mov $SZ*7(%r15),$H 691 692 vmovdqa 0x00(%r13,%r14,8),$mask14 693 vmovdqa 0x10(%r13,%r14,8),$mask12 694 vmovdqa 0x20(%r13,%r14,8),$mask10 695 vmovdqu 0x00-0x80($inp),$roundkey 696___ 697 if ($SZ==4) { # SHA256 698 my @X = map("%xmm$_",(0..3)); 699 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 700 701$code.=<<___; 702 jmp .Lloop_avx 703.align 16 704.Lloop_avx: 705 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 706 vmovdqu 0x00(%rsi,%r12),@X[0] 707 vmovdqu 0x10(%rsi,%r12),@X[1] 708 vmovdqu 0x20(%rsi,%r12),@X[2] 709 vmovdqu 0x30(%rsi,%r12),@X[3] 710 vpshufb $t3,@X[0],@X[0] 711 lea $TABLE(%rip),$Tbl 712 vpshufb $t3,@X[1],@X[1] 713 vpshufb $t3,@X[2],@X[2] 714 vpaddd 0x00($Tbl),@X[0],$t0 715 vpshufb $t3,@X[3],@X[3] 716 vpaddd 0x20($Tbl),@X[1],$t1 717 vpaddd 0x40($Tbl),@X[2],$t2 718 vpaddd 0x60($Tbl),@X[3],$t3 719 vmovdqa $t0,0x00(%rsp) 720 mov $A,$a1 721 vmovdqa $t1,0x10(%rsp) 722 mov $B,$a3 723 vmovdqa $t2,0x20(%rsp) 724 xor $C,$a3 # magic 725 vmovdqa $t3,0x30(%rsp) 726 mov $E,$a0 727 jmp .Lavx_00_47 728 729.align 16 730.Lavx_00_47: 731 sub \$-16*2*$SZ,$Tbl # size optimization 732 vmovdqu (%r12),$inout # $a4 733 mov %r12,$_inp # $a4 734___ 735sub Xupdate_256_AVX () { 736 ( 737 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 738 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 739 '&vpsrld ($t2,$t0,$sigma0[0]);', 740 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 741 '&vpsrld ($t3,$t0,$sigma0[2])', 742 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 743 '&vpxor ($t0,$t3,$t2)', 744 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 745 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 746 '&vpxor ($t0,$t0,$t1)', 747 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 748 '&vpxor ($t0,$t0,$t2)', 749 '&vpsrld ($t2,$t3,$sigma1[2]);', 750 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 751 '&vpsrlq ($t3,$t3,$sigma1[0]);', 752 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 753 '&vpxor ($t2,$t2,$t3);', 754 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 755 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) 756 '&vpshufd ($t2,$t2,0b10000100)', 757 '&vpsrldq ($t2,$t2,8)', 758 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 759 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 760 '&vpsrld ($t2,$t3,$sigma1[2])', 761 '&vpsrlq ($t3,$t3,$sigma1[0])', 762 '&vpxor ($t2,$t2,$t3);', 763 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 764 '&vpxor ($t2,$t2,$t3)', 765 '&vpshufd ($t2,$t2,0b11101000)', 766 '&vpslldq ($t2,$t2,8)', 767 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 768 ); 769} 770 771sub AVX_256_00_47 () { 772my $j = shift; 773my $body = shift; 774my @X = @_; 775my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 776 777 foreach (Xupdate_256_AVX()) { # 29 instructions 778 eval; 779 eval(shift(@insns)); 780 eval(shift(@insns)); 781 eval(shift(@insns)); 782 } 783 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 784 foreach (@insns) { eval; } # remaining instructions 785 &vmovdqa (16*$j."(%rsp)",$t2); 786} 787 788 $aesni_cbc_idx=0; 789 for ($i=0,$j=0; $j<4; $j++) { 790 &AVX_256_00_47($j,\&body_00_15,@X); 791 push(@X,shift(@X)); # rotate(@X) 792 } 793 &mov ("%r12",$_inp); # borrow $a4 794 &vpand ($temp,$temp,$mask14); 795 &mov ("%r15",$_out); # borrow $a2 796 &vpor ($iv,$iv,$temp); 797 &vmovdqu ("(%r15,%r12)",$iv); # write output 798 &lea ("%r12","16(%r12)"); # inp++ 799 800 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 801 &jne (".Lavx_00_47"); 802 803 &vmovdqu ($inout,"(%r12)"); 804 &mov ($_inp,"%r12"); 805 806 $aesni_cbc_idx=0; 807 for ($i=0; $i<16; ) { 808 foreach(body_00_15()) { eval; } 809 } 810 811 } 812$code.=<<___; 813 mov $_inp,%r12 # borrow $a4 814 mov $_out,%r13 # borrow $a0 815 mov $_ctx,%r15 # borrow $a2 816 mov $_in0,%rsi # borrow $a3 817 818 vpand $mask14,$temp,$temp 819 mov $a1,$A 820 vpor $temp,$iv,$iv 821 vmovdqu $iv,(%r13,%r12) # write output 822 lea 16(%r12),%r12 # inp++ 823 824 add $SZ*0(%r15),$A 825 add $SZ*1(%r15),$B 826 add $SZ*2(%r15),$C 827 add $SZ*3(%r15),$D 828 add $SZ*4(%r15),$E 829 add $SZ*5(%r15),$F 830 add $SZ*6(%r15),$G 831 add $SZ*7(%r15),$H 832 833 cmp $_end,%r12 834 835 mov $A,$SZ*0(%r15) 836 mov $B,$SZ*1(%r15) 837 mov $C,$SZ*2(%r15) 838 mov $D,$SZ*3(%r15) 839 mov $E,$SZ*4(%r15) 840 mov $F,$SZ*5(%r15) 841 mov $G,$SZ*6(%r15) 842 mov $H,$SZ*7(%r15) 843 jb .Lloop_avx 844 845 mov $_ivp,$ivp 846 mov $_rsp,%rsi 847 vmovdqu $iv,($ivp) # output IV 848 vzeroall 849___ 850$code.=<<___ if ($win64); 851 movaps `$framesz+16*0`(%rsp),%xmm6 852 movaps `$framesz+16*1`(%rsp),%xmm7 853 movaps `$framesz+16*2`(%rsp),%xmm8 854 movaps `$framesz+16*3`(%rsp),%xmm9 855 movaps `$framesz+16*4`(%rsp),%xmm10 856 movaps `$framesz+16*5`(%rsp),%xmm11 857 movaps `$framesz+16*6`(%rsp),%xmm12 858 movaps `$framesz+16*7`(%rsp),%xmm13 859 movaps `$framesz+16*8`(%rsp),%xmm14 860 movaps `$framesz+16*9`(%rsp),%xmm15 861___ 862$code.=<<___; 863 mov (%rsi),%r15 864 mov 8(%rsi),%r14 865 mov 16(%rsi),%r13 866 mov 24(%rsi),%r12 867 mov 32(%rsi),%rbp 868 mov 40(%rsi),%rbx 869 lea 48(%rsi),%rsp 870.Lepilogue_avx: 871 ret 872.size ${func}_avx,.-${func}_avx 873___ 874 875if ($avx>1) {{ 876###################################################################### 877# AVX2+BMI code path 878# 879my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 880my $PUSH8=8*2*$SZ; 881use integer; 882 883sub bodyx_00_15 () { 884 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 885 ( 886 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 887 888 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 889 '&and ($a4,$e)', # f&e 890 '&rorx ($a0,$e,$Sigma1[2])', 891 '&rorx ($a2,$e,$Sigma1[1])', 892 893 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 894 '&lea ($h,"($h,$a4)")', 895 '&andn ($a4,$e,$g)', # ~e&g 896 '&xor ($a0,$a2)', 897 898 '&rorx ($a1,$e,$Sigma1[0])', 899 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 900 '&xor ($a0,$a1)', # Sigma1(e) 901 '&mov ($a2,$a)', 902 903 '&rorx ($a4,$a,$Sigma0[2])', 904 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 905 '&xor ($a2,$b)', # a^b, b^c in next round 906 '&rorx ($a1,$a,$Sigma0[1])', 907 908 '&rorx ($a0,$a,$Sigma0[0])', 909 '&lea ($d,"($d,$h)")', # d+=h 910 '&and ($a3,$a2)', # (b^c)&(a^b) 911 @aesni_cbc_block[$aesni_cbc_idx++]. 912 '&xor ($a1,$a4)', 913 914 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 915 '&xor ($a1,$a0)', # Sigma0(a) 916 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 917 '&mov ($a4,$e)', # copy of f in future 918 919 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 920 ); 921 # and at the finish one has to $a+=$a1 922} 923 924$code.=<<___; 925.type ${func}_avx2,\@function,6 926.align 64 927${func}_avx2: 928.Lavx2_shortcut: 929 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 930 push %rbx 931 push %rbp 932 push %r12 933 push %r13 934 push %r14 935 push %r15 936 mov %rsp,%r11 # copy %rsp 937 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp 938 and \$-256*$SZ,%rsp # align stack frame 939 add \$`2*$SZ*($rounds-8)`,%rsp 940 941 shl \$6,$len 942 sub $inp,$out # re-bias 943 sub $inp,$in0 944 add $inp,$len # end of input 945 946 #mov $inp,$_inp # saved later 947 #mov $out,$_out # kept in $offload 948 mov $len,$_end 949 #mov $key,$_key # remains resident in $inp register 950 mov $ivp,$_ivp 951 mov $ctx,$_ctx 952 mov $in0,$_in0 953 mov %r11,$_rsp 954___ 955$code.=<<___ if ($win64); 956 movaps %xmm6,`$framesz+16*0`(%rsp) 957 movaps %xmm7,`$framesz+16*1`(%rsp) 958 movaps %xmm8,`$framesz+16*2`(%rsp) 959 movaps %xmm9,`$framesz+16*3`(%rsp) 960 movaps %xmm10,`$framesz+16*4`(%rsp) 961 movaps %xmm11,`$framesz+16*5`(%rsp) 962 movaps %xmm12,`$framesz+16*6`(%rsp) 963 movaps %xmm13,`$framesz+16*7`(%rsp) 964 movaps %xmm14,`$framesz+16*8`(%rsp) 965 movaps %xmm15,`$framesz+16*9`(%rsp) 966___ 967$code.=<<___; 968.Lprologue_avx2: 969 vzeroall 970 971 mov $inp,%r13 # borrow $a0 972 vpinsrq \$1,$out,$offload,$offload 973 lea 0x80($key),$inp # size optimization, reassign 974 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 975 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 976 mov $ctx,%r15 # borrow $a2 977 mov $in0,%rsi # borrow $a3 978 vmovdqu ($ivp),$iv # load IV 979 lea -9(%r14),%r14 980 981 vmovdqa 0x00(%r12,%r14,8),$mask14 982 vmovdqa 0x10(%r12,%r14,8),$mask12 983 vmovdqa 0x20(%r12,%r14,8),$mask10 984 985 sub \$-16*$SZ,%r13 # inp++, size optimization 986 mov $SZ*0(%r15),$A 987 lea (%rsi,%r13),%r12 # borrow $a0 988 mov $SZ*1(%r15),$B 989 cmp $len,%r13 # $_end 990 mov $SZ*2(%r15),$C 991 cmove %rsp,%r12 # next block or random data 992 mov $SZ*3(%r15),$D 993 mov $SZ*4(%r15),$E 994 mov $SZ*5(%r15),$F 995 mov $SZ*6(%r15),$G 996 mov $SZ*7(%r15),$H 997 vmovdqu 0x00-0x80($inp),$roundkey 998___ 999 if ($SZ==4) { # SHA256 1000 my @X = map("%ymm$_",(0..3)); 1001 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); 1002 1003$code.=<<___; 1004 jmp .Loop_avx2 1005.align 16 1006.Loop_avx2: 1007 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1008 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 1009 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 1010 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 1011 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 1012 1013 vinserti128 \$1,(%r12),@X[0],@X[0] 1014 vinserti128 \$1,16(%r12),@X[1],@X[1] 1015 vpshufb $t3,@X[0],@X[0] 1016 vinserti128 \$1,32(%r12),@X[2],@X[2] 1017 vpshufb $t3,@X[1],@X[1] 1018 vinserti128 \$1,48(%r12),@X[3],@X[3] 1019 1020 lea $TABLE(%rip),$Tbl 1021 vpshufb $t3,@X[2],@X[2] 1022 lea -16*$SZ(%r13),%r13 1023 vpaddd 0x00($Tbl),@X[0],$t0 1024 vpshufb $t3,@X[3],@X[3] 1025 vpaddd 0x20($Tbl),@X[1],$t1 1026 vpaddd 0x40($Tbl),@X[2],$t2 1027 vpaddd 0x60($Tbl),@X[3],$t3 1028 vmovdqa $t0,0x00(%rsp) 1029 xor $a1,$a1 1030 vmovdqa $t1,0x20(%rsp) 1031 lea -$PUSH8(%rsp),%rsp 1032 mov $B,$a3 1033 vmovdqa $t2,0x00(%rsp) 1034 xor $C,$a3 # magic 1035 vmovdqa $t3,0x20(%rsp) 1036 mov $F,$a4 1037 sub \$-16*2*$SZ,$Tbl # size optimization 1038 jmp .Lavx2_00_47 1039 1040.align 16 1041.Lavx2_00_47: 1042 vmovdqu (%r13),$inout 1043 vpinsrq \$0,%r13,$offload,$offload 1044___ 1045 1046sub AVX2_256_00_47 () { 1047my $j = shift; 1048my $body = shift; 1049my @X = @_; 1050my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1051my $base = "+2*$PUSH8(%rsp)"; 1052 1053 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1054 foreach (Xupdate_256_AVX()) { # 29 instructions 1055 eval; 1056 eval(shift(@insns)); 1057 eval(shift(@insns)); 1058 eval(shift(@insns)); 1059 } 1060 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1061 foreach (@insns) { eval; } # remaining instructions 1062 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1063} 1064 $aesni_cbc_idx=0; 1065 for ($i=0,$j=0; $j<4; $j++) { 1066 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1067 push(@X,shift(@X)); # rotate(@X) 1068 } 1069 &vmovq ("%r13",$offload); # borrow $a0 1070 &vpextrq ("%r15",$offload,1); # borrow $a2 1071 &vpand ($temp,$temp,$mask14); 1072 &vpor ($iv,$iv,$temp); 1073 &vmovdqu ("(%r15,%r13)",$iv); # write output 1074 &lea ("%r13","16(%r13)"); # inp++ 1075 1076 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1077 &cmpb (($SZ-1)."($Tbl)",0); 1078 &jne (".Lavx2_00_47"); 1079 1080 &vmovdqu ($inout,"(%r13)"); 1081 &vpinsrq ($offload,$offload,"%r13",0); 1082 1083 $aesni_cbc_idx=0; 1084 for ($i=0; $i<16; ) { 1085 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1086 foreach(bodyx_00_15()) { eval; } 1087 } 1088 } 1089$code.=<<___; 1090 vpextrq \$1,$offload,%r12 # $_out, borrow $a4 1091 vmovq $offload,%r13 # $_inp, borrow $a0 1092 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1093 add $a1,$A 1094 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 1095 1096 vpand $mask14,$temp,$temp 1097 vpor $temp,$iv,$iv 1098 vmovdqu $iv,(%r12,%r13) # write output 1099 lea 16(%r13),%r13 1100 1101 add $SZ*0(%r15),$A 1102 add $SZ*1(%r15),$B 1103 add $SZ*2(%r15),$C 1104 add $SZ*3(%r15),$D 1105 add $SZ*4(%r15),$E 1106 add $SZ*5(%r15),$F 1107 add $SZ*6(%r15),$G 1108 add $SZ*7(%r15),$H 1109 1110 mov $A,$SZ*0(%r15) 1111 mov $B,$SZ*1(%r15) 1112 mov $C,$SZ*2(%r15) 1113 mov $D,$SZ*3(%r15) 1114 mov $E,$SZ*4(%r15) 1115 mov $F,$SZ*5(%r15) 1116 mov $G,$SZ*6(%r15) 1117 mov $H,$SZ*7(%r15) 1118 1119 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end 1120 je .Ldone_avx2 1121 1122 xor $a1,$a1 1123 mov $B,$a3 1124 mov $F,$a4 1125 xor $C,$a3 # magic 1126 jmp .Lower_avx2 1127.align 16 1128.Lower_avx2: 1129 vmovdqu (%r13),$inout 1130 vpinsrq \$0,%r13,$offload,$offload 1131___ 1132 $aesni_cbc_idx=0; 1133 for ($i=0; $i<16; ) { 1134 my $base="+16($Tbl)"; 1135 foreach(bodyx_00_15()) { eval; } 1136 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); 1137 } 1138$code.=<<___; 1139 vmovq $offload,%r13 # borrow $a0 1140 vpextrq \$1,$offload,%r15 # borrow $a2 1141 vpand $mask14,$temp,$temp 1142 vpor $temp,$iv,$iv 1143 lea -$PUSH8($Tbl),$Tbl 1144 vmovdqu $iv,(%r15,%r13) # write output 1145 lea 16(%r13),%r13 # inp++ 1146 cmp %rsp,$Tbl 1147 jae .Lower_avx2 1148 1149 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1150 lea 16*$SZ(%r13),%r13 1151 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 1152 add $a1,$A 1153 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 1154 1155 add $SZ*0(%r15),$A 1156 add $SZ*1(%r15),$B 1157 add $SZ*2(%r15),$C 1158 add $SZ*3(%r15),$D 1159 add $SZ*4(%r15),$E 1160 add $SZ*5(%r15),$F 1161 add $SZ*6(%r15),$G 1162 lea (%rsi,%r13),%r12 1163 add $SZ*7(%r15),$H 1164 1165 cmp $_end,%r13 1166 1167 mov $A,$SZ*0(%r15) 1168 cmove %rsp,%r12 # next block or stale data 1169 mov $B,$SZ*1(%r15) 1170 mov $C,$SZ*2(%r15) 1171 mov $D,$SZ*3(%r15) 1172 mov $E,$SZ*4(%r15) 1173 mov $F,$SZ*5(%r15) 1174 mov $G,$SZ*6(%r15) 1175 mov $H,$SZ*7(%r15) 1176 1177 jbe .Loop_avx2 1178 lea (%rsp),$Tbl 1179 1180.Ldone_avx2: 1181 lea ($Tbl),%rsp 1182 mov $_ivp,$ivp 1183 mov $_rsp,%rsi 1184 vmovdqu $iv,($ivp) # output IV 1185 vzeroall 1186___ 1187$code.=<<___ if ($win64); 1188 movaps `$framesz+16*0`(%rsp),%xmm6 1189 movaps `$framesz+16*1`(%rsp),%xmm7 1190 movaps `$framesz+16*2`(%rsp),%xmm8 1191 movaps `$framesz+16*3`(%rsp),%xmm9 1192 movaps `$framesz+16*4`(%rsp),%xmm10 1193 movaps `$framesz+16*5`(%rsp),%xmm11 1194 movaps `$framesz+16*6`(%rsp),%xmm12 1195 movaps `$framesz+16*7`(%rsp),%xmm13 1196 movaps `$framesz+16*8`(%rsp),%xmm14 1197 movaps `$framesz+16*9`(%rsp),%xmm15 1198___ 1199$code.=<<___; 1200 mov (%rsi),%r15 1201 mov 8(%rsi),%r14 1202 mov 16(%rsi),%r13 1203 mov 24(%rsi),%r12 1204 mov 32(%rsi),%rbp 1205 mov 40(%rsi),%rbx 1206 lea 48(%rsi),%rsp 1207.Lepilogue_avx2: 1208 ret 1209.size ${func}_avx2,.-${func}_avx2 1210___ 1211}} 1212}} 1213{{ 1214my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1215 1216my ($rounds,$Tbl)=("%r11d","%rbx"); 1217 1218my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); 1219my @rndkey=("%xmm4","%xmm5"); 1220my $r=0; 1221my $sn=0; 1222 1223my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); 1224my @MSG=map("%xmm$_",(10..13)); 1225 1226my $aesenc=sub { 1227 use integer; 1228 my ($n,$k)=($r/10,$r%10); 1229 if ($k==0) { 1230 $code.=<<___; 1231 movups `16*$n`($in0),$in # load input 1232 xorps $rndkey0,$in 1233___ 1234 $code.=<<___ if ($n); 1235 movups $iv,`16*($n-1)`($out,$in0) # write output 1236___ 1237 $code.=<<___; 1238 xorps $in,$iv 1239 movups `32+16*$k-112`($key),$rndkey[1] 1240 aesenc $rndkey[0],$iv 1241___ 1242 } elsif ($k==9) { 1243 $sn++; 1244 $code.=<<___; 1245 cmp \$11,$rounds 1246 jb .Laesenclast$sn 1247 movups `32+16*($k+0)-112`($key),$rndkey[1] 1248 aesenc $rndkey[0],$iv 1249 movups `32+16*($k+1)-112`($key),$rndkey[0] 1250 aesenc $rndkey[1],$iv 1251 je .Laesenclast$sn 1252 movups `32+16*($k+2)-112`($key),$rndkey[1] 1253 aesenc $rndkey[0],$iv 1254 movups `32+16*($k+3)-112`($key),$rndkey[0] 1255 aesenc $rndkey[1],$iv 1256.Laesenclast$sn: 1257 aesenclast $rndkey[0],$iv 1258 movups 16-112($key),$rndkey[1] # forward reference 1259 nop 1260___ 1261 } else { 1262 $code.=<<___; 1263 movups `32+16*$k-112`($key),$rndkey[1] 1264 aesenc $rndkey[0],$iv 1265___ 1266 } 1267 $r++; unshift(@rndkey,pop(@rndkey)); 1268}; 1269 1270if ($shaext) { 1271my $Tbl="%rax"; 1272 1273$code.=<<___; 1274.type ${func}_shaext,\@function,6 1275.align 32 1276${func}_shaext: 1277 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1278___ 1279$code.=<<___ if ($win64); 1280 lea `-8-10*16`(%rsp),%rsp 1281 movaps %xmm6,-8-10*16(%rax) 1282 movaps %xmm7,-8-9*16(%rax) 1283 movaps %xmm8,-8-8*16(%rax) 1284 movaps %xmm9,-8-7*16(%rax) 1285 movaps %xmm10,-8-6*16(%rax) 1286 movaps %xmm11,-8-5*16(%rax) 1287 movaps %xmm12,-8-4*16(%rax) 1288 movaps %xmm13,-8-3*16(%rax) 1289 movaps %xmm14,-8-2*16(%rax) 1290 movaps %xmm15,-8-1*16(%rax) 1291.Lprologue_shaext: 1292___ 1293$code.=<<___; 1294 lea K256+0x80(%rip),$Tbl 1295 movdqu ($ctx),$ABEF # DCBA 1296 movdqu 16($ctx),$CDGH # HGFE 1297 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 1298 1299 mov 240($key),$rounds 1300 sub $in0,$out 1301 movups ($key),$rndkey0 # $key[0] 1302 movups 16($key),$rndkey[0] # forward reference 1303 lea 112($key),$key # size optimization 1304 1305 pshufd \$0x1b,$ABEF,$Wi # ABCD 1306 pshufd \$0xb1,$ABEF,$ABEF # CDAB 1307 pshufd \$0x1b,$CDGH,$CDGH # EFGH 1308 movdqa $TMP,$BSWAP # offload 1309 palignr \$8,$CDGH,$ABEF # ABEF 1310 punpcklqdq $Wi,$CDGH # CDGH 1311 1312 jmp .Loop_shaext 1313 1314.align 16 1315.Loop_shaext: 1316 movdqu ($inp),@MSG[0] 1317 movdqu 0x10($inp),@MSG[1] 1318 movdqu 0x20($inp),@MSG[2] 1319 pshufb $TMP,@MSG[0] 1320 movdqu 0x30($inp),@MSG[3] 1321 1322 movdqa 0*32-0x80($Tbl),$Wi 1323 paddd @MSG[0],$Wi 1324 pshufb $TMP,@MSG[1] 1325 movdqa $CDGH,$CDGH_SAVE # offload 1326 movdqa $ABEF,$ABEF_SAVE # offload 1327___ 1328 &$aesenc(); 1329$code.=<<___; 1330 sha256rnds2 $ABEF,$CDGH # 0-3 1331 pshufd \$0x0e,$Wi,$Wi 1332___ 1333 &$aesenc(); 1334$code.=<<___; 1335 sha256rnds2 $CDGH,$ABEF 1336 1337 movdqa 1*32-0x80($Tbl),$Wi 1338 paddd @MSG[1],$Wi 1339 pshufb $TMP,@MSG[2] 1340 lea 0x40($inp),$inp 1341___ 1342 &$aesenc(); 1343$code.=<<___; 1344 sha256rnds2 $ABEF,$CDGH # 4-7 1345 pshufd \$0x0e,$Wi,$Wi 1346___ 1347 &$aesenc(); 1348$code.=<<___; 1349 sha256rnds2 $CDGH,$ABEF 1350 1351 movdqa 2*32-0x80($Tbl),$Wi 1352 paddd @MSG[2],$Wi 1353 pshufb $TMP,@MSG[3] 1354 sha256msg1 @MSG[1],@MSG[0] 1355___ 1356 &$aesenc(); 1357$code.=<<___; 1358 sha256rnds2 $ABEF,$CDGH # 8-11 1359 pshufd \$0x0e,$Wi,$Wi 1360 movdqa @MSG[3],$TMP 1361 palignr \$4,@MSG[2],$TMP 1362 paddd $TMP,@MSG[0] 1363___ 1364 &$aesenc(); 1365$code.=<<___; 1366 sha256rnds2 $CDGH,$ABEF 1367 1368 movdqa 3*32-0x80($Tbl),$Wi 1369 paddd @MSG[3],$Wi 1370 sha256msg2 @MSG[3],@MSG[0] 1371 sha256msg1 @MSG[2],@MSG[1] 1372___ 1373 &$aesenc(); 1374$code.=<<___; 1375 sha256rnds2 $ABEF,$CDGH # 12-15 1376 pshufd \$0x0e,$Wi,$Wi 1377___ 1378 &$aesenc(); 1379$code.=<<___; 1380 movdqa @MSG[0],$TMP 1381 palignr \$4,@MSG[3],$TMP 1382 paddd $TMP,@MSG[1] 1383 sha256rnds2 $CDGH,$ABEF 1384___ 1385for($i=4;$i<16-3;$i++) { 1386 &$aesenc() if (($r%10)==0); 1387$code.=<<___; 1388 movdqa $i*32-0x80($Tbl),$Wi 1389 paddd @MSG[0],$Wi 1390 sha256msg2 @MSG[0],@MSG[1] 1391 sha256msg1 @MSG[3],@MSG[2] 1392___ 1393 &$aesenc(); 1394$code.=<<___; 1395 sha256rnds2 $ABEF,$CDGH # 16-19... 1396 pshufd \$0x0e,$Wi,$Wi 1397 movdqa @MSG[1],$TMP 1398 palignr \$4,@MSG[0],$TMP 1399 paddd $TMP,@MSG[2] 1400___ 1401 &$aesenc(); 1402 &$aesenc() if ($r==19); 1403$code.=<<___; 1404 sha256rnds2 $CDGH,$ABEF 1405___ 1406 push(@MSG,shift(@MSG)); 1407} 1408$code.=<<___; 1409 movdqa 13*32-0x80($Tbl),$Wi 1410 paddd @MSG[0],$Wi 1411 sha256msg2 @MSG[0],@MSG[1] 1412 sha256msg1 @MSG[3],@MSG[2] 1413___ 1414 &$aesenc(); 1415$code.=<<___; 1416 sha256rnds2 $ABEF,$CDGH # 52-55 1417 pshufd \$0x0e,$Wi,$Wi 1418 movdqa @MSG[1],$TMP 1419 palignr \$4,@MSG[0],$TMP 1420 paddd $TMP,@MSG[2] 1421___ 1422 &$aesenc(); 1423 &$aesenc(); 1424$code.=<<___; 1425 sha256rnds2 $CDGH,$ABEF 1426 1427 movdqa 14*32-0x80($Tbl),$Wi 1428 paddd @MSG[1],$Wi 1429 sha256msg2 @MSG[1],@MSG[2] 1430 movdqa $BSWAP,$TMP 1431___ 1432 &$aesenc(); 1433$code.=<<___; 1434 sha256rnds2 $ABEF,$CDGH # 56-59 1435 pshufd \$0x0e,$Wi,$Wi 1436___ 1437 &$aesenc(); 1438$code.=<<___; 1439 sha256rnds2 $CDGH,$ABEF 1440 1441 movdqa 15*32-0x80($Tbl),$Wi 1442 paddd @MSG[2],$Wi 1443___ 1444 &$aesenc(); 1445 &$aesenc(); 1446$code.=<<___; 1447 sha256rnds2 $ABEF,$CDGH # 60-63 1448 pshufd \$0x0e,$Wi,$Wi 1449___ 1450 &$aesenc(); 1451$code.=<<___; 1452 sha256rnds2 $CDGH,$ABEF 1453 #pxor $CDGH,$rndkey0 # black magic 1454___ 1455 while ($r<40) { &$aesenc(); } # remaining aesenc's 1456$code.=<<___; 1457 #xorps $CDGH,$rndkey0 # black magic 1458 paddd $CDGH_SAVE,$CDGH 1459 paddd $ABEF_SAVE,$ABEF 1460 1461 dec $len 1462 movups $iv,48($out,$in0) # write output 1463 lea 64($in0),$in0 1464 jnz .Loop_shaext 1465 1466 pshufd \$0xb1,$CDGH,$CDGH # DCHG 1467 pshufd \$0x1b,$ABEF,$TMP # FEBA 1468 pshufd \$0xb1,$ABEF,$ABEF # BAFE 1469 punpckhqdq $CDGH,$ABEF # DCBA 1470 palignr \$8,$TMP,$CDGH # HGFE 1471 1472 movups $iv,($ivp) # write IV 1473 movdqu $ABEF,($ctx) 1474 movdqu $CDGH,16($ctx) 1475___ 1476$code.=<<___ if ($win64); 1477 movaps 0*16(%rsp),%xmm6 1478 movaps 1*16(%rsp),%xmm7 1479 movaps 2*16(%rsp),%xmm8 1480 movaps 3*16(%rsp),%xmm9 1481 movaps 4*16(%rsp),%xmm10 1482 movaps 5*16(%rsp),%xmm11 1483 movaps 6*16(%rsp),%xmm12 1484 movaps 7*16(%rsp),%xmm13 1485 movaps 8*16(%rsp),%xmm14 1486 movaps 9*16(%rsp),%xmm15 1487 lea 8+10*16(%rsp),%rsp 1488.Lepilogue_shaext: 1489___ 1490$code.=<<___; 1491 ret 1492.size ${func}_shaext,.-${func}_shaext 1493___ 1494} 1495}}}}} 1496 1497# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1498# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1499if ($win64 && $avx) { 1500$rec="%rcx"; 1501$frame="%rdx"; 1502$context="%r8"; 1503$disp="%r9"; 1504 1505$code.=<<___; 1506.extern __imp_RtlVirtualUnwind 1507.type se_handler,\@abi-omnipotent 1508.align 16 1509se_handler: 1510 push %rsi 1511 push %rdi 1512 push %rbx 1513 push %rbp 1514 push %r12 1515 push %r13 1516 push %r14 1517 push %r15 1518 pushfq 1519 sub \$64,%rsp 1520 1521 mov 120($context),%rax # pull context->Rax 1522 mov 248($context),%rbx # pull context->Rip 1523 1524 mov 8($disp),%rsi # disp->ImageBase 1525 mov 56($disp),%r11 # disp->HanderlData 1526 1527 mov 0(%r11),%r10d # HandlerData[0] 1528 lea (%rsi,%r10),%r10 # prologue label 1529 cmp %r10,%rbx # context->Rip<prologue label 1530 jb .Lin_prologue 1531 1532 mov 152($context),%rax # pull context->Rsp 1533 1534 mov 4(%r11),%r10d # HandlerData[1] 1535 lea (%rsi,%r10),%r10 # epilogue label 1536 cmp %r10,%rbx # context->Rip>=epilogue label 1537 jae .Lin_prologue 1538___ 1539$code.=<<___ if ($shaext); 1540 lea aesni_cbc_sha256_enc_shaext(%rip),%r10 1541 cmp %r10,%rbx 1542 jb .Lnot_in_shaext 1543 1544 lea (%rax),%rsi 1545 lea 512($context),%rdi # &context.Xmm6 1546 mov \$20,%ecx 1547 .long 0xa548f3fc # cld; rep movsq 1548 lea 168(%rax),%rax # adjust stack pointer 1549 jmp .Lin_prologue 1550.Lnot_in_shaext: 1551___ 1552$code.=<<___ if ($avx>1); 1553 lea .Lavx2_shortcut(%rip),%r10 1554 cmp %r10,%rbx # context->Rip<avx2_shortcut 1555 jb .Lnot_in_avx2 1556 1557 and \$-256*$SZ,%rax 1558 add \$`2*$SZ*($rounds-8)`,%rax 1559.Lnot_in_avx2: 1560___ 1561$code.=<<___; 1562 mov %rax,%rsi # put aside Rsp 1563 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp 1564 lea 48(%rax),%rax 1565 1566 mov -8(%rax),%rbx 1567 mov -16(%rax),%rbp 1568 mov -24(%rax),%r12 1569 mov -32(%rax),%r13 1570 mov -40(%rax),%r14 1571 mov -48(%rax),%r15 1572 mov %rbx,144($context) # restore context->Rbx 1573 mov %rbp,160($context) # restore context->Rbp 1574 mov %r12,216($context) # restore context->R12 1575 mov %r13,224($context) # restore context->R13 1576 mov %r14,232($context) # restore context->R14 1577 mov %r15,240($context) # restore context->R15 1578 1579 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area 1580 lea 512($context),%rdi # &context.Xmm6 1581 mov \$20,%ecx 1582 .long 0xa548f3fc # cld; rep movsq 1583 1584.Lin_prologue: 1585 mov 8(%rax),%rdi 1586 mov 16(%rax),%rsi 1587 mov %rax,152($context) # restore context->Rsp 1588 mov %rsi,168($context) # restore context->Rsi 1589 mov %rdi,176($context) # restore context->Rdi 1590 1591 mov 40($disp),%rdi # disp->ContextRecord 1592 mov $context,%rsi # context 1593 mov \$154,%ecx # sizeof(CONTEXT) 1594 .long 0xa548f3fc # cld; rep movsq 1595 1596 mov $disp,%rsi 1597 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1598 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1599 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1600 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1601 mov 40(%rsi),%r10 # disp->ContextRecord 1602 lea 56(%rsi),%r11 # &disp->HandlerData 1603 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1604 mov %r10,32(%rsp) # arg5 1605 mov %r11,40(%rsp) # arg6 1606 mov %r12,48(%rsp) # arg7 1607 mov %rcx,56(%rsp) # arg8, (NULL) 1608 call *__imp_RtlVirtualUnwind(%rip) 1609 1610 mov \$1,%eax # ExceptionContinueSearch 1611 add \$64,%rsp 1612 popfq 1613 pop %r15 1614 pop %r14 1615 pop %r13 1616 pop %r12 1617 pop %rbp 1618 pop %rbx 1619 pop %rdi 1620 pop %rsi 1621 ret 1622.size se_handler,.-se_handler 1623 1624.section .pdata 1625 .rva .LSEH_begin_${func}_xop 1626 .rva .LSEH_end_${func}_xop 1627 .rva .LSEH_info_${func}_xop 1628 1629 .rva .LSEH_begin_${func}_avx 1630 .rva .LSEH_end_${func}_avx 1631 .rva .LSEH_info_${func}_avx 1632___ 1633$code.=<<___ if ($avx>1); 1634 .rva .LSEH_begin_${func}_avx2 1635 .rva .LSEH_end_${func}_avx2 1636 .rva .LSEH_info_${func}_avx2 1637___ 1638$code.=<<___ if ($shaext); 1639 .rva .LSEH_begin_${func}_shaext 1640 .rva .LSEH_end_${func}_shaext 1641 .rva .LSEH_info_${func}_shaext 1642___ 1643$code.=<<___; 1644.section .xdata 1645.align 8 1646.LSEH_info_${func}_xop: 1647 .byte 9,0,0,0 1648 .rva se_handler 1649 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 1650 1651.LSEH_info_${func}_avx: 1652 .byte 9,0,0,0 1653 .rva se_handler 1654 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1655___ 1656$code.=<<___ if ($avx>1); 1657.LSEH_info_${func}_avx2: 1658 .byte 9,0,0,0 1659 .rva se_handler 1660 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 1661___ 1662$code.=<<___ if ($shaext); 1663.LSEH_info_${func}_shaext: 1664 .byte 9,0,0,0 1665 .rva se_handler 1666 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] 1667___ 1668} 1669 1670#################################################################### 1671sub rex { 1672 local *opcode=shift; 1673 my ($dst,$src)=@_; 1674 my $rex=0; 1675 1676 $rex|=0x04 if($dst>=8); 1677 $rex|=0x01 if($src>=8); 1678 unshift @opcode,$rex|0x40 if($rex); 1679} 1680 1681{ 1682 my %opcodelet = ( 1683 "sha256rnds2" => 0xcb, 1684 "sha256msg1" => 0xcc, 1685 "sha256msg2" => 0xcd ); 1686 1687 sub sha256op38 { 1688 my $instr = shift; 1689 1690 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1691 my @opcode=(0x0f,0x38); 1692 rex(\@opcode,$2,$1); 1693 push @opcode,$opcodelet{$instr}; 1694 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1695 return ".byte\t".join(',',@opcode); 1696 } else { 1697 return $instr."\t".@_[0]; 1698 } 1699 } 1700} 1701 1702$code =~ s/\`([^\`]*)\`/eval $1/gem; 1703$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; 1704print $code; 1705close STDOUT; 1706