1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# Version 2.1. 11# 12# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on 13# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version 14# [you'll notice a lot of resemblance], such as compressed S-boxes 15# in little-endian byte order, prefetch of these tables in CBC mode, 16# as well as avoiding L1 cache aliasing between stack frame and key 17# schedule and already mentioned tables, compressed Td4... 18# 19# Performance in number of cycles per processed byte for 128-bit key: 20# 21# ECB encrypt ECB decrypt CBC large chunk 22# AMD64 33 41 13.0 23# EM64T 38 59 18.6(*) 24# Core 2 30 43 14.5(*) 25# 26# (*) with hyper-threading off 27 28$flavour = shift; 29$output = shift; 30if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 31 32$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 33 34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 37die "can't locate x86_64-xlate.pl"; 38 39open OUT,"| \"$^X\" $xlate $flavour $output"; 40*STDOUT=*OUT; 41 42$verticalspin=1; # unlike 32-bit version $verticalspin performs 43 # ~15% better on both AMD and Intel cores 44$speed_limit=512; # see aes-586.pl for details 45 46$code=".text\n"; 47 48$s0="%eax"; 49$s1="%ebx"; 50$s2="%ecx"; 51$s3="%edx"; 52$acc0="%esi"; $mask80="%rsi"; 53$acc1="%edi"; $maskfe="%rdi"; 54$acc2="%ebp"; $mask1b="%rbp"; 55$inp="%r8"; 56$out="%r9"; 57$t0="%r10d"; 58$t1="%r11d"; 59$t2="%r12d"; 60$rnds="%r13d"; 61$sbox="%r14"; 62$key="%r15"; 63 64sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } 65sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 66 $r =~ s/%[er]([sd]i)/%\1l/; 67 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 68sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; 69 $r =~ s/%r([0-9]+)/%r\1d/; $r; } 70sub _data_word() 71{ my $i; 72 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } 73} 74sub data_word() 75{ my $i; 76 my $last=pop(@_); 77 $code.=".long\t"; 78 while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; } 79 $code.=sprintf"0x%08x\n",$last; 80} 81 82sub data_byte() 83{ my $i; 84 my $last=pop(@_); 85 $code.=".byte\t"; 86 while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; } 87 $code.=sprintf"0x%02x\n",$last&0xff; 88} 89 90sub encvert() 91{ my $t3="%r8d"; # zaps $inp! 92 93$code.=<<___; 94 # favor 3-way issue Opteron pipeline... 95 movzb `&lo("$s0")`,$acc0 96 movzb `&lo("$s1")`,$acc1 97 movzb `&lo("$s2")`,$acc2 98 mov 0($sbox,$acc0,8),$t0 99 mov 0($sbox,$acc1,8),$t1 100 mov 0($sbox,$acc2,8),$t2 101 102 movzb `&hi("$s1")`,$acc0 103 movzb `&hi("$s2")`,$acc1 104 movzb `&lo("$s3")`,$acc2 105 xor 3($sbox,$acc0,8),$t0 106 xor 3($sbox,$acc1,8),$t1 107 mov 0($sbox,$acc2,8),$t3 108 109 movzb `&hi("$s3")`,$acc0 110 shr \$16,$s2 111 movzb `&hi("$s0")`,$acc2 112 xor 3($sbox,$acc0,8),$t2 113 shr \$16,$s3 114 xor 3($sbox,$acc2,8),$t3 115 116 shr \$16,$s1 117 lea 16($key),$key 118 shr \$16,$s0 119 120 movzb `&lo("$s2")`,$acc0 121 movzb `&lo("$s3")`,$acc1 122 movzb `&lo("$s0")`,$acc2 123 xor 2($sbox,$acc0,8),$t0 124 xor 2($sbox,$acc1,8),$t1 125 xor 2($sbox,$acc2,8),$t2 126 127 movzb `&hi("$s3")`,$acc0 128 movzb `&hi("$s0")`,$acc1 129 movzb `&lo("$s1")`,$acc2 130 xor 1($sbox,$acc0,8),$t0 131 xor 1($sbox,$acc1,8),$t1 132 xor 2($sbox,$acc2,8),$t3 133 134 mov 12($key),$s3 135 movzb `&hi("$s1")`,$acc1 136 movzb `&hi("$s2")`,$acc2 137 mov 0($key),$s0 138 xor 1($sbox,$acc1,8),$t2 139 xor 1($sbox,$acc2,8),$t3 140 141 mov 4($key),$s1 142 mov 8($key),$s2 143 xor $t0,$s0 144 xor $t1,$s1 145 xor $t2,$s2 146 xor $t3,$s3 147___ 148} 149 150sub enclastvert() 151{ my $t3="%r8d"; # zaps $inp! 152 153$code.=<<___; 154 movzb `&lo("$s0")`,$acc0 155 movzb `&lo("$s1")`,$acc1 156 movzb `&lo("$s2")`,$acc2 157 movzb 2($sbox,$acc0,8),$t0 158 movzb 2($sbox,$acc1,8),$t1 159 movzb 2($sbox,$acc2,8),$t2 160 161 movzb `&lo("$s3")`,$acc0 162 movzb `&hi("$s1")`,$acc1 163 movzb `&hi("$s2")`,$acc2 164 movzb 2($sbox,$acc0,8),$t3 165 mov 0($sbox,$acc1,8),$acc1 #$t0 166 mov 0($sbox,$acc2,8),$acc2 #$t1 167 168 and \$0x0000ff00,$acc1 169 and \$0x0000ff00,$acc2 170 171 xor $acc1,$t0 172 xor $acc2,$t1 173 shr \$16,$s2 174 175 movzb `&hi("$s3")`,$acc0 176 movzb `&hi("$s0")`,$acc1 177 shr \$16,$s3 178 mov 0($sbox,$acc0,8),$acc0 #$t2 179 mov 0($sbox,$acc1,8),$acc1 #$t3 180 181 and \$0x0000ff00,$acc0 182 and \$0x0000ff00,$acc1 183 shr \$16,$s1 184 xor $acc0,$t2 185 xor $acc1,$t3 186 shr \$16,$s0 187 188 movzb `&lo("$s2")`,$acc0 189 movzb `&lo("$s3")`,$acc1 190 movzb `&lo("$s0")`,$acc2 191 mov 0($sbox,$acc0,8),$acc0 #$t0 192 mov 0($sbox,$acc1,8),$acc1 #$t1 193 mov 0($sbox,$acc2,8),$acc2 #$t2 194 195 and \$0x00ff0000,$acc0 196 and \$0x00ff0000,$acc1 197 and \$0x00ff0000,$acc2 198 199 xor $acc0,$t0 200 xor $acc1,$t1 201 xor $acc2,$t2 202 203 movzb `&lo("$s1")`,$acc0 204 movzb `&hi("$s3")`,$acc1 205 movzb `&hi("$s0")`,$acc2 206 mov 0($sbox,$acc0,8),$acc0 #$t3 207 mov 2($sbox,$acc1,8),$acc1 #$t0 208 mov 2($sbox,$acc2,8),$acc2 #$t1 209 210 and \$0x00ff0000,$acc0 211 and \$0xff000000,$acc1 212 and \$0xff000000,$acc2 213 214 xor $acc0,$t3 215 xor $acc1,$t0 216 xor $acc2,$t1 217 218 movzb `&hi("$s1")`,$acc0 219 movzb `&hi("$s2")`,$acc1 220 mov 16+12($key),$s3 221 mov 2($sbox,$acc0,8),$acc0 #$t2 222 mov 2($sbox,$acc1,8),$acc1 #$t3 223 mov 16+0($key),$s0 224 225 and \$0xff000000,$acc0 226 and \$0xff000000,$acc1 227 228 xor $acc0,$t2 229 xor $acc1,$t3 230 231 mov 16+4($key),$s1 232 mov 16+8($key),$s2 233 xor $t0,$s0 234 xor $t1,$s1 235 xor $t2,$s2 236 xor $t3,$s3 237___ 238} 239 240sub encstep() 241{ my ($i,@s) = @_; 242 my $tmp0=$acc0; 243 my $tmp1=$acc1; 244 my $tmp2=$acc2; 245 my $out=($t0,$t1,$t2,$s[0])[$i]; 246 247 if ($i==3) { 248 $tmp0=$s[1]; 249 $tmp1=$s[2]; 250 $tmp2=$s[3]; 251 } 252 $code.=" movzb ".&lo($s[0]).",$out\n"; 253 $code.=" mov $s[2],$tmp1\n" if ($i!=3); 254 $code.=" lea 16($key),$key\n" if ($i==0); 255 256 $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 257 $code.=" mov 0($sbox,$out,8),$out\n"; 258 259 $code.=" shr \$16,$tmp1\n"; 260 $code.=" mov $s[3],$tmp2\n" if ($i!=3); 261 $code.=" xor 3($sbox,$tmp0,8),$out\n"; 262 263 $code.=" movzb ".&lo($tmp1).",$tmp1\n"; 264 $code.=" shr \$24,$tmp2\n"; 265 $code.=" xor 4*$i($key),$out\n"; 266 267 $code.=" xor 2($sbox,$tmp1,8),$out\n"; 268 $code.=" xor 1($sbox,$tmp2,8),$out\n"; 269 270 $code.=" mov $t0,$s[1]\n" if ($i==3); 271 $code.=" mov $t1,$s[2]\n" if ($i==3); 272 $code.=" mov $t2,$s[3]\n" if ($i==3); 273 $code.="\n"; 274} 275 276sub enclast() 277{ my ($i,@s)=@_; 278 my $tmp0=$acc0; 279 my $tmp1=$acc1; 280 my $tmp2=$acc2; 281 my $out=($t0,$t1,$t2,$s[0])[$i]; 282 283 if ($i==3) { 284 $tmp0=$s[1]; 285 $tmp1=$s[2]; 286 $tmp2=$s[3]; 287 } 288 $code.=" movzb ".&lo($s[0]).",$out\n"; 289 $code.=" mov $s[2],$tmp1\n" if ($i!=3); 290 291 $code.=" mov 2($sbox,$out,8),$out\n"; 292 $code.=" shr \$16,$tmp1\n"; 293 $code.=" mov $s[3],$tmp2\n" if ($i!=3); 294 295 $code.=" and \$0x000000ff,$out\n"; 296 $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 297 $code.=" movzb ".&lo($tmp1).",$tmp1\n"; 298 $code.=" shr \$24,$tmp2\n"; 299 300 $code.=" mov 0($sbox,$tmp0,8),$tmp0\n"; 301 $code.=" mov 0($sbox,$tmp1,8),$tmp1\n"; 302 $code.=" mov 2($sbox,$tmp2,8),$tmp2\n"; 303 304 $code.=" and \$0x0000ff00,$tmp0\n"; 305 $code.=" and \$0x00ff0000,$tmp1\n"; 306 $code.=" and \$0xff000000,$tmp2\n"; 307 308 $code.=" xor $tmp0,$out\n"; 309 $code.=" mov $t0,$s[1]\n" if ($i==3); 310 $code.=" xor $tmp1,$out\n"; 311 $code.=" mov $t1,$s[2]\n" if ($i==3); 312 $code.=" xor $tmp2,$out\n"; 313 $code.=" mov $t2,$s[3]\n" if ($i==3); 314 $code.="\n"; 315} 316 317$code.=<<___; 318.type _x86_64_AES_encrypt,\@abi-omnipotent 319.align 16 320_x86_64_AES_encrypt: 321 xor 0($key),$s0 # xor with key 322 xor 4($key),$s1 323 xor 8($key),$s2 324 xor 12($key),$s3 325 326 mov 240($key),$rnds # load key->rounds 327 sub \$1,$rnds 328 jmp .Lenc_loop 329.align 16 330.Lenc_loop: 331___ 332 if ($verticalspin) { &encvert(); } 333 else { &encstep(0,$s0,$s1,$s2,$s3); 334 &encstep(1,$s1,$s2,$s3,$s0); 335 &encstep(2,$s2,$s3,$s0,$s1); 336 &encstep(3,$s3,$s0,$s1,$s2); 337 } 338$code.=<<___; 339 sub \$1,$rnds 340 jnz .Lenc_loop 341___ 342 if ($verticalspin) { &enclastvert(); } 343 else { &enclast(0,$s0,$s1,$s2,$s3); 344 &enclast(1,$s1,$s2,$s3,$s0); 345 &enclast(2,$s2,$s3,$s0,$s1); 346 &enclast(3,$s3,$s0,$s1,$s2); 347 $code.=<<___; 348 xor 16+0($key),$s0 # xor with key 349 xor 16+4($key),$s1 350 xor 16+8($key),$s2 351 xor 16+12($key),$s3 352___ 353 } 354$code.=<<___; 355 .byte 0xf3,0xc3 # rep ret 356.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt 357___ 358 359# it's possible to implement this by shifting tN by 8, filling least 360# significant byte with byte load and finally bswap-ing at the end, 361# but such partial register load kills Core 2... 362sub enccompactvert() 363{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); 364 365$code.=<<___; 366 movzb `&lo("$s0")`,$t0 367 movzb `&lo("$s1")`,$t1 368 movzb `&lo("$s2")`,$t2 369 movzb ($sbox,$t0,1),$t0 370 movzb ($sbox,$t1,1),$t1 371 movzb ($sbox,$t2,1),$t2 372 373 movzb `&lo("$s3")`,$t3 374 movzb `&hi("$s1")`,$acc0 375 movzb `&hi("$s2")`,$acc1 376 movzb ($sbox,$t3,1),$t3 377 movzb ($sbox,$acc0,1),$t4 #$t0 378 movzb ($sbox,$acc1,1),$t5 #$t1 379 380 movzb `&hi("$s3")`,$acc2 381 movzb `&hi("$s0")`,$acc0 382 shr \$16,$s2 383 movzb ($sbox,$acc2,1),$acc2 #$t2 384 movzb ($sbox,$acc0,1),$acc0 #$t3 385 shr \$16,$s3 386 387 movzb `&lo("$s2")`,$acc1 388 shl \$8,$t4 389 shl \$8,$t5 390 movzb ($sbox,$acc1,1),$acc1 #$t0 391 xor $t4,$t0 392 xor $t5,$t1 393 394 movzb `&lo("$s3")`,$t4 395 shr \$16,$s0 396 shr \$16,$s1 397 movzb `&lo("$s0")`,$t5 398 shl \$8,$acc2 399 shl \$8,$acc0 400 movzb ($sbox,$t4,1),$t4 #$t1 401 movzb ($sbox,$t5,1),$t5 #$t2 402 xor $acc2,$t2 403 xor $acc0,$t3 404 405 movzb `&lo("$s1")`,$acc2 406 movzb `&hi("$s3")`,$acc0 407 shl \$16,$acc1 408 movzb ($sbox,$acc2,1),$acc2 #$t3 409 movzb ($sbox,$acc0,1),$acc0 #$t0 410 xor $acc1,$t0 411 412 movzb `&hi("$s0")`,$acc1 413 shr \$8,$s2 414 shr \$8,$s1 415 movzb ($sbox,$acc1,1),$acc1 #$t1 416 movzb ($sbox,$s2,1),$s3 #$t3 417 movzb ($sbox,$s1,1),$s2 #$t2 418 shl \$16,$t4 419 shl \$16,$t5 420 shl \$16,$acc2 421 xor $t4,$t1 422 xor $t5,$t2 423 xor $acc2,$t3 424 425 shl \$24,$acc0 426 shl \$24,$acc1 427 shl \$24,$s3 428 xor $acc0,$t0 429 shl \$24,$s2 430 xor $acc1,$t1 431 mov $t0,$s0 432 mov $t1,$s1 433 xor $t2,$s2 434 xor $t3,$s3 435___ 436} 437 438sub enctransform_ref() 439{ my $sn = shift; 440 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d"); 441 442$code.=<<___; 443 mov $sn,$acc 444 and \$0x80808080,$acc 445 mov $acc,$tmp 446 shr \$7,$tmp 447 lea ($sn,$sn),$r2 448 sub $tmp,$acc 449 and \$0xfefefefe,$r2 450 and \$0x1b1b1b1b,$acc 451 mov $sn,$tmp 452 xor $acc,$r2 453 454 xor $r2,$sn 455 rol \$24,$sn 456 xor $r2,$sn 457 ror \$16,$tmp 458 xor $tmp,$sn 459 ror \$8,$tmp 460 xor $tmp,$sn 461___ 462} 463 464# unlike decrypt case it does not pay off to parallelize enctransform 465sub enctransform() 466{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); 467 468$code.=<<___; 469 mov $s0,$acc0 470 mov $s1,$acc1 471 and \$0x80808080,$acc0 472 and \$0x80808080,$acc1 473 mov $acc0,$t0 474 mov $acc1,$t1 475 shr \$7,$t0 476 lea ($s0,$s0),$r20 477 shr \$7,$t1 478 lea ($s1,$s1),$r21 479 sub $t0,$acc0 480 sub $t1,$acc1 481 and \$0xfefefefe,$r20 482 and \$0xfefefefe,$r21 483 and \$0x1b1b1b1b,$acc0 484 and \$0x1b1b1b1b,$acc1 485 mov $s0,$t0 486 mov $s1,$t1 487 xor $acc0,$r20 488 xor $acc1,$r21 489 490 xor $r20,$s0 491 xor $r21,$s1 492 mov $s2,$acc0 493 mov $s3,$acc1 494 rol \$24,$s0 495 rol \$24,$s1 496 and \$0x80808080,$acc0 497 and \$0x80808080,$acc1 498 xor $r20,$s0 499 xor $r21,$s1 500 mov $acc0,$t2 501 mov $acc1,$t3 502 ror \$16,$t0 503 ror \$16,$t1 504 shr \$7,$t2 505 lea ($s2,$s2),$r20 506 xor $t0,$s0 507 xor $t1,$s1 508 shr \$7,$t3 509 lea ($s3,$s3),$r21 510 ror \$8,$t0 511 ror \$8,$t1 512 sub $t2,$acc0 513 sub $t3,$acc1 514 xor $t0,$s0 515 xor $t1,$s1 516 517 and \$0xfefefefe,$r20 518 and \$0xfefefefe,$r21 519 and \$0x1b1b1b1b,$acc0 520 and \$0x1b1b1b1b,$acc1 521 mov $s2,$t2 522 mov $s3,$t3 523 xor $acc0,$r20 524 xor $acc1,$r21 525 526 xor $r20,$s2 527 xor $r21,$s3 528 rol \$24,$s2 529 rol \$24,$s3 530 xor $r20,$s2 531 xor $r21,$s3 532 mov 0($sbox),$acc0 # prefetch Te4 533 ror \$16,$t2 534 ror \$16,$t3 535 mov 64($sbox),$acc1 536 xor $t2,$s2 537 xor $t3,$s3 538 mov 128($sbox),$r20 539 ror \$8,$t2 540 ror \$8,$t3 541 mov 192($sbox),$r21 542 xor $t2,$s2 543 xor $t3,$s3 544___ 545} 546 547$code.=<<___; 548.type _x86_64_AES_encrypt_compact,\@abi-omnipotent 549.align 16 550_x86_64_AES_encrypt_compact: 551 lea 128($sbox),$inp # size optimization 552 mov 0-128($inp),$acc1 # prefetch Te4 553 mov 32-128($inp),$acc2 554 mov 64-128($inp),$t0 555 mov 96-128($inp),$t1 556 mov 128-128($inp),$acc1 557 mov 160-128($inp),$acc2 558 mov 192-128($inp),$t0 559 mov 224-128($inp),$t1 560 jmp .Lenc_loop_compact 561.align 16 562.Lenc_loop_compact: 563 xor 0($key),$s0 # xor with key 564 xor 4($key),$s1 565 xor 8($key),$s2 566 xor 12($key),$s3 567 lea 16($key),$key 568___ 569 &enccompactvert(); 570$code.=<<___; 571 cmp 16(%rsp),$key 572 je .Lenc_compact_done 573___ 574 &enctransform(); 575$code.=<<___; 576 jmp .Lenc_loop_compact 577.align 16 578.Lenc_compact_done: 579 xor 0($key),$s0 580 xor 4($key),$s1 581 xor 8($key),$s2 582 xor 12($key),$s3 583 .byte 0xf3,0xc3 # rep ret 584.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact 585___ 586 587# void AES_encrypt (const void *inp,void *out,const AES_KEY *key); 588$code.=<<___; 589.globl AES_encrypt 590.type AES_encrypt,\@function,3 591.align 16 592AES_encrypt: 593 push %rbx 594 push %rbp 595 push %r12 596 push %r13 597 push %r14 598 push %r15 599 600 # allocate frame "above" key schedule 601 mov %rsp,%r10 602 lea -63(%rdx),%rcx # %rdx is key argument 603 and \$-64,%rsp 604 sub %rsp,%rcx 605 neg %rcx 606 and \$0x3c0,%rcx 607 sub %rcx,%rsp 608 sub \$32,%rsp 609 610 mov %rsi,16(%rsp) # save out 611 mov %r10,24(%rsp) # save real stack pointer 612.Lenc_prologue: 613 614 mov %rdx,$key 615 mov 240($key),$rnds # load rounds 616 617 mov 0(%rdi),$s0 # load input vector 618 mov 4(%rdi),$s1 619 mov 8(%rdi),$s2 620 mov 12(%rdi),$s3 621 622 shl \$4,$rnds 623 lea ($key,$rnds),%rbp 624 mov $key,(%rsp) # key schedule 625 mov %rbp,8(%rsp) # end of key schedule 626 627 # pick Te4 copy which can't "overlap" with stack frame or key schedule 628 lea .LAES_Te+2048(%rip),$sbox 629 lea 768(%rsp),%rbp 630 sub $sbox,%rbp 631 and \$0x300,%rbp 632 lea ($sbox,%rbp),$sbox 633 634 call _x86_64_AES_encrypt_compact 635 636 mov 16(%rsp),$out # restore out 637 mov 24(%rsp),%rsi # restore saved stack pointer 638 mov $s0,0($out) # write output vector 639 mov $s1,4($out) 640 mov $s2,8($out) 641 mov $s3,12($out) 642 643 mov (%rsi),%r15 644 mov 8(%rsi),%r14 645 mov 16(%rsi),%r13 646 mov 24(%rsi),%r12 647 mov 32(%rsi),%rbp 648 mov 40(%rsi),%rbx 649 lea 48(%rsi),%rsp 650.Lenc_epilogue: 651 ret 652.size AES_encrypt,.-AES_encrypt 653___ 654 655#------------------------------------------------------------------# 656 657sub decvert() 658{ my $t3="%r8d"; # zaps $inp! 659 660$code.=<<___; 661 # favor 3-way issue Opteron pipeline... 662 movzb `&lo("$s0")`,$acc0 663 movzb `&lo("$s1")`,$acc1 664 movzb `&lo("$s2")`,$acc2 665 mov 0($sbox,$acc0,8),$t0 666 mov 0($sbox,$acc1,8),$t1 667 mov 0($sbox,$acc2,8),$t2 668 669 movzb `&hi("$s3")`,$acc0 670 movzb `&hi("$s0")`,$acc1 671 movzb `&lo("$s3")`,$acc2 672 xor 3($sbox,$acc0,8),$t0 673 xor 3($sbox,$acc1,8),$t1 674 mov 0($sbox,$acc2,8),$t3 675 676 movzb `&hi("$s1")`,$acc0 677 shr \$16,$s0 678 movzb `&hi("$s2")`,$acc2 679 xor 3($sbox,$acc0,8),$t2 680 shr \$16,$s3 681 xor 3($sbox,$acc2,8),$t3 682 683 shr \$16,$s1 684 lea 16($key),$key 685 shr \$16,$s2 686 687 movzb `&lo("$s2")`,$acc0 688 movzb `&lo("$s3")`,$acc1 689 movzb `&lo("$s0")`,$acc2 690 xor 2($sbox,$acc0,8),$t0 691 xor 2($sbox,$acc1,8),$t1 692 xor 2($sbox,$acc2,8),$t2 693 694 movzb `&hi("$s1")`,$acc0 695 movzb `&hi("$s2")`,$acc1 696 movzb `&lo("$s1")`,$acc2 697 xor 1($sbox,$acc0,8),$t0 698 xor 1($sbox,$acc1,8),$t1 699 xor 2($sbox,$acc2,8),$t3 700 701 movzb `&hi("$s3")`,$acc0 702 mov 12($key),$s3 703 movzb `&hi("$s0")`,$acc2 704 xor 1($sbox,$acc0,8),$t2 705 mov 0($key),$s0 706 xor 1($sbox,$acc2,8),$t3 707 708 xor $t0,$s0 709 mov 4($key),$s1 710 mov 8($key),$s2 711 xor $t2,$s2 712 xor $t1,$s1 713 xor $t3,$s3 714___ 715} 716 717sub declastvert() 718{ my $t3="%r8d"; # zaps $inp! 719 720$code.=<<___; 721 lea 2048($sbox),$sbox # size optimization 722 movzb `&lo("$s0")`,$acc0 723 movzb `&lo("$s1")`,$acc1 724 movzb `&lo("$s2")`,$acc2 725 movzb ($sbox,$acc0,1),$t0 726 movzb ($sbox,$acc1,1),$t1 727 movzb ($sbox,$acc2,1),$t2 728 729 movzb `&lo("$s3")`,$acc0 730 movzb `&hi("$s3")`,$acc1 731 movzb `&hi("$s0")`,$acc2 732 movzb ($sbox,$acc0,1),$t3 733 movzb ($sbox,$acc1,1),$acc1 #$t0 734 movzb ($sbox,$acc2,1),$acc2 #$t1 735 736 shl \$8,$acc1 737 shl \$8,$acc2 738 739 xor $acc1,$t0 740 xor $acc2,$t1 741 shr \$16,$s3 742 743 movzb `&hi("$s1")`,$acc0 744 movzb `&hi("$s2")`,$acc1 745 shr \$16,$s0 746 movzb ($sbox,$acc0,1),$acc0 #$t2 747 movzb ($sbox,$acc1,1),$acc1 #$t3 748 749 shl \$8,$acc0 750 shl \$8,$acc1 751 shr \$16,$s1 752 xor $acc0,$t2 753 xor $acc1,$t3 754 shr \$16,$s2 755 756 movzb `&lo("$s2")`,$acc0 757 movzb `&lo("$s3")`,$acc1 758 movzb `&lo("$s0")`,$acc2 759 movzb ($sbox,$acc0,1),$acc0 #$t0 760 movzb ($sbox,$acc1,1),$acc1 #$t1 761 movzb ($sbox,$acc2,1),$acc2 #$t2 762 763 shl \$16,$acc0 764 shl \$16,$acc1 765 shl \$16,$acc2 766 767 xor $acc0,$t0 768 xor $acc1,$t1 769 xor $acc2,$t2 770 771 movzb `&lo("$s1")`,$acc0 772 movzb `&hi("$s1")`,$acc1 773 movzb `&hi("$s2")`,$acc2 774 movzb ($sbox,$acc0,1),$acc0 #$t3 775 movzb ($sbox,$acc1,1),$acc1 #$t0 776 movzb ($sbox,$acc2,1),$acc2 #$t1 777 778 shl \$16,$acc0 779 shl \$24,$acc1 780 shl \$24,$acc2 781 782 xor $acc0,$t3 783 xor $acc1,$t0 784 xor $acc2,$t1 785 786 movzb `&hi("$s3")`,$acc0 787 movzb `&hi("$s0")`,$acc1 788 mov 16+12($key),$s3 789 movzb ($sbox,$acc0,1),$acc0 #$t2 790 movzb ($sbox,$acc1,1),$acc1 #$t3 791 mov 16+0($key),$s0 792 793 shl \$24,$acc0 794 shl \$24,$acc1 795 796 xor $acc0,$t2 797 xor $acc1,$t3 798 799 mov 16+4($key),$s1 800 mov 16+8($key),$s2 801 lea -2048($sbox),$sbox 802 xor $t0,$s0 803 xor $t1,$s1 804 xor $t2,$s2 805 xor $t3,$s3 806___ 807} 808 809sub decstep() 810{ my ($i,@s) = @_; 811 my $tmp0=$acc0; 812 my $tmp1=$acc1; 813 my $tmp2=$acc2; 814 my $out=($t0,$t1,$t2,$s[0])[$i]; 815 816 $code.=" mov $s[0],$out\n" if ($i!=3); 817 $tmp1=$s[2] if ($i==3); 818 $code.=" mov $s[2],$tmp1\n" if ($i!=3); 819 $code.=" and \$0xFF,$out\n"; 820 821 $code.=" mov 0($sbox,$out,8),$out\n"; 822 $code.=" shr \$16,$tmp1\n"; 823 $tmp2=$s[3] if ($i==3); 824 $code.=" mov $s[3],$tmp2\n" if ($i!=3); 825 826 $tmp0=$s[1] if ($i==3); 827 $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 828 $code.=" and \$0xFF,$tmp1\n"; 829 $code.=" shr \$24,$tmp2\n"; 830 831 $code.=" xor 3($sbox,$tmp0,8),$out\n"; 832 $code.=" xor 2($sbox,$tmp1,8),$out\n"; 833 $code.=" xor 1($sbox,$tmp2,8),$out\n"; 834 835 $code.=" mov $t2,$s[1]\n" if ($i==3); 836 $code.=" mov $t1,$s[2]\n" if ($i==3); 837 $code.=" mov $t0,$s[3]\n" if ($i==3); 838 $code.="\n"; 839} 840 841sub declast() 842{ my ($i,@s)=@_; 843 my $tmp0=$acc0; 844 my $tmp1=$acc1; 845 my $tmp2=$acc2; 846 my $out=($t0,$t1,$t2,$s[0])[$i]; 847 848 $code.=" mov $s[0],$out\n" if ($i!=3); 849 $tmp1=$s[2] if ($i==3); 850 $code.=" mov $s[2],$tmp1\n" if ($i!=3); 851 $code.=" and \$0xFF,$out\n"; 852 853 $code.=" movzb 2048($sbox,$out,1),$out\n"; 854 $code.=" shr \$16,$tmp1\n"; 855 $tmp2=$s[3] if ($i==3); 856 $code.=" mov $s[3],$tmp2\n" if ($i!=3); 857 858 $tmp0=$s[1] if ($i==3); 859 $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 860 $code.=" and \$0xFF,$tmp1\n"; 861 $code.=" shr \$24,$tmp2\n"; 862 863 $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n"; 864 $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n"; 865 $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n"; 866 867 $code.=" shl \$8,$tmp0\n"; 868 $code.=" shl \$16,$tmp1\n"; 869 $code.=" shl \$24,$tmp2\n"; 870 871 $code.=" xor $tmp0,$out\n"; 872 $code.=" mov $t2,$s[1]\n" if ($i==3); 873 $code.=" xor $tmp1,$out\n"; 874 $code.=" mov $t1,$s[2]\n" if ($i==3); 875 $code.=" xor $tmp2,$out\n"; 876 $code.=" mov $t0,$s[3]\n" if ($i==3); 877 $code.="\n"; 878} 879 880$code.=<<___; 881.type _x86_64_AES_decrypt,\@abi-omnipotent 882.align 16 883_x86_64_AES_decrypt: 884 xor 0($key),$s0 # xor with key 885 xor 4($key),$s1 886 xor 8($key),$s2 887 xor 12($key),$s3 888 889 mov 240($key),$rnds # load key->rounds 890 sub \$1,$rnds 891 jmp .Ldec_loop 892.align 16 893.Ldec_loop: 894___ 895 if ($verticalspin) { &decvert(); } 896 else { &decstep(0,$s0,$s3,$s2,$s1); 897 &decstep(1,$s1,$s0,$s3,$s2); 898 &decstep(2,$s2,$s1,$s0,$s3); 899 &decstep(3,$s3,$s2,$s1,$s0); 900 $code.=<<___; 901 lea 16($key),$key 902 xor 0($key),$s0 # xor with key 903 xor 4($key),$s1 904 xor 8($key),$s2 905 xor 12($key),$s3 906___ 907 } 908$code.=<<___; 909 sub \$1,$rnds 910 jnz .Ldec_loop 911___ 912 if ($verticalspin) { &declastvert(); } 913 else { &declast(0,$s0,$s3,$s2,$s1); 914 &declast(1,$s1,$s0,$s3,$s2); 915 &declast(2,$s2,$s1,$s0,$s3); 916 &declast(3,$s3,$s2,$s1,$s0); 917 $code.=<<___; 918 xor 16+0($key),$s0 # xor with key 919 xor 16+4($key),$s1 920 xor 16+8($key),$s2 921 xor 16+12($key),$s3 922___ 923 } 924$code.=<<___; 925 .byte 0xf3,0xc3 # rep ret 926.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt 927___ 928 929sub deccompactvert() 930{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); 931 932$code.=<<___; 933 movzb `&lo("$s0")`,$t0 934 movzb `&lo("$s1")`,$t1 935 movzb `&lo("$s2")`,$t2 936 movzb ($sbox,$t0,1),$t0 937 movzb ($sbox,$t1,1),$t1 938 movzb ($sbox,$t2,1),$t2 939 940 movzb `&lo("$s3")`,$t3 941 movzb `&hi("$s3")`,$acc0 942 movzb `&hi("$s0")`,$acc1 943 movzb ($sbox,$t3,1),$t3 944 movzb ($sbox,$acc0,1),$t4 #$t0 945 movzb ($sbox,$acc1,1),$t5 #$t1 946 947 movzb `&hi("$s1")`,$acc2 948 movzb `&hi("$s2")`,$acc0 949 shr \$16,$s2 950 movzb ($sbox,$acc2,1),$acc2 #$t2 951 movzb ($sbox,$acc0,1),$acc0 #$t3 952 shr \$16,$s3 953 954 movzb `&lo("$s2")`,$acc1 955 shl \$8,$t4 956 shl \$8,$t5 957 movzb ($sbox,$acc1,1),$acc1 #$t0 958 xor $t4,$t0 959 xor $t5,$t1 960 961 movzb `&lo("$s3")`,$t4 962 shr \$16,$s0 963 shr \$16,$s1 964 movzb `&lo("$s0")`,$t5 965 shl \$8,$acc2 966 shl \$8,$acc0 967 movzb ($sbox,$t4,1),$t4 #$t1 968 movzb ($sbox,$t5,1),$t5 #$t2 969 xor $acc2,$t2 970 xor $acc0,$t3 971 972 movzb `&lo("$s1")`,$acc2 973 movzb `&hi("$s1")`,$acc0 974 shl \$16,$acc1 975 movzb ($sbox,$acc2,1),$acc2 #$t3 976 movzb ($sbox,$acc0,1),$acc0 #$t0 977 xor $acc1,$t0 978 979 movzb `&hi("$s2")`,$acc1 980 shl \$16,$t4 981 shl \$16,$t5 982 movzb ($sbox,$acc1,1),$s1 #$t1 983 xor $t4,$t1 984 xor $t5,$t2 985 986 movzb `&hi("$s3")`,$acc1 987 shr \$8,$s0 988 shl \$16,$acc2 989 movzb ($sbox,$acc1,1),$s2 #$t2 990 movzb ($sbox,$s0,1),$s3 #$t3 991 xor $acc2,$t3 992 993 shl \$24,$acc0 994 shl \$24,$s1 995 shl \$24,$s2 996 xor $acc0,$t0 997 shl \$24,$s3 998 xor $t1,$s1 999 mov $t0,$s0 1000 xor $t2,$s2 1001 xor $t3,$s3 1002___ 1003} 1004 1005# parallelized version! input is pair of 64-bit values: %rax=s1.s0 1006# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1, 1007# %ecx=s2 and %edx=s3. 1008sub dectransform() 1009{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx"); 1010 my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx"); 1011 my $prefetch = shift; 1012 1013$code.=<<___; 1014 mov $tp10,$acc0 1015 mov $tp18,$acc8 1016 and $mask80,$acc0 1017 and $mask80,$acc8 1018 mov $acc0,$tp40 1019 mov $acc8,$tp48 1020 shr \$7,$tp40 1021 lea ($tp10,$tp10),$tp20 1022 shr \$7,$tp48 1023 lea ($tp18,$tp18),$tp28 1024 sub $tp40,$acc0 1025 sub $tp48,$acc8 1026 and $maskfe,$tp20 1027 and $maskfe,$tp28 1028 and $mask1b,$acc0 1029 and $mask1b,$acc8 1030 xor $tp20,$acc0 1031 xor $tp28,$acc8 1032 mov $acc0,$tp20 1033 mov $acc8,$tp28 1034 1035 and $mask80,$acc0 1036 and $mask80,$acc8 1037 mov $acc0,$tp80 1038 mov $acc8,$tp88 1039 shr \$7,$tp80 1040 lea ($tp20,$tp20),$tp40 1041 shr \$7,$tp88 1042 lea ($tp28,$tp28),$tp48 1043 sub $tp80,$acc0 1044 sub $tp88,$acc8 1045 and $maskfe,$tp40 1046 and $maskfe,$tp48 1047 and $mask1b,$acc0 1048 and $mask1b,$acc8 1049 xor $tp40,$acc0 1050 xor $tp48,$acc8 1051 mov $acc0,$tp40 1052 mov $acc8,$tp48 1053 1054 and $mask80,$acc0 1055 and $mask80,$acc8 1056 mov $acc0,$tp80 1057 mov $acc8,$tp88 1058 shr \$7,$tp80 1059 xor $tp10,$tp20 # tp2^=tp1 1060 shr \$7,$tp88 1061 xor $tp18,$tp28 # tp2^=tp1 1062 sub $tp80,$acc0 1063 sub $tp88,$acc8 1064 lea ($tp40,$tp40),$tp80 1065 lea ($tp48,$tp48),$tp88 1066 xor $tp10,$tp40 # tp4^=tp1 1067 xor $tp18,$tp48 # tp4^=tp1 1068 and $maskfe,$tp80 1069 and $maskfe,$tp88 1070 and $mask1b,$acc0 1071 and $mask1b,$acc8 1072 xor $acc0,$tp80 1073 xor $acc8,$tp88 1074 1075 xor $tp80,$tp10 # tp1^=tp8 1076 xor $tp88,$tp18 # tp1^=tp8 1077 xor $tp80,$tp20 # tp2^tp1^=tp8 1078 xor $tp88,$tp28 # tp2^tp1^=tp8 1079 mov $tp10,$acc0 1080 mov $tp18,$acc8 1081 xor $tp80,$tp40 # tp4^tp1^=tp8 1082 xor $tp88,$tp48 # tp4^tp1^=tp8 1083 shr \$32,$acc0 1084 shr \$32,$acc8 1085 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 1086 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 1087 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) 1088 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) 1089 xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 1090 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 1091 1092 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) 1093 rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) 1094 xor `&LO("$tp80")`,`&LO("$tp10")` 1095 xor `&LO("$tp88")`,`&LO("$tp18")` 1096 shr \$32,$tp80 1097 shr \$32,$tp88 1098 xor `&LO("$tp80")`,`&LO("$acc0")` 1099 xor `&LO("$tp88")`,`&LO("$acc8")` 1100 1101 mov $tp20,$tp80 1102 mov $tp28,$tp88 1103 shr \$32,$tp80 1104 shr \$32,$tp88 1105 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) 1106 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) 1107 rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) 1108 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) 1109 xor `&LO("$tp20")`,`&LO("$tp10")` 1110 xor `&LO("$tp28")`,`&LO("$tp18")` 1111 mov $tp40,$tp20 1112 mov $tp48,$tp28 1113 xor `&LO("$tp80")`,`&LO("$acc0")` 1114 xor `&LO("$tp88")`,`&LO("$acc8")` 1115 1116 `"mov 0($sbox),$mask80" if ($prefetch)` 1117 shr \$32,$tp20 1118 shr \$32,$tp28 1119 `"mov 64($sbox),$maskfe" if ($prefetch)` 1120 rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) 1121 rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) 1122 `"mov 128($sbox),$mask1b" if ($prefetch)` 1123 rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) 1124 rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) 1125 `"mov 192($sbox),$tp80" if ($prefetch)` 1126 xor `&LO("$tp40")`,`&LO("$tp10")` 1127 xor `&LO("$tp48")`,`&LO("$tp18")` 1128 `"mov 256($sbox),$tp88" if ($prefetch)` 1129 xor `&LO("$tp20")`,`&LO("$acc0")` 1130 xor `&LO("$tp28")`,`&LO("$acc8")` 1131___ 1132} 1133 1134$code.=<<___; 1135.type _x86_64_AES_decrypt_compact,\@abi-omnipotent 1136.align 16 1137_x86_64_AES_decrypt_compact: 1138 lea 128($sbox),$inp # size optimization 1139 mov 0-128($inp),$acc1 # prefetch Td4 1140 mov 32-128($inp),$acc2 1141 mov 64-128($inp),$t0 1142 mov 96-128($inp),$t1 1143 mov 128-128($inp),$acc1 1144 mov 160-128($inp),$acc2 1145 mov 192-128($inp),$t0 1146 mov 224-128($inp),$t1 1147 jmp .Ldec_loop_compact 1148 1149.align 16 1150.Ldec_loop_compact: 1151 xor 0($key),$s0 # xor with key 1152 xor 4($key),$s1 1153 xor 8($key),$s2 1154 xor 12($key),$s3 1155 lea 16($key),$key 1156___ 1157 &deccompactvert(); 1158$code.=<<___; 1159 cmp 16(%rsp),$key 1160 je .Ldec_compact_done 1161 1162 mov 256+0($sbox),$mask80 1163 shl \$32,%rbx 1164 shl \$32,%rdx 1165 mov 256+8($sbox),$maskfe 1166 or %rbx,%rax 1167 or %rdx,%rcx 1168 mov 256+16($sbox),$mask1b 1169___ 1170 &dectransform(1); 1171$code.=<<___; 1172 jmp .Ldec_loop_compact 1173.align 16 1174.Ldec_compact_done: 1175 xor 0($key),$s0 1176 xor 4($key),$s1 1177 xor 8($key),$s2 1178 xor 12($key),$s3 1179 .byte 0xf3,0xc3 # rep ret 1180.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact 1181___ 1182 1183# void AES_decrypt (const void *inp,void *out,const AES_KEY *key); 1184$code.=<<___; 1185.globl AES_decrypt 1186.type AES_decrypt,\@function,3 1187.align 16 1188AES_decrypt: 1189 push %rbx 1190 push %rbp 1191 push %r12 1192 push %r13 1193 push %r14 1194 push %r15 1195 1196 # allocate frame "above" key schedule 1197 mov %rsp,%r10 1198 lea -63(%rdx),%rcx # %rdx is key argument 1199 and \$-64,%rsp 1200 sub %rsp,%rcx 1201 neg %rcx 1202 and \$0x3c0,%rcx 1203 sub %rcx,%rsp 1204 sub \$32,%rsp 1205 1206 mov %rsi,16(%rsp) # save out 1207 mov %r10,24(%rsp) # save real stack pointer 1208.Ldec_prologue: 1209 1210 mov %rdx,$key 1211 mov 240($key),$rnds # load rounds 1212 1213 mov 0(%rdi),$s0 # load input vector 1214 mov 4(%rdi),$s1 1215 mov 8(%rdi),$s2 1216 mov 12(%rdi),$s3 1217 1218 shl \$4,$rnds 1219 lea ($key,$rnds),%rbp 1220 mov $key,(%rsp) # key schedule 1221 mov %rbp,8(%rsp) # end of key schedule 1222 1223 # pick Td4 copy which can't "overlap" with stack frame or key schedule 1224 lea .LAES_Td+2048(%rip),$sbox 1225 lea 768(%rsp),%rbp 1226 sub $sbox,%rbp 1227 and \$0x300,%rbp 1228 lea ($sbox,%rbp),$sbox 1229 shr \$3,%rbp # recall "magic" constants! 1230 add %rbp,$sbox 1231 1232 call _x86_64_AES_decrypt_compact 1233 1234 mov 16(%rsp),$out # restore out 1235 mov 24(%rsp),%rsi # restore saved stack pointer 1236 mov $s0,0($out) # write output vector 1237 mov $s1,4($out) 1238 mov $s2,8($out) 1239 mov $s3,12($out) 1240 1241 mov (%rsi),%r15 1242 mov 8(%rsi),%r14 1243 mov 16(%rsi),%r13 1244 mov 24(%rsi),%r12 1245 mov 32(%rsi),%rbp 1246 mov 40(%rsi),%rbx 1247 lea 48(%rsi),%rsp 1248.Ldec_epilogue: 1249 ret 1250.size AES_decrypt,.-AES_decrypt 1251___ 1252#------------------------------------------------------------------# 1253 1254sub enckey() 1255{ 1256$code.=<<___; 1257 movz %dl,%esi # rk[i]>>0 1258 movzb -128(%rbp,%rsi),%ebx 1259 movz %dh,%esi # rk[i]>>8 1260 shl \$24,%ebx 1261 xor %ebx,%eax 1262 1263 movzb -128(%rbp,%rsi),%ebx 1264 shr \$16,%edx 1265 movz %dl,%esi # rk[i]>>16 1266 xor %ebx,%eax 1267 1268 movzb -128(%rbp,%rsi),%ebx 1269 movz %dh,%esi # rk[i]>>24 1270 shl \$8,%ebx 1271 xor %ebx,%eax 1272 1273 movzb -128(%rbp,%rsi),%ebx 1274 shl \$16,%ebx 1275 xor %ebx,%eax 1276 1277 xor 1024-128(%rbp,%rcx,4),%eax # rcon 1278___ 1279} 1280 1281# int AES_set_encrypt_key(const unsigned char *userKey, const int bits, 1282# AES_KEY *key) 1283$code.=<<___; 1284.globl AES_set_encrypt_key 1285.type AES_set_encrypt_key,\@function,3 1286.align 16 1287AES_set_encrypt_key: 1288 push %rbx 1289 push %rbp 1290 push %r12 # redundant, but allows to share 1291 push %r13 # exception handler... 1292 push %r14 1293 push %r15 1294 sub \$8,%rsp 1295.Lenc_key_prologue: 1296 1297 call _x86_64_AES_set_encrypt_key 1298 1299 mov 8(%rsp),%r15 1300 mov 16(%rsp),%r14 1301 mov 24(%rsp),%r13 1302 mov 32(%rsp),%r12 1303 mov 40(%rsp),%rbp 1304 mov 48(%rsp),%rbx 1305 add \$56,%rsp 1306.Lenc_key_epilogue: 1307 ret 1308.size AES_set_encrypt_key,.-AES_set_encrypt_key 1309 1310.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent 1311.align 16 1312_x86_64_AES_set_encrypt_key: 1313 mov %esi,%ecx # %ecx=bits 1314 mov %rdi,%rsi # %rsi=userKey 1315 mov %rdx,%rdi # %rdi=key 1316 1317 test \$-1,%rsi 1318 jz .Lbadpointer 1319 test \$-1,%rdi 1320 jz .Lbadpointer 1321 1322 lea .LAES_Te(%rip),%rbp 1323 lea 2048+128(%rbp),%rbp 1324 1325 # prefetch Te4 1326 mov 0-128(%rbp),%eax 1327 mov 32-128(%rbp),%ebx 1328 mov 64-128(%rbp),%r8d 1329 mov 96-128(%rbp),%edx 1330 mov 128-128(%rbp),%eax 1331 mov 160-128(%rbp),%ebx 1332 mov 192-128(%rbp),%r8d 1333 mov 224-128(%rbp),%edx 1334 1335 cmp \$128,%ecx 1336 je .L10rounds 1337 cmp \$192,%ecx 1338 je .L12rounds 1339 cmp \$256,%ecx 1340 je .L14rounds 1341 mov \$-2,%rax # invalid number of bits 1342 jmp .Lexit 1343 1344.L10rounds: 1345 mov 0(%rsi),%rax # copy first 4 dwords 1346 mov 8(%rsi),%rdx 1347 mov %rax,0(%rdi) 1348 mov %rdx,8(%rdi) 1349 1350 shr \$32,%rdx 1351 xor %ecx,%ecx 1352 jmp .L10shortcut 1353.align 4 1354.L10loop: 1355 mov 0(%rdi),%eax # rk[0] 1356 mov 12(%rdi),%edx # rk[3] 1357.L10shortcut: 1358___ 1359 &enckey (); 1360$code.=<<___; 1361 mov %eax,16(%rdi) # rk[4] 1362 xor 4(%rdi),%eax 1363 mov %eax,20(%rdi) # rk[5] 1364 xor 8(%rdi),%eax 1365 mov %eax,24(%rdi) # rk[6] 1366 xor 12(%rdi),%eax 1367 mov %eax,28(%rdi) # rk[7] 1368 add \$1,%ecx 1369 lea 16(%rdi),%rdi 1370 cmp \$10,%ecx 1371 jl .L10loop 1372 1373 movl \$10,80(%rdi) # setup number of rounds 1374 xor %rax,%rax 1375 jmp .Lexit 1376 1377.L12rounds: 1378 mov 0(%rsi),%rax # copy first 6 dwords 1379 mov 8(%rsi),%rbx 1380 mov 16(%rsi),%rdx 1381 mov %rax,0(%rdi) 1382 mov %rbx,8(%rdi) 1383 mov %rdx,16(%rdi) 1384 1385 shr \$32,%rdx 1386 xor %ecx,%ecx 1387 jmp .L12shortcut 1388.align 4 1389.L12loop: 1390 mov 0(%rdi),%eax # rk[0] 1391 mov 20(%rdi),%edx # rk[5] 1392.L12shortcut: 1393___ 1394 &enckey (); 1395$code.=<<___; 1396 mov %eax,24(%rdi) # rk[6] 1397 xor 4(%rdi),%eax 1398 mov %eax,28(%rdi) # rk[7] 1399 xor 8(%rdi),%eax 1400 mov %eax,32(%rdi) # rk[8] 1401 xor 12(%rdi),%eax 1402 mov %eax,36(%rdi) # rk[9] 1403 1404 cmp \$7,%ecx 1405 je .L12break 1406 add \$1,%ecx 1407 1408 xor 16(%rdi),%eax 1409 mov %eax,40(%rdi) # rk[10] 1410 xor 20(%rdi),%eax 1411 mov %eax,44(%rdi) # rk[11] 1412 1413 lea 24(%rdi),%rdi 1414 jmp .L12loop 1415.L12break: 1416 movl \$12,72(%rdi) # setup number of rounds 1417 xor %rax,%rax 1418 jmp .Lexit 1419 1420.L14rounds: 1421 mov 0(%rsi),%rax # copy first 8 dwords 1422 mov 8(%rsi),%rbx 1423 mov 16(%rsi),%rcx 1424 mov 24(%rsi),%rdx 1425 mov %rax,0(%rdi) 1426 mov %rbx,8(%rdi) 1427 mov %rcx,16(%rdi) 1428 mov %rdx,24(%rdi) 1429 1430 shr \$32,%rdx 1431 xor %ecx,%ecx 1432 jmp .L14shortcut 1433.align 4 1434.L14loop: 1435 mov 0(%rdi),%eax # rk[0] 1436 mov 28(%rdi),%edx # rk[4] 1437.L14shortcut: 1438___ 1439 &enckey (); 1440$code.=<<___; 1441 mov %eax,32(%rdi) # rk[8] 1442 xor 4(%rdi),%eax 1443 mov %eax,36(%rdi) # rk[9] 1444 xor 8(%rdi),%eax 1445 mov %eax,40(%rdi) # rk[10] 1446 xor 12(%rdi),%eax 1447 mov %eax,44(%rdi) # rk[11] 1448 1449 cmp \$6,%ecx 1450 je .L14break 1451 add \$1,%ecx 1452 1453 mov %eax,%edx 1454 mov 16(%rdi),%eax # rk[4] 1455 movz %dl,%esi # rk[11]>>0 1456 movzb -128(%rbp,%rsi),%ebx 1457 movz %dh,%esi # rk[11]>>8 1458 xor %ebx,%eax 1459 1460 movzb -128(%rbp,%rsi),%ebx 1461 shr \$16,%edx 1462 shl \$8,%ebx 1463 movz %dl,%esi # rk[11]>>16 1464 xor %ebx,%eax 1465 1466 movzb -128(%rbp,%rsi),%ebx 1467 movz %dh,%esi # rk[11]>>24 1468 shl \$16,%ebx 1469 xor %ebx,%eax 1470 1471 movzb -128(%rbp,%rsi),%ebx 1472 shl \$24,%ebx 1473 xor %ebx,%eax 1474 1475 mov %eax,48(%rdi) # rk[12] 1476 xor 20(%rdi),%eax 1477 mov %eax,52(%rdi) # rk[13] 1478 xor 24(%rdi),%eax 1479 mov %eax,56(%rdi) # rk[14] 1480 xor 28(%rdi),%eax 1481 mov %eax,60(%rdi) # rk[15] 1482 1483 lea 32(%rdi),%rdi 1484 jmp .L14loop 1485.L14break: 1486 movl \$14,48(%rdi) # setup number of rounds 1487 xor %rax,%rax 1488 jmp .Lexit 1489 1490.Lbadpointer: 1491 mov \$-1,%rax 1492.Lexit: 1493 .byte 0xf3,0xc3 # rep ret 1494.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key 1495___ 1496 1497sub deckey_ref() 1498{ my ($i,$ptr,$te,$td) = @_; 1499 my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d"); 1500$code.=<<___; 1501 mov $i($ptr),$tp1 1502 mov $tp1,$acc 1503 and \$0x80808080,$acc 1504 mov $acc,$tp4 1505 shr \$7,$tp4 1506 lea 0($tp1,$tp1),$tp2 1507 sub $tp4,$acc 1508 and \$0xfefefefe,$tp2 1509 and \$0x1b1b1b1b,$acc 1510 xor $tp2,$acc 1511 mov $acc,$tp2 1512 1513 and \$0x80808080,$acc 1514 mov $acc,$tp8 1515 shr \$7,$tp8 1516 lea 0($tp2,$tp2),$tp4 1517 sub $tp8,$acc 1518 and \$0xfefefefe,$tp4 1519 and \$0x1b1b1b1b,$acc 1520 xor $tp1,$tp2 # tp2^tp1 1521 xor $tp4,$acc 1522 mov $acc,$tp4 1523 1524 and \$0x80808080,$acc 1525 mov $acc,$tp8 1526 shr \$7,$tp8 1527 sub $tp8,$acc 1528 lea 0($tp4,$tp4),$tp8 1529 xor $tp1,$tp4 # tp4^tp1 1530 and \$0xfefefefe,$tp8 1531 and \$0x1b1b1b1b,$acc 1532 xor $acc,$tp8 1533 1534 xor $tp8,$tp1 # tp1^tp8 1535 rol \$8,$tp1 # ROTATE(tp1^tp8,8) 1536 xor $tp8,$tp2 # tp2^tp1^tp8 1537 xor $tp8,$tp4 # tp4^tp1^tp8 1538 xor $tp2,$tp8 1539 xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2 1540 1541 xor $tp8,$tp1 1542 rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24) 1543 xor $tp2,$tp1 1544 rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16) 1545 xor $tp4,$tp1 1546 1547 mov $tp1,$i($ptr) 1548___ 1549} 1550 1551# int AES_set_decrypt_key(const unsigned char *userKey, const int bits, 1552# AES_KEY *key) 1553$code.=<<___; 1554.globl AES_set_decrypt_key 1555.type AES_set_decrypt_key,\@function,3 1556.align 16 1557AES_set_decrypt_key: 1558 push %rbx 1559 push %rbp 1560 push %r12 1561 push %r13 1562 push %r14 1563 push %r15 1564 push %rdx # save key schedule 1565.Ldec_key_prologue: 1566 1567 call _x86_64_AES_set_encrypt_key 1568 mov (%rsp),%r8 # restore key schedule 1569 cmp \$0,%eax 1570 jne .Labort 1571 1572 mov 240(%r8),%r14d # pull number of rounds 1573 xor %rdi,%rdi 1574 lea (%rdi,%r14d,4),%rcx 1575 mov %r8,%rsi 1576 lea (%r8,%rcx,4),%rdi # pointer to last chunk 1577.align 4 1578.Linvert: 1579 mov 0(%rsi),%rax 1580 mov 8(%rsi),%rbx 1581 mov 0(%rdi),%rcx 1582 mov 8(%rdi),%rdx 1583 mov %rax,0(%rdi) 1584 mov %rbx,8(%rdi) 1585 mov %rcx,0(%rsi) 1586 mov %rdx,8(%rsi) 1587 lea 16(%rsi),%rsi 1588 lea -16(%rdi),%rdi 1589 cmp %rsi,%rdi 1590 jne .Linvert 1591 1592 lea .LAES_Te+2048+1024(%rip),%rax # rcon 1593 1594 mov 40(%rax),$mask80 1595 mov 48(%rax),$maskfe 1596 mov 56(%rax),$mask1b 1597 1598 mov %r8,$key 1599 sub \$1,%r14d 1600.align 4 1601.Lpermute: 1602 lea 16($key),$key 1603 mov 0($key),%rax 1604 mov 8($key),%rcx 1605___ 1606 &dectransform (); 1607$code.=<<___; 1608 mov %eax,0($key) 1609 mov %ebx,4($key) 1610 mov %ecx,8($key) 1611 mov %edx,12($key) 1612 sub \$1,%r14d 1613 jnz .Lpermute 1614 1615 xor %rax,%rax 1616.Labort: 1617 mov 8(%rsp),%r15 1618 mov 16(%rsp),%r14 1619 mov 24(%rsp),%r13 1620 mov 32(%rsp),%r12 1621 mov 40(%rsp),%rbp 1622 mov 48(%rsp),%rbx 1623 add \$56,%rsp 1624.Ldec_key_epilogue: 1625 ret 1626.size AES_set_decrypt_key,.-AES_set_decrypt_key 1627___ 1628 1629# void AES_cbc_encrypt (const void char *inp, unsigned char *out, 1630# size_t length, const AES_KEY *key, 1631# unsigned char *ivp,const int enc); 1632{ 1633# stack frame layout 1634# -8(%rsp) return address 1635my $keyp="0(%rsp)"; # one to pass as $key 1636my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds]) 1637my $_rsp="16(%rsp)"; # saved %rsp 1638my $_inp="24(%rsp)"; # copy of 1st parameter, inp 1639my $_out="32(%rsp)"; # copy of 2nd parameter, out 1640my $_len="40(%rsp)"; # copy of 3rd parameter, length 1641my $_key="48(%rsp)"; # copy of 4th parameter, key 1642my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp 1643my $ivec="64(%rsp)"; # ivec[16] 1644my $aes_key="80(%rsp)"; # copy of aes_key 1645my $mark="80+240(%rsp)"; # copy of aes_key->rounds 1646 1647$code.=<<___; 1648.globl AES_cbc_encrypt 1649.type AES_cbc_encrypt,\@function,6 1650.align 16 1651.extern OPENSSL_ia32cap_P 1652AES_cbc_encrypt: 1653 cmp \$0,%rdx # check length 1654 je .Lcbc_epilogue 1655 pushfq 1656 push %rbx 1657 push %rbp 1658 push %r12 1659 push %r13 1660 push %r14 1661 push %r15 1662.Lcbc_prologue: 1663 1664 cld 1665 mov %r9d,%r9d # clear upper half of enc 1666 1667 lea .LAES_Te(%rip),$sbox 1668 cmp \$0,%r9 1669 jne .Lcbc_picked_te 1670 lea .LAES_Td(%rip),$sbox 1671.Lcbc_picked_te: 1672 1673 mov OPENSSL_ia32cap_P(%rip),%r10d 1674 cmp \$$speed_limit,%rdx 1675 jb .Lcbc_slow_prologue 1676 test \$15,%rdx 1677 jnz .Lcbc_slow_prologue 1678 bt \$28,%r10d 1679 jc .Lcbc_slow_prologue 1680 1681 # allocate aligned stack frame... 1682 lea -88-248(%rsp),$key 1683 and \$-64,$key 1684 1685 # ... and make sure it doesn't alias with AES_T[ed] modulo 4096 1686 mov $sbox,%r10 1687 lea 2304($sbox),%r11 1688 mov $key,%r12 1689 and \$0xFFF,%r10 # s = $sbox&0xfff 1690 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff 1691 and \$0xFFF,%r12 # p = %rsp&0xfff 1692 1693 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); 1694 jb .Lcbc_te_break_out 1695 sub %r11,%r12 1696 sub %r12,$key 1697 jmp .Lcbc_te_ok 1698.Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz 1699 sub %r10,%r12 1700 and \$0xFFF,%r12 1701 add \$320,%r12 1702 sub %r12,$key 1703.align 4 1704.Lcbc_te_ok: 1705 1706 xchg %rsp,$key 1707 #add \$8,%rsp # reserve for return address! 1708 mov $key,$_rsp # save %rsp 1709.Lcbc_fast_body: 1710 mov %rdi,$_inp # save copy of inp 1711 mov %rsi,$_out # save copy of out 1712 mov %rdx,$_len # save copy of len 1713 mov %rcx,$_key # save copy of key 1714 mov %r8,$_ivp # save copy of ivp 1715 movl \$0,$mark # copy of aes_key->rounds = 0; 1716 mov %r8,%rbp # rearrange input arguments 1717 mov %r9,%rbx 1718 mov %rsi,$out 1719 mov %rdi,$inp 1720 mov %rcx,$key 1721 1722 mov 240($key),%eax # key->rounds 1723 # do we copy key schedule to stack? 1724 mov $key,%r10 1725 sub $sbox,%r10 1726 and \$0xfff,%r10 1727 cmp \$2304,%r10 1728 jb .Lcbc_do_ecopy 1729 cmp \$4096-248,%r10 1730 jb .Lcbc_skip_ecopy 1731.align 4 1732.Lcbc_do_ecopy: 1733 mov $key,%rsi 1734 lea $aes_key,%rdi 1735 lea $aes_key,$key 1736 mov \$240/8,%ecx 1737 .long 0x90A548F3 # rep movsq 1738 mov %eax,(%rdi) # copy aes_key->rounds 1739.Lcbc_skip_ecopy: 1740 mov $key,$keyp # save key pointer 1741 1742 mov \$18,%ecx 1743.align 4 1744.Lcbc_prefetch_te: 1745 mov 0($sbox),%r10 1746 mov 32($sbox),%r11 1747 mov 64($sbox),%r12 1748 mov 96($sbox),%r13 1749 lea 128($sbox),$sbox 1750 sub \$1,%ecx 1751 jnz .Lcbc_prefetch_te 1752 lea -2304($sbox),$sbox 1753 1754 cmp \$0,%rbx 1755 je .LFAST_DECRYPT 1756 1757#----------------------------- ENCRYPT -----------------------------# 1758 mov 0(%rbp),$s0 # load iv 1759 mov 4(%rbp),$s1 1760 mov 8(%rbp),$s2 1761 mov 12(%rbp),$s3 1762 1763.align 4 1764.Lcbc_fast_enc_loop: 1765 xor 0($inp),$s0 1766 xor 4($inp),$s1 1767 xor 8($inp),$s2 1768 xor 12($inp),$s3 1769 mov $keyp,$key # restore key 1770 mov $inp,$_inp # if ($verticalspin) save inp 1771 1772 call _x86_64_AES_encrypt 1773 1774 mov $_inp,$inp # if ($verticalspin) restore inp 1775 mov $_len,%r10 1776 mov $s0,0($out) 1777 mov $s1,4($out) 1778 mov $s2,8($out) 1779 mov $s3,12($out) 1780 1781 lea 16($inp),$inp 1782 lea 16($out),$out 1783 sub \$16,%r10 1784 test \$-16,%r10 1785 mov %r10,$_len 1786 jnz .Lcbc_fast_enc_loop 1787 mov $_ivp,%rbp # restore ivp 1788 mov $s0,0(%rbp) # save ivec 1789 mov $s1,4(%rbp) 1790 mov $s2,8(%rbp) 1791 mov $s3,12(%rbp) 1792 1793 jmp .Lcbc_fast_cleanup 1794 1795#----------------------------- DECRYPT -----------------------------# 1796.align 16 1797.LFAST_DECRYPT: 1798 cmp $inp,$out 1799 je .Lcbc_fast_dec_in_place 1800 1801 mov %rbp,$ivec 1802.align 4 1803.Lcbc_fast_dec_loop: 1804 mov 0($inp),$s0 # read input 1805 mov 4($inp),$s1 1806 mov 8($inp),$s2 1807 mov 12($inp),$s3 1808 mov $keyp,$key # restore key 1809 mov $inp,$_inp # if ($verticalspin) save inp 1810 1811 call _x86_64_AES_decrypt 1812 1813 mov $ivec,%rbp # load ivp 1814 mov $_inp,$inp # if ($verticalspin) restore inp 1815 mov $_len,%r10 # load len 1816 xor 0(%rbp),$s0 # xor iv 1817 xor 4(%rbp),$s1 1818 xor 8(%rbp),$s2 1819 xor 12(%rbp),$s3 1820 mov $inp,%rbp # current input, next iv 1821 1822 sub \$16,%r10 1823 mov %r10,$_len # update len 1824 mov %rbp,$ivec # update ivp 1825 1826 mov $s0,0($out) # write output 1827 mov $s1,4($out) 1828 mov $s2,8($out) 1829 mov $s3,12($out) 1830 1831 lea 16($inp),$inp 1832 lea 16($out),$out 1833 jnz .Lcbc_fast_dec_loop 1834 mov $_ivp,%r12 # load user ivp 1835 mov 0(%rbp),%r10 # load iv 1836 mov 8(%rbp),%r11 1837 mov %r10,0(%r12) # copy back to user 1838 mov %r11,8(%r12) 1839 jmp .Lcbc_fast_cleanup 1840 1841.align 16 1842.Lcbc_fast_dec_in_place: 1843 mov 0(%rbp),%r10 # copy iv to stack 1844 mov 8(%rbp),%r11 1845 mov %r10,0+$ivec 1846 mov %r11,8+$ivec 1847.align 4 1848.Lcbc_fast_dec_in_place_loop: 1849 mov 0($inp),$s0 # load input 1850 mov 4($inp),$s1 1851 mov 8($inp),$s2 1852 mov 12($inp),$s3 1853 mov $keyp,$key # restore key 1854 mov $inp,$_inp # if ($verticalspin) save inp 1855 1856 call _x86_64_AES_decrypt 1857 1858 mov $_inp,$inp # if ($verticalspin) restore inp 1859 mov $_len,%r10 1860 xor 0+$ivec,$s0 1861 xor 4+$ivec,$s1 1862 xor 8+$ivec,$s2 1863 xor 12+$ivec,$s3 1864 1865 mov 0($inp),%r11 # load input 1866 mov 8($inp),%r12 1867 sub \$16,%r10 1868 jz .Lcbc_fast_dec_in_place_done 1869 1870 mov %r11,0+$ivec # copy input to iv 1871 mov %r12,8+$ivec 1872 1873 mov $s0,0($out) # save output [zaps input] 1874 mov $s1,4($out) 1875 mov $s2,8($out) 1876 mov $s3,12($out) 1877 1878 lea 16($inp),$inp 1879 lea 16($out),$out 1880 mov %r10,$_len 1881 jmp .Lcbc_fast_dec_in_place_loop 1882.Lcbc_fast_dec_in_place_done: 1883 mov $_ivp,%rdi 1884 mov %r11,0(%rdi) # copy iv back to user 1885 mov %r12,8(%rdi) 1886 1887 mov $s0,0($out) # save output [zaps input] 1888 mov $s1,4($out) 1889 mov $s2,8($out) 1890 mov $s3,12($out) 1891 1892.align 4 1893.Lcbc_fast_cleanup: 1894 cmpl \$0,$mark # was the key schedule copied? 1895 lea $aes_key,%rdi 1896 je .Lcbc_exit 1897 mov \$240/8,%ecx 1898 xor %rax,%rax 1899 .long 0x90AB48F3 # rep stosq 1900 1901 jmp .Lcbc_exit 1902 1903#--------------------------- SLOW ROUTINE ---------------------------# 1904.align 16 1905.Lcbc_slow_prologue: 1906 # allocate aligned stack frame... 1907 lea -88(%rsp),%rbp 1908 and \$-64,%rbp 1909 # ... just "above" key schedule 1910 lea -88-63(%rcx),%r10 1911 sub %rbp,%r10 1912 neg %r10 1913 and \$0x3c0,%r10 1914 sub %r10,%rbp 1915 1916 xchg %rsp,%rbp 1917 #add \$8,%rsp # reserve for return address! 1918 mov %rbp,$_rsp # save %rsp 1919.Lcbc_slow_body: 1920 #mov %rdi,$_inp # save copy of inp 1921 #mov %rsi,$_out # save copy of out 1922 #mov %rdx,$_len # save copy of len 1923 #mov %rcx,$_key # save copy of key 1924 mov %r8,$_ivp # save copy of ivp 1925 mov %r8,%rbp # rearrange input arguments 1926 mov %r9,%rbx 1927 mov %rsi,$out 1928 mov %rdi,$inp 1929 mov %rcx,$key 1930 mov %rdx,%r10 1931 1932 mov 240($key),%eax 1933 mov $key,$keyp # save key pointer 1934 shl \$4,%eax 1935 lea ($key,%rax),%rax 1936 mov %rax,$keyend 1937 1938 # pick Te4 copy which can't "overlap" with stack frame or key scdedule 1939 lea 2048($sbox),$sbox 1940 lea 768-8(%rsp),%rax 1941 sub $sbox,%rax 1942 and \$0x300,%rax 1943 lea ($sbox,%rax),$sbox 1944 1945 cmp \$0,%rbx 1946 je .LSLOW_DECRYPT 1947 1948#--------------------------- SLOW ENCRYPT ---------------------------# 1949 test \$-16,%r10 # check upon length 1950 mov 0(%rbp),$s0 # load iv 1951 mov 4(%rbp),$s1 1952 mov 8(%rbp),$s2 1953 mov 12(%rbp),$s3 1954 jz .Lcbc_slow_enc_tail # short input... 1955 1956.align 4 1957.Lcbc_slow_enc_loop: 1958 xor 0($inp),$s0 1959 xor 4($inp),$s1 1960 xor 8($inp),$s2 1961 xor 12($inp),$s3 1962 mov $keyp,$key # restore key 1963 mov $inp,$_inp # save inp 1964 mov $out,$_out # save out 1965 mov %r10,$_len # save len 1966 1967 call _x86_64_AES_encrypt_compact 1968 1969 mov $_inp,$inp # restore inp 1970 mov $_out,$out # restore out 1971 mov $_len,%r10 # restore len 1972 mov $s0,0($out) 1973 mov $s1,4($out) 1974 mov $s2,8($out) 1975 mov $s3,12($out) 1976 1977 lea 16($inp),$inp 1978 lea 16($out),$out 1979 sub \$16,%r10 1980 test \$-16,%r10 1981 jnz .Lcbc_slow_enc_loop 1982 test \$15,%r10 1983 jnz .Lcbc_slow_enc_tail 1984 mov $_ivp,%rbp # restore ivp 1985 mov $s0,0(%rbp) # save ivec 1986 mov $s1,4(%rbp) 1987 mov $s2,8(%rbp) 1988 mov $s3,12(%rbp) 1989 1990 jmp .Lcbc_exit 1991 1992.align 4 1993.Lcbc_slow_enc_tail: 1994 mov %rax,%r11 1995 mov %rcx,%r12 1996 mov %r10,%rcx 1997 mov $inp,%rsi 1998 mov $out,%rdi 1999 .long 0x9066A4F3 # rep movsb 2000 mov \$16,%rcx # zero tail 2001 sub %r10,%rcx 2002 xor %rax,%rax 2003 .long 0x9066AAF3 # rep stosb 2004 mov $out,$inp # this is not a mistake! 2005 mov \$16,%r10 # len=16 2006 mov %r11,%rax 2007 mov %r12,%rcx 2008 jmp .Lcbc_slow_enc_loop # one more spin... 2009#--------------------------- SLOW DECRYPT ---------------------------# 2010.align 16 2011.LSLOW_DECRYPT: 2012 shr \$3,%rax 2013 add %rax,$sbox # recall "magic" constants! 2014 2015 mov 0(%rbp),%r11 # copy iv to stack 2016 mov 8(%rbp),%r12 2017 mov %r11,0+$ivec 2018 mov %r12,8+$ivec 2019 2020.align 4 2021.Lcbc_slow_dec_loop: 2022 mov 0($inp),$s0 # load input 2023 mov 4($inp),$s1 2024 mov 8($inp),$s2 2025 mov 12($inp),$s3 2026 mov $keyp,$key # restore key 2027 mov $inp,$_inp # save inp 2028 mov $out,$_out # save out 2029 mov %r10,$_len # save len 2030 2031 call _x86_64_AES_decrypt_compact 2032 2033 mov $_inp,$inp # restore inp 2034 mov $_out,$out # restore out 2035 mov $_len,%r10 2036 xor 0+$ivec,$s0 2037 xor 4+$ivec,$s1 2038 xor 8+$ivec,$s2 2039 xor 12+$ivec,$s3 2040 2041 mov 0($inp),%r11 # load input 2042 mov 8($inp),%r12 2043 sub \$16,%r10 2044 jc .Lcbc_slow_dec_partial 2045 jz .Lcbc_slow_dec_done 2046 2047 mov %r11,0+$ivec # copy input to iv 2048 mov %r12,8+$ivec 2049 2050 mov $s0,0($out) # save output [can zap input] 2051 mov $s1,4($out) 2052 mov $s2,8($out) 2053 mov $s3,12($out) 2054 2055 lea 16($inp),$inp 2056 lea 16($out),$out 2057 jmp .Lcbc_slow_dec_loop 2058.Lcbc_slow_dec_done: 2059 mov $_ivp,%rdi 2060 mov %r11,0(%rdi) # copy iv back to user 2061 mov %r12,8(%rdi) 2062 2063 mov $s0,0($out) # save output [can zap input] 2064 mov $s1,4($out) 2065 mov $s2,8($out) 2066 mov $s3,12($out) 2067 2068 jmp .Lcbc_exit 2069 2070.align 4 2071.Lcbc_slow_dec_partial: 2072 mov $_ivp,%rdi 2073 mov %r11,0(%rdi) # copy iv back to user 2074 mov %r12,8(%rdi) 2075 2076 mov $s0,0+$ivec # save output to stack 2077 mov $s1,4+$ivec 2078 mov $s2,8+$ivec 2079 mov $s3,12+$ivec 2080 2081 mov $out,%rdi 2082 lea $ivec,%rsi 2083 lea 16(%r10),%rcx 2084 .long 0x9066A4F3 # rep movsb 2085 jmp .Lcbc_exit 2086 2087.align 16 2088.Lcbc_exit: 2089 mov $_rsp,%rsi 2090 mov (%rsi),%r15 2091 mov 8(%rsi),%r14 2092 mov 16(%rsi),%r13 2093 mov 24(%rsi),%r12 2094 mov 32(%rsi),%rbp 2095 mov 40(%rsi),%rbx 2096 lea 48(%rsi),%rsp 2097.Lcbc_popfq: 2098 popfq 2099.Lcbc_epilogue: 2100 ret 2101.size AES_cbc_encrypt,.-AES_cbc_encrypt 2102___ 2103} 2104 2105$code.=<<___; 2106.align 64 2107.LAES_Te: 2108___ 2109 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); 2110 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); 2111 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56); 2112 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec); 2113 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa); 2114 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb); 2115 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45); 2116 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b); 2117 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c); 2118 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83); 2119 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9); 2120 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a); 2121 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d); 2122 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f); 2123 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df); 2124 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea); 2125 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34); 2126 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b); 2127 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d); 2128 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413); 2129 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1); 2130 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6); 2131 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972); 2132 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85); 2133 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed); 2134 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511); 2135 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe); 2136 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b); 2137 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05); 2138 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1); 2139 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142); 2140 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf); 2141 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3); 2142 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e); 2143 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a); 2144 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6); 2145 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3); 2146 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b); 2147 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428); 2148 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad); 2149 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14); 2150 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8); 2151 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4); 2152 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2); 2153 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda); 2154 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949); 2155 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf); 2156 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810); 2157 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c); 2158 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697); 2159 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e); 2160 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f); 2161 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc); 2162 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c); 2163 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); 2164 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); 2165 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); 2166 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); 2167 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); 2168 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); 2169 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); 2170 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); 2171 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); 2172 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); 2173 2174#Te4 # four copies of Te4 to choose from to avoid L1 aliasing 2175 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 2176 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 2177 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 2178 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 2179 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 2180 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 2181 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 2182 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 2183 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 2184 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 2185 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 2186 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 2187 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 2188 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 2189 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 2190 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 2191 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 2192 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 2193 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 2194 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 2195 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 2196 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 2197 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 2198 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 2199 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 2200 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 2201 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 2202 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 2203 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 2204 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 2205 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 2206 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 2207 2208 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 2209 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 2210 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 2211 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 2212 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 2213 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 2214 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 2215 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 2216 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 2217 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 2218 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 2219 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 2220 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 2221 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 2222 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 2223 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 2224 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 2225 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 2226 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 2227 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 2228 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 2229 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 2230 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 2231 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 2232 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 2233 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 2234 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 2235 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 2236 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 2237 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 2238 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 2239 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 2240 2241 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 2242 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 2243 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 2244 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 2245 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 2246 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 2247 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 2248 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 2249 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 2250 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 2251 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 2252 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 2253 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 2254 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 2255 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 2256 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 2257 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 2258 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 2259 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 2260 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 2261 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 2262 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 2263 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 2264 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 2265 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 2266 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 2267 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 2268 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 2269 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 2270 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 2271 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 2272 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 2273 2274 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 2275 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 2276 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 2277 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 2278 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 2279 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 2280 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 2281 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 2282 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 2283 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 2284 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 2285 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 2286 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 2287 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 2288 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 2289 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 2290 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 2291 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 2292 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 2293 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 2294 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 2295 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 2296 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 2297 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 2298 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 2299 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 2300 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 2301 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 2302 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 2303 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 2304 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 2305 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 2306#rcon: 2307$code.=<<___; 2308 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 2309 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 2310 .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 2311 .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b 2312___ 2313$code.=<<___; 2314.align 64 2315.LAES_Td: 2316___ 2317 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); 2318 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); 2319 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5); 2320 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5); 2321 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d); 2322 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b); 2323 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295); 2324 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e); 2325 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927); 2326 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d); 2327 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362); 2328 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9); 2329 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52); 2330 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566); 2331 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3); 2332 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed); 2333 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e); 2334 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4); 2335 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4); 2336 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd); 2337 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d); 2338 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060); 2339 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967); 2340 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879); 2341 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000); 2342 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c); 2343 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36); 2344 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624); 2345 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b); 2346 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c); 2347 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12); 2348 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14); 2349 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3); 2350 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b); 2351 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8); 2352 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684); 2353 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7); 2354 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177); 2355 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947); 2356 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322); 2357 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498); 2358 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f); 2359 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54); 2360 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382); 2361 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf); 2362 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb); 2363 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83); 2364 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef); 2365 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029); 2366 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235); 2367 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733); 2368 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117); 2369 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4); 2370 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546); 2371 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb); 2372 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d); 2373 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb); 2374 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a); 2375 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773); 2376 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478); 2377 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2); 2378 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); 2379 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); 2380 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); 2381 2382#Td4: # four copies of Td4 to choose from to avoid L1 aliasing 2383 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2384 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2385 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2386 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); 2387 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); 2388 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); 2389 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); 2390 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); 2391 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); 2392 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); 2393 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); 2394 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); 2395 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); 2396 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); 2397 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); 2398 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); 2399 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); 2400 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); 2401 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); 2402 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); 2403 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); 2404 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); 2405 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); 2406 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); 2407 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); 2408 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); 2409 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); 2410 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); 2411 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); 2412 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2413 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2414 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2415$code.=<<___; 2416 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe 2417 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 2418___ 2419 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2420 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2421 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2422 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); 2423 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); 2424 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); 2425 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); 2426 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); 2427 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); 2428 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); 2429 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); 2430 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); 2431 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); 2432 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); 2433 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); 2434 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); 2435 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); 2436 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); 2437 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); 2438 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); 2439 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); 2440 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); 2441 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); 2442 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); 2443 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); 2444 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); 2445 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); 2446 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); 2447 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); 2448 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2449 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2450 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2451$code.=<<___; 2452 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe 2453 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 2454___ 2455 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2456 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2457 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2458 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); 2459 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); 2460 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); 2461 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); 2462 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); 2463 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); 2464 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); 2465 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); 2466 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); 2467 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); 2468 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); 2469 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); 2470 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); 2471 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); 2472 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); 2473 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); 2474 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); 2475 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); 2476 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); 2477 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); 2478 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); 2479 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); 2480 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); 2481 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); 2482 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); 2483 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); 2484 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2485 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2486 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2487$code.=<<___; 2488 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe 2489 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 2490___ 2491 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2492 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2493 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2494 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); 2495 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); 2496 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); 2497 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); 2498 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); 2499 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); 2500 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); 2501 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); 2502 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); 2503 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); 2504 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); 2505 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); 2506 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); 2507 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); 2508 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); 2509 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); 2510 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); 2511 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); 2512 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); 2513 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); 2514 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); 2515 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); 2516 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); 2517 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); 2518 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); 2519 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); 2520 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2521 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2522 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2523$code.=<<___; 2524 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe 2525 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 2526.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 2527.align 64 2528___ 2529 2530# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2531# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2532if ($win64) { 2533$rec="%rcx"; 2534$frame="%rdx"; 2535$context="%r8"; 2536$disp="%r9"; 2537 2538$code.=<<___; 2539.extern __imp_RtlVirtualUnwind 2540.type block_se_handler,\@abi-omnipotent 2541.align 16 2542block_se_handler: 2543 push %rsi 2544 push %rdi 2545 push %rbx 2546 push %rbp 2547 push %r12 2548 push %r13 2549 push %r14 2550 push %r15 2551 pushfq 2552 sub \$64,%rsp 2553 2554 mov 120($context),%rax # pull context->Rax 2555 mov 248($context),%rbx # pull context->Rip 2556 2557 mov 8($disp),%rsi # disp->ImageBase 2558 mov 56($disp),%r11 # disp->HandlerData 2559 2560 mov 0(%r11),%r10d # HandlerData[0] 2561 lea (%rsi,%r10),%r10 # prologue label 2562 cmp %r10,%rbx # context->Rip<prologue label 2563 jb .Lin_block_prologue 2564 2565 mov 152($context),%rax # pull context->Rsp 2566 2567 mov 4(%r11),%r10d # HandlerData[1] 2568 lea (%rsi,%r10),%r10 # epilogue label 2569 cmp %r10,%rbx # context->Rip>=epilogue label 2570 jae .Lin_block_prologue 2571 2572 mov 24(%rax),%rax # pull saved real stack pointer 2573 lea 48(%rax),%rax # adjust... 2574 2575 mov -8(%rax),%rbx 2576 mov -16(%rax),%rbp 2577 mov -24(%rax),%r12 2578 mov -32(%rax),%r13 2579 mov -40(%rax),%r14 2580 mov -48(%rax),%r15 2581 mov %rbx,144($context) # restore context->Rbx 2582 mov %rbp,160($context) # restore context->Rbp 2583 mov %r12,216($context) # restore context->R12 2584 mov %r13,224($context) # restore context->R13 2585 mov %r14,232($context) # restore context->R14 2586 mov %r15,240($context) # restore context->R15 2587 2588.Lin_block_prologue: 2589 mov 8(%rax),%rdi 2590 mov 16(%rax),%rsi 2591 mov %rax,152($context) # restore context->Rsp 2592 mov %rsi,168($context) # restore context->Rsi 2593 mov %rdi,176($context) # restore context->Rdi 2594 2595 jmp .Lcommon_seh_exit 2596.size block_se_handler,.-block_se_handler 2597 2598.type key_se_handler,\@abi-omnipotent 2599.align 16 2600key_se_handler: 2601 push %rsi 2602 push %rdi 2603 push %rbx 2604 push %rbp 2605 push %r12 2606 push %r13 2607 push %r14 2608 push %r15 2609 pushfq 2610 sub \$64,%rsp 2611 2612 mov 120($context),%rax # pull context->Rax 2613 mov 248($context),%rbx # pull context->Rip 2614 2615 mov 8($disp),%rsi # disp->ImageBase 2616 mov 56($disp),%r11 # disp->HandlerData 2617 2618 mov 0(%r11),%r10d # HandlerData[0] 2619 lea (%rsi,%r10),%r10 # prologue label 2620 cmp %r10,%rbx # context->Rip<prologue label 2621 jb .Lin_key_prologue 2622 2623 mov 152($context),%rax # pull context->Rsp 2624 2625 mov 4(%r11),%r10d # HandlerData[1] 2626 lea (%rsi,%r10),%r10 # epilogue label 2627 cmp %r10,%rbx # context->Rip>=epilogue label 2628 jae .Lin_key_prologue 2629 2630 lea 56(%rax),%rax 2631 2632 mov -8(%rax),%rbx 2633 mov -16(%rax),%rbp 2634 mov -24(%rax),%r12 2635 mov -32(%rax),%r13 2636 mov -40(%rax),%r14 2637 mov -48(%rax),%r15 2638 mov %rbx,144($context) # restore context->Rbx 2639 mov %rbp,160($context) # restore context->Rbp 2640 mov %r12,216($context) # restore context->R12 2641 mov %r13,224($context) # restore context->R13 2642 mov %r14,232($context) # restore context->R14 2643 mov %r15,240($context) # restore context->R15 2644 2645.Lin_key_prologue: 2646 mov 8(%rax),%rdi 2647 mov 16(%rax),%rsi 2648 mov %rax,152($context) # restore context->Rsp 2649 mov %rsi,168($context) # restore context->Rsi 2650 mov %rdi,176($context) # restore context->Rdi 2651 2652 jmp .Lcommon_seh_exit 2653.size key_se_handler,.-key_se_handler 2654 2655.type cbc_se_handler,\@abi-omnipotent 2656.align 16 2657cbc_se_handler: 2658 push %rsi 2659 push %rdi 2660 push %rbx 2661 push %rbp 2662 push %r12 2663 push %r13 2664 push %r14 2665 push %r15 2666 pushfq 2667 sub \$64,%rsp 2668 2669 mov 120($context),%rax # pull context->Rax 2670 mov 248($context),%rbx # pull context->Rip 2671 2672 lea .Lcbc_prologue(%rip),%r10 2673 cmp %r10,%rbx # context->Rip<.Lcbc_prologue 2674 jb .Lin_cbc_prologue 2675 2676 lea .Lcbc_fast_body(%rip),%r10 2677 cmp %r10,%rbx # context->Rip<.Lcbc_fast_body 2678 jb .Lin_cbc_frame_setup 2679 2680 lea .Lcbc_slow_prologue(%rip),%r10 2681 cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue 2682 jb .Lin_cbc_body 2683 2684 lea .Lcbc_slow_body(%rip),%r10 2685 cmp %r10,%rbx # context->Rip<.Lcbc_slow_body 2686 jb .Lin_cbc_frame_setup 2687 2688.Lin_cbc_body: 2689 mov 152($context),%rax # pull context->Rsp 2690 2691 lea .Lcbc_epilogue(%rip),%r10 2692 cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue 2693 jae .Lin_cbc_prologue 2694 2695 lea 8(%rax),%rax 2696 2697 lea .Lcbc_popfq(%rip),%r10 2698 cmp %r10,%rbx # context->Rip>=.Lcbc_popfq 2699 jae .Lin_cbc_prologue 2700 2701 mov `16-8`(%rax),%rax # biased $_rsp 2702 lea 56(%rax),%rax 2703 2704.Lin_cbc_frame_setup: 2705 mov -16(%rax),%rbx 2706 mov -24(%rax),%rbp 2707 mov -32(%rax),%r12 2708 mov -40(%rax),%r13 2709 mov -48(%rax),%r14 2710 mov -56(%rax),%r15 2711 mov %rbx,144($context) # restore context->Rbx 2712 mov %rbp,160($context) # restore context->Rbp 2713 mov %r12,216($context) # restore context->R12 2714 mov %r13,224($context) # restore context->R13 2715 mov %r14,232($context) # restore context->R14 2716 mov %r15,240($context) # restore context->R15 2717 2718.Lin_cbc_prologue: 2719 mov 8(%rax),%rdi 2720 mov 16(%rax),%rsi 2721 mov %rax,152($context) # restore context->Rsp 2722 mov %rsi,168($context) # restore context->Rsi 2723 mov %rdi,176($context) # restore context->Rdi 2724 2725.Lcommon_seh_exit: 2726 2727 mov 40($disp),%rdi # disp->ContextRecord 2728 mov $context,%rsi # context 2729 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 2730 .long 0xa548f3fc # cld; rep movsq 2731 2732 mov $disp,%rsi 2733 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2734 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2735 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2736 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2737 mov 40(%rsi),%r10 # disp->ContextRecord 2738 lea 56(%rsi),%r11 # &disp->HandlerData 2739 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2740 mov %r10,32(%rsp) # arg5 2741 mov %r11,40(%rsp) # arg6 2742 mov %r12,48(%rsp) # arg7 2743 mov %rcx,56(%rsp) # arg8, (NULL) 2744 call *__imp_RtlVirtualUnwind(%rip) 2745 2746 mov \$1,%eax # ExceptionContinueSearch 2747 add \$64,%rsp 2748 popfq 2749 pop %r15 2750 pop %r14 2751 pop %r13 2752 pop %r12 2753 pop %rbp 2754 pop %rbx 2755 pop %rdi 2756 pop %rsi 2757 ret 2758.size cbc_se_handler,.-cbc_se_handler 2759 2760.section .pdata 2761.align 4 2762 .rva .LSEH_begin_AES_encrypt 2763 .rva .LSEH_end_AES_encrypt 2764 .rva .LSEH_info_AES_encrypt 2765 2766 .rva .LSEH_begin_AES_decrypt 2767 .rva .LSEH_end_AES_decrypt 2768 .rva .LSEH_info_AES_decrypt 2769 2770 .rva .LSEH_begin_AES_set_encrypt_key 2771 .rva .LSEH_end_AES_set_encrypt_key 2772 .rva .LSEH_info_AES_set_encrypt_key 2773 2774 .rva .LSEH_begin_AES_set_decrypt_key 2775 .rva .LSEH_end_AES_set_decrypt_key 2776 .rva .LSEH_info_AES_set_decrypt_key 2777 2778 .rva .LSEH_begin_AES_cbc_encrypt 2779 .rva .LSEH_end_AES_cbc_encrypt 2780 .rva .LSEH_info_AES_cbc_encrypt 2781 2782.section .xdata 2783.align 8 2784.LSEH_info_AES_encrypt: 2785 .byte 9,0,0,0 2786 .rva block_se_handler 2787 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 2788.LSEH_info_AES_decrypt: 2789 .byte 9,0,0,0 2790 .rva block_se_handler 2791 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 2792.LSEH_info_AES_set_encrypt_key: 2793 .byte 9,0,0,0 2794 .rva key_se_handler 2795 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] 2796.LSEH_info_AES_set_decrypt_key: 2797 .byte 9,0,0,0 2798 .rva key_se_handler 2799 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] 2800.LSEH_info_AES_cbc_encrypt: 2801 .byte 9,0,0,0 2802 .rva cbc_se_handler 2803___ 2804} 2805 2806$code =~ s/\`([^\`]*)\`/eval($1)/gem; 2807 2808print $code; 2809 2810close STDOUT; 2811