1183234Ssimon#!/usr/bin/env perl 2183234Ssimon# 3183234Ssimon# ==================================================================== 4183234Ssimon# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238405Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238405Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238405Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8183234Ssimon# ==================================================================== 9183234Ssimon# 10238405Sjkim# Version 2.1. 11183234Ssimon# 12183234Ssimon# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on 13183234Ssimon# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version 14183234Ssimon# [you'll notice a lot of resemblance], such as compressed S-boxes 15183234Ssimon# in little-endian byte order, prefetch of these tables in CBC mode, 16183234Ssimon# as well as avoiding L1 cache aliasing between stack frame and key 17183234Ssimon# schedule and already mentioned tables, compressed Td4... 18183234Ssimon# 19183234Ssimon# Performance in number of cycles per processed byte for 128-bit key: 20183234Ssimon# 21238405Sjkim# ECB encrypt ECB decrypt CBC large chunk 22238405Sjkim# AMD64 33 41 13.0 23238405Sjkim# EM64T 38 59 18.6(*) 24238405Sjkim# Core 2 30 43 14.5(*) 25183234Ssimon# 26238405Sjkim# (*) with hyper-threading off 27183234Ssimon 28238405Sjkim$flavour = shift; 29238405Sjkim$output = shift; 30238405Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 31238405Sjkim 32238405Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 33238405Sjkim 34238405Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 35238405Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 36238405Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 37238405Sjkimdie "can't locate x86_64-xlate.pl"; 38238405Sjkim 39246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 40246772Sjkim*STDOUT=*OUT; 41238405Sjkim 42183234Ssimon$verticalspin=1; # unlike 32-bit version $verticalspin performs 43183234Ssimon # ~15% better on both AMD and Intel cores 44238405Sjkim$speed_limit=512; # see aes-586.pl for details 45183234Ssimon 46183234Ssimon$code=".text\n"; 47183234Ssimon 48183234Ssimon$s0="%eax"; 49183234Ssimon$s1="%ebx"; 50183234Ssimon$s2="%ecx"; 51183234Ssimon$s3="%edx"; 52238405Sjkim$acc0="%esi"; $mask80="%rsi"; 53238405Sjkim$acc1="%edi"; $maskfe="%rdi"; 54238405Sjkim$acc2="%ebp"; $mask1b="%rbp"; 55183234Ssimon$inp="%r8"; 56183234Ssimon$out="%r9"; 57183234Ssimon$t0="%r10d"; 58183234Ssimon$t1="%r11d"; 59183234Ssimon$t2="%r12d"; 60183234Ssimon$rnds="%r13d"; 61183234Ssimon$sbox="%r14"; 62183234Ssimon$key="%r15"; 63183234Ssimon 64183234Ssimonsub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } 65183234Ssimonsub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 66183234Ssimon $r =~ s/%[er]([sd]i)/%\1l/; 67183234Ssimon $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 68238405Sjkimsub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; 69238405Sjkim $r =~ s/%r([0-9]+)/%r\1d/; $r; } 70183234Ssimonsub _data_word() 71183234Ssimon{ my $i; 72183234Ssimon while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } 73183234Ssimon} 74183234Ssimonsub data_word() 75183234Ssimon{ my $i; 76183234Ssimon my $last=pop(@_); 77183234Ssimon $code.=".long\t"; 78183234Ssimon while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; } 79183234Ssimon $code.=sprintf"0x%08x\n",$last; 80183234Ssimon} 81183234Ssimon 82183234Ssimonsub data_byte() 83183234Ssimon{ my $i; 84183234Ssimon my $last=pop(@_); 85183234Ssimon $code.=".byte\t"; 86183234Ssimon while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; } 87183234Ssimon $code.=sprintf"0x%02x\n",$last&0xff; 88183234Ssimon} 89183234Ssimon 90183234Ssimonsub encvert() 91183234Ssimon{ my $t3="%r8d"; # zaps $inp! 92183234Ssimon 93183234Ssimon$code.=<<___; 94183234Ssimon # favor 3-way issue Opteron pipeline... 95183234Ssimon movzb `&lo("$s0")`,$acc0 96183234Ssimon movzb `&lo("$s1")`,$acc1 97183234Ssimon movzb `&lo("$s2")`,$acc2 98183234Ssimon mov 0($sbox,$acc0,8),$t0 99183234Ssimon mov 0($sbox,$acc1,8),$t1 100183234Ssimon mov 0($sbox,$acc2,8),$t2 101183234Ssimon 102183234Ssimon movzb `&hi("$s1")`,$acc0 103183234Ssimon movzb `&hi("$s2")`,$acc1 104183234Ssimon movzb `&lo("$s3")`,$acc2 105183234Ssimon xor 3($sbox,$acc0,8),$t0 106183234Ssimon xor 3($sbox,$acc1,8),$t1 107183234Ssimon mov 0($sbox,$acc2,8),$t3 108183234Ssimon 109183234Ssimon movzb `&hi("$s3")`,$acc0 110183234Ssimon shr \$16,$s2 111183234Ssimon movzb `&hi("$s0")`,$acc2 112183234Ssimon xor 3($sbox,$acc0,8),$t2 113183234Ssimon shr \$16,$s3 114183234Ssimon xor 3($sbox,$acc2,8),$t3 115183234Ssimon 116183234Ssimon shr \$16,$s1 117183234Ssimon lea 16($key),$key 118183234Ssimon shr \$16,$s0 119183234Ssimon 120183234Ssimon movzb `&lo("$s2")`,$acc0 121183234Ssimon movzb `&lo("$s3")`,$acc1 122183234Ssimon movzb `&lo("$s0")`,$acc2 123183234Ssimon xor 2($sbox,$acc0,8),$t0 124183234Ssimon xor 2($sbox,$acc1,8),$t1 125183234Ssimon xor 2($sbox,$acc2,8),$t2 126183234Ssimon 127183234Ssimon movzb `&hi("$s3")`,$acc0 128183234Ssimon movzb `&hi("$s0")`,$acc1 129183234Ssimon movzb `&lo("$s1")`,$acc2 130183234Ssimon xor 1($sbox,$acc0,8),$t0 131183234Ssimon xor 1($sbox,$acc1,8),$t1 132183234Ssimon xor 2($sbox,$acc2,8),$t3 133183234Ssimon 134183234Ssimon mov 12($key),$s3 135183234Ssimon movzb `&hi("$s1")`,$acc1 136183234Ssimon movzb `&hi("$s2")`,$acc2 137183234Ssimon mov 0($key),$s0 138183234Ssimon xor 1($sbox,$acc1,8),$t2 139183234Ssimon xor 1($sbox,$acc2,8),$t3 140183234Ssimon 141183234Ssimon mov 4($key),$s1 142183234Ssimon mov 8($key),$s2 143183234Ssimon xor $t0,$s0 144183234Ssimon xor $t1,$s1 145183234Ssimon xor $t2,$s2 146183234Ssimon xor $t3,$s3 147183234Ssimon___ 148183234Ssimon} 149183234Ssimon 150183234Ssimonsub enclastvert() 151183234Ssimon{ my $t3="%r8d"; # zaps $inp! 152183234Ssimon 153183234Ssimon$code.=<<___; 154183234Ssimon movzb `&lo("$s0")`,$acc0 155183234Ssimon movzb `&lo("$s1")`,$acc1 156183234Ssimon movzb `&lo("$s2")`,$acc2 157238405Sjkim movzb 2($sbox,$acc0,8),$t0 158238405Sjkim movzb 2($sbox,$acc1,8),$t1 159238405Sjkim movzb 2($sbox,$acc2,8),$t2 160183234Ssimon 161183234Ssimon movzb `&lo("$s3")`,$acc0 162183234Ssimon movzb `&hi("$s1")`,$acc1 163183234Ssimon movzb `&hi("$s2")`,$acc2 164238405Sjkim movzb 2($sbox,$acc0,8),$t3 165183234Ssimon mov 0($sbox,$acc1,8),$acc1 #$t0 166183234Ssimon mov 0($sbox,$acc2,8),$acc2 #$t1 167183234Ssimon 168183234Ssimon and \$0x0000ff00,$acc1 169183234Ssimon and \$0x0000ff00,$acc2 170183234Ssimon 171183234Ssimon xor $acc1,$t0 172183234Ssimon xor $acc2,$t1 173183234Ssimon shr \$16,$s2 174183234Ssimon 175183234Ssimon movzb `&hi("$s3")`,$acc0 176183234Ssimon movzb `&hi("$s0")`,$acc1 177183234Ssimon shr \$16,$s3 178183234Ssimon mov 0($sbox,$acc0,8),$acc0 #$t2 179183234Ssimon mov 0($sbox,$acc1,8),$acc1 #$t3 180183234Ssimon 181183234Ssimon and \$0x0000ff00,$acc0 182183234Ssimon and \$0x0000ff00,$acc1 183183234Ssimon shr \$16,$s1 184183234Ssimon xor $acc0,$t2 185183234Ssimon xor $acc1,$t3 186183234Ssimon shr \$16,$s0 187183234Ssimon 188183234Ssimon movzb `&lo("$s2")`,$acc0 189183234Ssimon movzb `&lo("$s3")`,$acc1 190183234Ssimon movzb `&lo("$s0")`,$acc2 191183234Ssimon mov 0($sbox,$acc0,8),$acc0 #$t0 192183234Ssimon mov 0($sbox,$acc1,8),$acc1 #$t1 193183234Ssimon mov 0($sbox,$acc2,8),$acc2 #$t2 194183234Ssimon 195183234Ssimon and \$0x00ff0000,$acc0 196183234Ssimon and \$0x00ff0000,$acc1 197183234Ssimon and \$0x00ff0000,$acc2 198183234Ssimon 199183234Ssimon xor $acc0,$t0 200183234Ssimon xor $acc1,$t1 201183234Ssimon xor $acc2,$t2 202183234Ssimon 203183234Ssimon movzb `&lo("$s1")`,$acc0 204183234Ssimon movzb `&hi("$s3")`,$acc1 205183234Ssimon movzb `&hi("$s0")`,$acc2 206183234Ssimon mov 0($sbox,$acc0,8),$acc0 #$t3 207183234Ssimon mov 2($sbox,$acc1,8),$acc1 #$t0 208183234Ssimon mov 2($sbox,$acc2,8),$acc2 #$t1 209183234Ssimon 210183234Ssimon and \$0x00ff0000,$acc0 211183234Ssimon and \$0xff000000,$acc1 212183234Ssimon and \$0xff000000,$acc2 213183234Ssimon 214183234Ssimon xor $acc0,$t3 215183234Ssimon xor $acc1,$t0 216183234Ssimon xor $acc2,$t1 217183234Ssimon 218183234Ssimon movzb `&hi("$s1")`,$acc0 219183234Ssimon movzb `&hi("$s2")`,$acc1 220183234Ssimon mov 16+12($key),$s3 221183234Ssimon mov 2($sbox,$acc0,8),$acc0 #$t2 222183234Ssimon mov 2($sbox,$acc1,8),$acc1 #$t3 223183234Ssimon mov 16+0($key),$s0 224183234Ssimon 225183234Ssimon and \$0xff000000,$acc0 226183234Ssimon and \$0xff000000,$acc1 227183234Ssimon 228183234Ssimon xor $acc0,$t2 229183234Ssimon xor $acc1,$t3 230183234Ssimon 231183234Ssimon mov 16+4($key),$s1 232183234Ssimon mov 16+8($key),$s2 233183234Ssimon xor $t0,$s0 234183234Ssimon xor $t1,$s1 235183234Ssimon xor $t2,$s2 236183234Ssimon xor $t3,$s3 237183234Ssimon___ 238183234Ssimon} 239183234Ssimon 240183234Ssimonsub encstep() 241183234Ssimon{ my ($i,@s) = @_; 242183234Ssimon my $tmp0=$acc0; 243183234Ssimon my $tmp1=$acc1; 244183234Ssimon my $tmp2=$acc2; 245183234Ssimon my $out=($t0,$t1,$t2,$s[0])[$i]; 246183234Ssimon 247183234Ssimon if ($i==3) { 248183234Ssimon $tmp0=$s[1]; 249183234Ssimon $tmp1=$s[2]; 250183234Ssimon $tmp2=$s[3]; 251183234Ssimon } 252183234Ssimon $code.=" movzb ".&lo($s[0]).",$out\n"; 253183234Ssimon $code.=" mov $s[2],$tmp1\n" if ($i!=3); 254183234Ssimon $code.=" lea 16($key),$key\n" if ($i==0); 255183234Ssimon 256183234Ssimon $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 257183234Ssimon $code.=" mov 0($sbox,$out,8),$out\n"; 258183234Ssimon 259183234Ssimon $code.=" shr \$16,$tmp1\n"; 260183234Ssimon $code.=" mov $s[3],$tmp2\n" if ($i!=3); 261183234Ssimon $code.=" xor 3($sbox,$tmp0,8),$out\n"; 262183234Ssimon 263183234Ssimon $code.=" movzb ".&lo($tmp1).",$tmp1\n"; 264183234Ssimon $code.=" shr \$24,$tmp2\n"; 265183234Ssimon $code.=" xor 4*$i($key),$out\n"; 266183234Ssimon 267183234Ssimon $code.=" xor 2($sbox,$tmp1,8),$out\n"; 268183234Ssimon $code.=" xor 1($sbox,$tmp2,8),$out\n"; 269183234Ssimon 270183234Ssimon $code.=" mov $t0,$s[1]\n" if ($i==3); 271183234Ssimon $code.=" mov $t1,$s[2]\n" if ($i==3); 272183234Ssimon $code.=" mov $t2,$s[3]\n" if ($i==3); 273183234Ssimon $code.="\n"; 274183234Ssimon} 275183234Ssimon 276183234Ssimonsub enclast() 277183234Ssimon{ my ($i,@s)=@_; 278183234Ssimon my $tmp0=$acc0; 279183234Ssimon my $tmp1=$acc1; 280183234Ssimon my $tmp2=$acc2; 281183234Ssimon my $out=($t0,$t1,$t2,$s[0])[$i]; 282183234Ssimon 283183234Ssimon if ($i==3) { 284183234Ssimon $tmp0=$s[1]; 285183234Ssimon $tmp1=$s[2]; 286183234Ssimon $tmp2=$s[3]; 287183234Ssimon } 288183234Ssimon $code.=" movzb ".&lo($s[0]).",$out\n"; 289183234Ssimon $code.=" mov $s[2],$tmp1\n" if ($i!=3); 290183234Ssimon 291183234Ssimon $code.=" mov 2($sbox,$out,8),$out\n"; 292183234Ssimon $code.=" shr \$16,$tmp1\n"; 293183234Ssimon $code.=" mov $s[3],$tmp2\n" if ($i!=3); 294183234Ssimon 295183234Ssimon $code.=" and \$0x000000ff,$out\n"; 296183234Ssimon $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 297183234Ssimon $code.=" movzb ".&lo($tmp1).",$tmp1\n"; 298183234Ssimon $code.=" shr \$24,$tmp2\n"; 299183234Ssimon 300183234Ssimon $code.=" mov 0($sbox,$tmp0,8),$tmp0\n"; 301183234Ssimon $code.=" mov 0($sbox,$tmp1,8),$tmp1\n"; 302183234Ssimon $code.=" mov 2($sbox,$tmp2,8),$tmp2\n"; 303183234Ssimon 304183234Ssimon $code.=" and \$0x0000ff00,$tmp0\n"; 305183234Ssimon $code.=" and \$0x00ff0000,$tmp1\n"; 306183234Ssimon $code.=" and \$0xff000000,$tmp2\n"; 307183234Ssimon 308183234Ssimon $code.=" xor $tmp0,$out\n"; 309183234Ssimon $code.=" mov $t0,$s[1]\n" if ($i==3); 310183234Ssimon $code.=" xor $tmp1,$out\n"; 311183234Ssimon $code.=" mov $t1,$s[2]\n" if ($i==3); 312183234Ssimon $code.=" xor $tmp2,$out\n"; 313183234Ssimon $code.=" mov $t2,$s[3]\n" if ($i==3); 314183234Ssimon $code.="\n"; 315183234Ssimon} 316183234Ssimon 317183234Ssimon$code.=<<___; 318183234Ssimon.type _x86_64_AES_encrypt,\@abi-omnipotent 319183234Ssimon.align 16 320183234Ssimon_x86_64_AES_encrypt: 321183234Ssimon xor 0($key),$s0 # xor with key 322183234Ssimon xor 4($key),$s1 323183234Ssimon xor 8($key),$s2 324183234Ssimon xor 12($key),$s3 325183234Ssimon 326183234Ssimon mov 240($key),$rnds # load key->rounds 327183234Ssimon sub \$1,$rnds 328183234Ssimon jmp .Lenc_loop 329183234Ssimon.align 16 330183234Ssimon.Lenc_loop: 331183234Ssimon___ 332183234Ssimon if ($verticalspin) { &encvert(); } 333183234Ssimon else { &encstep(0,$s0,$s1,$s2,$s3); 334183234Ssimon &encstep(1,$s1,$s2,$s3,$s0); 335183234Ssimon &encstep(2,$s2,$s3,$s0,$s1); 336183234Ssimon &encstep(3,$s3,$s0,$s1,$s2); 337183234Ssimon } 338183234Ssimon$code.=<<___; 339183234Ssimon sub \$1,$rnds 340183234Ssimon jnz .Lenc_loop 341183234Ssimon___ 342183234Ssimon if ($verticalspin) { &enclastvert(); } 343183234Ssimon else { &enclast(0,$s0,$s1,$s2,$s3); 344183234Ssimon &enclast(1,$s1,$s2,$s3,$s0); 345183234Ssimon &enclast(2,$s2,$s3,$s0,$s1); 346183234Ssimon &enclast(3,$s3,$s0,$s1,$s2); 347183234Ssimon $code.=<<___; 348183234Ssimon xor 16+0($key),$s0 # xor with key 349183234Ssimon xor 16+4($key),$s1 350183234Ssimon xor 16+8($key),$s2 351183234Ssimon xor 16+12($key),$s3 352183234Ssimon___ 353183234Ssimon } 354183234Ssimon$code.=<<___; 355183234Ssimon .byte 0xf3,0xc3 # rep ret 356183234Ssimon.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt 357183234Ssimon___ 358183234Ssimon 359238405Sjkim# it's possible to implement this by shifting tN by 8, filling least 360238405Sjkim# significant byte with byte load and finally bswap-ing at the end, 361238405Sjkim# but such partial register load kills Core 2... 362238405Sjkimsub enccompactvert() 363238405Sjkim{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); 364238405Sjkim 365238405Sjkim$code.=<<___; 366238405Sjkim movzb `&lo("$s0")`,$t0 367238405Sjkim movzb `&lo("$s1")`,$t1 368238405Sjkim movzb `&lo("$s2")`,$t2 369238405Sjkim movzb ($sbox,$t0,1),$t0 370238405Sjkim movzb ($sbox,$t1,1),$t1 371238405Sjkim movzb ($sbox,$t2,1),$t2 372238405Sjkim 373238405Sjkim movzb `&lo("$s3")`,$t3 374238405Sjkim movzb `&hi("$s1")`,$acc0 375238405Sjkim movzb `&hi("$s2")`,$acc1 376238405Sjkim movzb ($sbox,$t3,1),$t3 377238405Sjkim movzb ($sbox,$acc0,1),$t4 #$t0 378238405Sjkim movzb ($sbox,$acc1,1),$t5 #$t1 379238405Sjkim 380238405Sjkim movzb `&hi("$s3")`,$acc2 381238405Sjkim movzb `&hi("$s0")`,$acc0 382238405Sjkim shr \$16,$s2 383238405Sjkim movzb ($sbox,$acc2,1),$acc2 #$t2 384238405Sjkim movzb ($sbox,$acc0,1),$acc0 #$t3 385238405Sjkim shr \$16,$s3 386238405Sjkim 387238405Sjkim movzb `&lo("$s2")`,$acc1 388238405Sjkim shl \$8,$t4 389238405Sjkim shl \$8,$t5 390238405Sjkim movzb ($sbox,$acc1,1),$acc1 #$t0 391238405Sjkim xor $t4,$t0 392238405Sjkim xor $t5,$t1 393238405Sjkim 394238405Sjkim movzb `&lo("$s3")`,$t4 395238405Sjkim shr \$16,$s0 396238405Sjkim shr \$16,$s1 397238405Sjkim movzb `&lo("$s0")`,$t5 398238405Sjkim shl \$8,$acc2 399238405Sjkim shl \$8,$acc0 400238405Sjkim movzb ($sbox,$t4,1),$t4 #$t1 401238405Sjkim movzb ($sbox,$t5,1),$t5 #$t2 402238405Sjkim xor $acc2,$t2 403238405Sjkim xor $acc0,$t3 404238405Sjkim 405238405Sjkim movzb `&lo("$s1")`,$acc2 406238405Sjkim movzb `&hi("$s3")`,$acc0 407238405Sjkim shl \$16,$acc1 408238405Sjkim movzb ($sbox,$acc2,1),$acc2 #$t3 409238405Sjkim movzb ($sbox,$acc0,1),$acc0 #$t0 410238405Sjkim xor $acc1,$t0 411238405Sjkim 412238405Sjkim movzb `&hi("$s0")`,$acc1 413238405Sjkim shr \$8,$s2 414238405Sjkim shr \$8,$s1 415238405Sjkim movzb ($sbox,$acc1,1),$acc1 #$t1 416238405Sjkim movzb ($sbox,$s2,1),$s3 #$t3 417238405Sjkim movzb ($sbox,$s1,1),$s2 #$t2 418238405Sjkim shl \$16,$t4 419238405Sjkim shl \$16,$t5 420238405Sjkim shl \$16,$acc2 421238405Sjkim xor $t4,$t1 422238405Sjkim xor $t5,$t2 423238405Sjkim xor $acc2,$t3 424238405Sjkim 425238405Sjkim shl \$24,$acc0 426238405Sjkim shl \$24,$acc1 427238405Sjkim shl \$24,$s3 428238405Sjkim xor $acc0,$t0 429238405Sjkim shl \$24,$s2 430238405Sjkim xor $acc1,$t1 431238405Sjkim mov $t0,$s0 432238405Sjkim mov $t1,$s1 433238405Sjkim xor $t2,$s2 434238405Sjkim xor $t3,$s3 435238405Sjkim___ 436238405Sjkim} 437238405Sjkim 438238405Sjkimsub enctransform_ref() 439238405Sjkim{ my $sn = shift; 440238405Sjkim my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d"); 441238405Sjkim 442238405Sjkim$code.=<<___; 443238405Sjkim mov $sn,$acc 444238405Sjkim and \$0x80808080,$acc 445238405Sjkim mov $acc,$tmp 446238405Sjkim shr \$7,$tmp 447238405Sjkim lea ($sn,$sn),$r2 448238405Sjkim sub $tmp,$acc 449238405Sjkim and \$0xfefefefe,$r2 450238405Sjkim and \$0x1b1b1b1b,$acc 451238405Sjkim mov $sn,$tmp 452238405Sjkim xor $acc,$r2 453238405Sjkim 454238405Sjkim xor $r2,$sn 455238405Sjkim rol \$24,$sn 456238405Sjkim xor $r2,$sn 457238405Sjkim ror \$16,$tmp 458238405Sjkim xor $tmp,$sn 459238405Sjkim ror \$8,$tmp 460238405Sjkim xor $tmp,$sn 461238405Sjkim___ 462238405Sjkim} 463238405Sjkim 464238405Sjkim# unlike decrypt case it does not pay off to parallelize enctransform 465238405Sjkimsub enctransform() 466238405Sjkim{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); 467238405Sjkim 468238405Sjkim$code.=<<___; 469238405Sjkim mov $s0,$acc0 470238405Sjkim mov $s1,$acc1 471238405Sjkim and \$0x80808080,$acc0 472238405Sjkim and \$0x80808080,$acc1 473238405Sjkim mov $acc0,$t0 474238405Sjkim mov $acc1,$t1 475238405Sjkim shr \$7,$t0 476238405Sjkim lea ($s0,$s0),$r20 477238405Sjkim shr \$7,$t1 478238405Sjkim lea ($s1,$s1),$r21 479238405Sjkim sub $t0,$acc0 480238405Sjkim sub $t1,$acc1 481238405Sjkim and \$0xfefefefe,$r20 482238405Sjkim and \$0xfefefefe,$r21 483238405Sjkim and \$0x1b1b1b1b,$acc0 484238405Sjkim and \$0x1b1b1b1b,$acc1 485238405Sjkim mov $s0,$t0 486238405Sjkim mov $s1,$t1 487238405Sjkim xor $acc0,$r20 488238405Sjkim xor $acc1,$r21 489238405Sjkim 490238405Sjkim xor $r20,$s0 491238405Sjkim xor $r21,$s1 492238405Sjkim mov $s2,$acc0 493238405Sjkim mov $s3,$acc1 494238405Sjkim rol \$24,$s0 495238405Sjkim rol \$24,$s1 496238405Sjkim and \$0x80808080,$acc0 497238405Sjkim and \$0x80808080,$acc1 498238405Sjkim xor $r20,$s0 499238405Sjkim xor $r21,$s1 500238405Sjkim mov $acc0,$t2 501238405Sjkim mov $acc1,$t3 502238405Sjkim ror \$16,$t0 503238405Sjkim ror \$16,$t1 504238405Sjkim shr \$7,$t2 505238405Sjkim lea ($s2,$s2),$r20 506238405Sjkim xor $t0,$s0 507238405Sjkim xor $t1,$s1 508238405Sjkim shr \$7,$t3 509238405Sjkim lea ($s3,$s3),$r21 510238405Sjkim ror \$8,$t0 511238405Sjkim ror \$8,$t1 512238405Sjkim sub $t2,$acc0 513238405Sjkim sub $t3,$acc1 514238405Sjkim xor $t0,$s0 515238405Sjkim xor $t1,$s1 516238405Sjkim 517238405Sjkim and \$0xfefefefe,$r20 518238405Sjkim and \$0xfefefefe,$r21 519238405Sjkim and \$0x1b1b1b1b,$acc0 520238405Sjkim and \$0x1b1b1b1b,$acc1 521238405Sjkim mov $s2,$t2 522238405Sjkim mov $s3,$t3 523238405Sjkim xor $acc0,$r20 524238405Sjkim xor $acc1,$r21 525238405Sjkim 526238405Sjkim xor $r20,$s2 527238405Sjkim xor $r21,$s3 528238405Sjkim rol \$24,$s2 529238405Sjkim rol \$24,$s3 530238405Sjkim xor $r20,$s2 531238405Sjkim xor $r21,$s3 532238405Sjkim mov 0($sbox),$acc0 # prefetch Te4 533238405Sjkim ror \$16,$t2 534238405Sjkim ror \$16,$t3 535238405Sjkim mov 64($sbox),$acc1 536238405Sjkim xor $t2,$s2 537238405Sjkim xor $t3,$s3 538238405Sjkim mov 128($sbox),$r20 539238405Sjkim ror \$8,$t2 540238405Sjkim ror \$8,$t3 541238405Sjkim mov 192($sbox),$r21 542238405Sjkim xor $t2,$s2 543238405Sjkim xor $t3,$s3 544238405Sjkim___ 545238405Sjkim} 546238405Sjkim 547238405Sjkim$code.=<<___; 548238405Sjkim.type _x86_64_AES_encrypt_compact,\@abi-omnipotent 549238405Sjkim.align 16 550238405Sjkim_x86_64_AES_encrypt_compact: 551238405Sjkim lea 128($sbox),$inp # size optimization 552238405Sjkim mov 0-128($inp),$acc1 # prefetch Te4 553238405Sjkim mov 32-128($inp),$acc2 554238405Sjkim mov 64-128($inp),$t0 555238405Sjkim mov 96-128($inp),$t1 556238405Sjkim mov 128-128($inp),$acc1 557238405Sjkim mov 160-128($inp),$acc2 558238405Sjkim mov 192-128($inp),$t0 559238405Sjkim mov 224-128($inp),$t1 560238405Sjkim jmp .Lenc_loop_compact 561238405Sjkim.align 16 562238405Sjkim.Lenc_loop_compact: 563238405Sjkim xor 0($key),$s0 # xor with key 564238405Sjkim xor 4($key),$s1 565238405Sjkim xor 8($key),$s2 566238405Sjkim xor 12($key),$s3 567238405Sjkim lea 16($key),$key 568238405Sjkim___ 569238405Sjkim &enccompactvert(); 570238405Sjkim$code.=<<___; 571238405Sjkim cmp 16(%rsp),$key 572238405Sjkim je .Lenc_compact_done 573238405Sjkim___ 574238405Sjkim &enctransform(); 575238405Sjkim$code.=<<___; 576238405Sjkim jmp .Lenc_loop_compact 577238405Sjkim.align 16 578238405Sjkim.Lenc_compact_done: 579238405Sjkim xor 0($key),$s0 580238405Sjkim xor 4($key),$s1 581238405Sjkim xor 8($key),$s2 582238405Sjkim xor 12($key),$s3 583238405Sjkim .byte 0xf3,0xc3 # rep ret 584238405Sjkim.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact 585238405Sjkim___ 586238405Sjkim 587183234Ssimon# void AES_encrypt (const void *inp,void *out,const AES_KEY *key); 588183234Ssimon$code.=<<___; 589183234Ssimon.globl AES_encrypt 590183234Ssimon.type AES_encrypt,\@function,3 591183234Ssimon.align 16 592238405Sjkim.globl asm_AES_encrypt 593238405Sjkim.hidden asm_AES_encrypt 594238405Sjkimasm_AES_encrypt: 595183234SsimonAES_encrypt: 596183234Ssimon push %rbx 597183234Ssimon push %rbp 598183234Ssimon push %r12 599183234Ssimon push %r13 600183234Ssimon push %r14 601183234Ssimon push %r15 602183234Ssimon 603238405Sjkim # allocate frame "above" key schedule 604238405Sjkim mov %rsp,%r10 605238405Sjkim lea -63(%rdx),%rcx # %rdx is key argument 606238405Sjkim and \$-64,%rsp 607238405Sjkim sub %rsp,%rcx 608238405Sjkim neg %rcx 609238405Sjkim and \$0x3c0,%rcx 610238405Sjkim sub %rcx,%rsp 611238405Sjkim sub \$32,%rsp 612238405Sjkim 613238405Sjkim mov %rsi,16(%rsp) # save out 614238405Sjkim mov %r10,24(%rsp) # save real stack pointer 615238405Sjkim.Lenc_prologue: 616238405Sjkim 617183234Ssimon mov %rdx,$key 618238405Sjkim mov 240($key),$rnds # load rounds 619183234Ssimon 620238405Sjkim mov 0(%rdi),$s0 # load input vector 621238405Sjkim mov 4(%rdi),$s1 622238405Sjkim mov 8(%rdi),$s2 623238405Sjkim mov 12(%rdi),$s3 624183234Ssimon 625238405Sjkim shl \$4,$rnds 626238405Sjkim lea ($key,$rnds),%rbp 627238405Sjkim mov $key,(%rsp) # key schedule 628238405Sjkim mov %rbp,8(%rsp) # end of key schedule 629183234Ssimon 630238405Sjkim # pick Te4 copy which can't "overlap" with stack frame or key schedule 631238405Sjkim lea .LAES_Te+2048(%rip),$sbox 632238405Sjkim lea 768(%rsp),%rbp 633238405Sjkim sub $sbox,%rbp 634238405Sjkim and \$0x300,%rbp 635238405Sjkim lea ($sbox,%rbp),$sbox 636183234Ssimon 637238405Sjkim call _x86_64_AES_encrypt_compact 638238405Sjkim 639238405Sjkim mov 16(%rsp),$out # restore out 640238405Sjkim mov 24(%rsp),%rsi # restore saved stack pointer 641238405Sjkim mov $s0,0($out) # write output vector 642183234Ssimon mov $s1,4($out) 643183234Ssimon mov $s2,8($out) 644183234Ssimon mov $s3,12($out) 645183234Ssimon 646238405Sjkim mov (%rsi),%r15 647238405Sjkim mov 8(%rsi),%r14 648238405Sjkim mov 16(%rsi),%r13 649238405Sjkim mov 24(%rsi),%r12 650238405Sjkim mov 32(%rsi),%rbp 651238405Sjkim mov 40(%rsi),%rbx 652238405Sjkim lea 48(%rsi),%rsp 653238405Sjkim.Lenc_epilogue: 654183234Ssimon ret 655183234Ssimon.size AES_encrypt,.-AES_encrypt 656183234Ssimon___ 657183234Ssimon 658183234Ssimon#------------------------------------------------------------------# 659183234Ssimon 660183234Ssimonsub decvert() 661183234Ssimon{ my $t3="%r8d"; # zaps $inp! 662183234Ssimon 663183234Ssimon$code.=<<___; 664183234Ssimon # favor 3-way issue Opteron pipeline... 665183234Ssimon movzb `&lo("$s0")`,$acc0 666183234Ssimon movzb `&lo("$s1")`,$acc1 667183234Ssimon movzb `&lo("$s2")`,$acc2 668183234Ssimon mov 0($sbox,$acc0,8),$t0 669183234Ssimon mov 0($sbox,$acc1,8),$t1 670183234Ssimon mov 0($sbox,$acc2,8),$t2 671183234Ssimon 672183234Ssimon movzb `&hi("$s3")`,$acc0 673183234Ssimon movzb `&hi("$s0")`,$acc1 674183234Ssimon movzb `&lo("$s3")`,$acc2 675183234Ssimon xor 3($sbox,$acc0,8),$t0 676183234Ssimon xor 3($sbox,$acc1,8),$t1 677183234Ssimon mov 0($sbox,$acc2,8),$t3 678183234Ssimon 679183234Ssimon movzb `&hi("$s1")`,$acc0 680183234Ssimon shr \$16,$s0 681183234Ssimon movzb `&hi("$s2")`,$acc2 682183234Ssimon xor 3($sbox,$acc0,8),$t2 683183234Ssimon shr \$16,$s3 684183234Ssimon xor 3($sbox,$acc2,8),$t3 685183234Ssimon 686183234Ssimon shr \$16,$s1 687183234Ssimon lea 16($key),$key 688183234Ssimon shr \$16,$s2 689183234Ssimon 690183234Ssimon movzb `&lo("$s2")`,$acc0 691183234Ssimon movzb `&lo("$s3")`,$acc1 692183234Ssimon movzb `&lo("$s0")`,$acc2 693183234Ssimon xor 2($sbox,$acc0,8),$t0 694183234Ssimon xor 2($sbox,$acc1,8),$t1 695183234Ssimon xor 2($sbox,$acc2,8),$t2 696183234Ssimon 697183234Ssimon movzb `&hi("$s1")`,$acc0 698183234Ssimon movzb `&hi("$s2")`,$acc1 699183234Ssimon movzb `&lo("$s1")`,$acc2 700183234Ssimon xor 1($sbox,$acc0,8),$t0 701183234Ssimon xor 1($sbox,$acc1,8),$t1 702183234Ssimon xor 2($sbox,$acc2,8),$t3 703183234Ssimon 704183234Ssimon movzb `&hi("$s3")`,$acc0 705183234Ssimon mov 12($key),$s3 706183234Ssimon movzb `&hi("$s0")`,$acc2 707183234Ssimon xor 1($sbox,$acc0,8),$t2 708183234Ssimon mov 0($key),$s0 709183234Ssimon xor 1($sbox,$acc2,8),$t3 710183234Ssimon 711183234Ssimon xor $t0,$s0 712183234Ssimon mov 4($key),$s1 713183234Ssimon mov 8($key),$s2 714183234Ssimon xor $t2,$s2 715183234Ssimon xor $t1,$s1 716183234Ssimon xor $t3,$s3 717183234Ssimon___ 718183234Ssimon} 719183234Ssimon 720183234Ssimonsub declastvert() 721183234Ssimon{ my $t3="%r8d"; # zaps $inp! 722183234Ssimon 723183234Ssimon$code.=<<___; 724238405Sjkim lea 2048($sbox),$sbox # size optimization 725183234Ssimon movzb `&lo("$s0")`,$acc0 726183234Ssimon movzb `&lo("$s1")`,$acc1 727183234Ssimon movzb `&lo("$s2")`,$acc2 728238405Sjkim movzb ($sbox,$acc0,1),$t0 729238405Sjkim movzb ($sbox,$acc1,1),$t1 730238405Sjkim movzb ($sbox,$acc2,1),$t2 731183234Ssimon 732183234Ssimon movzb `&lo("$s3")`,$acc0 733183234Ssimon movzb `&hi("$s3")`,$acc1 734183234Ssimon movzb `&hi("$s0")`,$acc2 735238405Sjkim movzb ($sbox,$acc0,1),$t3 736238405Sjkim movzb ($sbox,$acc1,1),$acc1 #$t0 737238405Sjkim movzb ($sbox,$acc2,1),$acc2 #$t1 738183234Ssimon 739183234Ssimon shl \$8,$acc1 740183234Ssimon shl \$8,$acc2 741183234Ssimon 742183234Ssimon xor $acc1,$t0 743183234Ssimon xor $acc2,$t1 744183234Ssimon shr \$16,$s3 745183234Ssimon 746183234Ssimon movzb `&hi("$s1")`,$acc0 747183234Ssimon movzb `&hi("$s2")`,$acc1 748183234Ssimon shr \$16,$s0 749238405Sjkim movzb ($sbox,$acc0,1),$acc0 #$t2 750238405Sjkim movzb ($sbox,$acc1,1),$acc1 #$t3 751183234Ssimon 752183234Ssimon shl \$8,$acc0 753183234Ssimon shl \$8,$acc1 754183234Ssimon shr \$16,$s1 755183234Ssimon xor $acc0,$t2 756183234Ssimon xor $acc1,$t3 757183234Ssimon shr \$16,$s2 758183234Ssimon 759183234Ssimon movzb `&lo("$s2")`,$acc0 760183234Ssimon movzb `&lo("$s3")`,$acc1 761183234Ssimon movzb `&lo("$s0")`,$acc2 762238405Sjkim movzb ($sbox,$acc0,1),$acc0 #$t0 763238405Sjkim movzb ($sbox,$acc1,1),$acc1 #$t1 764238405Sjkim movzb ($sbox,$acc2,1),$acc2 #$t2 765183234Ssimon 766183234Ssimon shl \$16,$acc0 767183234Ssimon shl \$16,$acc1 768183234Ssimon shl \$16,$acc2 769183234Ssimon 770183234Ssimon xor $acc0,$t0 771183234Ssimon xor $acc1,$t1 772183234Ssimon xor $acc2,$t2 773183234Ssimon 774183234Ssimon movzb `&lo("$s1")`,$acc0 775183234Ssimon movzb `&hi("$s1")`,$acc1 776183234Ssimon movzb `&hi("$s2")`,$acc2 777238405Sjkim movzb ($sbox,$acc0,1),$acc0 #$t3 778238405Sjkim movzb ($sbox,$acc1,1),$acc1 #$t0 779238405Sjkim movzb ($sbox,$acc2,1),$acc2 #$t1 780183234Ssimon 781183234Ssimon shl \$16,$acc0 782183234Ssimon shl \$24,$acc1 783183234Ssimon shl \$24,$acc2 784183234Ssimon 785183234Ssimon xor $acc0,$t3 786183234Ssimon xor $acc1,$t0 787183234Ssimon xor $acc2,$t1 788183234Ssimon 789183234Ssimon movzb `&hi("$s3")`,$acc0 790183234Ssimon movzb `&hi("$s0")`,$acc1 791183234Ssimon mov 16+12($key),$s3 792238405Sjkim movzb ($sbox,$acc0,1),$acc0 #$t2 793238405Sjkim movzb ($sbox,$acc1,1),$acc1 #$t3 794183234Ssimon mov 16+0($key),$s0 795183234Ssimon 796183234Ssimon shl \$24,$acc0 797183234Ssimon shl \$24,$acc1 798183234Ssimon 799183234Ssimon xor $acc0,$t2 800183234Ssimon xor $acc1,$t3 801183234Ssimon 802183234Ssimon mov 16+4($key),$s1 803183234Ssimon mov 16+8($key),$s2 804238405Sjkim lea -2048($sbox),$sbox 805183234Ssimon xor $t0,$s0 806183234Ssimon xor $t1,$s1 807183234Ssimon xor $t2,$s2 808183234Ssimon xor $t3,$s3 809183234Ssimon___ 810183234Ssimon} 811183234Ssimon 812183234Ssimonsub decstep() 813183234Ssimon{ my ($i,@s) = @_; 814183234Ssimon my $tmp0=$acc0; 815183234Ssimon my $tmp1=$acc1; 816183234Ssimon my $tmp2=$acc2; 817183234Ssimon my $out=($t0,$t1,$t2,$s[0])[$i]; 818183234Ssimon 819183234Ssimon $code.=" mov $s[0],$out\n" if ($i!=3); 820183234Ssimon $tmp1=$s[2] if ($i==3); 821183234Ssimon $code.=" mov $s[2],$tmp1\n" if ($i!=3); 822183234Ssimon $code.=" and \$0xFF,$out\n"; 823183234Ssimon 824183234Ssimon $code.=" mov 0($sbox,$out,8),$out\n"; 825183234Ssimon $code.=" shr \$16,$tmp1\n"; 826183234Ssimon $tmp2=$s[3] if ($i==3); 827183234Ssimon $code.=" mov $s[3],$tmp2\n" if ($i!=3); 828183234Ssimon 829183234Ssimon $tmp0=$s[1] if ($i==3); 830183234Ssimon $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 831183234Ssimon $code.=" and \$0xFF,$tmp1\n"; 832183234Ssimon $code.=" shr \$24,$tmp2\n"; 833183234Ssimon 834183234Ssimon $code.=" xor 3($sbox,$tmp0,8),$out\n"; 835183234Ssimon $code.=" xor 2($sbox,$tmp1,8),$out\n"; 836183234Ssimon $code.=" xor 1($sbox,$tmp2,8),$out\n"; 837183234Ssimon 838183234Ssimon $code.=" mov $t2,$s[1]\n" if ($i==3); 839183234Ssimon $code.=" mov $t1,$s[2]\n" if ($i==3); 840183234Ssimon $code.=" mov $t0,$s[3]\n" if ($i==3); 841183234Ssimon $code.="\n"; 842183234Ssimon} 843183234Ssimon 844183234Ssimonsub declast() 845183234Ssimon{ my ($i,@s)=@_; 846183234Ssimon my $tmp0=$acc0; 847183234Ssimon my $tmp1=$acc1; 848183234Ssimon my $tmp2=$acc2; 849183234Ssimon my $out=($t0,$t1,$t2,$s[0])[$i]; 850183234Ssimon 851183234Ssimon $code.=" mov $s[0],$out\n" if ($i!=3); 852183234Ssimon $tmp1=$s[2] if ($i==3); 853183234Ssimon $code.=" mov $s[2],$tmp1\n" if ($i!=3); 854183234Ssimon $code.=" and \$0xFF,$out\n"; 855183234Ssimon 856183234Ssimon $code.=" movzb 2048($sbox,$out,1),$out\n"; 857183234Ssimon $code.=" shr \$16,$tmp1\n"; 858183234Ssimon $tmp2=$s[3] if ($i==3); 859183234Ssimon $code.=" mov $s[3],$tmp2\n" if ($i!=3); 860183234Ssimon 861183234Ssimon $tmp0=$s[1] if ($i==3); 862183234Ssimon $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 863183234Ssimon $code.=" and \$0xFF,$tmp1\n"; 864183234Ssimon $code.=" shr \$24,$tmp2\n"; 865183234Ssimon 866183234Ssimon $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n"; 867183234Ssimon $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n"; 868183234Ssimon $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n"; 869183234Ssimon 870183234Ssimon $code.=" shl \$8,$tmp0\n"; 871183234Ssimon $code.=" shl \$16,$tmp1\n"; 872183234Ssimon $code.=" shl \$24,$tmp2\n"; 873183234Ssimon 874183234Ssimon $code.=" xor $tmp0,$out\n"; 875183234Ssimon $code.=" mov $t2,$s[1]\n" if ($i==3); 876183234Ssimon $code.=" xor $tmp1,$out\n"; 877183234Ssimon $code.=" mov $t1,$s[2]\n" if ($i==3); 878183234Ssimon $code.=" xor $tmp2,$out\n"; 879183234Ssimon $code.=" mov $t0,$s[3]\n" if ($i==3); 880183234Ssimon $code.="\n"; 881183234Ssimon} 882183234Ssimon 883183234Ssimon$code.=<<___; 884183234Ssimon.type _x86_64_AES_decrypt,\@abi-omnipotent 885183234Ssimon.align 16 886183234Ssimon_x86_64_AES_decrypt: 887183234Ssimon xor 0($key),$s0 # xor with key 888183234Ssimon xor 4($key),$s1 889183234Ssimon xor 8($key),$s2 890183234Ssimon xor 12($key),$s3 891183234Ssimon 892183234Ssimon mov 240($key),$rnds # load key->rounds 893183234Ssimon sub \$1,$rnds 894183234Ssimon jmp .Ldec_loop 895183234Ssimon.align 16 896183234Ssimon.Ldec_loop: 897183234Ssimon___ 898183234Ssimon if ($verticalspin) { &decvert(); } 899183234Ssimon else { &decstep(0,$s0,$s3,$s2,$s1); 900183234Ssimon &decstep(1,$s1,$s0,$s3,$s2); 901183234Ssimon &decstep(2,$s2,$s1,$s0,$s3); 902183234Ssimon &decstep(3,$s3,$s2,$s1,$s0); 903183234Ssimon $code.=<<___; 904183234Ssimon lea 16($key),$key 905183234Ssimon xor 0($key),$s0 # xor with key 906183234Ssimon xor 4($key),$s1 907183234Ssimon xor 8($key),$s2 908183234Ssimon xor 12($key),$s3 909183234Ssimon___ 910183234Ssimon } 911183234Ssimon$code.=<<___; 912183234Ssimon sub \$1,$rnds 913183234Ssimon jnz .Ldec_loop 914183234Ssimon___ 915183234Ssimon if ($verticalspin) { &declastvert(); } 916183234Ssimon else { &declast(0,$s0,$s3,$s2,$s1); 917183234Ssimon &declast(1,$s1,$s0,$s3,$s2); 918183234Ssimon &declast(2,$s2,$s1,$s0,$s3); 919183234Ssimon &declast(3,$s3,$s2,$s1,$s0); 920183234Ssimon $code.=<<___; 921183234Ssimon xor 16+0($key),$s0 # xor with key 922183234Ssimon xor 16+4($key),$s1 923183234Ssimon xor 16+8($key),$s2 924183234Ssimon xor 16+12($key),$s3 925183234Ssimon___ 926183234Ssimon } 927183234Ssimon$code.=<<___; 928183234Ssimon .byte 0xf3,0xc3 # rep ret 929183234Ssimon.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt 930183234Ssimon___ 931183234Ssimon 932238405Sjkimsub deccompactvert() 933238405Sjkim{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); 934238405Sjkim 935238405Sjkim$code.=<<___; 936238405Sjkim movzb `&lo("$s0")`,$t0 937238405Sjkim movzb `&lo("$s1")`,$t1 938238405Sjkim movzb `&lo("$s2")`,$t2 939238405Sjkim movzb ($sbox,$t0,1),$t0 940238405Sjkim movzb ($sbox,$t1,1),$t1 941238405Sjkim movzb ($sbox,$t2,1),$t2 942238405Sjkim 943238405Sjkim movzb `&lo("$s3")`,$t3 944238405Sjkim movzb `&hi("$s3")`,$acc0 945238405Sjkim movzb `&hi("$s0")`,$acc1 946238405Sjkim movzb ($sbox,$t3,1),$t3 947238405Sjkim movzb ($sbox,$acc0,1),$t4 #$t0 948238405Sjkim movzb ($sbox,$acc1,1),$t5 #$t1 949238405Sjkim 950238405Sjkim movzb `&hi("$s1")`,$acc2 951238405Sjkim movzb `&hi("$s2")`,$acc0 952238405Sjkim shr \$16,$s2 953238405Sjkim movzb ($sbox,$acc2,1),$acc2 #$t2 954238405Sjkim movzb ($sbox,$acc0,1),$acc0 #$t3 955238405Sjkim shr \$16,$s3 956238405Sjkim 957238405Sjkim movzb `&lo("$s2")`,$acc1 958238405Sjkim shl \$8,$t4 959238405Sjkim shl \$8,$t5 960238405Sjkim movzb ($sbox,$acc1,1),$acc1 #$t0 961238405Sjkim xor $t4,$t0 962238405Sjkim xor $t5,$t1 963238405Sjkim 964238405Sjkim movzb `&lo("$s3")`,$t4 965238405Sjkim shr \$16,$s0 966238405Sjkim shr \$16,$s1 967238405Sjkim movzb `&lo("$s0")`,$t5 968238405Sjkim shl \$8,$acc2 969238405Sjkim shl \$8,$acc0 970238405Sjkim movzb ($sbox,$t4,1),$t4 #$t1 971238405Sjkim movzb ($sbox,$t5,1),$t5 #$t2 972238405Sjkim xor $acc2,$t2 973238405Sjkim xor $acc0,$t3 974238405Sjkim 975238405Sjkim movzb `&lo("$s1")`,$acc2 976238405Sjkim movzb `&hi("$s1")`,$acc0 977238405Sjkim shl \$16,$acc1 978238405Sjkim movzb ($sbox,$acc2,1),$acc2 #$t3 979238405Sjkim movzb ($sbox,$acc0,1),$acc0 #$t0 980238405Sjkim xor $acc1,$t0 981238405Sjkim 982238405Sjkim movzb `&hi("$s2")`,$acc1 983238405Sjkim shl \$16,$t4 984238405Sjkim shl \$16,$t5 985238405Sjkim movzb ($sbox,$acc1,1),$s1 #$t1 986238405Sjkim xor $t4,$t1 987238405Sjkim xor $t5,$t2 988238405Sjkim 989238405Sjkim movzb `&hi("$s3")`,$acc1 990238405Sjkim shr \$8,$s0 991238405Sjkim shl \$16,$acc2 992238405Sjkim movzb ($sbox,$acc1,1),$s2 #$t2 993238405Sjkim movzb ($sbox,$s0,1),$s3 #$t3 994238405Sjkim xor $acc2,$t3 995238405Sjkim 996238405Sjkim shl \$24,$acc0 997238405Sjkim shl \$24,$s1 998238405Sjkim shl \$24,$s2 999238405Sjkim xor $acc0,$t0 1000238405Sjkim shl \$24,$s3 1001238405Sjkim xor $t1,$s1 1002238405Sjkim mov $t0,$s0 1003238405Sjkim xor $t2,$s2 1004238405Sjkim xor $t3,$s3 1005238405Sjkim___ 1006238405Sjkim} 1007238405Sjkim 1008238405Sjkim# parallelized version! input is pair of 64-bit values: %rax=s1.s0 1009238405Sjkim# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1, 1010238405Sjkim# %ecx=s2 and %edx=s3. 1011238405Sjkimsub dectransform() 1012238405Sjkim{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx"); 1013238405Sjkim my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx"); 1014238405Sjkim my $prefetch = shift; 1015238405Sjkim 1016238405Sjkim$code.=<<___; 1017238405Sjkim mov $tp10,$acc0 1018238405Sjkim mov $tp18,$acc8 1019238405Sjkim and $mask80,$acc0 1020238405Sjkim and $mask80,$acc8 1021238405Sjkim mov $acc0,$tp40 1022238405Sjkim mov $acc8,$tp48 1023238405Sjkim shr \$7,$tp40 1024238405Sjkim lea ($tp10,$tp10),$tp20 1025238405Sjkim shr \$7,$tp48 1026238405Sjkim lea ($tp18,$tp18),$tp28 1027238405Sjkim sub $tp40,$acc0 1028238405Sjkim sub $tp48,$acc8 1029238405Sjkim and $maskfe,$tp20 1030238405Sjkim and $maskfe,$tp28 1031238405Sjkim and $mask1b,$acc0 1032238405Sjkim and $mask1b,$acc8 1033238405Sjkim xor $tp20,$acc0 1034238405Sjkim xor $tp28,$acc8 1035238405Sjkim mov $acc0,$tp20 1036238405Sjkim mov $acc8,$tp28 1037238405Sjkim 1038238405Sjkim and $mask80,$acc0 1039238405Sjkim and $mask80,$acc8 1040238405Sjkim mov $acc0,$tp80 1041238405Sjkim mov $acc8,$tp88 1042238405Sjkim shr \$7,$tp80 1043238405Sjkim lea ($tp20,$tp20),$tp40 1044238405Sjkim shr \$7,$tp88 1045238405Sjkim lea ($tp28,$tp28),$tp48 1046238405Sjkim sub $tp80,$acc0 1047238405Sjkim sub $tp88,$acc8 1048238405Sjkim and $maskfe,$tp40 1049238405Sjkim and $maskfe,$tp48 1050238405Sjkim and $mask1b,$acc0 1051238405Sjkim and $mask1b,$acc8 1052238405Sjkim xor $tp40,$acc0 1053238405Sjkim xor $tp48,$acc8 1054238405Sjkim mov $acc0,$tp40 1055238405Sjkim mov $acc8,$tp48 1056238405Sjkim 1057238405Sjkim and $mask80,$acc0 1058238405Sjkim and $mask80,$acc8 1059238405Sjkim mov $acc0,$tp80 1060238405Sjkim mov $acc8,$tp88 1061238405Sjkim shr \$7,$tp80 1062238405Sjkim xor $tp10,$tp20 # tp2^=tp1 1063238405Sjkim shr \$7,$tp88 1064238405Sjkim xor $tp18,$tp28 # tp2^=tp1 1065238405Sjkim sub $tp80,$acc0 1066238405Sjkim sub $tp88,$acc8 1067238405Sjkim lea ($tp40,$tp40),$tp80 1068238405Sjkim lea ($tp48,$tp48),$tp88 1069238405Sjkim xor $tp10,$tp40 # tp4^=tp1 1070238405Sjkim xor $tp18,$tp48 # tp4^=tp1 1071238405Sjkim and $maskfe,$tp80 1072238405Sjkim and $maskfe,$tp88 1073238405Sjkim and $mask1b,$acc0 1074238405Sjkim and $mask1b,$acc8 1075238405Sjkim xor $acc0,$tp80 1076238405Sjkim xor $acc8,$tp88 1077238405Sjkim 1078238405Sjkim xor $tp80,$tp10 # tp1^=tp8 1079238405Sjkim xor $tp88,$tp18 # tp1^=tp8 1080238405Sjkim xor $tp80,$tp20 # tp2^tp1^=tp8 1081238405Sjkim xor $tp88,$tp28 # tp2^tp1^=tp8 1082238405Sjkim mov $tp10,$acc0 1083238405Sjkim mov $tp18,$acc8 1084238405Sjkim xor $tp80,$tp40 # tp4^tp1^=tp8 1085238405Sjkim xor $tp88,$tp48 # tp4^tp1^=tp8 1086238405Sjkim shr \$32,$acc0 1087238405Sjkim shr \$32,$acc8 1088238405Sjkim xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 1089238405Sjkim xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 1090238405Sjkim rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) 1091238405Sjkim rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) 1092238405Sjkim xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 1093238405Sjkim xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 1094238405Sjkim 1095238405Sjkim rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) 1096238405Sjkim rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) 1097238405Sjkim xor `&LO("$tp80")`,`&LO("$tp10")` 1098238405Sjkim xor `&LO("$tp88")`,`&LO("$tp18")` 1099238405Sjkim shr \$32,$tp80 1100238405Sjkim shr \$32,$tp88 1101238405Sjkim xor `&LO("$tp80")`,`&LO("$acc0")` 1102238405Sjkim xor `&LO("$tp88")`,`&LO("$acc8")` 1103238405Sjkim 1104238405Sjkim mov $tp20,$tp80 1105238405Sjkim mov $tp28,$tp88 1106238405Sjkim shr \$32,$tp80 1107238405Sjkim shr \$32,$tp88 1108238405Sjkim rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) 1109238405Sjkim rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) 1110238405Sjkim rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) 1111238405Sjkim rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) 1112238405Sjkim xor `&LO("$tp20")`,`&LO("$tp10")` 1113238405Sjkim xor `&LO("$tp28")`,`&LO("$tp18")` 1114238405Sjkim mov $tp40,$tp20 1115238405Sjkim mov $tp48,$tp28 1116238405Sjkim xor `&LO("$tp80")`,`&LO("$acc0")` 1117238405Sjkim xor `&LO("$tp88")`,`&LO("$acc8")` 1118238405Sjkim 1119238405Sjkim `"mov 0($sbox),$mask80" if ($prefetch)` 1120238405Sjkim shr \$32,$tp20 1121238405Sjkim shr \$32,$tp28 1122238405Sjkim `"mov 64($sbox),$maskfe" if ($prefetch)` 1123238405Sjkim rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) 1124238405Sjkim rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) 1125238405Sjkim `"mov 128($sbox),$mask1b" if ($prefetch)` 1126238405Sjkim rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) 1127238405Sjkim rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) 1128238405Sjkim `"mov 192($sbox),$tp80" if ($prefetch)` 1129238405Sjkim xor `&LO("$tp40")`,`&LO("$tp10")` 1130238405Sjkim xor `&LO("$tp48")`,`&LO("$tp18")` 1131238405Sjkim `"mov 256($sbox),$tp88" if ($prefetch)` 1132238405Sjkim xor `&LO("$tp20")`,`&LO("$acc0")` 1133238405Sjkim xor `&LO("$tp28")`,`&LO("$acc8")` 1134238405Sjkim___ 1135238405Sjkim} 1136238405Sjkim 1137238405Sjkim$code.=<<___; 1138238405Sjkim.type _x86_64_AES_decrypt_compact,\@abi-omnipotent 1139238405Sjkim.align 16 1140238405Sjkim_x86_64_AES_decrypt_compact: 1141238405Sjkim lea 128($sbox),$inp # size optimization 1142238405Sjkim mov 0-128($inp),$acc1 # prefetch Td4 1143238405Sjkim mov 32-128($inp),$acc2 1144238405Sjkim mov 64-128($inp),$t0 1145238405Sjkim mov 96-128($inp),$t1 1146238405Sjkim mov 128-128($inp),$acc1 1147238405Sjkim mov 160-128($inp),$acc2 1148238405Sjkim mov 192-128($inp),$t0 1149238405Sjkim mov 224-128($inp),$t1 1150238405Sjkim jmp .Ldec_loop_compact 1151238405Sjkim 1152238405Sjkim.align 16 1153238405Sjkim.Ldec_loop_compact: 1154238405Sjkim xor 0($key),$s0 # xor with key 1155238405Sjkim xor 4($key),$s1 1156238405Sjkim xor 8($key),$s2 1157238405Sjkim xor 12($key),$s3 1158238405Sjkim lea 16($key),$key 1159238405Sjkim___ 1160238405Sjkim &deccompactvert(); 1161238405Sjkim$code.=<<___; 1162238405Sjkim cmp 16(%rsp),$key 1163238405Sjkim je .Ldec_compact_done 1164238405Sjkim 1165238405Sjkim mov 256+0($sbox),$mask80 1166238405Sjkim shl \$32,%rbx 1167238405Sjkim shl \$32,%rdx 1168238405Sjkim mov 256+8($sbox),$maskfe 1169238405Sjkim or %rbx,%rax 1170238405Sjkim or %rdx,%rcx 1171238405Sjkim mov 256+16($sbox),$mask1b 1172238405Sjkim___ 1173238405Sjkim &dectransform(1); 1174238405Sjkim$code.=<<___; 1175238405Sjkim jmp .Ldec_loop_compact 1176238405Sjkim.align 16 1177238405Sjkim.Ldec_compact_done: 1178238405Sjkim xor 0($key),$s0 1179238405Sjkim xor 4($key),$s1 1180238405Sjkim xor 8($key),$s2 1181238405Sjkim xor 12($key),$s3 1182238405Sjkim .byte 0xf3,0xc3 # rep ret 1183238405Sjkim.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact 1184238405Sjkim___ 1185238405Sjkim 1186183234Ssimon# void AES_decrypt (const void *inp,void *out,const AES_KEY *key); 1187183234Ssimon$code.=<<___; 1188183234Ssimon.globl AES_decrypt 1189183234Ssimon.type AES_decrypt,\@function,3 1190183234Ssimon.align 16 1191238405Sjkim.globl asm_AES_decrypt 1192238405Sjkim.hidden asm_AES_decrypt 1193238405Sjkimasm_AES_decrypt: 1194183234SsimonAES_decrypt: 1195183234Ssimon push %rbx 1196183234Ssimon push %rbp 1197183234Ssimon push %r12 1198183234Ssimon push %r13 1199183234Ssimon push %r14 1200183234Ssimon push %r15 1201183234Ssimon 1202238405Sjkim # allocate frame "above" key schedule 1203238405Sjkim mov %rsp,%r10 1204238405Sjkim lea -63(%rdx),%rcx # %rdx is key argument 1205238405Sjkim and \$-64,%rsp 1206238405Sjkim sub %rsp,%rcx 1207238405Sjkim neg %rcx 1208238405Sjkim and \$0x3c0,%rcx 1209238405Sjkim sub %rcx,%rsp 1210238405Sjkim sub \$32,%rsp 1211238405Sjkim 1212238405Sjkim mov %rsi,16(%rsp) # save out 1213238405Sjkim mov %r10,24(%rsp) # save real stack pointer 1214238405Sjkim.Ldec_prologue: 1215238405Sjkim 1216183234Ssimon mov %rdx,$key 1217238405Sjkim mov 240($key),$rnds # load rounds 1218183234Ssimon 1219238405Sjkim mov 0(%rdi),$s0 # load input vector 1220238405Sjkim mov 4(%rdi),$s1 1221238405Sjkim mov 8(%rdi),$s2 1222238405Sjkim mov 12(%rdi),$s3 1223183234Ssimon 1224238405Sjkim shl \$4,$rnds 1225238405Sjkim lea ($key,$rnds),%rbp 1226238405Sjkim mov $key,(%rsp) # key schedule 1227238405Sjkim mov %rbp,8(%rsp) # end of key schedule 1228183234Ssimon 1229238405Sjkim # pick Td4 copy which can't "overlap" with stack frame or key schedule 1230238405Sjkim lea .LAES_Td+2048(%rip),$sbox 1231238405Sjkim lea 768(%rsp),%rbp 1232238405Sjkim sub $sbox,%rbp 1233238405Sjkim and \$0x300,%rbp 1234238405Sjkim lea ($sbox,%rbp),$sbox 1235238405Sjkim shr \$3,%rbp # recall "magic" constants! 1236238405Sjkim add %rbp,$sbox 1237183234Ssimon 1238238405Sjkim call _x86_64_AES_decrypt_compact 1239183234Ssimon 1240238405Sjkim mov 16(%rsp),$out # restore out 1241238405Sjkim mov 24(%rsp),%rsi # restore saved stack pointer 1242238405Sjkim mov $s0,0($out) # write output vector 1243183234Ssimon mov $s1,4($out) 1244183234Ssimon mov $s2,8($out) 1245183234Ssimon mov $s3,12($out) 1246183234Ssimon 1247238405Sjkim mov (%rsi),%r15 1248238405Sjkim mov 8(%rsi),%r14 1249238405Sjkim mov 16(%rsi),%r13 1250238405Sjkim mov 24(%rsi),%r12 1251238405Sjkim mov 32(%rsi),%rbp 1252238405Sjkim mov 40(%rsi),%rbx 1253238405Sjkim lea 48(%rsi),%rsp 1254238405Sjkim.Ldec_epilogue: 1255183234Ssimon ret 1256183234Ssimon.size AES_decrypt,.-AES_decrypt 1257183234Ssimon___ 1258183234Ssimon#------------------------------------------------------------------# 1259183234Ssimon 1260183234Ssimonsub enckey() 1261183234Ssimon{ 1262183234Ssimon$code.=<<___; 1263183234Ssimon movz %dl,%esi # rk[i]>>0 1264238405Sjkim movzb -128(%rbp,%rsi),%ebx 1265183234Ssimon movz %dh,%esi # rk[i]>>8 1266238405Sjkim shl \$24,%ebx 1267183234Ssimon xor %ebx,%eax 1268183234Ssimon 1269238405Sjkim movzb -128(%rbp,%rsi),%ebx 1270183234Ssimon shr \$16,%edx 1271183234Ssimon movz %dl,%esi # rk[i]>>16 1272183234Ssimon xor %ebx,%eax 1273183234Ssimon 1274238405Sjkim movzb -128(%rbp,%rsi),%ebx 1275183234Ssimon movz %dh,%esi # rk[i]>>24 1276238405Sjkim shl \$8,%ebx 1277183234Ssimon xor %ebx,%eax 1278183234Ssimon 1279238405Sjkim movzb -128(%rbp,%rsi),%ebx 1280238405Sjkim shl \$16,%ebx 1281183234Ssimon xor %ebx,%eax 1282183234Ssimon 1283238405Sjkim xor 1024-128(%rbp,%rcx,4),%eax # rcon 1284183234Ssimon___ 1285183234Ssimon} 1286183234Ssimon 1287238405Sjkim# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, 1288183234Ssimon# AES_KEY *key) 1289183234Ssimon$code.=<<___; 1290238405Sjkim.globl private_AES_set_encrypt_key 1291238405Sjkim.type private_AES_set_encrypt_key,\@function,3 1292183234Ssimon.align 16 1293238405Sjkimprivate_AES_set_encrypt_key: 1294183234Ssimon push %rbx 1295183234Ssimon push %rbp 1296238405Sjkim push %r12 # redundant, but allows to share 1297238405Sjkim push %r13 # exception handler... 1298238405Sjkim push %r14 1299238405Sjkim push %r15 1300215697Ssimon sub \$8,%rsp 1301238405Sjkim.Lenc_key_prologue: 1302183234Ssimon 1303215697Ssimon call _x86_64_AES_set_encrypt_key 1304215697Ssimon 1305238405Sjkim mov 8(%rsp),%r15 1306238405Sjkim mov 16(%rsp),%r14 1307238405Sjkim mov 24(%rsp),%r13 1308238405Sjkim mov 32(%rsp),%r12 1309238405Sjkim mov 40(%rsp),%rbp 1310238405Sjkim mov 48(%rsp),%rbx 1311238405Sjkim add \$56,%rsp 1312238405Sjkim.Lenc_key_epilogue: 1313215697Ssimon ret 1314238405Sjkim.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key 1315215697Ssimon 1316215697Ssimon.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent 1317215697Ssimon.align 16 1318215697Ssimon_x86_64_AES_set_encrypt_key: 1319183234Ssimon mov %esi,%ecx # %ecx=bits 1320183234Ssimon mov %rdi,%rsi # %rsi=userKey 1321183234Ssimon mov %rdx,%rdi # %rdi=key 1322183234Ssimon 1323183234Ssimon test \$-1,%rsi 1324183234Ssimon jz .Lbadpointer 1325183234Ssimon test \$-1,%rdi 1326183234Ssimon jz .Lbadpointer 1327183234Ssimon 1328238405Sjkim lea .LAES_Te(%rip),%rbp 1329238405Sjkim lea 2048+128(%rbp),%rbp 1330183234Ssimon 1331238405Sjkim # prefetch Te4 1332238405Sjkim mov 0-128(%rbp),%eax 1333238405Sjkim mov 32-128(%rbp),%ebx 1334238405Sjkim mov 64-128(%rbp),%r8d 1335238405Sjkim mov 96-128(%rbp),%edx 1336238405Sjkim mov 128-128(%rbp),%eax 1337238405Sjkim mov 160-128(%rbp),%ebx 1338238405Sjkim mov 192-128(%rbp),%r8d 1339238405Sjkim mov 224-128(%rbp),%edx 1340238405Sjkim 1341183234Ssimon cmp \$128,%ecx 1342183234Ssimon je .L10rounds 1343183234Ssimon cmp \$192,%ecx 1344183234Ssimon je .L12rounds 1345183234Ssimon cmp \$256,%ecx 1346183234Ssimon je .L14rounds 1347183234Ssimon mov \$-2,%rax # invalid number of bits 1348183234Ssimon jmp .Lexit 1349183234Ssimon 1350183234Ssimon.L10rounds: 1351238405Sjkim mov 0(%rsi),%rax # copy first 4 dwords 1352238405Sjkim mov 8(%rsi),%rdx 1353238405Sjkim mov %rax,0(%rdi) 1354238405Sjkim mov %rdx,8(%rdi) 1355183234Ssimon 1356238405Sjkim shr \$32,%rdx 1357183234Ssimon xor %ecx,%ecx 1358183234Ssimon jmp .L10shortcut 1359183234Ssimon.align 4 1360183234Ssimon.L10loop: 1361183234Ssimon mov 0(%rdi),%eax # rk[0] 1362183234Ssimon mov 12(%rdi),%edx # rk[3] 1363183234Ssimon.L10shortcut: 1364183234Ssimon___ 1365183234Ssimon &enckey (); 1366183234Ssimon$code.=<<___; 1367183234Ssimon mov %eax,16(%rdi) # rk[4] 1368183234Ssimon xor 4(%rdi),%eax 1369183234Ssimon mov %eax,20(%rdi) # rk[5] 1370183234Ssimon xor 8(%rdi),%eax 1371183234Ssimon mov %eax,24(%rdi) # rk[6] 1372183234Ssimon xor 12(%rdi),%eax 1373183234Ssimon mov %eax,28(%rdi) # rk[7] 1374183234Ssimon add \$1,%ecx 1375183234Ssimon lea 16(%rdi),%rdi 1376183234Ssimon cmp \$10,%ecx 1377183234Ssimon jl .L10loop 1378183234Ssimon 1379183234Ssimon movl \$10,80(%rdi) # setup number of rounds 1380183234Ssimon xor %rax,%rax 1381183234Ssimon jmp .Lexit 1382183234Ssimon 1383183234Ssimon.L12rounds: 1384238405Sjkim mov 0(%rsi),%rax # copy first 6 dwords 1385238405Sjkim mov 8(%rsi),%rbx 1386238405Sjkim mov 16(%rsi),%rdx 1387238405Sjkim mov %rax,0(%rdi) 1388238405Sjkim mov %rbx,8(%rdi) 1389238405Sjkim mov %rdx,16(%rdi) 1390183234Ssimon 1391238405Sjkim shr \$32,%rdx 1392183234Ssimon xor %ecx,%ecx 1393183234Ssimon jmp .L12shortcut 1394183234Ssimon.align 4 1395183234Ssimon.L12loop: 1396183234Ssimon mov 0(%rdi),%eax # rk[0] 1397183234Ssimon mov 20(%rdi),%edx # rk[5] 1398183234Ssimon.L12shortcut: 1399183234Ssimon___ 1400183234Ssimon &enckey (); 1401183234Ssimon$code.=<<___; 1402183234Ssimon mov %eax,24(%rdi) # rk[6] 1403183234Ssimon xor 4(%rdi),%eax 1404183234Ssimon mov %eax,28(%rdi) # rk[7] 1405183234Ssimon xor 8(%rdi),%eax 1406183234Ssimon mov %eax,32(%rdi) # rk[8] 1407183234Ssimon xor 12(%rdi),%eax 1408183234Ssimon mov %eax,36(%rdi) # rk[9] 1409183234Ssimon 1410183234Ssimon cmp \$7,%ecx 1411183234Ssimon je .L12break 1412183234Ssimon add \$1,%ecx 1413183234Ssimon 1414183234Ssimon xor 16(%rdi),%eax 1415183234Ssimon mov %eax,40(%rdi) # rk[10] 1416183234Ssimon xor 20(%rdi),%eax 1417183234Ssimon mov %eax,44(%rdi) # rk[11] 1418183234Ssimon 1419183234Ssimon lea 24(%rdi),%rdi 1420183234Ssimon jmp .L12loop 1421183234Ssimon.L12break: 1422183234Ssimon movl \$12,72(%rdi) # setup number of rounds 1423183234Ssimon xor %rax,%rax 1424183234Ssimon jmp .Lexit 1425183234Ssimon 1426183234Ssimon.L14rounds: 1427238405Sjkim mov 0(%rsi),%rax # copy first 8 dwords 1428238405Sjkim mov 8(%rsi),%rbx 1429238405Sjkim mov 16(%rsi),%rcx 1430238405Sjkim mov 24(%rsi),%rdx 1431238405Sjkim mov %rax,0(%rdi) 1432238405Sjkim mov %rbx,8(%rdi) 1433238405Sjkim mov %rcx,16(%rdi) 1434238405Sjkim mov %rdx,24(%rdi) 1435183234Ssimon 1436238405Sjkim shr \$32,%rdx 1437183234Ssimon xor %ecx,%ecx 1438183234Ssimon jmp .L14shortcut 1439183234Ssimon.align 4 1440183234Ssimon.L14loop: 1441238405Sjkim mov 0(%rdi),%eax # rk[0] 1442183234Ssimon mov 28(%rdi),%edx # rk[4] 1443183234Ssimon.L14shortcut: 1444183234Ssimon___ 1445183234Ssimon &enckey (); 1446183234Ssimon$code.=<<___; 1447183234Ssimon mov %eax,32(%rdi) # rk[8] 1448183234Ssimon xor 4(%rdi),%eax 1449183234Ssimon mov %eax,36(%rdi) # rk[9] 1450183234Ssimon xor 8(%rdi),%eax 1451183234Ssimon mov %eax,40(%rdi) # rk[10] 1452183234Ssimon xor 12(%rdi),%eax 1453183234Ssimon mov %eax,44(%rdi) # rk[11] 1454183234Ssimon 1455183234Ssimon cmp \$6,%ecx 1456183234Ssimon je .L14break 1457183234Ssimon add \$1,%ecx 1458183234Ssimon 1459183234Ssimon mov %eax,%edx 1460183234Ssimon mov 16(%rdi),%eax # rk[4] 1461183234Ssimon movz %dl,%esi # rk[11]>>0 1462238405Sjkim movzb -128(%rbp,%rsi),%ebx 1463183234Ssimon movz %dh,%esi # rk[11]>>8 1464183234Ssimon xor %ebx,%eax 1465183234Ssimon 1466238405Sjkim movzb -128(%rbp,%rsi),%ebx 1467183234Ssimon shr \$16,%edx 1468238405Sjkim shl \$8,%ebx 1469183234Ssimon movz %dl,%esi # rk[11]>>16 1470183234Ssimon xor %ebx,%eax 1471183234Ssimon 1472238405Sjkim movzb -128(%rbp,%rsi),%ebx 1473183234Ssimon movz %dh,%esi # rk[11]>>24 1474238405Sjkim shl \$16,%ebx 1475183234Ssimon xor %ebx,%eax 1476183234Ssimon 1477238405Sjkim movzb -128(%rbp,%rsi),%ebx 1478238405Sjkim shl \$24,%ebx 1479183234Ssimon xor %ebx,%eax 1480183234Ssimon 1481183234Ssimon mov %eax,48(%rdi) # rk[12] 1482183234Ssimon xor 20(%rdi),%eax 1483183234Ssimon mov %eax,52(%rdi) # rk[13] 1484183234Ssimon xor 24(%rdi),%eax 1485183234Ssimon mov %eax,56(%rdi) # rk[14] 1486183234Ssimon xor 28(%rdi),%eax 1487183234Ssimon mov %eax,60(%rdi) # rk[15] 1488183234Ssimon 1489183234Ssimon lea 32(%rdi),%rdi 1490183234Ssimon jmp .L14loop 1491183234Ssimon.L14break: 1492183234Ssimon movl \$14,48(%rdi) # setup number of rounds 1493183234Ssimon xor %rax,%rax 1494183234Ssimon jmp .Lexit 1495183234Ssimon 1496183234Ssimon.Lbadpointer: 1497183234Ssimon mov \$-1,%rax 1498183234Ssimon.Lexit: 1499238405Sjkim .byte 0xf3,0xc3 # rep ret 1500215697Ssimon.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key 1501183234Ssimon___ 1502183234Ssimon 1503238405Sjkimsub deckey_ref() 1504183234Ssimon{ my ($i,$ptr,$te,$td) = @_; 1505238405Sjkim my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d"); 1506183234Ssimon$code.=<<___; 1507238405Sjkim mov $i($ptr),$tp1 1508238405Sjkim mov $tp1,$acc 1509238405Sjkim and \$0x80808080,$acc 1510238405Sjkim mov $acc,$tp4 1511238405Sjkim shr \$7,$tp4 1512238405Sjkim lea 0($tp1,$tp1),$tp2 1513238405Sjkim sub $tp4,$acc 1514238405Sjkim and \$0xfefefefe,$tp2 1515238405Sjkim and \$0x1b1b1b1b,$acc 1516238405Sjkim xor $tp2,$acc 1517238405Sjkim mov $acc,$tp2 1518238405Sjkim 1519238405Sjkim and \$0x80808080,$acc 1520238405Sjkim mov $acc,$tp8 1521238405Sjkim shr \$7,$tp8 1522238405Sjkim lea 0($tp2,$tp2),$tp4 1523238405Sjkim sub $tp8,$acc 1524238405Sjkim and \$0xfefefefe,$tp4 1525238405Sjkim and \$0x1b1b1b1b,$acc 1526238405Sjkim xor $tp1,$tp2 # tp2^tp1 1527238405Sjkim xor $tp4,$acc 1528238405Sjkim mov $acc,$tp4 1529238405Sjkim 1530238405Sjkim and \$0x80808080,$acc 1531238405Sjkim mov $acc,$tp8 1532238405Sjkim shr \$7,$tp8 1533238405Sjkim sub $tp8,$acc 1534238405Sjkim lea 0($tp4,$tp4),$tp8 1535238405Sjkim xor $tp1,$tp4 # tp4^tp1 1536238405Sjkim and \$0xfefefefe,$tp8 1537238405Sjkim and \$0x1b1b1b1b,$acc 1538238405Sjkim xor $acc,$tp8 1539238405Sjkim 1540238405Sjkim xor $tp8,$tp1 # tp1^tp8 1541238405Sjkim rol \$8,$tp1 # ROTATE(tp1^tp8,8) 1542238405Sjkim xor $tp8,$tp2 # tp2^tp1^tp8 1543238405Sjkim xor $tp8,$tp4 # tp4^tp1^tp8 1544238405Sjkim xor $tp2,$tp8 1545238405Sjkim xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2 1546238405Sjkim 1547238405Sjkim xor $tp8,$tp1 1548238405Sjkim rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24) 1549238405Sjkim xor $tp2,$tp1 1550238405Sjkim rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16) 1551238405Sjkim xor $tp4,$tp1 1552238405Sjkim 1553238405Sjkim mov $tp1,$i($ptr) 1554183234Ssimon___ 1555183234Ssimon} 1556183234Ssimon 1557238405Sjkim# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, 1558183234Ssimon# AES_KEY *key) 1559183234Ssimon$code.=<<___; 1560238405Sjkim.globl private_AES_set_decrypt_key 1561238405Sjkim.type private_AES_set_decrypt_key,\@function,3 1562183234Ssimon.align 16 1563238405Sjkimprivate_AES_set_decrypt_key: 1564215697Ssimon push %rbx 1565215697Ssimon push %rbp 1566238405Sjkim push %r12 1567238405Sjkim push %r13 1568238405Sjkim push %r14 1569238405Sjkim push %r15 1570215697Ssimon push %rdx # save key schedule 1571238405Sjkim.Ldec_key_prologue: 1572215697Ssimon 1573215697Ssimon call _x86_64_AES_set_encrypt_key 1574215697Ssimon mov (%rsp),%r8 # restore key schedule 1575183234Ssimon cmp \$0,%eax 1576215697Ssimon jne .Labort 1577183234Ssimon 1578238405Sjkim mov 240(%r8),%r14d # pull number of rounds 1579183234Ssimon xor %rdi,%rdi 1580238405Sjkim lea (%rdi,%r14d,4),%rcx 1581183234Ssimon mov %r8,%rsi 1582183234Ssimon lea (%r8,%rcx,4),%rdi # pointer to last chunk 1583183234Ssimon.align 4 1584183234Ssimon.Linvert: 1585183234Ssimon mov 0(%rsi),%rax 1586183234Ssimon mov 8(%rsi),%rbx 1587183234Ssimon mov 0(%rdi),%rcx 1588183234Ssimon mov 8(%rdi),%rdx 1589183234Ssimon mov %rax,0(%rdi) 1590183234Ssimon mov %rbx,8(%rdi) 1591183234Ssimon mov %rcx,0(%rsi) 1592183234Ssimon mov %rdx,8(%rsi) 1593183234Ssimon lea 16(%rsi),%rsi 1594183234Ssimon lea -16(%rdi),%rdi 1595183234Ssimon cmp %rsi,%rdi 1596183234Ssimon jne .Linvert 1597183234Ssimon 1598238405Sjkim lea .LAES_Te+2048+1024(%rip),%rax # rcon 1599183234Ssimon 1600238405Sjkim mov 40(%rax),$mask80 1601238405Sjkim mov 48(%rax),$maskfe 1602238405Sjkim mov 56(%rax),$mask1b 1603238405Sjkim 1604238405Sjkim mov %r8,$key 1605238405Sjkim sub \$1,%r14d 1606183234Ssimon.align 4 1607183234Ssimon.Lpermute: 1608238405Sjkim lea 16($key),$key 1609238405Sjkim mov 0($key),%rax 1610238405Sjkim mov 8($key),%rcx 1611183234Ssimon___ 1612238405Sjkim &dectransform (); 1613183234Ssimon$code.=<<___; 1614238405Sjkim mov %eax,0($key) 1615238405Sjkim mov %ebx,4($key) 1616238405Sjkim mov %ecx,8($key) 1617238405Sjkim mov %edx,12($key) 1618238405Sjkim sub \$1,%r14d 1619183234Ssimon jnz .Lpermute 1620183234Ssimon 1621183234Ssimon xor %rax,%rax 1622215697Ssimon.Labort: 1623238405Sjkim mov 8(%rsp),%r15 1624238405Sjkim mov 16(%rsp),%r14 1625238405Sjkim mov 24(%rsp),%r13 1626238405Sjkim mov 32(%rsp),%r12 1627238405Sjkim mov 40(%rsp),%rbp 1628238405Sjkim mov 48(%rsp),%rbx 1629238405Sjkim add \$56,%rsp 1630238405Sjkim.Ldec_key_epilogue: 1631183234Ssimon ret 1632238405Sjkim.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key 1633183234Ssimon___ 1634183234Ssimon 1635183234Ssimon# void AES_cbc_encrypt (const void char *inp, unsigned char *out, 1636183234Ssimon# size_t length, const AES_KEY *key, 1637183234Ssimon# unsigned char *ivp,const int enc); 1638183234Ssimon{ 1639183234Ssimon# stack frame layout 1640183234Ssimon# -8(%rsp) return address 1641238405Sjkimmy $keyp="0(%rsp)"; # one to pass as $key 1642238405Sjkimmy $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds]) 1643238405Sjkimmy $_rsp="16(%rsp)"; # saved %rsp 1644238405Sjkimmy $_inp="24(%rsp)"; # copy of 1st parameter, inp 1645238405Sjkimmy $_out="32(%rsp)"; # copy of 2nd parameter, out 1646238405Sjkimmy $_len="40(%rsp)"; # copy of 3rd parameter, length 1647238405Sjkimmy $_key="48(%rsp)"; # copy of 4th parameter, key 1648238405Sjkimmy $_ivp="56(%rsp)"; # copy of 5th parameter, ivp 1649238405Sjkimmy $ivec="64(%rsp)"; # ivec[16] 1650238405Sjkimmy $aes_key="80(%rsp)"; # copy of aes_key 1651238405Sjkimmy $mark="80+240(%rsp)"; # copy of aes_key->rounds 1652183234Ssimon 1653183234Ssimon$code.=<<___; 1654183234Ssimon.globl AES_cbc_encrypt 1655183234Ssimon.type AES_cbc_encrypt,\@function,6 1656183234Ssimon.align 16 1657238405Sjkim.extern OPENSSL_ia32cap_P 1658238405Sjkim.globl asm_AES_cbc_encrypt 1659238405Sjkim.hidden asm_AES_cbc_encrypt 1660238405Sjkimasm_AES_cbc_encrypt: 1661183234SsimonAES_cbc_encrypt: 1662183234Ssimon cmp \$0,%rdx # check length 1663238405Sjkim je .Lcbc_epilogue 1664238405Sjkim pushfq 1665183234Ssimon push %rbx 1666183234Ssimon push %rbp 1667183234Ssimon push %r12 1668183234Ssimon push %r13 1669183234Ssimon push %r14 1670183234Ssimon push %r15 1671238405Sjkim.Lcbc_prologue: 1672238405Sjkim 1673183234Ssimon cld 1674183234Ssimon mov %r9d,%r9d # clear upper half of enc 1675183234Ssimon 1676238405Sjkim lea .LAES_Te(%rip),$sbox 1677183234Ssimon cmp \$0,%r9 1678238405Sjkim jne .Lcbc_picked_te 1679238405Sjkim lea .LAES_Td(%rip),$sbox 1680238405Sjkim.Lcbc_picked_te: 1681183234Ssimon 1682238405Sjkim mov OPENSSL_ia32cap_P(%rip),%r10d 1683238405Sjkim cmp \$$speed_limit,%rdx 1684238405Sjkim jb .Lcbc_slow_prologue 1685238405Sjkim test \$15,%rdx 1686238405Sjkim jnz .Lcbc_slow_prologue 1687238405Sjkim bt \$28,%r10d 1688238405Sjkim jc .Lcbc_slow_prologue 1689183234Ssimon 1690183234Ssimon # allocate aligned stack frame... 1691238405Sjkim lea -88-248(%rsp),$key 1692183234Ssimon and \$-64,$key 1693183234Ssimon 1694238405Sjkim # ... and make sure it doesn't alias with AES_T[ed] modulo 4096 1695183234Ssimon mov $sbox,%r10 1696238405Sjkim lea 2304($sbox),%r11 1697183234Ssimon mov $key,%r12 1698183234Ssimon and \$0xFFF,%r10 # s = $sbox&0xfff 1699183234Ssimon and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff 1700183234Ssimon and \$0xFFF,%r12 # p = %rsp&0xfff 1701183234Ssimon 1702183234Ssimon cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); 1703183234Ssimon jb .Lcbc_te_break_out 1704183234Ssimon sub %r11,%r12 1705183234Ssimon sub %r12,$key 1706183234Ssimon jmp .Lcbc_te_ok 1707183234Ssimon.Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz 1708183234Ssimon sub %r10,%r12 1709183234Ssimon and \$0xFFF,%r12 1710183234Ssimon add \$320,%r12 1711183234Ssimon sub %r12,$key 1712183234Ssimon.align 4 1713183234Ssimon.Lcbc_te_ok: 1714183234Ssimon 1715183234Ssimon xchg %rsp,$key 1716238405Sjkim #add \$8,%rsp # reserve for return address! 1717183234Ssimon mov $key,$_rsp # save %rsp 1718238405Sjkim.Lcbc_fast_body: 1719238405Sjkim mov %rdi,$_inp # save copy of inp 1720238405Sjkim mov %rsi,$_out # save copy of out 1721183234Ssimon mov %rdx,$_len # save copy of len 1722183234Ssimon mov %rcx,$_key # save copy of key 1723183234Ssimon mov %r8,$_ivp # save copy of ivp 1724183234Ssimon movl \$0,$mark # copy of aes_key->rounds = 0; 1725183234Ssimon mov %r8,%rbp # rearrange input arguments 1726238405Sjkim mov %r9,%rbx 1727183234Ssimon mov %rsi,$out 1728183234Ssimon mov %rdi,$inp 1729183234Ssimon mov %rcx,$key 1730183234Ssimon 1731238405Sjkim mov 240($key),%eax # key->rounds 1732183234Ssimon # do we copy key schedule to stack? 1733183234Ssimon mov $key,%r10 1734183234Ssimon sub $sbox,%r10 1735183234Ssimon and \$0xfff,%r10 1736238405Sjkim cmp \$2304,%r10 1737183234Ssimon jb .Lcbc_do_ecopy 1738183234Ssimon cmp \$4096-248,%r10 1739183234Ssimon jb .Lcbc_skip_ecopy 1740183234Ssimon.align 4 1741183234Ssimon.Lcbc_do_ecopy: 1742183234Ssimon mov $key,%rsi 1743183234Ssimon lea $aes_key,%rdi 1744183234Ssimon lea $aes_key,$key 1745183234Ssimon mov \$240/8,%ecx 1746183234Ssimon .long 0x90A548F3 # rep movsq 1747238405Sjkim mov %eax,(%rdi) # copy aes_key->rounds 1748183234Ssimon.Lcbc_skip_ecopy: 1749183234Ssimon mov $key,$keyp # save key pointer 1750183234Ssimon 1751238405Sjkim mov \$18,%ecx 1752183234Ssimon.align 4 1753183234Ssimon.Lcbc_prefetch_te: 1754183234Ssimon mov 0($sbox),%r10 1755183234Ssimon mov 32($sbox),%r11 1756183234Ssimon mov 64($sbox),%r12 1757183234Ssimon mov 96($sbox),%r13 1758183234Ssimon lea 128($sbox),$sbox 1759183234Ssimon sub \$1,%ecx 1760183234Ssimon jnz .Lcbc_prefetch_te 1761238405Sjkim lea -2304($sbox),$sbox 1762183234Ssimon 1763238405Sjkim cmp \$0,%rbx 1764238405Sjkim je .LFAST_DECRYPT 1765238405Sjkim 1766238405Sjkim#----------------------------- ENCRYPT -----------------------------# 1767183234Ssimon mov 0(%rbp),$s0 # load iv 1768183234Ssimon mov 4(%rbp),$s1 1769183234Ssimon mov 8(%rbp),$s2 1770183234Ssimon mov 12(%rbp),$s3 1771183234Ssimon 1772183234Ssimon.align 4 1773238405Sjkim.Lcbc_fast_enc_loop: 1774183234Ssimon xor 0($inp),$s0 1775183234Ssimon xor 4($inp),$s1 1776183234Ssimon xor 8($inp),$s2 1777183234Ssimon xor 12($inp),$s3 1778238405Sjkim mov $keyp,$key # restore key 1779238405Sjkim mov $inp,$_inp # if ($verticalspin) save inp 1780183234Ssimon 1781183234Ssimon call _x86_64_AES_encrypt 1782183234Ssimon 1783238405Sjkim mov $_inp,$inp # if ($verticalspin) restore inp 1784238405Sjkim mov $_len,%r10 1785183234Ssimon mov $s0,0($out) 1786183234Ssimon mov $s1,4($out) 1787183234Ssimon mov $s2,8($out) 1788183234Ssimon mov $s3,12($out) 1789183234Ssimon 1790183234Ssimon lea 16($inp),$inp 1791183234Ssimon lea 16($out),$out 1792183234Ssimon sub \$16,%r10 1793183234Ssimon test \$-16,%r10 1794183234Ssimon mov %r10,$_len 1795238405Sjkim jnz .Lcbc_fast_enc_loop 1796183234Ssimon mov $_ivp,%rbp # restore ivp 1797183234Ssimon mov $s0,0(%rbp) # save ivec 1798183234Ssimon mov $s1,4(%rbp) 1799183234Ssimon mov $s2,8(%rbp) 1800183234Ssimon mov $s3,12(%rbp) 1801183234Ssimon 1802238405Sjkim jmp .Lcbc_fast_cleanup 1803238405Sjkim 1804183234Ssimon#----------------------------- DECRYPT -----------------------------# 1805183234Ssimon.align 16 1806238405Sjkim.LFAST_DECRYPT: 1807183234Ssimon cmp $inp,$out 1808238405Sjkim je .Lcbc_fast_dec_in_place 1809183234Ssimon 1810183234Ssimon mov %rbp,$ivec 1811183234Ssimon.align 4 1812238405Sjkim.Lcbc_fast_dec_loop: 1813238405Sjkim mov 0($inp),$s0 # read input 1814183234Ssimon mov 4($inp),$s1 1815183234Ssimon mov 8($inp),$s2 1816183234Ssimon mov 12($inp),$s3 1817238405Sjkim mov $keyp,$key # restore key 1818238405Sjkim mov $inp,$_inp # if ($verticalspin) save inp 1819183234Ssimon 1820183234Ssimon call _x86_64_AES_decrypt 1821183234Ssimon 1822183234Ssimon mov $ivec,%rbp # load ivp 1823238405Sjkim mov $_inp,$inp # if ($verticalspin) restore inp 1824238405Sjkim mov $_len,%r10 # load len 1825183234Ssimon xor 0(%rbp),$s0 # xor iv 1826183234Ssimon xor 4(%rbp),$s1 1827183234Ssimon xor 8(%rbp),$s2 1828183234Ssimon xor 12(%rbp),$s3 1829183234Ssimon mov $inp,%rbp # current input, next iv 1830183234Ssimon 1831183234Ssimon sub \$16,%r10 1832183234Ssimon mov %r10,$_len # update len 1833183234Ssimon mov %rbp,$ivec # update ivp 1834183234Ssimon 1835183234Ssimon mov $s0,0($out) # write output 1836183234Ssimon mov $s1,4($out) 1837183234Ssimon mov $s2,8($out) 1838183234Ssimon mov $s3,12($out) 1839183234Ssimon 1840183234Ssimon lea 16($inp),$inp 1841183234Ssimon lea 16($out),$out 1842238405Sjkim jnz .Lcbc_fast_dec_loop 1843183234Ssimon mov $_ivp,%r12 # load user ivp 1844183234Ssimon mov 0(%rbp),%r10 # load iv 1845183234Ssimon mov 8(%rbp),%r11 1846183234Ssimon mov %r10,0(%r12) # copy back to user 1847183234Ssimon mov %r11,8(%r12) 1848238405Sjkim jmp .Lcbc_fast_cleanup 1849183234Ssimon 1850238405Sjkim.align 16 1851238405Sjkim.Lcbc_fast_dec_in_place: 1852238405Sjkim mov 0(%rbp),%r10 # copy iv to stack 1853238405Sjkim mov 8(%rbp),%r11 1854238405Sjkim mov %r10,0+$ivec 1855238405Sjkim mov %r11,8+$ivec 1856183234Ssimon.align 4 1857238405Sjkim.Lcbc_fast_dec_in_place_loop: 1858183234Ssimon mov 0($inp),$s0 # load input 1859183234Ssimon mov 4($inp),$s1 1860183234Ssimon mov 8($inp),$s2 1861183234Ssimon mov 12($inp),$s3 1862238405Sjkim mov $keyp,$key # restore key 1863238405Sjkim mov $inp,$_inp # if ($verticalspin) save inp 1864183234Ssimon 1865183234Ssimon call _x86_64_AES_decrypt 1866183234Ssimon 1867238405Sjkim mov $_inp,$inp # if ($verticalspin) restore inp 1868238405Sjkim mov $_len,%r10 1869238405Sjkim xor 0+$ivec,$s0 1870238405Sjkim xor 4+$ivec,$s1 1871238405Sjkim xor 8+$ivec,$s2 1872238405Sjkim xor 12+$ivec,$s3 1873183234Ssimon 1874238405Sjkim mov 0($inp),%r11 # load input 1875238405Sjkim mov 8($inp),%r12 1876238405Sjkim sub \$16,%r10 1877238405Sjkim jz .Lcbc_fast_dec_in_place_done 1878183234Ssimon 1879238405Sjkim mov %r11,0+$ivec # copy input to iv 1880238405Sjkim mov %r12,8+$ivec 1881238405Sjkim 1882183234Ssimon mov $s0,0($out) # save output [zaps input] 1883183234Ssimon mov $s1,4($out) 1884183234Ssimon mov $s2,8($out) 1885183234Ssimon mov $s3,12($out) 1886183234Ssimon 1887183234Ssimon lea 16($inp),$inp 1888183234Ssimon lea 16($out),$out 1889238405Sjkim mov %r10,$_len 1890238405Sjkim jmp .Lcbc_fast_dec_in_place_loop 1891238405Sjkim.Lcbc_fast_dec_in_place_done: 1892238405Sjkim mov $_ivp,%rdi 1893238405Sjkim mov %r11,0(%rdi) # copy iv back to user 1894238405Sjkim mov %r12,8(%rdi) 1895183234Ssimon 1896238405Sjkim mov $s0,0($out) # save output [zaps input] 1897238405Sjkim mov $s1,4($out) 1898238405Sjkim mov $s2,8($out) 1899238405Sjkim mov $s3,12($out) 1900238405Sjkim 1901183234Ssimon.align 4 1902238405Sjkim.Lcbc_fast_cleanup: 1903238405Sjkim cmpl \$0,$mark # was the key schedule copied? 1904238405Sjkim lea $aes_key,%rdi 1905238405Sjkim je .Lcbc_exit 1906238405Sjkim mov \$240/8,%ecx 1907238405Sjkim xor %rax,%rax 1908238405Sjkim .long 0x90AB48F3 # rep stosq 1909238405Sjkim 1910238405Sjkim jmp .Lcbc_exit 1911238405Sjkim 1912238405Sjkim#--------------------------- SLOW ROUTINE ---------------------------# 1913238405Sjkim.align 16 1914238405Sjkim.Lcbc_slow_prologue: 1915238405Sjkim # allocate aligned stack frame... 1916238405Sjkim lea -88(%rsp),%rbp 1917238405Sjkim and \$-64,%rbp 1918238405Sjkim # ... just "above" key schedule 1919238405Sjkim lea -88-63(%rcx),%r10 1920238405Sjkim sub %rbp,%r10 1921238405Sjkim neg %r10 1922238405Sjkim and \$0x3c0,%r10 1923238405Sjkim sub %r10,%rbp 1924238405Sjkim 1925238405Sjkim xchg %rsp,%rbp 1926238405Sjkim #add \$8,%rsp # reserve for return address! 1927238405Sjkim mov %rbp,$_rsp # save %rsp 1928238405Sjkim.Lcbc_slow_body: 1929238405Sjkim #mov %rdi,$_inp # save copy of inp 1930238405Sjkim #mov %rsi,$_out # save copy of out 1931238405Sjkim #mov %rdx,$_len # save copy of len 1932238405Sjkim #mov %rcx,$_key # save copy of key 1933238405Sjkim mov %r8,$_ivp # save copy of ivp 1934238405Sjkim mov %r8,%rbp # rearrange input arguments 1935238405Sjkim mov %r9,%rbx 1936238405Sjkim mov %rsi,$out 1937238405Sjkim mov %rdi,$inp 1938238405Sjkim mov %rcx,$key 1939238405Sjkim mov %rdx,%r10 1940238405Sjkim 1941238405Sjkim mov 240($key),%eax 1942238405Sjkim mov $key,$keyp # save key pointer 1943238405Sjkim shl \$4,%eax 1944238405Sjkim lea ($key,%rax),%rax 1945238405Sjkim mov %rax,$keyend 1946238405Sjkim 1947238405Sjkim # pick Te4 copy which can't "overlap" with stack frame or key scdedule 1948238405Sjkim lea 2048($sbox),$sbox 1949238405Sjkim lea 768-8(%rsp),%rax 1950238405Sjkim sub $sbox,%rax 1951238405Sjkim and \$0x300,%rax 1952238405Sjkim lea ($sbox,%rax),$sbox 1953238405Sjkim 1954238405Sjkim cmp \$0,%rbx 1955238405Sjkim je .LSLOW_DECRYPT 1956238405Sjkim 1957238405Sjkim#--------------------------- SLOW ENCRYPT ---------------------------# 1958238405Sjkim test \$-16,%r10 # check upon length 1959238405Sjkim mov 0(%rbp),$s0 # load iv 1960238405Sjkim mov 4(%rbp),$s1 1961238405Sjkim mov 8(%rbp),$s2 1962238405Sjkim mov 12(%rbp),$s3 1963238405Sjkim jz .Lcbc_slow_enc_tail # short input... 1964238405Sjkim 1965238405Sjkim.align 4 1966238405Sjkim.Lcbc_slow_enc_loop: 1967238405Sjkim xor 0($inp),$s0 1968238405Sjkim xor 4($inp),$s1 1969238405Sjkim xor 8($inp),$s2 1970238405Sjkim xor 12($inp),$s3 1971238405Sjkim mov $keyp,$key # restore key 1972238405Sjkim mov $inp,$_inp # save inp 1973238405Sjkim mov $out,$_out # save out 1974238405Sjkim mov %r10,$_len # save len 1975238405Sjkim 1976238405Sjkim call _x86_64_AES_encrypt_compact 1977238405Sjkim 1978238405Sjkim mov $_inp,$inp # restore inp 1979238405Sjkim mov $_out,$out # restore out 1980238405Sjkim mov $_len,%r10 # restore len 1981238405Sjkim mov $s0,0($out) 1982238405Sjkim mov $s1,4($out) 1983238405Sjkim mov $s2,8($out) 1984238405Sjkim mov $s3,12($out) 1985238405Sjkim 1986238405Sjkim lea 16($inp),$inp 1987238405Sjkim lea 16($out),$out 1988238405Sjkim sub \$16,%r10 1989238405Sjkim test \$-16,%r10 1990238405Sjkim jnz .Lcbc_slow_enc_loop 1991238405Sjkim test \$15,%r10 1992238405Sjkim jnz .Lcbc_slow_enc_tail 1993238405Sjkim mov $_ivp,%rbp # restore ivp 1994238405Sjkim mov $s0,0(%rbp) # save ivec 1995238405Sjkim mov $s1,4(%rbp) 1996238405Sjkim mov $s2,8(%rbp) 1997238405Sjkim mov $s3,12(%rbp) 1998238405Sjkim 1999238405Sjkim jmp .Lcbc_exit 2000238405Sjkim 2001238405Sjkim.align 4 2002238405Sjkim.Lcbc_slow_enc_tail: 2003238405Sjkim mov %rax,%r11 2004238405Sjkim mov %rcx,%r12 2005238405Sjkim mov %r10,%rcx 2006238405Sjkim mov $inp,%rsi 2007238405Sjkim mov $out,%rdi 2008238405Sjkim .long 0x9066A4F3 # rep movsb 2009238405Sjkim mov \$16,%rcx # zero tail 2010238405Sjkim sub %r10,%rcx 2011238405Sjkim xor %rax,%rax 2012238405Sjkim .long 0x9066AAF3 # rep stosb 2013238405Sjkim mov $out,$inp # this is not a mistake! 2014238405Sjkim mov \$16,%r10 # len=16 2015238405Sjkim mov %r11,%rax 2016238405Sjkim mov %r12,%rcx 2017238405Sjkim jmp .Lcbc_slow_enc_loop # one more spin... 2018238405Sjkim#--------------------------- SLOW DECRYPT ---------------------------# 2019238405Sjkim.align 16 2020238405Sjkim.LSLOW_DECRYPT: 2021238405Sjkim shr \$3,%rax 2022238405Sjkim add %rax,$sbox # recall "magic" constants! 2023238405Sjkim 2024238405Sjkim mov 0(%rbp),%r11 # copy iv to stack 2025238405Sjkim mov 8(%rbp),%r12 2026238405Sjkim mov %r11,0+$ivec 2027238405Sjkim mov %r12,8+$ivec 2028238405Sjkim 2029238405Sjkim.align 4 2030238405Sjkim.Lcbc_slow_dec_loop: 2031238405Sjkim mov 0($inp),$s0 # load input 2032238405Sjkim mov 4($inp),$s1 2033238405Sjkim mov 8($inp),$s2 2034238405Sjkim mov 12($inp),$s3 2035238405Sjkim mov $keyp,$key # restore key 2036238405Sjkim mov $inp,$_inp # save inp 2037238405Sjkim mov $out,$_out # save out 2038238405Sjkim mov %r10,$_len # save len 2039238405Sjkim 2040238405Sjkim call _x86_64_AES_decrypt_compact 2041238405Sjkim 2042238405Sjkim mov $_inp,$inp # restore inp 2043238405Sjkim mov $_out,$out # restore out 2044238405Sjkim mov $_len,%r10 2045238405Sjkim xor 0+$ivec,$s0 2046238405Sjkim xor 4+$ivec,$s1 2047238405Sjkim xor 8+$ivec,$s2 2048238405Sjkim xor 12+$ivec,$s3 2049238405Sjkim 2050238405Sjkim mov 0($inp),%r11 # load input 2051238405Sjkim mov 8($inp),%r12 2052238405Sjkim sub \$16,%r10 2053238405Sjkim jc .Lcbc_slow_dec_partial 2054238405Sjkim jz .Lcbc_slow_dec_done 2055238405Sjkim 2056238405Sjkim mov %r11,0+$ivec # copy input to iv 2057238405Sjkim mov %r12,8+$ivec 2058238405Sjkim 2059238405Sjkim mov $s0,0($out) # save output [can zap input] 2060238405Sjkim mov $s1,4($out) 2061238405Sjkim mov $s2,8($out) 2062238405Sjkim mov $s3,12($out) 2063238405Sjkim 2064238405Sjkim lea 16($inp),$inp 2065238405Sjkim lea 16($out),$out 2066238405Sjkim jmp .Lcbc_slow_dec_loop 2067238405Sjkim.Lcbc_slow_dec_done: 2068238405Sjkim mov $_ivp,%rdi 2069238405Sjkim mov %r11,0(%rdi) # copy iv back to user 2070238405Sjkim mov %r12,8(%rdi) 2071238405Sjkim 2072238405Sjkim mov $s0,0($out) # save output [can zap input] 2073238405Sjkim mov $s1,4($out) 2074238405Sjkim mov $s2,8($out) 2075238405Sjkim mov $s3,12($out) 2076238405Sjkim 2077238405Sjkim jmp .Lcbc_exit 2078238405Sjkim 2079238405Sjkim.align 4 2080238405Sjkim.Lcbc_slow_dec_partial: 2081238405Sjkim mov $_ivp,%rdi 2082238405Sjkim mov %r11,0(%rdi) # copy iv back to user 2083238405Sjkim mov %r12,8(%rdi) 2084238405Sjkim 2085238405Sjkim mov $s0,0+$ivec # save output to stack 2086238405Sjkim mov $s1,4+$ivec 2087238405Sjkim mov $s2,8+$ivec 2088238405Sjkim mov $s3,12+$ivec 2089238405Sjkim 2090238405Sjkim mov $out,%rdi 2091238405Sjkim lea $ivec,%rsi 2092238405Sjkim lea 16(%r10),%rcx 2093238405Sjkim .long 0x9066A4F3 # rep movsb 2094238405Sjkim jmp .Lcbc_exit 2095238405Sjkim 2096238405Sjkim.align 16 2097238405Sjkim.Lcbc_exit: 2098238405Sjkim mov $_rsp,%rsi 2099238405Sjkim mov (%rsi),%r15 2100238405Sjkim mov 8(%rsi),%r14 2101238405Sjkim mov 16(%rsi),%r13 2102238405Sjkim mov 24(%rsi),%r12 2103238405Sjkim mov 32(%rsi),%rbp 2104238405Sjkim mov 40(%rsi),%rbx 2105238405Sjkim lea 48(%rsi),%rsp 2106238405Sjkim.Lcbc_popfq: 2107238405Sjkim popfq 2108238405Sjkim.Lcbc_epilogue: 2109238405Sjkim ret 2110183234Ssimon.size AES_cbc_encrypt,.-AES_cbc_encrypt 2111183234Ssimon___ 2112183234Ssimon} 2113183234Ssimon 2114183234Ssimon$code.=<<___; 2115183234Ssimon.align 64 2116238405Sjkim.LAES_Te: 2117183234Ssimon___ 2118183234Ssimon &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); 2119183234Ssimon &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); 2120183234Ssimon &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56); 2121183234Ssimon &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec); 2122183234Ssimon &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa); 2123183234Ssimon &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb); 2124183234Ssimon &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45); 2125183234Ssimon &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b); 2126183234Ssimon &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c); 2127183234Ssimon &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83); 2128183234Ssimon &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9); 2129183234Ssimon &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a); 2130183234Ssimon &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d); 2131183234Ssimon &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f); 2132183234Ssimon &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df); 2133183234Ssimon &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea); 2134183234Ssimon &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34); 2135183234Ssimon &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b); 2136183234Ssimon &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d); 2137183234Ssimon &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413); 2138183234Ssimon &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1); 2139183234Ssimon &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6); 2140183234Ssimon &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972); 2141183234Ssimon &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85); 2142183234Ssimon &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed); 2143183234Ssimon &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511); 2144183234Ssimon &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe); 2145183234Ssimon &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b); 2146183234Ssimon &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05); 2147183234Ssimon &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1); 2148183234Ssimon &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142); 2149183234Ssimon &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf); 2150183234Ssimon &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3); 2151183234Ssimon &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e); 2152183234Ssimon &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a); 2153183234Ssimon &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6); 2154183234Ssimon &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3); 2155183234Ssimon &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b); 2156183234Ssimon &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428); 2157183234Ssimon &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad); 2158183234Ssimon &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14); 2159183234Ssimon &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8); 2160183234Ssimon &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4); 2161183234Ssimon &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2); 2162183234Ssimon &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda); 2163183234Ssimon &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949); 2164183234Ssimon &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf); 2165183234Ssimon &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810); 2166183234Ssimon &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c); 2167183234Ssimon &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697); 2168183234Ssimon &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e); 2169183234Ssimon &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f); 2170183234Ssimon &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc); 2171183234Ssimon &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c); 2172183234Ssimon &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); 2173183234Ssimon &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); 2174183234Ssimon &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); 2175183234Ssimon &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); 2176183234Ssimon &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); 2177183234Ssimon &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); 2178183234Ssimon &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); 2179183234Ssimon &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); 2180183234Ssimon &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); 2181183234Ssimon &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); 2182238405Sjkim 2183238405Sjkim#Te4 # four copies of Te4 to choose from to avoid L1 aliasing 2184238405Sjkim &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 2185238405Sjkim &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 2186238405Sjkim &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 2187238405Sjkim &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 2188238405Sjkim &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 2189238405Sjkim &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 2190238405Sjkim &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 2191238405Sjkim &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 2192238405Sjkim &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 2193238405Sjkim &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 2194238405Sjkim &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 2195238405Sjkim &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 2196238405Sjkim &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 2197238405Sjkim &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 2198238405Sjkim &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 2199238405Sjkim &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 2200238405Sjkim &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 2201238405Sjkim &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 2202238405Sjkim &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 2203238405Sjkim &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 2204238405Sjkim &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 2205238405Sjkim &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 2206238405Sjkim &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 2207238405Sjkim &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 2208238405Sjkim &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 2209238405Sjkim &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 2210238405Sjkim &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 2211238405Sjkim &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 2212238405Sjkim &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 2213238405Sjkim &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 2214238405Sjkim &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 2215238405Sjkim &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 2216238405Sjkim 2217238405Sjkim &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 2218238405Sjkim &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 2219238405Sjkim &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 2220238405Sjkim &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 2221238405Sjkim &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 2222238405Sjkim &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 2223238405Sjkim &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 2224238405Sjkim &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 2225238405Sjkim &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 2226238405Sjkim &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 2227238405Sjkim &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 2228238405Sjkim &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 2229238405Sjkim &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 2230238405Sjkim &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 2231238405Sjkim &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 2232238405Sjkim &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 2233238405Sjkim &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 2234238405Sjkim &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 2235238405Sjkim &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 2236238405Sjkim &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 2237238405Sjkim &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 2238238405Sjkim &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 2239238405Sjkim &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 2240238405Sjkim &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 2241238405Sjkim &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 2242238405Sjkim &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 2243238405Sjkim &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 2244238405Sjkim &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 2245238405Sjkim &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 2246238405Sjkim &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 2247238405Sjkim &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 2248238405Sjkim &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 2249238405Sjkim 2250238405Sjkim &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 2251238405Sjkim &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 2252238405Sjkim &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 2253238405Sjkim &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 2254238405Sjkim &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 2255238405Sjkim &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 2256238405Sjkim &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 2257238405Sjkim &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 2258238405Sjkim &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 2259238405Sjkim &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 2260238405Sjkim &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 2261238405Sjkim &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 2262238405Sjkim &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 2263238405Sjkim &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 2264238405Sjkim &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 2265238405Sjkim &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 2266238405Sjkim &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 2267238405Sjkim &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 2268238405Sjkim &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 2269238405Sjkim &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 2270238405Sjkim &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 2271238405Sjkim &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 2272238405Sjkim &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 2273238405Sjkim &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 2274238405Sjkim &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 2275238405Sjkim &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 2276238405Sjkim &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 2277238405Sjkim &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 2278238405Sjkim &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 2279238405Sjkim &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 2280238405Sjkim &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 2281238405Sjkim &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 2282238405Sjkim 2283238405Sjkim &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 2284238405Sjkim &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 2285238405Sjkim &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 2286238405Sjkim &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 2287238405Sjkim &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 2288238405Sjkim &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 2289238405Sjkim &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 2290238405Sjkim &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 2291238405Sjkim &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 2292238405Sjkim &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 2293238405Sjkim &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 2294238405Sjkim &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 2295238405Sjkim &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 2296238405Sjkim &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 2297238405Sjkim &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 2298238405Sjkim &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 2299238405Sjkim &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 2300238405Sjkim &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 2301238405Sjkim &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 2302238405Sjkim &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 2303238405Sjkim &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 2304238405Sjkim &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 2305238405Sjkim &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 2306238405Sjkim &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 2307238405Sjkim &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 2308238405Sjkim &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 2309238405Sjkim &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 2310238405Sjkim &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 2311238405Sjkim &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 2312238405Sjkim &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 2313238405Sjkim &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 2314238405Sjkim &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 2315183234Ssimon#rcon: 2316183234Ssimon$code.=<<___; 2317183234Ssimon .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 2318183234Ssimon .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 2319238405Sjkim .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 2320238405Sjkim .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b 2321183234Ssimon___ 2322183234Ssimon$code.=<<___; 2323183234Ssimon.align 64 2324238405Sjkim.LAES_Td: 2325183234Ssimon___ 2326183234Ssimon &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); 2327183234Ssimon &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); 2328183234Ssimon &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5); 2329183234Ssimon &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5); 2330183234Ssimon &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d); 2331183234Ssimon &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b); 2332183234Ssimon &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295); 2333183234Ssimon &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e); 2334183234Ssimon &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927); 2335183234Ssimon &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d); 2336183234Ssimon &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362); 2337183234Ssimon &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9); 2338183234Ssimon &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52); 2339183234Ssimon &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566); 2340183234Ssimon &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3); 2341183234Ssimon &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed); 2342183234Ssimon &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e); 2343183234Ssimon &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4); 2344183234Ssimon &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4); 2345183234Ssimon &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd); 2346183234Ssimon &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d); 2347183234Ssimon &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060); 2348183234Ssimon &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967); 2349183234Ssimon &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879); 2350183234Ssimon &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000); 2351183234Ssimon &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c); 2352183234Ssimon &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36); 2353183234Ssimon &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624); 2354183234Ssimon &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b); 2355183234Ssimon &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c); 2356183234Ssimon &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12); 2357183234Ssimon &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14); 2358183234Ssimon &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3); 2359183234Ssimon &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b); 2360183234Ssimon &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8); 2361183234Ssimon &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684); 2362183234Ssimon &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7); 2363183234Ssimon &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177); 2364183234Ssimon &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947); 2365183234Ssimon &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322); 2366183234Ssimon &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498); 2367183234Ssimon &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f); 2368183234Ssimon &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54); 2369183234Ssimon &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382); 2370183234Ssimon &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf); 2371183234Ssimon &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb); 2372183234Ssimon &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83); 2373183234Ssimon &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef); 2374183234Ssimon &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029); 2375183234Ssimon &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235); 2376183234Ssimon &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733); 2377183234Ssimon &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117); 2378183234Ssimon &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4); 2379183234Ssimon &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546); 2380183234Ssimon &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb); 2381183234Ssimon &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d); 2382183234Ssimon &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb); 2383183234Ssimon &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a); 2384183234Ssimon &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773); 2385183234Ssimon &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478); 2386183234Ssimon &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2); 2387183234Ssimon &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); 2388183234Ssimon &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); 2389183234Ssimon &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); 2390238405Sjkim 2391238405Sjkim#Td4: # four copies of Td4 to choose from to avoid L1 aliasing 2392183234Ssimon &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2393183234Ssimon &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2394183234Ssimon &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2395183234Ssimon &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); 2396183234Ssimon &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); 2397183234Ssimon &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); 2398183234Ssimon &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); 2399183234Ssimon &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); 2400183234Ssimon &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); 2401183234Ssimon &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); 2402183234Ssimon &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); 2403183234Ssimon &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); 2404183234Ssimon &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); 2405183234Ssimon &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); 2406183234Ssimon &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); 2407183234Ssimon &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); 2408183234Ssimon &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); 2409183234Ssimon &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); 2410183234Ssimon &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); 2411183234Ssimon &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); 2412183234Ssimon &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); 2413183234Ssimon &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); 2414183234Ssimon &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); 2415183234Ssimon &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); 2416183234Ssimon &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); 2417183234Ssimon &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); 2418183234Ssimon &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); 2419183234Ssimon &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); 2420183234Ssimon &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); 2421183234Ssimon &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2422183234Ssimon &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2423183234Ssimon &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2424238405Sjkim$code.=<<___; 2425238405Sjkim .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe 2426238405Sjkim .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 2427238405Sjkim___ 2428238405Sjkim &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2429238405Sjkim &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2430238405Sjkim &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2431238405Sjkim &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); 2432238405Sjkim &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); 2433238405Sjkim &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); 2434238405Sjkim &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); 2435238405Sjkim &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); 2436238405Sjkim &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); 2437238405Sjkim &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); 2438238405Sjkim &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); 2439238405Sjkim &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); 2440238405Sjkim &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); 2441238405Sjkim &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); 2442238405Sjkim &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); 2443238405Sjkim &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); 2444238405Sjkim &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); 2445238405Sjkim &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); 2446238405Sjkim &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); 2447238405Sjkim &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); 2448238405Sjkim &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); 2449238405Sjkim &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); 2450238405Sjkim &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); 2451238405Sjkim &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); 2452238405Sjkim &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); 2453238405Sjkim &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); 2454238405Sjkim &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); 2455238405Sjkim &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); 2456238405Sjkim &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); 2457238405Sjkim &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2458238405Sjkim &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2459238405Sjkim &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2460238405Sjkim$code.=<<___; 2461238405Sjkim .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe 2462238405Sjkim .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 2463238405Sjkim___ 2464238405Sjkim &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2465238405Sjkim &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2466238405Sjkim &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2467238405Sjkim &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); 2468238405Sjkim &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); 2469238405Sjkim &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); 2470238405Sjkim &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); 2471238405Sjkim &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); 2472238405Sjkim &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); 2473238405Sjkim &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); 2474238405Sjkim &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); 2475238405Sjkim &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); 2476238405Sjkim &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); 2477238405Sjkim &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); 2478238405Sjkim &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); 2479238405Sjkim &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); 2480238405Sjkim &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); 2481238405Sjkim &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); 2482238405Sjkim &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); 2483238405Sjkim &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); 2484238405Sjkim &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); 2485238405Sjkim &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); 2486238405Sjkim &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); 2487238405Sjkim &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); 2488238405Sjkim &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); 2489238405Sjkim &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); 2490238405Sjkim &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); 2491238405Sjkim &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); 2492238405Sjkim &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); 2493238405Sjkim &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2494238405Sjkim &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2495238405Sjkim &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2496238405Sjkim$code.=<<___; 2497238405Sjkim .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe 2498238405Sjkim .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 2499238405Sjkim___ 2500238405Sjkim &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2501238405Sjkim &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2502238405Sjkim &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2503238405Sjkim &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); 2504238405Sjkim &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); 2505238405Sjkim &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); 2506238405Sjkim &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); 2507238405Sjkim &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); 2508238405Sjkim &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); 2509238405Sjkim &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); 2510238405Sjkim &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); 2511238405Sjkim &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); 2512238405Sjkim &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); 2513238405Sjkim &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); 2514238405Sjkim &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); 2515238405Sjkim &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); 2516238405Sjkim &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); 2517238405Sjkim &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); 2518238405Sjkim &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); 2519238405Sjkim &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); 2520238405Sjkim &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); 2521238405Sjkim &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); 2522238405Sjkim &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); 2523238405Sjkim &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); 2524238405Sjkim &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); 2525238405Sjkim &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); 2526238405Sjkim &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); 2527238405Sjkim &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); 2528238405Sjkim &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); 2529238405Sjkim &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2530238405Sjkim &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2531238405Sjkim &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2532238405Sjkim$code.=<<___; 2533238405Sjkim .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe 2534238405Sjkim .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 2535238405Sjkim.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 2536238405Sjkim.align 64 2537238405Sjkim___ 2538183234Ssimon 2539238405Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2540238405Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2541238405Sjkimif ($win64) { 2542238405Sjkim$rec="%rcx"; 2543238405Sjkim$frame="%rdx"; 2544238405Sjkim$context="%r8"; 2545238405Sjkim$disp="%r9"; 2546238405Sjkim 2547238405Sjkim$code.=<<___; 2548238405Sjkim.extern __imp_RtlVirtualUnwind 2549238405Sjkim.type block_se_handler,\@abi-omnipotent 2550238405Sjkim.align 16 2551238405Sjkimblock_se_handler: 2552238405Sjkim push %rsi 2553238405Sjkim push %rdi 2554238405Sjkim push %rbx 2555238405Sjkim push %rbp 2556238405Sjkim push %r12 2557238405Sjkim push %r13 2558238405Sjkim push %r14 2559238405Sjkim push %r15 2560238405Sjkim pushfq 2561238405Sjkim sub \$64,%rsp 2562238405Sjkim 2563238405Sjkim mov 120($context),%rax # pull context->Rax 2564238405Sjkim mov 248($context),%rbx # pull context->Rip 2565238405Sjkim 2566238405Sjkim mov 8($disp),%rsi # disp->ImageBase 2567238405Sjkim mov 56($disp),%r11 # disp->HandlerData 2568238405Sjkim 2569238405Sjkim mov 0(%r11),%r10d # HandlerData[0] 2570238405Sjkim lea (%rsi,%r10),%r10 # prologue label 2571238405Sjkim cmp %r10,%rbx # context->Rip<prologue label 2572238405Sjkim jb .Lin_block_prologue 2573238405Sjkim 2574238405Sjkim mov 152($context),%rax # pull context->Rsp 2575238405Sjkim 2576238405Sjkim mov 4(%r11),%r10d # HandlerData[1] 2577238405Sjkim lea (%rsi,%r10),%r10 # epilogue label 2578238405Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 2579238405Sjkim jae .Lin_block_prologue 2580238405Sjkim 2581238405Sjkim mov 24(%rax),%rax # pull saved real stack pointer 2582238405Sjkim lea 48(%rax),%rax # adjust... 2583238405Sjkim 2584238405Sjkim mov -8(%rax),%rbx 2585238405Sjkim mov -16(%rax),%rbp 2586238405Sjkim mov -24(%rax),%r12 2587238405Sjkim mov -32(%rax),%r13 2588238405Sjkim mov -40(%rax),%r14 2589238405Sjkim mov -48(%rax),%r15 2590238405Sjkim mov %rbx,144($context) # restore context->Rbx 2591238405Sjkim mov %rbp,160($context) # restore context->Rbp 2592238405Sjkim mov %r12,216($context) # restore context->R12 2593238405Sjkim mov %r13,224($context) # restore context->R13 2594238405Sjkim mov %r14,232($context) # restore context->R14 2595238405Sjkim mov %r15,240($context) # restore context->R15 2596238405Sjkim 2597238405Sjkim.Lin_block_prologue: 2598238405Sjkim mov 8(%rax),%rdi 2599238405Sjkim mov 16(%rax),%rsi 2600238405Sjkim mov %rax,152($context) # restore context->Rsp 2601238405Sjkim mov %rsi,168($context) # restore context->Rsi 2602238405Sjkim mov %rdi,176($context) # restore context->Rdi 2603238405Sjkim 2604238405Sjkim jmp .Lcommon_seh_exit 2605238405Sjkim.size block_se_handler,.-block_se_handler 2606238405Sjkim 2607238405Sjkim.type key_se_handler,\@abi-omnipotent 2608238405Sjkim.align 16 2609238405Sjkimkey_se_handler: 2610238405Sjkim push %rsi 2611238405Sjkim push %rdi 2612238405Sjkim push %rbx 2613238405Sjkim push %rbp 2614238405Sjkim push %r12 2615238405Sjkim push %r13 2616238405Sjkim push %r14 2617238405Sjkim push %r15 2618238405Sjkim pushfq 2619238405Sjkim sub \$64,%rsp 2620238405Sjkim 2621238405Sjkim mov 120($context),%rax # pull context->Rax 2622238405Sjkim mov 248($context),%rbx # pull context->Rip 2623238405Sjkim 2624238405Sjkim mov 8($disp),%rsi # disp->ImageBase 2625238405Sjkim mov 56($disp),%r11 # disp->HandlerData 2626238405Sjkim 2627238405Sjkim mov 0(%r11),%r10d # HandlerData[0] 2628238405Sjkim lea (%rsi,%r10),%r10 # prologue label 2629238405Sjkim cmp %r10,%rbx # context->Rip<prologue label 2630238405Sjkim jb .Lin_key_prologue 2631238405Sjkim 2632238405Sjkim mov 152($context),%rax # pull context->Rsp 2633238405Sjkim 2634238405Sjkim mov 4(%r11),%r10d # HandlerData[1] 2635238405Sjkim lea (%rsi,%r10),%r10 # epilogue label 2636238405Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 2637238405Sjkim jae .Lin_key_prologue 2638238405Sjkim 2639238405Sjkim lea 56(%rax),%rax 2640238405Sjkim 2641238405Sjkim mov -8(%rax),%rbx 2642238405Sjkim mov -16(%rax),%rbp 2643238405Sjkim mov -24(%rax),%r12 2644238405Sjkim mov -32(%rax),%r13 2645238405Sjkim mov -40(%rax),%r14 2646238405Sjkim mov -48(%rax),%r15 2647238405Sjkim mov %rbx,144($context) # restore context->Rbx 2648238405Sjkim mov %rbp,160($context) # restore context->Rbp 2649238405Sjkim mov %r12,216($context) # restore context->R12 2650238405Sjkim mov %r13,224($context) # restore context->R13 2651238405Sjkim mov %r14,232($context) # restore context->R14 2652238405Sjkim mov %r15,240($context) # restore context->R15 2653238405Sjkim 2654238405Sjkim.Lin_key_prologue: 2655238405Sjkim mov 8(%rax),%rdi 2656238405Sjkim mov 16(%rax),%rsi 2657238405Sjkim mov %rax,152($context) # restore context->Rsp 2658238405Sjkim mov %rsi,168($context) # restore context->Rsi 2659238405Sjkim mov %rdi,176($context) # restore context->Rdi 2660238405Sjkim 2661238405Sjkim jmp .Lcommon_seh_exit 2662238405Sjkim.size key_se_handler,.-key_se_handler 2663238405Sjkim 2664238405Sjkim.type cbc_se_handler,\@abi-omnipotent 2665238405Sjkim.align 16 2666238405Sjkimcbc_se_handler: 2667238405Sjkim push %rsi 2668238405Sjkim push %rdi 2669238405Sjkim push %rbx 2670238405Sjkim push %rbp 2671238405Sjkim push %r12 2672238405Sjkim push %r13 2673238405Sjkim push %r14 2674238405Sjkim push %r15 2675238405Sjkim pushfq 2676238405Sjkim sub \$64,%rsp 2677238405Sjkim 2678238405Sjkim mov 120($context),%rax # pull context->Rax 2679238405Sjkim mov 248($context),%rbx # pull context->Rip 2680238405Sjkim 2681238405Sjkim lea .Lcbc_prologue(%rip),%r10 2682238405Sjkim cmp %r10,%rbx # context->Rip<.Lcbc_prologue 2683238405Sjkim jb .Lin_cbc_prologue 2684238405Sjkim 2685238405Sjkim lea .Lcbc_fast_body(%rip),%r10 2686238405Sjkim cmp %r10,%rbx # context->Rip<.Lcbc_fast_body 2687238405Sjkim jb .Lin_cbc_frame_setup 2688238405Sjkim 2689238405Sjkim lea .Lcbc_slow_prologue(%rip),%r10 2690238405Sjkim cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue 2691238405Sjkim jb .Lin_cbc_body 2692238405Sjkim 2693238405Sjkim lea .Lcbc_slow_body(%rip),%r10 2694238405Sjkim cmp %r10,%rbx # context->Rip<.Lcbc_slow_body 2695238405Sjkim jb .Lin_cbc_frame_setup 2696238405Sjkim 2697238405Sjkim.Lin_cbc_body: 2698238405Sjkim mov 152($context),%rax # pull context->Rsp 2699238405Sjkim 2700238405Sjkim lea .Lcbc_epilogue(%rip),%r10 2701238405Sjkim cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue 2702238405Sjkim jae .Lin_cbc_prologue 2703238405Sjkim 2704238405Sjkim lea 8(%rax),%rax 2705238405Sjkim 2706238405Sjkim lea .Lcbc_popfq(%rip),%r10 2707238405Sjkim cmp %r10,%rbx # context->Rip>=.Lcbc_popfq 2708238405Sjkim jae .Lin_cbc_prologue 2709238405Sjkim 2710238405Sjkim mov `16-8`(%rax),%rax # biased $_rsp 2711238405Sjkim lea 56(%rax),%rax 2712238405Sjkim 2713238405Sjkim.Lin_cbc_frame_setup: 2714238405Sjkim mov -16(%rax),%rbx 2715238405Sjkim mov -24(%rax),%rbp 2716238405Sjkim mov -32(%rax),%r12 2717238405Sjkim mov -40(%rax),%r13 2718238405Sjkim mov -48(%rax),%r14 2719238405Sjkim mov -56(%rax),%r15 2720238405Sjkim mov %rbx,144($context) # restore context->Rbx 2721238405Sjkim mov %rbp,160($context) # restore context->Rbp 2722238405Sjkim mov %r12,216($context) # restore context->R12 2723238405Sjkim mov %r13,224($context) # restore context->R13 2724238405Sjkim mov %r14,232($context) # restore context->R14 2725238405Sjkim mov %r15,240($context) # restore context->R15 2726238405Sjkim 2727238405Sjkim.Lin_cbc_prologue: 2728238405Sjkim mov 8(%rax),%rdi 2729238405Sjkim mov 16(%rax),%rsi 2730238405Sjkim mov %rax,152($context) # restore context->Rsp 2731238405Sjkim mov %rsi,168($context) # restore context->Rsi 2732238405Sjkim mov %rdi,176($context) # restore context->Rdi 2733238405Sjkim 2734238405Sjkim.Lcommon_seh_exit: 2735238405Sjkim 2736238405Sjkim mov 40($disp),%rdi # disp->ContextRecord 2737238405Sjkim mov $context,%rsi # context 2738238405Sjkim mov \$`1232/8`,%ecx # sizeof(CONTEXT) 2739238405Sjkim .long 0xa548f3fc # cld; rep movsq 2740238405Sjkim 2741238405Sjkim mov $disp,%rsi 2742238405Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2743238405Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 2744238405Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 2745238405Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2746238405Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 2747238405Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 2748238405Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 2749238405Sjkim mov %r10,32(%rsp) # arg5 2750238405Sjkim mov %r11,40(%rsp) # arg6 2751238405Sjkim mov %r12,48(%rsp) # arg7 2752238405Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 2753238405Sjkim call *__imp_RtlVirtualUnwind(%rip) 2754238405Sjkim 2755238405Sjkim mov \$1,%eax # ExceptionContinueSearch 2756238405Sjkim add \$64,%rsp 2757238405Sjkim popfq 2758238405Sjkim pop %r15 2759238405Sjkim pop %r14 2760238405Sjkim pop %r13 2761238405Sjkim pop %r12 2762238405Sjkim pop %rbp 2763238405Sjkim pop %rbx 2764238405Sjkim pop %rdi 2765238405Sjkim pop %rsi 2766238405Sjkim ret 2767238405Sjkim.size cbc_se_handler,.-cbc_se_handler 2768238405Sjkim 2769238405Sjkim.section .pdata 2770238405Sjkim.align 4 2771238405Sjkim .rva .LSEH_begin_AES_encrypt 2772238405Sjkim .rva .LSEH_end_AES_encrypt 2773238405Sjkim .rva .LSEH_info_AES_encrypt 2774238405Sjkim 2775238405Sjkim .rva .LSEH_begin_AES_decrypt 2776238405Sjkim .rva .LSEH_end_AES_decrypt 2777238405Sjkim .rva .LSEH_info_AES_decrypt 2778238405Sjkim 2779238405Sjkim .rva .LSEH_begin_private_AES_set_encrypt_key 2780238405Sjkim .rva .LSEH_end_private_AES_set_encrypt_key 2781238405Sjkim .rva .LSEH_info_private_AES_set_encrypt_key 2782238405Sjkim 2783238405Sjkim .rva .LSEH_begin_private_AES_set_decrypt_key 2784238405Sjkim .rva .LSEH_end_private_AES_set_decrypt_key 2785238405Sjkim .rva .LSEH_info_private_AES_set_decrypt_key 2786238405Sjkim 2787238405Sjkim .rva .LSEH_begin_AES_cbc_encrypt 2788238405Sjkim .rva .LSEH_end_AES_cbc_encrypt 2789238405Sjkim .rva .LSEH_info_AES_cbc_encrypt 2790238405Sjkim 2791238405Sjkim.section .xdata 2792238405Sjkim.align 8 2793238405Sjkim.LSEH_info_AES_encrypt: 2794238405Sjkim .byte 9,0,0,0 2795238405Sjkim .rva block_se_handler 2796238405Sjkim .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 2797238405Sjkim.LSEH_info_AES_decrypt: 2798238405Sjkim .byte 9,0,0,0 2799238405Sjkim .rva block_se_handler 2800238405Sjkim .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 2801238405Sjkim.LSEH_info_private_AES_set_encrypt_key: 2802238405Sjkim .byte 9,0,0,0 2803238405Sjkim .rva key_se_handler 2804238405Sjkim .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] 2805238405Sjkim.LSEH_info_private_AES_set_decrypt_key: 2806238405Sjkim .byte 9,0,0,0 2807238405Sjkim .rva key_se_handler 2808238405Sjkim .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] 2809238405Sjkim.LSEH_info_AES_cbc_encrypt: 2810238405Sjkim .byte 9,0,0,0 2811238405Sjkim .rva cbc_se_handler 2812238405Sjkim___ 2813238405Sjkim} 2814238405Sjkim 2815183234Ssimon$code =~ s/\`([^\`]*)\`/eval($1)/gem; 2816183234Ssimon 2817183234Ssimonprint $code; 2818183234Ssimon 2819183234Ssimonclose STDOUT; 2820