1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim################################################################### 4238384Sjkim### AES-128 [originally in CTR mode] ### 5238384Sjkim### bitsliced implementation for Intel Core 2 processors ### 6238384Sjkim### requires support of SSE extensions up to SSSE3 ### 7238384Sjkim### Author: Emilia K��sper and Peter Schwabe ### 8238384Sjkim### Date: 2009-03-19 ### 9238384Sjkim### Public domain ### 10238384Sjkim### ### 11238384Sjkim### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12238384Sjkim### further information. ### 13238384Sjkim################################################################### 14238384Sjkim# 15238384Sjkim# September 2011. 16238384Sjkim# 17238384Sjkim# Started as transliteration to "perlasm" the original code has 18238384Sjkim# undergone following changes: 19238384Sjkim# 20238384Sjkim# - code was made position-independent; 21238384Sjkim# - rounds were folded into a loop resulting in >5x size reduction 22238384Sjkim# from 12.5KB to 2.2KB; 23238384Sjkim# - above was possibile thanks to mixcolumns() modification that 24238384Sjkim# allowed to feed its output back to aesenc[last], this was 25238384Sjkim# achieved at cost of two additional inter-registers moves; 26238384Sjkim# - some instruction reordering and interleaving; 27238384Sjkim# - this module doesn't implement key setup subroutine, instead it 28238384Sjkim# relies on conversion of "conventional" key schedule as returned 29238384Sjkim# by AES_set_encrypt_key (see discussion below); 30238384Sjkim# - first and last round keys are treated differently, which allowed 31238384Sjkim# to skip one shiftrows(), reduce bit-sliced key schedule and 32238384Sjkim# speed-up conversion by 22%; 33238384Sjkim# - support for 192- and 256-bit keys was added; 34238384Sjkim# 35238384Sjkim# Resulting performance in CPU cycles spent to encrypt one byte out 36238384Sjkim# of 4096-byte buffer with 128-bit key is: 37238384Sjkim# 38238384Sjkim# Emilia's this(*) difference 39238384Sjkim# 40238384Sjkim# Core 2 9.30 8.69 +7% 41238384Sjkim# Nehalem(**) 7.63 6.98 +9% 42238384Sjkim# Atom 17.1 17.4 -2%(***) 43238384Sjkim# 44238384Sjkim# (*) Comparison is not completely fair, because "this" is ECB, 45238384Sjkim# i.e. no extra processing such as counter values calculation 46238384Sjkim# and xor-ing input as in Emilia's CTR implementation is 47238384Sjkim# performed. However, the CTR calculations stand for not more 48238384Sjkim# than 1% of total time, so comparison is *rather* fair. 49238384Sjkim# 50238384Sjkim# (**) Results were collected on Westmere, which is considered to 51238384Sjkim# be equivalent to Nehalem for this code. 52238384Sjkim# 53238384Sjkim# (***) Slowdown on Atom is rather strange per se, because original 54238384Sjkim# implementation has a number of 9+-bytes instructions, which 55238384Sjkim# are bad for Atom front-end, and which I eliminated completely. 56238384Sjkim# In attempt to address deterioration sbox() was tested in FP 57238384Sjkim# SIMD "domain" (movaps instead of movdqa, xorps instead of 58238384Sjkim# pxor, etc.). While it resulted in nominal 4% improvement on 59238384Sjkim# Atom, it hurted Westmere by more than 2x factor. 60238384Sjkim# 61238384Sjkim# As for key schedule conversion subroutine. Interface to OpenSSL 62238384Sjkim# relies on per-invocation on-the-fly conversion. This naturally 63238384Sjkim# has impact on performance, especially for short inputs. Conversion 64238384Sjkim# time in CPU cycles and its ratio to CPU cycles spent in 8x block 65238384Sjkim# function is: 66238384Sjkim# 67238384Sjkim# conversion conversion/8x block 68238384Sjkim# Core 2 240 0.22 69238384Sjkim# Nehalem 180 0.20 70238384Sjkim# Atom 430 0.19 71238384Sjkim# 72238384Sjkim# The ratio values mean that 128-byte blocks will be processed 73238384Sjkim# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 74238384Sjkim# etc. Then keep in mind that input sizes not divisible by 128 are 75238384Sjkim# *effectively* slower, especially shortest ones, e.g. consecutive 76238384Sjkim# 144-byte blocks are processed 44% slower than one would expect, 77238384Sjkim# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 78238384Sjkim# it's still faster than ["hyper-threading-safe" code path in] 79238384Sjkim# aes-x86_64.pl on all lengths above 64 bytes... 80238384Sjkim# 81238384Sjkim# October 2011. 82238384Sjkim# 83238384Sjkim# Add decryption procedure. Performance in CPU cycles spent to decrypt 84238384Sjkim# one byte out of 4096-byte buffer with 128-bit key is: 85238384Sjkim# 86264331Sjkim# Core 2 9.83 87264331Sjkim# Nehalem 7.74 88264331Sjkim# Atom 19.0 89238384Sjkim# 90238384Sjkim# November 2011. 91238384Sjkim# 92238384Sjkim# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 93238384Sjkim# suboptimal, but XTS is meant to be used with larger blocks... 94238384Sjkim# 95238384Sjkim# <appro@openssl.org> 96238384Sjkim 97238384Sjkim$flavour = shift; 98238384Sjkim$output = shift; 99238384Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 100238384Sjkim 101238384Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 102238384Sjkim 103238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 104238384Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 105238384Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 106238384Sjkimdie "can't locate x86_64-xlate.pl"; 107238384Sjkim 108246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 109246772Sjkim*STDOUT=*OUT; 110238384Sjkim 111238384Sjkimmy ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 112238384Sjkimmy @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 113238384Sjkimmy $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 114238384Sjkim 115238384Sjkim{ 116238384Sjkimmy ($key,$rounds,$const)=("%rax","%r10d","%r11"); 117238384Sjkim 118238384Sjkimsub Sbox { 119238384Sjkim# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 120238384Sjkim# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 121238384Sjkimmy @b=@_[0..7]; 122238384Sjkimmy @t=@_[8..11]; 123238384Sjkimmy @s=@_[12..15]; 124238384Sjkim &InBasisChange (@b); 125238384Sjkim &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 126238384Sjkim &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 127238384Sjkim} 128238384Sjkim 129238384Sjkimsub InBasisChange { 130238384Sjkim# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 131238384Sjkim# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 132238384Sjkimmy @b=@_[0..7]; 133238384Sjkim$code.=<<___; 134238384Sjkim pxor @b[6], @b[5] 135238384Sjkim pxor @b[1], @b[2] 136238384Sjkim pxor @b[0], @b[3] 137238384Sjkim pxor @b[2], @b[6] 138238384Sjkim pxor @b[0], @b[5] 139238384Sjkim 140238384Sjkim pxor @b[3], @b[6] 141238384Sjkim pxor @b[7], @b[3] 142238384Sjkim pxor @b[5], @b[7] 143238384Sjkim pxor @b[4], @b[3] 144238384Sjkim pxor @b[5], @b[4] 145238384Sjkim pxor @b[1], @b[3] 146238384Sjkim 147238384Sjkim pxor @b[7], @b[2] 148238384Sjkim pxor @b[5], @b[1] 149238384Sjkim___ 150238384Sjkim} 151238384Sjkim 152238384Sjkimsub OutBasisChange { 153238384Sjkim# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 154238384Sjkim# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 155238384Sjkimmy @b=@_[0..7]; 156238384Sjkim$code.=<<___; 157238384Sjkim pxor @b[6], @b[0] 158238384Sjkim pxor @b[4], @b[1] 159238384Sjkim pxor @b[0], @b[2] 160238384Sjkim pxor @b[6], @b[4] 161238384Sjkim pxor @b[1], @b[6] 162238384Sjkim 163238384Sjkim pxor @b[5], @b[1] 164238384Sjkim pxor @b[3], @b[5] 165238384Sjkim pxor @b[7], @b[3] 166238384Sjkim pxor @b[5], @b[7] 167238384Sjkim pxor @b[5], @b[2] 168238384Sjkim 169238384Sjkim pxor @b[7], @b[4] 170238384Sjkim___ 171238384Sjkim} 172238384Sjkim 173238384Sjkimsub InvSbox { 174238384Sjkim# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 175238384Sjkim# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 176238384Sjkimmy @b=@_[0..7]; 177238384Sjkimmy @t=@_[8..11]; 178238384Sjkimmy @s=@_[12..15]; 179238384Sjkim &InvInBasisChange (@b); 180238384Sjkim &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 181238384Sjkim &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 182238384Sjkim} 183238384Sjkim 184238384Sjkimsub InvInBasisChange { # OutBasisChange in reverse 185238384Sjkimmy @b=@_[5,1,2,6,3,7,0,4]; 186238384Sjkim$code.=<<___ 187238384Sjkim pxor @b[7], @b[4] 188238384Sjkim 189238384Sjkim pxor @b[5], @b[7] 190238384Sjkim pxor @b[5], @b[2] 191238384Sjkim pxor @b[7], @b[3] 192238384Sjkim pxor @b[3], @b[5] 193238384Sjkim pxor @b[5], @b[1] 194238384Sjkim 195238384Sjkim pxor @b[1], @b[6] 196238384Sjkim pxor @b[0], @b[2] 197238384Sjkim pxor @b[6], @b[4] 198238384Sjkim pxor @b[6], @b[0] 199238384Sjkim pxor @b[4], @b[1] 200238384Sjkim___ 201238384Sjkim} 202238384Sjkim 203238384Sjkimsub InvOutBasisChange { # InBasisChange in reverse 204238384Sjkimmy @b=@_[2,5,7,3,6,1,0,4]; 205238384Sjkim$code.=<<___; 206238384Sjkim pxor @b[5], @b[1] 207238384Sjkim pxor @b[7], @b[2] 208238384Sjkim 209238384Sjkim pxor @b[1], @b[3] 210238384Sjkim pxor @b[5], @b[4] 211238384Sjkim pxor @b[5], @b[7] 212238384Sjkim pxor @b[4], @b[3] 213238384Sjkim pxor @b[0], @b[5] 214238384Sjkim pxor @b[7], @b[3] 215238384Sjkim pxor @b[2], @b[6] 216238384Sjkim pxor @b[1], @b[2] 217238384Sjkim pxor @b[3], @b[6] 218238384Sjkim 219238384Sjkim pxor @b[0], @b[3] 220238384Sjkim pxor @b[6], @b[5] 221238384Sjkim___ 222238384Sjkim} 223238384Sjkim 224238384Sjkimsub Mul_GF4 { 225238384Sjkim#;************************************************************* 226238384Sjkim#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 227238384Sjkim#;************************************************************* 228238384Sjkimmy ($x0,$x1,$y0,$y1,$t0)=@_; 229238384Sjkim$code.=<<___; 230238384Sjkim movdqa $y0, $t0 231238384Sjkim pxor $y1, $t0 232238384Sjkim pand $x0, $t0 233238384Sjkim pxor $x1, $x0 234238384Sjkim pand $y0, $x1 235238384Sjkim pand $y1, $x0 236238384Sjkim pxor $x1, $x0 237238384Sjkim pxor $t0, $x1 238238384Sjkim___ 239238384Sjkim} 240238384Sjkim 241238384Sjkimsub Mul_GF4_N { # not used, see next subroutine 242238384Sjkim# multiply and scale by N 243238384Sjkimmy ($x0,$x1,$y0,$y1,$t0)=@_; 244238384Sjkim$code.=<<___; 245238384Sjkim movdqa $y0, $t0 246238384Sjkim pxor $y1, $t0 247238384Sjkim pand $x0, $t0 248238384Sjkim pxor $x1, $x0 249238384Sjkim pand $y0, $x1 250238384Sjkim pand $y1, $x0 251238384Sjkim pxor $x0, $x1 252238384Sjkim pxor $t0, $x0 253238384Sjkim___ 254238384Sjkim} 255238384Sjkim 256238384Sjkimsub Mul_GF4_N_GF4 { 257238384Sjkim# interleaved Mul_GF4_N and Mul_GF4 258238384Sjkimmy ($x0,$x1,$y0,$y1,$t0, 259238384Sjkim $x2,$x3,$y2,$y3,$t1)=@_; 260238384Sjkim$code.=<<___; 261238384Sjkim movdqa $y0, $t0 262238384Sjkim movdqa $y2, $t1 263238384Sjkim pxor $y1, $t0 264238384Sjkim pxor $y3, $t1 265238384Sjkim pand $x0, $t0 266238384Sjkim pand $x2, $t1 267238384Sjkim pxor $x1, $x0 268238384Sjkim pxor $x3, $x2 269238384Sjkim pand $y0, $x1 270238384Sjkim pand $y2, $x3 271238384Sjkim pand $y1, $x0 272238384Sjkim pand $y3, $x2 273238384Sjkim pxor $x0, $x1 274238384Sjkim pxor $x3, $x2 275238384Sjkim pxor $t0, $x0 276238384Sjkim pxor $t1, $x3 277238384Sjkim___ 278238384Sjkim} 279238384Sjkimsub Mul_GF16_2 { 280238384Sjkimmy @x=@_[0..7]; 281238384Sjkimmy @y=@_[8..11]; 282238384Sjkimmy @t=@_[12..15]; 283238384Sjkim$code.=<<___; 284238384Sjkim movdqa @x[0], @t[0] 285238384Sjkim movdqa @x[1], @t[1] 286238384Sjkim___ 287238384Sjkim &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 288238384Sjkim$code.=<<___; 289238384Sjkim pxor @x[2], @t[0] 290238384Sjkim pxor @x[3], @t[1] 291238384Sjkim pxor @y[2], @y[0] 292238384Sjkim pxor @y[3], @y[1] 293238384Sjkim___ 294238384Sjkim Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 295238384Sjkim @x[2], @x[3], @y[2], @y[3], @t[2]); 296238384Sjkim$code.=<<___; 297238384Sjkim pxor @t[0], @x[0] 298238384Sjkim pxor @t[0], @x[2] 299238384Sjkim pxor @t[1], @x[1] 300238384Sjkim pxor @t[1], @x[3] 301238384Sjkim 302238384Sjkim movdqa @x[4], @t[0] 303238384Sjkim movdqa @x[5], @t[1] 304238384Sjkim pxor @x[6], @t[0] 305238384Sjkim pxor @x[7], @t[1] 306238384Sjkim___ 307238384Sjkim &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 308238384Sjkim @x[6], @x[7], @y[2], @y[3], @t[2]); 309238384Sjkim$code.=<<___; 310238384Sjkim pxor @y[2], @y[0] 311238384Sjkim pxor @y[3], @y[1] 312238384Sjkim___ 313238384Sjkim &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 314238384Sjkim$code.=<<___; 315238384Sjkim pxor @t[0], @x[4] 316238384Sjkim pxor @t[0], @x[6] 317238384Sjkim pxor @t[1], @x[5] 318238384Sjkim pxor @t[1], @x[7] 319238384Sjkim___ 320238384Sjkim} 321238384Sjkimsub Inv_GF256 { 322238384Sjkim#;******************************************************************** 323238384Sjkim#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 324238384Sjkim#;******************************************************************** 325238384Sjkimmy @x=@_[0..7]; 326238384Sjkimmy @t=@_[8..11]; 327238384Sjkimmy @s=@_[12..15]; 328238384Sjkim# direct optimizations from hardware 329238384Sjkim$code.=<<___; 330238384Sjkim movdqa @x[4], @t[3] 331238384Sjkim movdqa @x[5], @t[2] 332238384Sjkim movdqa @x[1], @t[1] 333238384Sjkim movdqa @x[7], @s[1] 334238384Sjkim movdqa @x[0], @s[0] 335238384Sjkim 336238384Sjkim pxor @x[6], @t[3] 337238384Sjkim pxor @x[7], @t[2] 338238384Sjkim pxor @x[3], @t[1] 339238384Sjkim movdqa @t[3], @s[2] 340238384Sjkim pxor @x[6], @s[1] 341238384Sjkim movdqa @t[2], @t[0] 342238384Sjkim pxor @x[2], @s[0] 343238384Sjkim movdqa @t[3], @s[3] 344238384Sjkim 345238384Sjkim por @t[1], @t[2] 346238384Sjkim por @s[0], @t[3] 347238384Sjkim pxor @t[0], @s[3] 348238384Sjkim pand @s[0], @s[2] 349238384Sjkim pxor @t[1], @s[0] 350238384Sjkim pand @t[1], @t[0] 351238384Sjkim pand @s[0], @s[3] 352238384Sjkim movdqa @x[3], @s[0] 353238384Sjkim pxor @x[2], @s[0] 354238384Sjkim pand @s[0], @s[1] 355238384Sjkim pxor @s[1], @t[3] 356238384Sjkim pxor @s[1], @t[2] 357238384Sjkim movdqa @x[4], @s[1] 358238384Sjkim movdqa @x[1], @s[0] 359238384Sjkim pxor @x[5], @s[1] 360238384Sjkim pxor @x[0], @s[0] 361238384Sjkim movdqa @s[1], @t[1] 362238384Sjkim pand @s[0], @s[1] 363238384Sjkim por @s[0], @t[1] 364238384Sjkim pxor @s[1], @t[0] 365238384Sjkim pxor @s[3], @t[3] 366238384Sjkim pxor @s[2], @t[2] 367238384Sjkim pxor @s[3], @t[1] 368238384Sjkim movdqa @x[7], @s[0] 369238384Sjkim pxor @s[2], @t[0] 370238384Sjkim movdqa @x[6], @s[1] 371238384Sjkim pxor @s[2], @t[1] 372238384Sjkim movdqa @x[5], @s[2] 373238384Sjkim pand @x[3], @s[0] 374238384Sjkim movdqa @x[4], @s[3] 375238384Sjkim pand @x[2], @s[1] 376238384Sjkim pand @x[1], @s[2] 377238384Sjkim por @x[0], @s[3] 378238384Sjkim pxor @s[0], @t[3] 379238384Sjkim pxor @s[1], @t[2] 380238384Sjkim pxor @s[2], @t[1] 381238384Sjkim pxor @s[3], @t[0] 382238384Sjkim 383238384Sjkim #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 384238384Sjkim 385238384Sjkim # new smaller inversion 386238384Sjkim 387238384Sjkim movdqa @t[3], @s[0] 388238384Sjkim pand @t[1], @t[3] 389238384Sjkim pxor @t[2], @s[0] 390238384Sjkim 391238384Sjkim movdqa @t[0], @s[2] 392238384Sjkim movdqa @s[0], @s[3] 393238384Sjkim pxor @t[3], @s[2] 394238384Sjkim pand @s[2], @s[3] 395238384Sjkim 396238384Sjkim movdqa @t[1], @s[1] 397238384Sjkim pxor @t[2], @s[3] 398238384Sjkim pxor @t[0], @s[1] 399238384Sjkim 400238384Sjkim pxor @t[2], @t[3] 401238384Sjkim 402238384Sjkim pand @t[3], @s[1] 403238384Sjkim 404238384Sjkim movdqa @s[2], @t[2] 405238384Sjkim pxor @t[0], @s[1] 406238384Sjkim 407238384Sjkim pxor @s[1], @t[2] 408238384Sjkim pxor @s[1], @t[1] 409238384Sjkim 410238384Sjkim pand @t[0], @t[2] 411238384Sjkim 412238384Sjkim pxor @t[2], @s[2] 413238384Sjkim pxor @t[2], @t[1] 414238384Sjkim 415238384Sjkim pand @s[3], @s[2] 416238384Sjkim 417238384Sjkim pxor @s[0], @s[2] 418238384Sjkim___ 419238384Sjkim# output in s3, s2, s1, t1 420238384Sjkim 421238384Sjkim# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 422238384Sjkim 423238384Sjkim# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 424238384Sjkim &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 425238384Sjkim 426238384Sjkim### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 427238384Sjkim} 428238384Sjkim 429238384Sjkim# AES linear components 430238384Sjkim 431238384Sjkimsub ShiftRows { 432238384Sjkimmy @x=@_[0..7]; 433238384Sjkimmy $mask=pop; 434238384Sjkim$code.=<<___; 435238384Sjkim pxor 0x00($key),@x[0] 436238384Sjkim pxor 0x10($key),@x[1] 437238384Sjkim pshufb $mask,@x[0] 438238384Sjkim pxor 0x20($key),@x[2] 439238384Sjkim pshufb $mask,@x[1] 440238384Sjkim pxor 0x30($key),@x[3] 441238384Sjkim pshufb $mask,@x[2] 442238384Sjkim pxor 0x40($key),@x[4] 443238384Sjkim pshufb $mask,@x[3] 444238384Sjkim pxor 0x50($key),@x[5] 445238384Sjkim pshufb $mask,@x[4] 446238384Sjkim pxor 0x60($key),@x[6] 447238384Sjkim pshufb $mask,@x[5] 448238384Sjkim pxor 0x70($key),@x[7] 449238384Sjkim pshufb $mask,@x[6] 450238384Sjkim lea 0x80($key),$key 451238384Sjkim pshufb $mask,@x[7] 452238384Sjkim___ 453238384Sjkim} 454238384Sjkim 455238384Sjkimsub MixColumns { 456238384Sjkim# modified to emit output in order suitable for feeding back to aesenc[last] 457238384Sjkimmy @x=@_[0..7]; 458238384Sjkimmy @t=@_[8..15]; 459264331Sjkimmy $inv=@_[16]; # optional 460238384Sjkim$code.=<<___; 461238384Sjkim pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 462238384Sjkim pshufd \$0x93, @x[1], @t[1] 463238384Sjkim pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 464238384Sjkim pshufd \$0x93, @x[2], @t[2] 465238384Sjkim pxor @t[1], @x[1] 466238384Sjkim pshufd \$0x93, @x[3], @t[3] 467238384Sjkim pxor @t[2], @x[2] 468238384Sjkim pshufd \$0x93, @x[4], @t[4] 469238384Sjkim pxor @t[3], @x[3] 470238384Sjkim pshufd \$0x93, @x[5], @t[5] 471238384Sjkim pxor @t[4], @x[4] 472238384Sjkim pshufd \$0x93, @x[6], @t[6] 473238384Sjkim pxor @t[5], @x[5] 474238384Sjkim pshufd \$0x93, @x[7], @t[7] 475238384Sjkim pxor @t[6], @x[6] 476238384Sjkim pxor @t[7], @x[7] 477238384Sjkim 478238384Sjkim pxor @x[0], @t[1] 479238384Sjkim pxor @x[7], @t[0] 480238384Sjkim pxor @x[7], @t[1] 481238384Sjkim pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 482238384Sjkim pxor @x[1], @t[2] 483238384Sjkim pshufd \$0x4E, @x[1], @x[1] 484238384Sjkim pxor @x[4], @t[5] 485238384Sjkim pxor @t[0], @x[0] 486238384Sjkim pxor @x[5], @t[6] 487238384Sjkim pxor @t[1], @x[1] 488238384Sjkim pxor @x[3], @t[4] 489238384Sjkim pshufd \$0x4E, @x[4], @t[0] 490238384Sjkim pxor @x[6], @t[7] 491238384Sjkim pshufd \$0x4E, @x[5], @t[1] 492238384Sjkim pxor @x[2], @t[3] 493238384Sjkim pshufd \$0x4E, @x[3], @x[4] 494238384Sjkim pxor @x[7], @t[3] 495238384Sjkim pshufd \$0x4E, @x[7], @x[5] 496238384Sjkim pxor @x[7], @t[4] 497238384Sjkim pshufd \$0x4E, @x[6], @x[3] 498238384Sjkim pxor @t[4], @t[0] 499238384Sjkim pshufd \$0x4E, @x[2], @x[6] 500238384Sjkim pxor @t[5], @t[1] 501264331Sjkim___ 502264331Sjkim$code.=<<___ if (!$inv); 503238384Sjkim pxor @t[3], @x[4] 504238384Sjkim pxor @t[7], @x[5] 505238384Sjkim pxor @t[6], @x[3] 506238384Sjkim movdqa @t[0], @x[2] 507238384Sjkim pxor @t[2], @x[6] 508238384Sjkim movdqa @t[1], @x[7] 509238384Sjkim___ 510264331Sjkim$code.=<<___ if ($inv); 511264331Sjkim pxor @x[4], @t[3] 512264331Sjkim pxor @t[7], @x[5] 513264331Sjkim pxor @x[3], @t[6] 514264331Sjkim movdqa @t[0], @x[3] 515264331Sjkim pxor @t[2], @x[6] 516264331Sjkim movdqa @t[6], @x[2] 517264331Sjkim movdqa @t[1], @x[7] 518264331Sjkim movdqa @x[6], @x[4] 519264331Sjkim movdqa @t[3], @x[6] 520264331Sjkim___ 521238384Sjkim} 522238384Sjkim 523264331Sjkimsub InvMixColumns_orig { 524238384Sjkimmy @x=@_[0..7]; 525238384Sjkimmy @t=@_[8..15]; 526238384Sjkim 527238384Sjkim$code.=<<___; 528238384Sjkim # multiplication by 0x0e 529238384Sjkim pshufd \$0x93, @x[7], @t[7] 530238384Sjkim movdqa @x[2], @t[2] 531238384Sjkim pxor @x[5], @x[7] # 7 5 532238384Sjkim pxor @x[5], @x[2] # 2 5 533238384Sjkim pshufd \$0x93, @x[0], @t[0] 534238384Sjkim movdqa @x[5], @t[5] 535238384Sjkim pxor @x[0], @x[5] # 5 0 [1] 536238384Sjkim pxor @x[1], @x[0] # 0 1 537238384Sjkim pshufd \$0x93, @x[1], @t[1] 538238384Sjkim pxor @x[2], @x[1] # 1 25 539238384Sjkim pxor @x[6], @x[0] # 01 6 [2] 540238384Sjkim pxor @x[3], @x[1] # 125 3 [4] 541238384Sjkim pshufd \$0x93, @x[3], @t[3] 542238384Sjkim pxor @x[0], @x[2] # 25 016 [3] 543238384Sjkim pxor @x[7], @x[3] # 3 75 544238384Sjkim pxor @x[6], @x[7] # 75 6 [0] 545238384Sjkim pshufd \$0x93, @x[6], @t[6] 546238384Sjkim movdqa @x[4], @t[4] 547238384Sjkim pxor @x[4], @x[6] # 6 4 548238384Sjkim pxor @x[3], @x[4] # 4 375 [6] 549238384Sjkim pxor @x[7], @x[3] # 375 756=36 550238384Sjkim pxor @t[5], @x[6] # 64 5 [7] 551238384Sjkim pxor @t[2], @x[3] # 36 2 552238384Sjkim pxor @t[4], @x[3] # 362 4 [5] 553238384Sjkim pshufd \$0x93, @t[5], @t[5] 554238384Sjkim___ 555238384Sjkim my @y = @x[7,5,0,2,1,3,4,6]; 556238384Sjkim$code.=<<___; 557238384Sjkim # multiplication by 0x0b 558238384Sjkim pxor @y[0], @y[1] 559238384Sjkim pxor @t[0], @y[0] 560238384Sjkim pxor @t[1], @y[1] 561238384Sjkim pshufd \$0x93, @t[2], @t[2] 562238384Sjkim pxor @t[5], @y[0] 563238384Sjkim pxor @t[6], @y[1] 564238384Sjkim pxor @t[7], @y[0] 565238384Sjkim pshufd \$0x93, @t[4], @t[4] 566238384Sjkim pxor @t[6], @t[7] # clobber t[7] 567238384Sjkim pxor @y[0], @y[1] 568238384Sjkim 569238384Sjkim pxor @t[0], @y[3] 570238384Sjkim pshufd \$0x93, @t[0], @t[0] 571238384Sjkim pxor @t[1], @y[2] 572238384Sjkim pxor @t[1], @y[4] 573238384Sjkim pxor @t[2], @y[2] 574238384Sjkim pshufd \$0x93, @t[1], @t[1] 575238384Sjkim pxor @t[2], @y[3] 576238384Sjkim pxor @t[2], @y[5] 577238384Sjkim pxor @t[7], @y[2] 578238384Sjkim pshufd \$0x93, @t[2], @t[2] 579238384Sjkim pxor @t[3], @y[3] 580238384Sjkim pxor @t[3], @y[6] 581238384Sjkim pxor @t[3], @y[4] 582238384Sjkim pshufd \$0x93, @t[3], @t[3] 583238384Sjkim pxor @t[4], @y[7] 584238384Sjkim pxor @t[4], @y[5] 585238384Sjkim pxor @t[7], @y[7] 586238384Sjkim pxor @t[5], @y[3] 587238384Sjkim pxor @t[4], @y[4] 588238384Sjkim pxor @t[5], @t[7] # clobber t[7] even more 589238384Sjkim 590238384Sjkim pxor @t[7], @y[5] 591238384Sjkim pshufd \$0x93, @t[4], @t[4] 592238384Sjkim pxor @t[7], @y[6] 593238384Sjkim pxor @t[7], @y[4] 594238384Sjkim 595238384Sjkim pxor @t[5], @t[7] 596238384Sjkim pshufd \$0x93, @t[5], @t[5] 597238384Sjkim pxor @t[6], @t[7] # restore t[7] 598238384Sjkim 599238384Sjkim # multiplication by 0x0d 600238384Sjkim pxor @y[7], @y[4] 601238384Sjkim pxor @t[4], @y[7] 602238384Sjkim pshufd \$0x93, @t[6], @t[6] 603238384Sjkim pxor @t[0], @y[2] 604238384Sjkim pxor @t[5], @y[7] 605238384Sjkim pxor @t[2], @y[2] 606238384Sjkim pshufd \$0x93, @t[7], @t[7] 607238384Sjkim 608238384Sjkim pxor @y[1], @y[3] 609238384Sjkim pxor @t[1], @y[1] 610238384Sjkim pxor @t[0], @y[0] 611238384Sjkim pxor @t[0], @y[3] 612238384Sjkim pxor @t[5], @y[1] 613238384Sjkim pxor @t[5], @y[0] 614238384Sjkim pxor @t[7], @y[1] 615238384Sjkim pshufd \$0x93, @t[0], @t[0] 616238384Sjkim pxor @t[6], @y[0] 617238384Sjkim pxor @y[1], @y[3] 618238384Sjkim pxor @t[1], @y[4] 619238384Sjkim pshufd \$0x93, @t[1], @t[1] 620238384Sjkim 621238384Sjkim pxor @t[7], @y[7] 622238384Sjkim pxor @t[2], @y[4] 623238384Sjkim pxor @t[2], @y[5] 624238384Sjkim pshufd \$0x93, @t[2], @t[2] 625238384Sjkim pxor @t[6], @y[2] 626238384Sjkim pxor @t[3], @t[6] # clobber t[6] 627238384Sjkim pxor @y[7], @y[4] 628238384Sjkim pxor @t[6], @y[3] 629238384Sjkim 630238384Sjkim pxor @t[6], @y[6] 631238384Sjkim pxor @t[5], @y[5] 632238384Sjkim pxor @t[4], @y[6] 633238384Sjkim pshufd \$0x93, @t[4], @t[4] 634238384Sjkim pxor @t[6], @y[5] 635238384Sjkim pxor @t[7], @y[6] 636238384Sjkim pxor @t[3], @t[6] # restore t[6] 637238384Sjkim 638238384Sjkim pshufd \$0x93, @t[5], @t[5] 639238384Sjkim pshufd \$0x93, @t[6], @t[6] 640238384Sjkim pshufd \$0x93, @t[7], @t[7] 641238384Sjkim pshufd \$0x93, @t[3], @t[3] 642238384Sjkim 643238384Sjkim # multiplication by 0x09 644238384Sjkim pxor @y[1], @y[4] 645238384Sjkim pxor @y[1], @t[1] # t[1]=y[1] 646238384Sjkim pxor @t[5], @t[0] # clobber t[0] 647238384Sjkim pxor @t[5], @t[1] 648238384Sjkim pxor @t[0], @y[3] 649238384Sjkim pxor @y[0], @t[0] # t[0]=y[0] 650238384Sjkim pxor @t[6], @t[1] 651238384Sjkim pxor @t[7], @t[6] # clobber t[6] 652238384Sjkim pxor @t[1], @y[4] 653238384Sjkim pxor @t[4], @y[7] 654238384Sjkim pxor @y[4], @t[4] # t[4]=y[4] 655238384Sjkim pxor @t[3], @y[6] 656238384Sjkim pxor @y[3], @t[3] # t[3]=y[3] 657238384Sjkim pxor @t[2], @y[5] 658238384Sjkim pxor @y[2], @t[2] # t[2]=y[2] 659238384Sjkim pxor @t[7], @t[3] 660238384Sjkim pxor @y[5], @t[5] # t[5]=y[5] 661238384Sjkim pxor @t[6], @t[2] 662238384Sjkim pxor @t[6], @t[5] 663238384Sjkim pxor @y[6], @t[6] # t[6]=y[6] 664238384Sjkim pxor @y[7], @t[7] # t[7]=y[7] 665238384Sjkim 666238384Sjkim movdqa @t[0],@XMM[0] 667238384Sjkim movdqa @t[1],@XMM[1] 668238384Sjkim movdqa @t[2],@XMM[2] 669238384Sjkim movdqa @t[3],@XMM[3] 670238384Sjkim movdqa @t[4],@XMM[4] 671238384Sjkim movdqa @t[5],@XMM[5] 672238384Sjkim movdqa @t[6],@XMM[6] 673238384Sjkim movdqa @t[7],@XMM[7] 674238384Sjkim___ 675238384Sjkim} 676238384Sjkim 677264331Sjkimsub InvMixColumns { 678264331Sjkimmy @x=@_[0..7]; 679264331Sjkimmy @t=@_[8..15]; 680264331Sjkim 681264331Sjkim# Thanks to Jussi Kivilinna for providing pointer to 682264331Sjkim# 683264331Sjkim# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 684264331Sjkim# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 685264331Sjkim# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 686264331Sjkim# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 687264331Sjkim 688264331Sjkim$code.=<<___; 689264331Sjkim # multiplication by 0x05-0x00-0x04-0x00 690264331Sjkim pshufd \$0x4E, @x[0], @t[0] 691264331Sjkim pshufd \$0x4E, @x[6], @t[6] 692264331Sjkim pxor @x[0], @t[0] 693264331Sjkim pshufd \$0x4E, @x[7], @t[7] 694264331Sjkim pxor @x[6], @t[6] 695264331Sjkim pshufd \$0x4E, @x[1], @t[1] 696264331Sjkim pxor @x[7], @t[7] 697264331Sjkim pshufd \$0x4E, @x[2], @t[2] 698264331Sjkim pxor @x[1], @t[1] 699264331Sjkim pshufd \$0x4E, @x[3], @t[3] 700264331Sjkim pxor @x[2], @t[2] 701264331Sjkim pxor @t[6], @x[0] 702264331Sjkim pxor @t[6], @x[1] 703264331Sjkim pshufd \$0x4E, @x[4], @t[4] 704264331Sjkim pxor @x[3], @t[3] 705264331Sjkim pxor @t[0], @x[2] 706264331Sjkim pxor @t[1], @x[3] 707264331Sjkim pshufd \$0x4E, @x[5], @t[5] 708264331Sjkim pxor @x[4], @t[4] 709264331Sjkim pxor @t[7], @x[1] 710264331Sjkim pxor @t[2], @x[4] 711264331Sjkim pxor @x[5], @t[5] 712264331Sjkim 713264331Sjkim pxor @t[7], @x[2] 714264331Sjkim pxor @t[6], @x[3] 715264331Sjkim pxor @t[6], @x[4] 716264331Sjkim pxor @t[3], @x[5] 717264331Sjkim pxor @t[4], @x[6] 718264331Sjkim pxor @t[7], @x[4] 719264331Sjkim pxor @t[7], @x[5] 720264331Sjkim pxor @t[5], @x[7] 721264331Sjkim___ 722264331Sjkim &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 723264331Sjkim} 724264331Sjkim 725238384Sjkimsub aesenc { # not used 726238384Sjkimmy @b=@_[0..7]; 727238384Sjkimmy @t=@_[8..15]; 728238384Sjkim$code.=<<___; 729238384Sjkim movdqa 0x30($const),@t[0] # .LSR 730238384Sjkim___ 731238384Sjkim &ShiftRows (@b,@t[0]); 732238384Sjkim &Sbox (@b,@t); 733238384Sjkim &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 734238384Sjkim} 735238384Sjkim 736238384Sjkimsub aesenclast { # not used 737238384Sjkimmy @b=@_[0..7]; 738238384Sjkimmy @t=@_[8..15]; 739238384Sjkim$code.=<<___; 740238384Sjkim movdqa 0x40($const),@t[0] # .LSRM0 741238384Sjkim___ 742238384Sjkim &ShiftRows (@b,@t[0]); 743238384Sjkim &Sbox (@b,@t); 744238384Sjkim$code.=<<___ 745238384Sjkim pxor 0x00($key),@b[0] 746238384Sjkim pxor 0x10($key),@b[1] 747238384Sjkim pxor 0x20($key),@b[4] 748238384Sjkim pxor 0x30($key),@b[6] 749238384Sjkim pxor 0x40($key),@b[3] 750238384Sjkim pxor 0x50($key),@b[7] 751238384Sjkim pxor 0x60($key),@b[2] 752238384Sjkim pxor 0x70($key),@b[5] 753238384Sjkim___ 754238384Sjkim} 755238384Sjkim 756238384Sjkimsub swapmove { 757238384Sjkimmy ($a,$b,$n,$mask,$t)=@_; 758238384Sjkim$code.=<<___; 759238384Sjkim movdqa $b,$t 760238384Sjkim psrlq \$$n,$b 761238384Sjkim pxor $a,$b 762238384Sjkim pand $mask,$b 763238384Sjkim pxor $b,$a 764238384Sjkim psllq \$$n,$b 765238384Sjkim pxor $t,$b 766238384Sjkim___ 767238384Sjkim} 768238384Sjkimsub swapmove2x { 769238384Sjkimmy ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 770238384Sjkim$code.=<<___; 771238384Sjkim movdqa $b0,$t0 772238384Sjkim psrlq \$$n,$b0 773238384Sjkim movdqa $b1,$t1 774238384Sjkim psrlq \$$n,$b1 775238384Sjkim pxor $a0,$b0 776238384Sjkim pxor $a1,$b1 777238384Sjkim pand $mask,$b0 778238384Sjkim pand $mask,$b1 779238384Sjkim pxor $b0,$a0 780238384Sjkim psllq \$$n,$b0 781238384Sjkim pxor $b1,$a1 782238384Sjkim psllq \$$n,$b1 783238384Sjkim pxor $t0,$b0 784238384Sjkim pxor $t1,$b1 785238384Sjkim___ 786238384Sjkim} 787238384Sjkim 788238384Sjkimsub bitslice { 789238384Sjkimmy @x=reverse(@_[0..7]); 790238384Sjkimmy ($t0,$t1,$t2,$t3)=@_[8..11]; 791238384Sjkim$code.=<<___; 792238384Sjkim movdqa 0x00($const),$t0 # .LBS0 793238384Sjkim movdqa 0x10($const),$t1 # .LBS1 794238384Sjkim___ 795238384Sjkim &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 796238384Sjkim &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 797238384Sjkim$code.=<<___; 798238384Sjkim movdqa 0x20($const),$t0 # .LBS2 799238384Sjkim___ 800238384Sjkim &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 801238384Sjkim &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 802238384Sjkim 803238384Sjkim &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 804238384Sjkim &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 805238384Sjkim} 806238384Sjkim 807238384Sjkim$code.=<<___; 808238384Sjkim.text 809238384Sjkim 810238384Sjkim.extern asm_AES_encrypt 811238384Sjkim.extern asm_AES_decrypt 812238384Sjkim 813238384Sjkim.type _bsaes_encrypt8,\@abi-omnipotent 814238384Sjkim.align 64 815238384Sjkim_bsaes_encrypt8: 816238384Sjkim lea .LBS0(%rip), $const # constants table 817238384Sjkim 818238384Sjkim movdqa ($key), @XMM[9] # round 0 key 819238384Sjkim lea 0x10($key), $key 820238384Sjkim movdqa 0x50($const), @XMM[8] # .LM0SR 821238384Sjkim pxor @XMM[9], @XMM[0] # xor with round0 key 822238384Sjkim pxor @XMM[9], @XMM[1] 823238384Sjkim pshufb @XMM[8], @XMM[0] 824238384Sjkim pxor @XMM[9], @XMM[2] 825238384Sjkim pshufb @XMM[8], @XMM[1] 826238384Sjkim pxor @XMM[9], @XMM[3] 827238384Sjkim pshufb @XMM[8], @XMM[2] 828238384Sjkim pxor @XMM[9], @XMM[4] 829238384Sjkim pshufb @XMM[8], @XMM[3] 830238384Sjkim pxor @XMM[9], @XMM[5] 831238384Sjkim pshufb @XMM[8], @XMM[4] 832238384Sjkim pxor @XMM[9], @XMM[6] 833238384Sjkim pshufb @XMM[8], @XMM[5] 834238384Sjkim pxor @XMM[9], @XMM[7] 835238384Sjkim pshufb @XMM[8], @XMM[6] 836238384Sjkim pshufb @XMM[8], @XMM[7] 837238384Sjkim_bsaes_encrypt8_bitslice: 838238384Sjkim___ 839238384Sjkim &bitslice (@XMM[0..7, 8..11]); 840238384Sjkim$code.=<<___; 841238384Sjkim dec $rounds 842238384Sjkim jmp .Lenc_sbox 843238384Sjkim.align 16 844238384Sjkim.Lenc_loop: 845238384Sjkim___ 846238384Sjkim &ShiftRows (@XMM[0..7, 8]); 847238384Sjkim$code.=".Lenc_sbox:\n"; 848238384Sjkim &Sbox (@XMM[0..7, 8..15]); 849238384Sjkim$code.=<<___; 850238384Sjkim dec $rounds 851238384Sjkim jl .Lenc_done 852238384Sjkim___ 853238384Sjkim &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 854238384Sjkim$code.=<<___; 855238384Sjkim movdqa 0x30($const), @XMM[8] # .LSR 856238384Sjkim jnz .Lenc_loop 857238384Sjkim movdqa 0x40($const), @XMM[8] # .LSRM0 858238384Sjkim jmp .Lenc_loop 859238384Sjkim.align 16 860238384Sjkim.Lenc_done: 861238384Sjkim___ 862238384Sjkim # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 863238384Sjkim &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 864238384Sjkim$code.=<<___; 865238384Sjkim movdqa ($key), @XMM[8] # last round key 866238384Sjkim pxor @XMM[8], @XMM[4] 867238384Sjkim pxor @XMM[8], @XMM[6] 868238384Sjkim pxor @XMM[8], @XMM[3] 869238384Sjkim pxor @XMM[8], @XMM[7] 870238384Sjkim pxor @XMM[8], @XMM[2] 871238384Sjkim pxor @XMM[8], @XMM[5] 872238384Sjkim pxor @XMM[8], @XMM[0] 873238384Sjkim pxor @XMM[8], @XMM[1] 874238384Sjkim ret 875238384Sjkim.size _bsaes_encrypt8,.-_bsaes_encrypt8 876238384Sjkim 877238384Sjkim.type _bsaes_decrypt8,\@abi-omnipotent 878238384Sjkim.align 64 879238384Sjkim_bsaes_decrypt8: 880238384Sjkim lea .LBS0(%rip), $const # constants table 881238384Sjkim 882238384Sjkim movdqa ($key), @XMM[9] # round 0 key 883238384Sjkim lea 0x10($key), $key 884238384Sjkim movdqa -0x30($const), @XMM[8] # .LM0ISR 885238384Sjkim pxor @XMM[9], @XMM[0] # xor with round0 key 886238384Sjkim pxor @XMM[9], @XMM[1] 887238384Sjkim pshufb @XMM[8], @XMM[0] 888238384Sjkim pxor @XMM[9], @XMM[2] 889238384Sjkim pshufb @XMM[8], @XMM[1] 890238384Sjkim pxor @XMM[9], @XMM[3] 891238384Sjkim pshufb @XMM[8], @XMM[2] 892238384Sjkim pxor @XMM[9], @XMM[4] 893238384Sjkim pshufb @XMM[8], @XMM[3] 894238384Sjkim pxor @XMM[9], @XMM[5] 895238384Sjkim pshufb @XMM[8], @XMM[4] 896238384Sjkim pxor @XMM[9], @XMM[6] 897238384Sjkim pshufb @XMM[8], @XMM[5] 898238384Sjkim pxor @XMM[9], @XMM[7] 899238384Sjkim pshufb @XMM[8], @XMM[6] 900238384Sjkim pshufb @XMM[8], @XMM[7] 901238384Sjkim___ 902238384Sjkim &bitslice (@XMM[0..7, 8..11]); 903238384Sjkim$code.=<<___; 904238384Sjkim dec $rounds 905238384Sjkim jmp .Ldec_sbox 906238384Sjkim.align 16 907238384Sjkim.Ldec_loop: 908238384Sjkim___ 909238384Sjkim &ShiftRows (@XMM[0..7, 8]); 910238384Sjkim$code.=".Ldec_sbox:\n"; 911238384Sjkim &InvSbox (@XMM[0..7, 8..15]); 912238384Sjkim$code.=<<___; 913238384Sjkim dec $rounds 914238384Sjkim jl .Ldec_done 915238384Sjkim___ 916238384Sjkim &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 917238384Sjkim$code.=<<___; 918238384Sjkim movdqa -0x10($const), @XMM[8] # .LISR 919238384Sjkim jnz .Ldec_loop 920238384Sjkim movdqa -0x20($const), @XMM[8] # .LISRM0 921238384Sjkim jmp .Ldec_loop 922238384Sjkim.align 16 923238384Sjkim.Ldec_done: 924238384Sjkim___ 925238384Sjkim &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 926238384Sjkim$code.=<<___; 927238384Sjkim movdqa ($key), @XMM[8] # last round key 928238384Sjkim pxor @XMM[8], @XMM[6] 929238384Sjkim pxor @XMM[8], @XMM[4] 930238384Sjkim pxor @XMM[8], @XMM[2] 931238384Sjkim pxor @XMM[8], @XMM[7] 932238384Sjkim pxor @XMM[8], @XMM[3] 933238384Sjkim pxor @XMM[8], @XMM[5] 934238384Sjkim pxor @XMM[8], @XMM[0] 935238384Sjkim pxor @XMM[8], @XMM[1] 936238384Sjkim ret 937238384Sjkim.size _bsaes_decrypt8,.-_bsaes_decrypt8 938238384Sjkim___ 939238384Sjkim} 940238384Sjkim{ 941238384Sjkimmy ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 942238384Sjkim 943238384Sjkimsub bitslice_key { 944238384Sjkimmy @x=reverse(@_[0..7]); 945238384Sjkimmy ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 946238384Sjkim 947238384Sjkim &swapmove (@x[0,1],1,$bs0,$t2,$t3); 948238384Sjkim$code.=<<___; 949238384Sjkim #&swapmove(@x[2,3],1,$t0,$t2,$t3); 950238384Sjkim movdqa @x[0], @x[2] 951238384Sjkim movdqa @x[1], @x[3] 952238384Sjkim___ 953238384Sjkim #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 954238384Sjkim 955238384Sjkim &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 956238384Sjkim$code.=<<___; 957238384Sjkim #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 958238384Sjkim movdqa @x[0], @x[4] 959238384Sjkim movdqa @x[2], @x[6] 960238384Sjkim movdqa @x[1], @x[5] 961238384Sjkim movdqa @x[3], @x[7] 962238384Sjkim___ 963238384Sjkim &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 964238384Sjkim &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 965238384Sjkim} 966238384Sjkim 967238384Sjkim$code.=<<___; 968238384Sjkim.type _bsaes_key_convert,\@abi-omnipotent 969238384Sjkim.align 16 970238384Sjkim_bsaes_key_convert: 971238384Sjkim lea .Lmasks(%rip), $const 972238384Sjkim movdqu ($inp), %xmm7 # load round 0 key 973238384Sjkim lea 0x10($inp), $inp 974238384Sjkim movdqa 0x00($const), %xmm0 # 0x01... 975238384Sjkim movdqa 0x10($const), %xmm1 # 0x02... 976238384Sjkim movdqa 0x20($const), %xmm2 # 0x04... 977238384Sjkim movdqa 0x30($const), %xmm3 # 0x08... 978238384Sjkim movdqa 0x40($const), %xmm4 # .LM0 979238384Sjkim pcmpeqd %xmm5, %xmm5 # .LNOT 980238384Sjkim 981238384Sjkim movdqu ($inp), %xmm6 # load round 1 key 982238384Sjkim movdqa %xmm7, ($out) # save round 0 key 983238384Sjkim lea 0x10($out), $out 984238384Sjkim dec $rounds 985238384Sjkim jmp .Lkey_loop 986238384Sjkim.align 16 987238384Sjkim.Lkey_loop: 988238384Sjkim pshufb %xmm4, %xmm6 # .LM0 989238384Sjkim 990238384Sjkim movdqa %xmm0, %xmm8 991238384Sjkim movdqa %xmm1, %xmm9 992238384Sjkim 993238384Sjkim pand %xmm6, %xmm8 994238384Sjkim pand %xmm6, %xmm9 995238384Sjkim movdqa %xmm2, %xmm10 996238384Sjkim pcmpeqb %xmm0, %xmm8 997238384Sjkim psllq \$4, %xmm0 # 0x10... 998238384Sjkim movdqa %xmm3, %xmm11 999238384Sjkim pcmpeqb %xmm1, %xmm9 1000238384Sjkim psllq \$4, %xmm1 # 0x20... 1001238384Sjkim 1002238384Sjkim pand %xmm6, %xmm10 1003238384Sjkim pand %xmm6, %xmm11 1004238384Sjkim movdqa %xmm0, %xmm12 1005238384Sjkim pcmpeqb %xmm2, %xmm10 1006238384Sjkim psllq \$4, %xmm2 # 0x40... 1007238384Sjkim movdqa %xmm1, %xmm13 1008238384Sjkim pcmpeqb %xmm3, %xmm11 1009238384Sjkim psllq \$4, %xmm3 # 0x80... 1010238384Sjkim 1011238384Sjkim movdqa %xmm2, %xmm14 1012238384Sjkim movdqa %xmm3, %xmm15 1013238384Sjkim pxor %xmm5, %xmm8 # "pnot" 1014238384Sjkim pxor %xmm5, %xmm9 1015238384Sjkim 1016238384Sjkim pand %xmm6, %xmm12 1017238384Sjkim pand %xmm6, %xmm13 1018238384Sjkim movdqa %xmm8, 0x00($out) # write bit-sliced round key 1019238384Sjkim pcmpeqb %xmm0, %xmm12 1020238384Sjkim psrlq \$4, %xmm0 # 0x01... 1021238384Sjkim movdqa %xmm9, 0x10($out) 1022238384Sjkim pcmpeqb %xmm1, %xmm13 1023238384Sjkim psrlq \$4, %xmm1 # 0x02... 1024238384Sjkim lea 0x10($inp), $inp 1025238384Sjkim 1026238384Sjkim pand %xmm6, %xmm14 1027238384Sjkim pand %xmm6, %xmm15 1028238384Sjkim movdqa %xmm10, 0x20($out) 1029238384Sjkim pcmpeqb %xmm2, %xmm14 1030238384Sjkim psrlq \$4, %xmm2 # 0x04... 1031238384Sjkim movdqa %xmm11, 0x30($out) 1032238384Sjkim pcmpeqb %xmm3, %xmm15 1033238384Sjkim psrlq \$4, %xmm3 # 0x08... 1034238384Sjkim movdqu ($inp), %xmm6 # load next round key 1035238384Sjkim 1036238384Sjkim pxor %xmm5, %xmm13 # "pnot" 1037238384Sjkim pxor %xmm5, %xmm14 1038238384Sjkim movdqa %xmm12, 0x40($out) 1039238384Sjkim movdqa %xmm13, 0x50($out) 1040238384Sjkim movdqa %xmm14, 0x60($out) 1041238384Sjkim movdqa %xmm15, 0x70($out) 1042238384Sjkim lea 0x80($out),$out 1043238384Sjkim dec $rounds 1044238384Sjkim jnz .Lkey_loop 1045238384Sjkim 1046238384Sjkim movdqa 0x50($const), %xmm7 # .L63 1047238384Sjkim #movdqa %xmm6, ($out) # don't save last round key 1048238384Sjkim ret 1049238384Sjkim.size _bsaes_key_convert,.-_bsaes_key_convert 1050238384Sjkim___ 1051238384Sjkim} 1052238384Sjkim 1053238384Sjkimif (0 && !$win64) { # following four functions are unsupported interface 1054238384Sjkim # used for benchmarking... 1055238384Sjkim$code.=<<___; 1056238384Sjkim.globl bsaes_enc_key_convert 1057238384Sjkim.type bsaes_enc_key_convert,\@function,2 1058238384Sjkim.align 16 1059238384Sjkimbsaes_enc_key_convert: 1060238384Sjkim mov 240($inp),%r10d # pass rounds 1061238384Sjkim mov $inp,%rcx # pass key 1062238384Sjkim mov $out,%rax # pass key schedule 1063238384Sjkim call _bsaes_key_convert 1064238384Sjkim pxor %xmm6,%xmm7 # fix up last round key 1065238384Sjkim movdqa %xmm7,(%rax) # save last round key 1066238384Sjkim ret 1067238384Sjkim.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1068238384Sjkim 1069238384Sjkim.globl bsaes_encrypt_128 1070238384Sjkim.type bsaes_encrypt_128,\@function,4 1071238384Sjkim.align 16 1072238384Sjkimbsaes_encrypt_128: 1073238384Sjkim.Lenc128_loop: 1074238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1075238384Sjkim movdqu 0x10($inp), @XMM[1] 1076238384Sjkim movdqu 0x20($inp), @XMM[2] 1077238384Sjkim movdqu 0x30($inp), @XMM[3] 1078238384Sjkim movdqu 0x40($inp), @XMM[4] 1079238384Sjkim movdqu 0x50($inp), @XMM[5] 1080238384Sjkim movdqu 0x60($inp), @XMM[6] 1081238384Sjkim movdqu 0x70($inp), @XMM[7] 1082238384Sjkim mov $key, %rax # pass the $key 1083238384Sjkim lea 0x80($inp), $inp 1084238384Sjkim mov \$10,%r10d 1085238384Sjkim 1086238384Sjkim call _bsaes_encrypt8 1087238384Sjkim 1088238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1089238384Sjkim movdqu @XMM[1], 0x10($out) 1090238384Sjkim movdqu @XMM[4], 0x20($out) 1091238384Sjkim movdqu @XMM[6], 0x30($out) 1092238384Sjkim movdqu @XMM[3], 0x40($out) 1093238384Sjkim movdqu @XMM[7], 0x50($out) 1094238384Sjkim movdqu @XMM[2], 0x60($out) 1095238384Sjkim movdqu @XMM[5], 0x70($out) 1096238384Sjkim lea 0x80($out), $out 1097238384Sjkim sub \$0x80,$len 1098238384Sjkim ja .Lenc128_loop 1099238384Sjkim ret 1100238384Sjkim.size bsaes_encrypt_128,.-bsaes_encrypt_128 1101238384Sjkim 1102238384Sjkim.globl bsaes_dec_key_convert 1103238384Sjkim.type bsaes_dec_key_convert,\@function,2 1104238384Sjkim.align 16 1105238384Sjkimbsaes_dec_key_convert: 1106238384Sjkim mov 240($inp),%r10d # pass rounds 1107238384Sjkim mov $inp,%rcx # pass key 1108238384Sjkim mov $out,%rax # pass key schedule 1109238384Sjkim call _bsaes_key_convert 1110238384Sjkim pxor ($out),%xmm7 # fix up round 0 key 1111238384Sjkim movdqa %xmm6,(%rax) # save last round key 1112238384Sjkim movdqa %xmm7,($out) 1113238384Sjkim ret 1114238384Sjkim.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1115238384Sjkim 1116238384Sjkim.globl bsaes_decrypt_128 1117238384Sjkim.type bsaes_decrypt_128,\@function,4 1118238384Sjkim.align 16 1119238384Sjkimbsaes_decrypt_128: 1120238384Sjkim.Ldec128_loop: 1121238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1122238384Sjkim movdqu 0x10($inp), @XMM[1] 1123238384Sjkim movdqu 0x20($inp), @XMM[2] 1124238384Sjkim movdqu 0x30($inp), @XMM[3] 1125238384Sjkim movdqu 0x40($inp), @XMM[4] 1126238384Sjkim movdqu 0x50($inp), @XMM[5] 1127238384Sjkim movdqu 0x60($inp), @XMM[6] 1128238384Sjkim movdqu 0x70($inp), @XMM[7] 1129238384Sjkim mov $key, %rax # pass the $key 1130238384Sjkim lea 0x80($inp), $inp 1131238384Sjkim mov \$10,%r10d 1132238384Sjkim 1133238384Sjkim call _bsaes_decrypt8 1134238384Sjkim 1135238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1136238384Sjkim movdqu @XMM[1], 0x10($out) 1137238384Sjkim movdqu @XMM[6], 0x20($out) 1138238384Sjkim movdqu @XMM[4], 0x30($out) 1139238384Sjkim movdqu @XMM[2], 0x40($out) 1140238384Sjkim movdqu @XMM[7], 0x50($out) 1141238384Sjkim movdqu @XMM[3], 0x60($out) 1142238384Sjkim movdqu @XMM[5], 0x70($out) 1143238384Sjkim lea 0x80($out), $out 1144238384Sjkim sub \$0x80,$len 1145238384Sjkim ja .Ldec128_loop 1146238384Sjkim ret 1147238384Sjkim.size bsaes_decrypt_128,.-bsaes_decrypt_128 1148238384Sjkim___ 1149238384Sjkim} 1150238384Sjkim{ 1151238384Sjkim###################################################################### 1152238384Sjkim# 1153238384Sjkim# OpenSSL interface 1154238384Sjkim# 1155238384Sjkimmy ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1156238384Sjkim : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1157238384Sjkimmy ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1158238384Sjkim 1159238384Sjkimif ($ecb) { 1160238384Sjkim$code.=<<___; 1161238384Sjkim.globl bsaes_ecb_encrypt_blocks 1162238384Sjkim.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1163238384Sjkim.align 16 1164238384Sjkimbsaes_ecb_encrypt_blocks: 1165238384Sjkim mov %rsp, %rax 1166238384Sjkim.Lecb_enc_prologue: 1167238384Sjkim push %rbp 1168238384Sjkim push %rbx 1169238384Sjkim push %r12 1170238384Sjkim push %r13 1171238384Sjkim push %r14 1172238384Sjkim push %r15 1173238384Sjkim lea -0x48(%rsp),%rsp 1174238384Sjkim___ 1175238384Sjkim$code.=<<___ if ($win64); 1176238384Sjkim lea -0xa0(%rsp), %rsp 1177238384Sjkim movaps %xmm6, 0x40(%rsp) 1178238384Sjkim movaps %xmm7, 0x50(%rsp) 1179238384Sjkim movaps %xmm8, 0x60(%rsp) 1180238384Sjkim movaps %xmm9, 0x70(%rsp) 1181238384Sjkim movaps %xmm10, 0x80(%rsp) 1182238384Sjkim movaps %xmm11, 0x90(%rsp) 1183238384Sjkim movaps %xmm12, 0xa0(%rsp) 1184238384Sjkim movaps %xmm13, 0xb0(%rsp) 1185238384Sjkim movaps %xmm14, 0xc0(%rsp) 1186238384Sjkim movaps %xmm15, 0xd0(%rsp) 1187238384Sjkim.Lecb_enc_body: 1188238384Sjkim___ 1189238384Sjkim$code.=<<___; 1190238384Sjkim mov %rsp,%rbp # backup %rsp 1191238384Sjkim mov 240($arg4),%eax # rounds 1192238384Sjkim mov $arg1,$inp # backup arguments 1193238384Sjkim mov $arg2,$out 1194238384Sjkim mov $arg3,$len 1195238384Sjkim mov $arg4,$key 1196238384Sjkim cmp \$8,$arg3 1197238384Sjkim jb .Lecb_enc_short 1198238384Sjkim 1199238384Sjkim mov %eax,%ebx # backup rounds 1200238384Sjkim shl \$7,%rax # 128 bytes per inner round key 1201238384Sjkim sub \$`128-32`,%rax # size of bit-sliced key schedule 1202238384Sjkim sub %rax,%rsp 1203238384Sjkim mov %rsp,%rax # pass key schedule 1204238384Sjkim mov $key,%rcx # pass key 1205238384Sjkim mov %ebx,%r10d # pass rounds 1206238384Sjkim call _bsaes_key_convert 1207238384Sjkim pxor %xmm6,%xmm7 # fix up last round key 1208238384Sjkim movdqa %xmm7,(%rax) # save last round key 1209238384Sjkim 1210238384Sjkim sub \$8,$len 1211238384Sjkim.Lecb_enc_loop: 1212238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1213238384Sjkim movdqu 0x10($inp), @XMM[1] 1214238384Sjkim movdqu 0x20($inp), @XMM[2] 1215238384Sjkim movdqu 0x30($inp), @XMM[3] 1216238384Sjkim movdqu 0x40($inp), @XMM[4] 1217238384Sjkim movdqu 0x50($inp), @XMM[5] 1218238384Sjkim mov %rsp, %rax # pass key schedule 1219238384Sjkim movdqu 0x60($inp), @XMM[6] 1220238384Sjkim mov %ebx,%r10d # pass rounds 1221238384Sjkim movdqu 0x70($inp), @XMM[7] 1222238384Sjkim lea 0x80($inp), $inp 1223238384Sjkim 1224238384Sjkim call _bsaes_encrypt8 1225238384Sjkim 1226238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1227238384Sjkim movdqu @XMM[1], 0x10($out) 1228238384Sjkim movdqu @XMM[4], 0x20($out) 1229238384Sjkim movdqu @XMM[6], 0x30($out) 1230238384Sjkim movdqu @XMM[3], 0x40($out) 1231238384Sjkim movdqu @XMM[7], 0x50($out) 1232238384Sjkim movdqu @XMM[2], 0x60($out) 1233238384Sjkim movdqu @XMM[5], 0x70($out) 1234238384Sjkim lea 0x80($out), $out 1235238384Sjkim sub \$8,$len 1236238384Sjkim jnc .Lecb_enc_loop 1237238384Sjkim 1238238384Sjkim add \$8,$len 1239238384Sjkim jz .Lecb_enc_done 1240238384Sjkim 1241238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1242238384Sjkim mov %rsp, %rax # pass key schedule 1243238384Sjkim mov %ebx,%r10d # pass rounds 1244238384Sjkim cmp \$2,$len 1245238384Sjkim jb .Lecb_enc_one 1246238384Sjkim movdqu 0x10($inp), @XMM[1] 1247238384Sjkim je .Lecb_enc_two 1248238384Sjkim movdqu 0x20($inp), @XMM[2] 1249238384Sjkim cmp \$4,$len 1250238384Sjkim jb .Lecb_enc_three 1251238384Sjkim movdqu 0x30($inp), @XMM[3] 1252238384Sjkim je .Lecb_enc_four 1253238384Sjkim movdqu 0x40($inp), @XMM[4] 1254238384Sjkim cmp \$6,$len 1255238384Sjkim jb .Lecb_enc_five 1256238384Sjkim movdqu 0x50($inp), @XMM[5] 1257238384Sjkim je .Lecb_enc_six 1258238384Sjkim movdqu 0x60($inp), @XMM[6] 1259238384Sjkim call _bsaes_encrypt8 1260238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1261238384Sjkim movdqu @XMM[1], 0x10($out) 1262238384Sjkim movdqu @XMM[4], 0x20($out) 1263238384Sjkim movdqu @XMM[6], 0x30($out) 1264238384Sjkim movdqu @XMM[3], 0x40($out) 1265238384Sjkim movdqu @XMM[7], 0x50($out) 1266238384Sjkim movdqu @XMM[2], 0x60($out) 1267238384Sjkim jmp .Lecb_enc_done 1268238384Sjkim.align 16 1269238384Sjkim.Lecb_enc_six: 1270238384Sjkim call _bsaes_encrypt8 1271238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1272238384Sjkim movdqu @XMM[1], 0x10($out) 1273238384Sjkim movdqu @XMM[4], 0x20($out) 1274238384Sjkim movdqu @XMM[6], 0x30($out) 1275238384Sjkim movdqu @XMM[3], 0x40($out) 1276238384Sjkim movdqu @XMM[7], 0x50($out) 1277238384Sjkim jmp .Lecb_enc_done 1278238384Sjkim.align 16 1279238384Sjkim.Lecb_enc_five: 1280238384Sjkim call _bsaes_encrypt8 1281238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1282238384Sjkim movdqu @XMM[1], 0x10($out) 1283238384Sjkim movdqu @XMM[4], 0x20($out) 1284238384Sjkim movdqu @XMM[6], 0x30($out) 1285238384Sjkim movdqu @XMM[3], 0x40($out) 1286238384Sjkim jmp .Lecb_enc_done 1287238384Sjkim.align 16 1288238384Sjkim.Lecb_enc_four: 1289238384Sjkim call _bsaes_encrypt8 1290238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1291238384Sjkim movdqu @XMM[1], 0x10($out) 1292238384Sjkim movdqu @XMM[4], 0x20($out) 1293238384Sjkim movdqu @XMM[6], 0x30($out) 1294238384Sjkim jmp .Lecb_enc_done 1295238384Sjkim.align 16 1296238384Sjkim.Lecb_enc_three: 1297238384Sjkim call _bsaes_encrypt8 1298238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1299238384Sjkim movdqu @XMM[1], 0x10($out) 1300238384Sjkim movdqu @XMM[4], 0x20($out) 1301238384Sjkim jmp .Lecb_enc_done 1302238384Sjkim.align 16 1303238384Sjkim.Lecb_enc_two: 1304238384Sjkim call _bsaes_encrypt8 1305238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1306238384Sjkim movdqu @XMM[1], 0x10($out) 1307238384Sjkim jmp .Lecb_enc_done 1308238384Sjkim.align 16 1309238384Sjkim.Lecb_enc_one: 1310238384Sjkim call _bsaes_encrypt8 1311238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1312238384Sjkim jmp .Lecb_enc_done 1313238384Sjkim.align 16 1314238384Sjkim.Lecb_enc_short: 1315238384Sjkim lea ($inp), $arg1 1316238384Sjkim lea ($out), $arg2 1317238384Sjkim lea ($key), $arg3 1318238384Sjkim call asm_AES_encrypt 1319238384Sjkim lea 16($inp), $inp 1320238384Sjkim lea 16($out), $out 1321238384Sjkim dec $len 1322238384Sjkim jnz .Lecb_enc_short 1323238384Sjkim 1324238384Sjkim.Lecb_enc_done: 1325238384Sjkim lea (%rsp),%rax 1326238384Sjkim pxor %xmm0, %xmm0 1327238384Sjkim.Lecb_enc_bzero: # wipe key schedule [if any] 1328238384Sjkim movdqa %xmm0, 0x00(%rax) 1329238384Sjkim movdqa %xmm0, 0x10(%rax) 1330238384Sjkim lea 0x20(%rax), %rax 1331238384Sjkim cmp %rax, %rbp 1332238384Sjkim jb .Lecb_enc_bzero 1333238384Sjkim 1334238384Sjkim lea (%rbp),%rsp # restore %rsp 1335238384Sjkim___ 1336238384Sjkim$code.=<<___ if ($win64); 1337238384Sjkim movaps 0x40(%rbp), %xmm6 1338238384Sjkim movaps 0x50(%rbp), %xmm7 1339238384Sjkim movaps 0x60(%rbp), %xmm8 1340238384Sjkim movaps 0x70(%rbp), %xmm9 1341238384Sjkim movaps 0x80(%rbp), %xmm10 1342238384Sjkim movaps 0x90(%rbp), %xmm11 1343238384Sjkim movaps 0xa0(%rbp), %xmm12 1344238384Sjkim movaps 0xb0(%rbp), %xmm13 1345238384Sjkim movaps 0xc0(%rbp), %xmm14 1346238384Sjkim movaps 0xd0(%rbp), %xmm15 1347238384Sjkim lea 0xa0(%rbp), %rsp 1348238384Sjkim___ 1349238384Sjkim$code.=<<___; 1350238384Sjkim mov 0x48(%rsp), %r15 1351238384Sjkim mov 0x50(%rsp), %r14 1352238384Sjkim mov 0x58(%rsp), %r13 1353238384Sjkim mov 0x60(%rsp), %r12 1354238384Sjkim mov 0x68(%rsp), %rbx 1355238384Sjkim mov 0x70(%rsp), %rax 1356238384Sjkim lea 0x78(%rsp), %rsp 1357238384Sjkim mov %rax, %rbp 1358238384Sjkim.Lecb_enc_epilogue: 1359238384Sjkim ret 1360238384Sjkim.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1361238384Sjkim 1362238384Sjkim.globl bsaes_ecb_decrypt_blocks 1363238384Sjkim.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1364238384Sjkim.align 16 1365238384Sjkimbsaes_ecb_decrypt_blocks: 1366238384Sjkim mov %rsp, %rax 1367238384Sjkim.Lecb_dec_prologue: 1368238384Sjkim push %rbp 1369238384Sjkim push %rbx 1370238384Sjkim push %r12 1371238384Sjkim push %r13 1372238384Sjkim push %r14 1373238384Sjkim push %r15 1374238384Sjkim lea -0x48(%rsp),%rsp 1375238384Sjkim___ 1376238384Sjkim$code.=<<___ if ($win64); 1377238384Sjkim lea -0xa0(%rsp), %rsp 1378238384Sjkim movaps %xmm6, 0x40(%rsp) 1379238384Sjkim movaps %xmm7, 0x50(%rsp) 1380238384Sjkim movaps %xmm8, 0x60(%rsp) 1381238384Sjkim movaps %xmm9, 0x70(%rsp) 1382238384Sjkim movaps %xmm10, 0x80(%rsp) 1383238384Sjkim movaps %xmm11, 0x90(%rsp) 1384238384Sjkim movaps %xmm12, 0xa0(%rsp) 1385238384Sjkim movaps %xmm13, 0xb0(%rsp) 1386238384Sjkim movaps %xmm14, 0xc0(%rsp) 1387238384Sjkim movaps %xmm15, 0xd0(%rsp) 1388238384Sjkim.Lecb_dec_body: 1389238384Sjkim___ 1390238384Sjkim$code.=<<___; 1391238384Sjkim mov %rsp,%rbp # backup %rsp 1392238384Sjkim mov 240($arg4),%eax # rounds 1393238384Sjkim mov $arg1,$inp # backup arguments 1394238384Sjkim mov $arg2,$out 1395238384Sjkim mov $arg3,$len 1396238384Sjkim mov $arg4,$key 1397238384Sjkim cmp \$8,$arg3 1398238384Sjkim jb .Lecb_dec_short 1399238384Sjkim 1400238384Sjkim mov %eax,%ebx # backup rounds 1401238384Sjkim shl \$7,%rax # 128 bytes per inner round key 1402238384Sjkim sub \$`128-32`,%rax # size of bit-sliced key schedule 1403238384Sjkim sub %rax,%rsp 1404238384Sjkim mov %rsp,%rax # pass key schedule 1405238384Sjkim mov $key,%rcx # pass key 1406238384Sjkim mov %ebx,%r10d # pass rounds 1407238384Sjkim call _bsaes_key_convert 1408238384Sjkim pxor (%rsp),%xmm7 # fix up 0 round key 1409238384Sjkim movdqa %xmm6,(%rax) # save last round key 1410238384Sjkim movdqa %xmm7,(%rsp) 1411238384Sjkim 1412238384Sjkim sub \$8,$len 1413238384Sjkim.Lecb_dec_loop: 1414238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1415238384Sjkim movdqu 0x10($inp), @XMM[1] 1416238384Sjkim movdqu 0x20($inp), @XMM[2] 1417238384Sjkim movdqu 0x30($inp), @XMM[3] 1418238384Sjkim movdqu 0x40($inp), @XMM[4] 1419238384Sjkim movdqu 0x50($inp), @XMM[5] 1420238384Sjkim mov %rsp, %rax # pass key schedule 1421238384Sjkim movdqu 0x60($inp), @XMM[6] 1422238384Sjkim mov %ebx,%r10d # pass rounds 1423238384Sjkim movdqu 0x70($inp), @XMM[7] 1424238384Sjkim lea 0x80($inp), $inp 1425238384Sjkim 1426238384Sjkim call _bsaes_decrypt8 1427238384Sjkim 1428238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1429238384Sjkim movdqu @XMM[1], 0x10($out) 1430238384Sjkim movdqu @XMM[6], 0x20($out) 1431238384Sjkim movdqu @XMM[4], 0x30($out) 1432238384Sjkim movdqu @XMM[2], 0x40($out) 1433238384Sjkim movdqu @XMM[7], 0x50($out) 1434238384Sjkim movdqu @XMM[3], 0x60($out) 1435238384Sjkim movdqu @XMM[5], 0x70($out) 1436238384Sjkim lea 0x80($out), $out 1437238384Sjkim sub \$8,$len 1438238384Sjkim jnc .Lecb_dec_loop 1439238384Sjkim 1440238384Sjkim add \$8,$len 1441238384Sjkim jz .Lecb_dec_done 1442238384Sjkim 1443238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1444238384Sjkim mov %rsp, %rax # pass key schedule 1445238384Sjkim mov %ebx,%r10d # pass rounds 1446238384Sjkim cmp \$2,$len 1447238384Sjkim jb .Lecb_dec_one 1448238384Sjkim movdqu 0x10($inp), @XMM[1] 1449238384Sjkim je .Lecb_dec_two 1450238384Sjkim movdqu 0x20($inp), @XMM[2] 1451238384Sjkim cmp \$4,$len 1452238384Sjkim jb .Lecb_dec_three 1453238384Sjkim movdqu 0x30($inp), @XMM[3] 1454238384Sjkim je .Lecb_dec_four 1455238384Sjkim movdqu 0x40($inp), @XMM[4] 1456238384Sjkim cmp \$6,$len 1457238384Sjkim jb .Lecb_dec_five 1458238384Sjkim movdqu 0x50($inp), @XMM[5] 1459238384Sjkim je .Lecb_dec_six 1460238384Sjkim movdqu 0x60($inp), @XMM[6] 1461238384Sjkim call _bsaes_decrypt8 1462238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1463238384Sjkim movdqu @XMM[1], 0x10($out) 1464238384Sjkim movdqu @XMM[6], 0x20($out) 1465238384Sjkim movdqu @XMM[4], 0x30($out) 1466238384Sjkim movdqu @XMM[2], 0x40($out) 1467238384Sjkim movdqu @XMM[7], 0x50($out) 1468238384Sjkim movdqu @XMM[3], 0x60($out) 1469238384Sjkim jmp .Lecb_dec_done 1470238384Sjkim.align 16 1471238384Sjkim.Lecb_dec_six: 1472238384Sjkim call _bsaes_decrypt8 1473238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1474238384Sjkim movdqu @XMM[1], 0x10($out) 1475238384Sjkim movdqu @XMM[6], 0x20($out) 1476238384Sjkim movdqu @XMM[4], 0x30($out) 1477238384Sjkim movdqu @XMM[2], 0x40($out) 1478238384Sjkim movdqu @XMM[7], 0x50($out) 1479238384Sjkim jmp .Lecb_dec_done 1480238384Sjkim.align 16 1481238384Sjkim.Lecb_dec_five: 1482238384Sjkim call _bsaes_decrypt8 1483238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1484238384Sjkim movdqu @XMM[1], 0x10($out) 1485238384Sjkim movdqu @XMM[6], 0x20($out) 1486238384Sjkim movdqu @XMM[4], 0x30($out) 1487238384Sjkim movdqu @XMM[2], 0x40($out) 1488238384Sjkim jmp .Lecb_dec_done 1489238384Sjkim.align 16 1490238384Sjkim.Lecb_dec_four: 1491238384Sjkim call _bsaes_decrypt8 1492238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1493238384Sjkim movdqu @XMM[1], 0x10($out) 1494238384Sjkim movdqu @XMM[6], 0x20($out) 1495238384Sjkim movdqu @XMM[4], 0x30($out) 1496238384Sjkim jmp .Lecb_dec_done 1497238384Sjkim.align 16 1498238384Sjkim.Lecb_dec_three: 1499238384Sjkim call _bsaes_decrypt8 1500238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1501238384Sjkim movdqu @XMM[1], 0x10($out) 1502238384Sjkim movdqu @XMM[6], 0x20($out) 1503238384Sjkim jmp .Lecb_dec_done 1504238384Sjkim.align 16 1505238384Sjkim.Lecb_dec_two: 1506238384Sjkim call _bsaes_decrypt8 1507238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1508238384Sjkim movdqu @XMM[1], 0x10($out) 1509238384Sjkim jmp .Lecb_dec_done 1510238384Sjkim.align 16 1511238384Sjkim.Lecb_dec_one: 1512238384Sjkim call _bsaes_decrypt8 1513238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1514238384Sjkim jmp .Lecb_dec_done 1515238384Sjkim.align 16 1516238384Sjkim.Lecb_dec_short: 1517238384Sjkim lea ($inp), $arg1 1518238384Sjkim lea ($out), $arg2 1519238384Sjkim lea ($key), $arg3 1520238384Sjkim call asm_AES_decrypt 1521238384Sjkim lea 16($inp), $inp 1522238384Sjkim lea 16($out), $out 1523238384Sjkim dec $len 1524238384Sjkim jnz .Lecb_dec_short 1525238384Sjkim 1526238384Sjkim.Lecb_dec_done: 1527238384Sjkim lea (%rsp),%rax 1528238384Sjkim pxor %xmm0, %xmm0 1529238384Sjkim.Lecb_dec_bzero: # wipe key schedule [if any] 1530238384Sjkim movdqa %xmm0, 0x00(%rax) 1531238384Sjkim movdqa %xmm0, 0x10(%rax) 1532238384Sjkim lea 0x20(%rax), %rax 1533238384Sjkim cmp %rax, %rbp 1534238384Sjkim jb .Lecb_dec_bzero 1535238384Sjkim 1536238384Sjkim lea (%rbp),%rsp # restore %rsp 1537238384Sjkim___ 1538238384Sjkim$code.=<<___ if ($win64); 1539238384Sjkim movaps 0x40(%rbp), %xmm6 1540238384Sjkim movaps 0x50(%rbp), %xmm7 1541238384Sjkim movaps 0x60(%rbp), %xmm8 1542238384Sjkim movaps 0x70(%rbp), %xmm9 1543238384Sjkim movaps 0x80(%rbp), %xmm10 1544238384Sjkim movaps 0x90(%rbp), %xmm11 1545238384Sjkim movaps 0xa0(%rbp), %xmm12 1546238384Sjkim movaps 0xb0(%rbp), %xmm13 1547238384Sjkim movaps 0xc0(%rbp), %xmm14 1548238384Sjkim movaps 0xd0(%rbp), %xmm15 1549238384Sjkim lea 0xa0(%rbp), %rsp 1550238384Sjkim___ 1551238384Sjkim$code.=<<___; 1552238384Sjkim mov 0x48(%rsp), %r15 1553238384Sjkim mov 0x50(%rsp), %r14 1554238384Sjkim mov 0x58(%rsp), %r13 1555238384Sjkim mov 0x60(%rsp), %r12 1556238384Sjkim mov 0x68(%rsp), %rbx 1557238384Sjkim mov 0x70(%rsp), %rax 1558238384Sjkim lea 0x78(%rsp), %rsp 1559238384Sjkim mov %rax, %rbp 1560238384Sjkim.Lecb_dec_epilogue: 1561238384Sjkim ret 1562238384Sjkim.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1563238384Sjkim___ 1564238384Sjkim} 1565238384Sjkim$code.=<<___; 1566238384Sjkim.extern asm_AES_cbc_encrypt 1567238384Sjkim.globl bsaes_cbc_encrypt 1568238384Sjkim.type bsaes_cbc_encrypt,\@abi-omnipotent 1569238384Sjkim.align 16 1570238384Sjkimbsaes_cbc_encrypt: 1571238384Sjkim___ 1572238384Sjkim$code.=<<___ if ($win64); 1573238384Sjkim mov 48(%rsp),$arg6 # pull direction flag 1574238384Sjkim___ 1575238384Sjkim$code.=<<___; 1576238384Sjkim cmp \$0,$arg6 1577238384Sjkim jne asm_AES_cbc_encrypt 1578238384Sjkim cmp \$128,$arg3 1579238384Sjkim jb asm_AES_cbc_encrypt 1580238384Sjkim 1581238384Sjkim mov %rsp, %rax 1582238384Sjkim.Lcbc_dec_prologue: 1583238384Sjkim push %rbp 1584238384Sjkim push %rbx 1585238384Sjkim push %r12 1586238384Sjkim push %r13 1587238384Sjkim push %r14 1588238384Sjkim push %r15 1589238384Sjkim lea -0x48(%rsp), %rsp 1590238384Sjkim___ 1591238384Sjkim$code.=<<___ if ($win64); 1592238384Sjkim mov 0xa0(%rsp),$arg5 # pull ivp 1593238384Sjkim lea -0xa0(%rsp), %rsp 1594238384Sjkim movaps %xmm6, 0x40(%rsp) 1595238384Sjkim movaps %xmm7, 0x50(%rsp) 1596238384Sjkim movaps %xmm8, 0x60(%rsp) 1597238384Sjkim movaps %xmm9, 0x70(%rsp) 1598238384Sjkim movaps %xmm10, 0x80(%rsp) 1599238384Sjkim movaps %xmm11, 0x90(%rsp) 1600238384Sjkim movaps %xmm12, 0xa0(%rsp) 1601238384Sjkim movaps %xmm13, 0xb0(%rsp) 1602238384Sjkim movaps %xmm14, 0xc0(%rsp) 1603238384Sjkim movaps %xmm15, 0xd0(%rsp) 1604238384Sjkim.Lcbc_dec_body: 1605238384Sjkim___ 1606238384Sjkim$code.=<<___; 1607238384Sjkim mov %rsp, %rbp # backup %rsp 1608238384Sjkim mov 240($arg4), %eax # rounds 1609238384Sjkim mov $arg1, $inp # backup arguments 1610238384Sjkim mov $arg2, $out 1611238384Sjkim mov $arg3, $len 1612238384Sjkim mov $arg4, $key 1613238384Sjkim mov $arg5, %rbx 1614238384Sjkim shr \$4, $len # bytes to blocks 1615238384Sjkim 1616238384Sjkim mov %eax, %edx # rounds 1617238384Sjkim shl \$7, %rax # 128 bytes per inner round key 1618238384Sjkim sub \$`128-32`, %rax # size of bit-sliced key schedule 1619238384Sjkim sub %rax, %rsp 1620238384Sjkim 1621238384Sjkim mov %rsp, %rax # pass key schedule 1622238384Sjkim mov $key, %rcx # pass key 1623238384Sjkim mov %edx, %r10d # pass rounds 1624238384Sjkim call _bsaes_key_convert 1625238384Sjkim pxor (%rsp),%xmm7 # fix up 0 round key 1626238384Sjkim movdqa %xmm6,(%rax) # save last round key 1627238384Sjkim movdqa %xmm7,(%rsp) 1628238384Sjkim 1629238384Sjkim movdqu (%rbx), @XMM[15] # load IV 1630238384Sjkim sub \$8,$len 1631238384Sjkim.Lcbc_dec_loop: 1632238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1633238384Sjkim movdqu 0x10($inp), @XMM[1] 1634238384Sjkim movdqu 0x20($inp), @XMM[2] 1635238384Sjkim movdqu 0x30($inp), @XMM[3] 1636238384Sjkim movdqu 0x40($inp), @XMM[4] 1637238384Sjkim movdqu 0x50($inp), @XMM[5] 1638238384Sjkim mov %rsp, %rax # pass key schedule 1639238384Sjkim movdqu 0x60($inp), @XMM[6] 1640238384Sjkim mov %edx,%r10d # pass rounds 1641238384Sjkim movdqu 0x70($inp), @XMM[7] 1642238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1643238384Sjkim 1644238384Sjkim call _bsaes_decrypt8 1645238384Sjkim 1646238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1647238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1648238384Sjkim movdqu 0x10($inp), @XMM[9] 1649238384Sjkim pxor @XMM[8], @XMM[1] 1650238384Sjkim movdqu 0x20($inp), @XMM[10] 1651238384Sjkim pxor @XMM[9], @XMM[6] 1652238384Sjkim movdqu 0x30($inp), @XMM[11] 1653238384Sjkim pxor @XMM[10], @XMM[4] 1654238384Sjkim movdqu 0x40($inp), @XMM[12] 1655238384Sjkim pxor @XMM[11], @XMM[2] 1656238384Sjkim movdqu 0x50($inp), @XMM[13] 1657238384Sjkim pxor @XMM[12], @XMM[7] 1658238384Sjkim movdqu 0x60($inp), @XMM[14] 1659238384Sjkim pxor @XMM[13], @XMM[3] 1660238384Sjkim movdqu 0x70($inp), @XMM[15] # IV 1661238384Sjkim pxor @XMM[14], @XMM[5] 1662238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1663238384Sjkim lea 0x80($inp), $inp 1664238384Sjkim movdqu @XMM[1], 0x10($out) 1665238384Sjkim movdqu @XMM[6], 0x20($out) 1666238384Sjkim movdqu @XMM[4], 0x30($out) 1667238384Sjkim movdqu @XMM[2], 0x40($out) 1668238384Sjkim movdqu @XMM[7], 0x50($out) 1669238384Sjkim movdqu @XMM[3], 0x60($out) 1670238384Sjkim movdqu @XMM[5], 0x70($out) 1671238384Sjkim lea 0x80($out), $out 1672238384Sjkim sub \$8,$len 1673238384Sjkim jnc .Lcbc_dec_loop 1674238384Sjkim 1675238384Sjkim add \$8,$len 1676238384Sjkim jz .Lcbc_dec_done 1677238384Sjkim 1678238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1679238384Sjkim mov %rsp, %rax # pass key schedule 1680238384Sjkim mov %edx, %r10d # pass rounds 1681238384Sjkim cmp \$2,$len 1682238384Sjkim jb .Lcbc_dec_one 1683238384Sjkim movdqu 0x10($inp), @XMM[1] 1684238384Sjkim je .Lcbc_dec_two 1685238384Sjkim movdqu 0x20($inp), @XMM[2] 1686238384Sjkim cmp \$4,$len 1687238384Sjkim jb .Lcbc_dec_three 1688238384Sjkim movdqu 0x30($inp), @XMM[3] 1689238384Sjkim je .Lcbc_dec_four 1690238384Sjkim movdqu 0x40($inp), @XMM[4] 1691238384Sjkim cmp \$6,$len 1692238384Sjkim jb .Lcbc_dec_five 1693238384Sjkim movdqu 0x50($inp), @XMM[5] 1694238384Sjkim je .Lcbc_dec_six 1695238384Sjkim movdqu 0x60($inp), @XMM[6] 1696238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1697238384Sjkim call _bsaes_decrypt8 1698238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1699238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1700238384Sjkim movdqu 0x10($inp), @XMM[9] 1701238384Sjkim pxor @XMM[8], @XMM[1] 1702238384Sjkim movdqu 0x20($inp), @XMM[10] 1703238384Sjkim pxor @XMM[9], @XMM[6] 1704238384Sjkim movdqu 0x30($inp), @XMM[11] 1705238384Sjkim pxor @XMM[10], @XMM[4] 1706238384Sjkim movdqu 0x40($inp), @XMM[12] 1707238384Sjkim pxor @XMM[11], @XMM[2] 1708238384Sjkim movdqu 0x50($inp), @XMM[13] 1709238384Sjkim pxor @XMM[12], @XMM[7] 1710238384Sjkim movdqu 0x60($inp), @XMM[15] # IV 1711238384Sjkim pxor @XMM[13], @XMM[3] 1712238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1713238384Sjkim movdqu @XMM[1], 0x10($out) 1714238384Sjkim movdqu @XMM[6], 0x20($out) 1715238384Sjkim movdqu @XMM[4], 0x30($out) 1716238384Sjkim movdqu @XMM[2], 0x40($out) 1717238384Sjkim movdqu @XMM[7], 0x50($out) 1718238384Sjkim movdqu @XMM[3], 0x60($out) 1719238384Sjkim jmp .Lcbc_dec_done 1720238384Sjkim.align 16 1721238384Sjkim.Lcbc_dec_six: 1722238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1723238384Sjkim call _bsaes_decrypt8 1724238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1725238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1726238384Sjkim movdqu 0x10($inp), @XMM[9] 1727238384Sjkim pxor @XMM[8], @XMM[1] 1728238384Sjkim movdqu 0x20($inp), @XMM[10] 1729238384Sjkim pxor @XMM[9], @XMM[6] 1730238384Sjkim movdqu 0x30($inp), @XMM[11] 1731238384Sjkim pxor @XMM[10], @XMM[4] 1732238384Sjkim movdqu 0x40($inp), @XMM[12] 1733238384Sjkim pxor @XMM[11], @XMM[2] 1734238384Sjkim movdqu 0x50($inp), @XMM[15] # IV 1735238384Sjkim pxor @XMM[12], @XMM[7] 1736238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1737238384Sjkim movdqu @XMM[1], 0x10($out) 1738238384Sjkim movdqu @XMM[6], 0x20($out) 1739238384Sjkim movdqu @XMM[4], 0x30($out) 1740238384Sjkim movdqu @XMM[2], 0x40($out) 1741238384Sjkim movdqu @XMM[7], 0x50($out) 1742238384Sjkim jmp .Lcbc_dec_done 1743238384Sjkim.align 16 1744238384Sjkim.Lcbc_dec_five: 1745238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1746238384Sjkim call _bsaes_decrypt8 1747238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1748238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1749238384Sjkim movdqu 0x10($inp), @XMM[9] 1750238384Sjkim pxor @XMM[8], @XMM[1] 1751238384Sjkim movdqu 0x20($inp), @XMM[10] 1752238384Sjkim pxor @XMM[9], @XMM[6] 1753238384Sjkim movdqu 0x30($inp), @XMM[11] 1754238384Sjkim pxor @XMM[10], @XMM[4] 1755238384Sjkim movdqu 0x40($inp), @XMM[15] # IV 1756238384Sjkim pxor @XMM[11], @XMM[2] 1757238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1758238384Sjkim movdqu @XMM[1], 0x10($out) 1759238384Sjkim movdqu @XMM[6], 0x20($out) 1760238384Sjkim movdqu @XMM[4], 0x30($out) 1761238384Sjkim movdqu @XMM[2], 0x40($out) 1762238384Sjkim jmp .Lcbc_dec_done 1763238384Sjkim.align 16 1764238384Sjkim.Lcbc_dec_four: 1765238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1766238384Sjkim call _bsaes_decrypt8 1767238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1768238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1769238384Sjkim movdqu 0x10($inp), @XMM[9] 1770238384Sjkim pxor @XMM[8], @XMM[1] 1771238384Sjkim movdqu 0x20($inp), @XMM[10] 1772238384Sjkim pxor @XMM[9], @XMM[6] 1773238384Sjkim movdqu 0x30($inp), @XMM[15] # IV 1774238384Sjkim pxor @XMM[10], @XMM[4] 1775238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1776238384Sjkim movdqu @XMM[1], 0x10($out) 1777238384Sjkim movdqu @XMM[6], 0x20($out) 1778238384Sjkim movdqu @XMM[4], 0x30($out) 1779238384Sjkim jmp .Lcbc_dec_done 1780238384Sjkim.align 16 1781238384Sjkim.Lcbc_dec_three: 1782238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1783238384Sjkim call _bsaes_decrypt8 1784238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1785238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1786238384Sjkim movdqu 0x10($inp), @XMM[9] 1787238384Sjkim pxor @XMM[8], @XMM[1] 1788238384Sjkim movdqu 0x20($inp), @XMM[15] # IV 1789238384Sjkim pxor @XMM[9], @XMM[6] 1790238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1791238384Sjkim movdqu @XMM[1], 0x10($out) 1792238384Sjkim movdqu @XMM[6], 0x20($out) 1793238384Sjkim jmp .Lcbc_dec_done 1794238384Sjkim.align 16 1795238384Sjkim.Lcbc_dec_two: 1796238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1797238384Sjkim call _bsaes_decrypt8 1798238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1799238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1800238384Sjkim movdqu 0x10($inp), @XMM[15] # IV 1801238384Sjkim pxor @XMM[8], @XMM[1] 1802238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1803238384Sjkim movdqu @XMM[1], 0x10($out) 1804238384Sjkim jmp .Lcbc_dec_done 1805238384Sjkim.align 16 1806238384Sjkim.Lcbc_dec_one: 1807238384Sjkim lea ($inp), $arg1 1808238384Sjkim lea 0x20(%rbp), $arg2 # buffer output 1809238384Sjkim lea ($key), $arg3 1810238384Sjkim call asm_AES_decrypt # doesn't touch %xmm 1811238384Sjkim pxor 0x20(%rbp), @XMM[15] # ^= IV 1812238384Sjkim movdqu @XMM[15], ($out) # write output 1813238384Sjkim movdqa @XMM[0], @XMM[15] # IV 1814238384Sjkim 1815238384Sjkim.Lcbc_dec_done: 1816238384Sjkim movdqu @XMM[15], (%rbx) # return IV 1817238384Sjkim lea (%rsp), %rax 1818238384Sjkim pxor %xmm0, %xmm0 1819238384Sjkim.Lcbc_dec_bzero: # wipe key schedule [if any] 1820238384Sjkim movdqa %xmm0, 0x00(%rax) 1821238384Sjkim movdqa %xmm0, 0x10(%rax) 1822238384Sjkim lea 0x20(%rax), %rax 1823238384Sjkim cmp %rax, %rbp 1824238384Sjkim ja .Lcbc_dec_bzero 1825238384Sjkim 1826238384Sjkim lea (%rbp),%rsp # restore %rsp 1827238384Sjkim___ 1828238384Sjkim$code.=<<___ if ($win64); 1829238384Sjkim movaps 0x40(%rbp), %xmm6 1830238384Sjkim movaps 0x50(%rbp), %xmm7 1831238384Sjkim movaps 0x60(%rbp), %xmm8 1832238384Sjkim movaps 0x70(%rbp), %xmm9 1833238384Sjkim movaps 0x80(%rbp), %xmm10 1834238384Sjkim movaps 0x90(%rbp), %xmm11 1835238384Sjkim movaps 0xa0(%rbp), %xmm12 1836238384Sjkim movaps 0xb0(%rbp), %xmm13 1837238384Sjkim movaps 0xc0(%rbp), %xmm14 1838238384Sjkim movaps 0xd0(%rbp), %xmm15 1839238384Sjkim lea 0xa0(%rbp), %rsp 1840238384Sjkim___ 1841238384Sjkim$code.=<<___; 1842238384Sjkim mov 0x48(%rsp), %r15 1843238384Sjkim mov 0x50(%rsp), %r14 1844238384Sjkim mov 0x58(%rsp), %r13 1845238384Sjkim mov 0x60(%rsp), %r12 1846238384Sjkim mov 0x68(%rsp), %rbx 1847238384Sjkim mov 0x70(%rsp), %rax 1848238384Sjkim lea 0x78(%rsp), %rsp 1849238384Sjkim mov %rax, %rbp 1850238384Sjkim.Lcbc_dec_epilogue: 1851238384Sjkim ret 1852238384Sjkim.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1853238384Sjkim 1854238384Sjkim.globl bsaes_ctr32_encrypt_blocks 1855238384Sjkim.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1856238384Sjkim.align 16 1857238384Sjkimbsaes_ctr32_encrypt_blocks: 1858238384Sjkim mov %rsp, %rax 1859238384Sjkim.Lctr_enc_prologue: 1860238384Sjkim push %rbp 1861238384Sjkim push %rbx 1862238384Sjkim push %r12 1863238384Sjkim push %r13 1864238384Sjkim push %r14 1865238384Sjkim push %r15 1866238384Sjkim lea -0x48(%rsp), %rsp 1867238384Sjkim___ 1868238384Sjkim$code.=<<___ if ($win64); 1869238384Sjkim mov 0xa0(%rsp),$arg5 # pull ivp 1870238384Sjkim lea -0xa0(%rsp), %rsp 1871238384Sjkim movaps %xmm6, 0x40(%rsp) 1872238384Sjkim movaps %xmm7, 0x50(%rsp) 1873238384Sjkim movaps %xmm8, 0x60(%rsp) 1874238384Sjkim movaps %xmm9, 0x70(%rsp) 1875238384Sjkim movaps %xmm10, 0x80(%rsp) 1876238384Sjkim movaps %xmm11, 0x90(%rsp) 1877238384Sjkim movaps %xmm12, 0xa0(%rsp) 1878238384Sjkim movaps %xmm13, 0xb0(%rsp) 1879238384Sjkim movaps %xmm14, 0xc0(%rsp) 1880238384Sjkim movaps %xmm15, 0xd0(%rsp) 1881238384Sjkim.Lctr_enc_body: 1882238384Sjkim___ 1883238384Sjkim$code.=<<___; 1884238384Sjkim mov %rsp, %rbp # backup %rsp 1885238384Sjkim movdqu ($arg5), %xmm0 # load counter 1886238384Sjkim mov 240($arg4), %eax # rounds 1887238384Sjkim mov $arg1, $inp # backup arguments 1888238384Sjkim mov $arg2, $out 1889238384Sjkim mov $arg3, $len 1890238384Sjkim mov $arg4, $key 1891238384Sjkim movdqa %xmm0, 0x20(%rbp) # copy counter 1892238384Sjkim cmp \$8, $arg3 1893238384Sjkim jb .Lctr_enc_short 1894238384Sjkim 1895238384Sjkim mov %eax, %ebx # rounds 1896238384Sjkim shl \$7, %rax # 128 bytes per inner round key 1897238384Sjkim sub \$`128-32`, %rax # size of bit-sliced key schedule 1898238384Sjkim sub %rax, %rsp 1899238384Sjkim 1900238384Sjkim mov %rsp, %rax # pass key schedule 1901238384Sjkim mov $key, %rcx # pass key 1902238384Sjkim mov %ebx, %r10d # pass rounds 1903238384Sjkim call _bsaes_key_convert 1904238384Sjkim pxor %xmm6,%xmm7 # fix up last round key 1905238384Sjkim movdqa %xmm7,(%rax) # save last round key 1906238384Sjkim 1907238384Sjkim movdqa (%rsp), @XMM[9] # load round0 key 1908238384Sjkim lea .LADD1(%rip), %r11 1909238384Sjkim movdqa 0x20(%rbp), @XMM[0] # counter copy 1910238384Sjkim movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1911238384Sjkim pshufb @XMM[8], @XMM[9] # byte swap upper part 1912238384Sjkim pshufb @XMM[8], @XMM[0] 1913238384Sjkim movdqa @XMM[9], (%rsp) # save adjusted round0 key 1914238384Sjkim jmp .Lctr_enc_loop 1915238384Sjkim.align 16 1916238384Sjkim.Lctr_enc_loop: 1917238384Sjkim movdqa @XMM[0], 0x20(%rbp) # save counter 1918238384Sjkim movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1919238384Sjkim movdqa @XMM[0], @XMM[2] 1920238384Sjkim paddd 0x00(%r11), @XMM[1] # .LADD1 1921238384Sjkim movdqa @XMM[0], @XMM[3] 1922238384Sjkim paddd 0x10(%r11), @XMM[2] # .LADD2 1923238384Sjkim movdqa @XMM[0], @XMM[4] 1924238384Sjkim paddd 0x20(%r11), @XMM[3] # .LADD3 1925238384Sjkim movdqa @XMM[0], @XMM[5] 1926238384Sjkim paddd 0x30(%r11), @XMM[4] # .LADD4 1927238384Sjkim movdqa @XMM[0], @XMM[6] 1928238384Sjkim paddd 0x40(%r11), @XMM[5] # .LADD5 1929238384Sjkim movdqa @XMM[0], @XMM[7] 1930238384Sjkim paddd 0x50(%r11), @XMM[6] # .LADD6 1931238384Sjkim paddd 0x60(%r11), @XMM[7] # .LADD7 1932238384Sjkim 1933238384Sjkim # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1934238384Sjkim # to flip byte order in 32-bit counter 1935238384Sjkim movdqa (%rsp), @XMM[9] # round 0 key 1936238384Sjkim lea 0x10(%rsp), %rax # pass key schedule 1937238384Sjkim movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1938238384Sjkim pxor @XMM[9], @XMM[0] # xor with round0 key 1939238384Sjkim pxor @XMM[9], @XMM[1] 1940238384Sjkim pshufb @XMM[8], @XMM[0] 1941238384Sjkim pxor @XMM[9], @XMM[2] 1942238384Sjkim pshufb @XMM[8], @XMM[1] 1943238384Sjkim pxor @XMM[9], @XMM[3] 1944238384Sjkim pshufb @XMM[8], @XMM[2] 1945238384Sjkim pxor @XMM[9], @XMM[4] 1946238384Sjkim pshufb @XMM[8], @XMM[3] 1947238384Sjkim pxor @XMM[9], @XMM[5] 1948238384Sjkim pshufb @XMM[8], @XMM[4] 1949238384Sjkim pxor @XMM[9], @XMM[6] 1950238384Sjkim pshufb @XMM[8], @XMM[5] 1951238384Sjkim pxor @XMM[9], @XMM[7] 1952238384Sjkim pshufb @XMM[8], @XMM[6] 1953238384Sjkim lea .LBS0(%rip), %r11 # constants table 1954238384Sjkim pshufb @XMM[8], @XMM[7] 1955238384Sjkim mov %ebx,%r10d # pass rounds 1956238384Sjkim 1957238384Sjkim call _bsaes_encrypt8_bitslice 1958238384Sjkim 1959238384Sjkim sub \$8,$len 1960238384Sjkim jc .Lctr_enc_loop_done 1961238384Sjkim 1962238384Sjkim movdqu 0x00($inp), @XMM[8] # load input 1963238384Sjkim movdqu 0x10($inp), @XMM[9] 1964238384Sjkim movdqu 0x20($inp), @XMM[10] 1965238384Sjkim movdqu 0x30($inp), @XMM[11] 1966238384Sjkim movdqu 0x40($inp), @XMM[12] 1967238384Sjkim movdqu 0x50($inp), @XMM[13] 1968238384Sjkim movdqu 0x60($inp), @XMM[14] 1969238384Sjkim movdqu 0x70($inp), @XMM[15] 1970238384Sjkim lea 0x80($inp),$inp 1971238384Sjkim pxor @XMM[0], @XMM[8] 1972238384Sjkim movdqa 0x20(%rbp), @XMM[0] # load counter 1973238384Sjkim pxor @XMM[9], @XMM[1] 1974238384Sjkim movdqu @XMM[8], 0x00($out) # write output 1975238384Sjkim pxor @XMM[10], @XMM[4] 1976238384Sjkim movdqu @XMM[1], 0x10($out) 1977238384Sjkim pxor @XMM[11], @XMM[6] 1978238384Sjkim movdqu @XMM[4], 0x20($out) 1979238384Sjkim pxor @XMM[12], @XMM[3] 1980238384Sjkim movdqu @XMM[6], 0x30($out) 1981238384Sjkim pxor @XMM[13], @XMM[7] 1982238384Sjkim movdqu @XMM[3], 0x40($out) 1983238384Sjkim pxor @XMM[14], @XMM[2] 1984238384Sjkim movdqu @XMM[7], 0x50($out) 1985238384Sjkim pxor @XMM[15], @XMM[5] 1986238384Sjkim movdqu @XMM[2], 0x60($out) 1987238384Sjkim lea .LADD1(%rip), %r11 1988238384Sjkim movdqu @XMM[5], 0x70($out) 1989238384Sjkim lea 0x80($out), $out 1990238384Sjkim paddd 0x70(%r11), @XMM[0] # .LADD8 1991238384Sjkim jnz .Lctr_enc_loop 1992238384Sjkim 1993238384Sjkim jmp .Lctr_enc_done 1994238384Sjkim.align 16 1995238384Sjkim.Lctr_enc_loop_done: 1996238384Sjkim add \$8, $len 1997238384Sjkim movdqu 0x00($inp), @XMM[8] # load input 1998238384Sjkim pxor @XMM[8], @XMM[0] 1999238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2000238384Sjkim cmp \$2,$len 2001238384Sjkim jb .Lctr_enc_done 2002238384Sjkim movdqu 0x10($inp), @XMM[9] 2003238384Sjkim pxor @XMM[9], @XMM[1] 2004238384Sjkim movdqu @XMM[1], 0x10($out) 2005238384Sjkim je .Lctr_enc_done 2006238384Sjkim movdqu 0x20($inp), @XMM[10] 2007238384Sjkim pxor @XMM[10], @XMM[4] 2008238384Sjkim movdqu @XMM[4], 0x20($out) 2009238384Sjkim cmp \$4,$len 2010238384Sjkim jb .Lctr_enc_done 2011238384Sjkim movdqu 0x30($inp), @XMM[11] 2012238384Sjkim pxor @XMM[11], @XMM[6] 2013238384Sjkim movdqu @XMM[6], 0x30($out) 2014238384Sjkim je .Lctr_enc_done 2015238384Sjkim movdqu 0x40($inp), @XMM[12] 2016238384Sjkim pxor @XMM[12], @XMM[3] 2017238384Sjkim movdqu @XMM[3], 0x40($out) 2018238384Sjkim cmp \$6,$len 2019238384Sjkim jb .Lctr_enc_done 2020238384Sjkim movdqu 0x50($inp), @XMM[13] 2021238384Sjkim pxor @XMM[13], @XMM[7] 2022238384Sjkim movdqu @XMM[7], 0x50($out) 2023238384Sjkim je .Lctr_enc_done 2024238384Sjkim movdqu 0x60($inp), @XMM[14] 2025238384Sjkim pxor @XMM[14], @XMM[2] 2026238384Sjkim movdqu @XMM[2], 0x60($out) 2027238384Sjkim jmp .Lctr_enc_done 2028238384Sjkim 2029238384Sjkim.align 16 2030238384Sjkim.Lctr_enc_short: 2031238384Sjkim lea 0x20(%rbp), $arg1 2032238384Sjkim lea 0x30(%rbp), $arg2 2033238384Sjkim lea ($key), $arg3 2034238384Sjkim call asm_AES_encrypt 2035238384Sjkim movdqu ($inp), @XMM[1] 2036238384Sjkim lea 16($inp), $inp 2037238384Sjkim mov 0x2c(%rbp), %eax # load 32-bit counter 2038238384Sjkim bswap %eax 2039238384Sjkim pxor 0x30(%rbp), @XMM[1] 2040238384Sjkim inc %eax # increment 2041238384Sjkim movdqu @XMM[1], ($out) 2042238384Sjkim bswap %eax 2043238384Sjkim lea 16($out), $out 2044238384Sjkim mov %eax, 0x2c(%rsp) # save 32-bit counter 2045238384Sjkim dec $len 2046238384Sjkim jnz .Lctr_enc_short 2047238384Sjkim 2048238384Sjkim.Lctr_enc_done: 2049238384Sjkim lea (%rsp), %rax 2050238384Sjkim pxor %xmm0, %xmm0 2051238384Sjkim.Lctr_enc_bzero: # wipe key schedule [if any] 2052238384Sjkim movdqa %xmm0, 0x00(%rax) 2053238384Sjkim movdqa %xmm0, 0x10(%rax) 2054238384Sjkim lea 0x20(%rax), %rax 2055238384Sjkim cmp %rax, %rbp 2056238384Sjkim ja .Lctr_enc_bzero 2057238384Sjkim 2058238384Sjkim lea (%rbp),%rsp # restore %rsp 2059238384Sjkim___ 2060238384Sjkim$code.=<<___ if ($win64); 2061238384Sjkim movaps 0x40(%rbp), %xmm6 2062238384Sjkim movaps 0x50(%rbp), %xmm7 2063238384Sjkim movaps 0x60(%rbp), %xmm8 2064238384Sjkim movaps 0x70(%rbp), %xmm9 2065238384Sjkim movaps 0x80(%rbp), %xmm10 2066238384Sjkim movaps 0x90(%rbp), %xmm11 2067238384Sjkim movaps 0xa0(%rbp), %xmm12 2068238384Sjkim movaps 0xb0(%rbp), %xmm13 2069238384Sjkim movaps 0xc0(%rbp), %xmm14 2070238384Sjkim movaps 0xd0(%rbp), %xmm15 2071238384Sjkim lea 0xa0(%rbp), %rsp 2072238384Sjkim___ 2073238384Sjkim$code.=<<___; 2074238384Sjkim mov 0x48(%rsp), %r15 2075238384Sjkim mov 0x50(%rsp), %r14 2076238384Sjkim mov 0x58(%rsp), %r13 2077238384Sjkim mov 0x60(%rsp), %r12 2078238384Sjkim mov 0x68(%rsp), %rbx 2079238384Sjkim mov 0x70(%rsp), %rax 2080238384Sjkim lea 0x78(%rsp), %rsp 2081238384Sjkim mov %rax, %rbp 2082238384Sjkim.Lctr_enc_epilogue: 2083238384Sjkim ret 2084238384Sjkim.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2085238384Sjkim___ 2086238384Sjkim###################################################################### 2087238384Sjkim# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2088238384Sjkim# const AES_KEY *key1, const AES_KEY *key2, 2089238384Sjkim# const unsigned char iv[16]); 2090238384Sjkim# 2091238384Sjkimmy ($twmask,$twres,$twtmp)=@XMM[13..15]; 2092264331Sjkim$arg6=~s/d$//; 2093264331Sjkim 2094238384Sjkim$code.=<<___; 2095238384Sjkim.globl bsaes_xts_encrypt 2096238384Sjkim.type bsaes_xts_encrypt,\@abi-omnipotent 2097238384Sjkim.align 16 2098238384Sjkimbsaes_xts_encrypt: 2099238384Sjkim mov %rsp, %rax 2100238384Sjkim.Lxts_enc_prologue: 2101238384Sjkim push %rbp 2102238384Sjkim push %rbx 2103238384Sjkim push %r12 2104238384Sjkim push %r13 2105238384Sjkim push %r14 2106238384Sjkim push %r15 2107238384Sjkim lea -0x48(%rsp), %rsp 2108238384Sjkim___ 2109238384Sjkim$code.=<<___ if ($win64); 2110238384Sjkim mov 0xa0(%rsp),$arg5 # pull key2 2111238384Sjkim mov 0xa8(%rsp),$arg6 # pull ivp 2112238384Sjkim lea -0xa0(%rsp), %rsp 2113238384Sjkim movaps %xmm6, 0x40(%rsp) 2114238384Sjkim movaps %xmm7, 0x50(%rsp) 2115238384Sjkim movaps %xmm8, 0x60(%rsp) 2116238384Sjkim movaps %xmm9, 0x70(%rsp) 2117238384Sjkim movaps %xmm10, 0x80(%rsp) 2118238384Sjkim movaps %xmm11, 0x90(%rsp) 2119238384Sjkim movaps %xmm12, 0xa0(%rsp) 2120238384Sjkim movaps %xmm13, 0xb0(%rsp) 2121238384Sjkim movaps %xmm14, 0xc0(%rsp) 2122238384Sjkim movaps %xmm15, 0xd0(%rsp) 2123238384Sjkim.Lxts_enc_body: 2124238384Sjkim___ 2125238384Sjkim$code.=<<___; 2126238384Sjkim mov %rsp, %rbp # backup %rsp 2127238384Sjkim mov $arg1, $inp # backup arguments 2128238384Sjkim mov $arg2, $out 2129238384Sjkim mov $arg3, $len 2130238384Sjkim mov $arg4, $key 2131238384Sjkim 2132238384Sjkim lea ($arg6), $arg1 2133238384Sjkim lea 0x20(%rbp), $arg2 2134238384Sjkim lea ($arg5), $arg3 2135238384Sjkim call asm_AES_encrypt # generate initial tweak 2136238384Sjkim 2137238384Sjkim mov 240($key), %eax # rounds 2138238384Sjkim mov $len, %rbx # backup $len 2139238384Sjkim 2140238384Sjkim mov %eax, %edx # rounds 2141238384Sjkim shl \$7, %rax # 128 bytes per inner round key 2142238384Sjkim sub \$`128-32`, %rax # size of bit-sliced key schedule 2143238384Sjkim sub %rax, %rsp 2144238384Sjkim 2145238384Sjkim mov %rsp, %rax # pass key schedule 2146238384Sjkim mov $key, %rcx # pass key 2147238384Sjkim mov %edx, %r10d # pass rounds 2148238384Sjkim call _bsaes_key_convert 2149238384Sjkim pxor %xmm6, %xmm7 # fix up last round key 2150238384Sjkim movdqa %xmm7, (%rax) # save last round key 2151238384Sjkim 2152238384Sjkim and \$-16, $len 2153238384Sjkim sub \$0x80, %rsp # place for tweak[8] 2154238384Sjkim movdqa 0x20(%rbp), @XMM[7] # initial tweak 2155238384Sjkim 2156238384Sjkim pxor $twtmp, $twtmp 2157238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2158238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2159238384Sjkim 2160238384Sjkim sub \$0x80, $len 2161238384Sjkim jc .Lxts_enc_short 2162238384Sjkim jmp .Lxts_enc_loop 2163238384Sjkim 2164238384Sjkim.align 16 2165238384Sjkim.Lxts_enc_loop: 2166238384Sjkim___ 2167238384Sjkim for ($i=0;$i<7;$i++) { 2168238384Sjkim $code.=<<___; 2169238384Sjkim pshufd \$0x13, $twtmp, $twres 2170238384Sjkim pxor $twtmp, $twtmp 2171238384Sjkim movdqa @XMM[7], @XMM[$i] 2172238384Sjkim movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2173238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2174238384Sjkim pand $twmask, $twres # isolate carry and residue 2175238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2176238384Sjkim pxor $twres, @XMM[7] 2177238384Sjkim___ 2178238384Sjkim $code.=<<___ if ($i>=1); 2179238384Sjkim movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2180238384Sjkim___ 2181238384Sjkim $code.=<<___ if ($i>=2); 2182238384Sjkim pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2183238384Sjkim___ 2184238384Sjkim } 2185238384Sjkim$code.=<<___; 2186238384Sjkim movdqu 0x60($inp), @XMM[8+6] 2187238384Sjkim pxor @XMM[8+5], @XMM[5] 2188238384Sjkim movdqu 0x70($inp), @XMM[8+7] 2189238384Sjkim lea 0x80($inp), $inp 2190238384Sjkim movdqa @XMM[7], 0x70(%rsp) 2191238384Sjkim pxor @XMM[8+6], @XMM[6] 2192238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2193238384Sjkim pxor @XMM[8+7], @XMM[7] 2194238384Sjkim mov %edx, %r10d # pass rounds 2195238384Sjkim 2196238384Sjkim call _bsaes_encrypt8 2197238384Sjkim 2198238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2199238384Sjkim pxor 0x10(%rsp), @XMM[1] 2200238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2201238384Sjkim pxor 0x20(%rsp), @XMM[4] 2202238384Sjkim movdqu @XMM[1], 0x10($out) 2203238384Sjkim pxor 0x30(%rsp), @XMM[6] 2204238384Sjkim movdqu @XMM[4], 0x20($out) 2205238384Sjkim pxor 0x40(%rsp), @XMM[3] 2206238384Sjkim movdqu @XMM[6], 0x30($out) 2207238384Sjkim pxor 0x50(%rsp), @XMM[7] 2208238384Sjkim movdqu @XMM[3], 0x40($out) 2209238384Sjkim pxor 0x60(%rsp), @XMM[2] 2210238384Sjkim movdqu @XMM[7], 0x50($out) 2211238384Sjkim pxor 0x70(%rsp), @XMM[5] 2212238384Sjkim movdqu @XMM[2], 0x60($out) 2213238384Sjkim movdqu @XMM[5], 0x70($out) 2214238384Sjkim lea 0x80($out), $out 2215238384Sjkim 2216238384Sjkim movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2217238384Sjkim pxor $twtmp, $twtmp 2218238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2219238384Sjkim pcmpgtd @XMM[7], $twtmp 2220238384Sjkim pshufd \$0x13, $twtmp, $twres 2221238384Sjkim pxor $twtmp, $twtmp 2222238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2223238384Sjkim pand $twmask, $twres # isolate carry and residue 2224238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2225238384Sjkim pxor $twres, @XMM[7] 2226238384Sjkim 2227238384Sjkim sub \$0x80,$len 2228238384Sjkim jnc .Lxts_enc_loop 2229238384Sjkim 2230238384Sjkim.Lxts_enc_short: 2231238384Sjkim add \$0x80, $len 2232238384Sjkim jz .Lxts_enc_done 2233238384Sjkim___ 2234238384Sjkim for ($i=0;$i<7;$i++) { 2235238384Sjkim $code.=<<___; 2236238384Sjkim pshufd \$0x13, $twtmp, $twres 2237238384Sjkim pxor $twtmp, $twtmp 2238238384Sjkim movdqa @XMM[7], @XMM[$i] 2239238384Sjkim movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2240238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2241238384Sjkim pand $twmask, $twres # isolate carry and residue 2242238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2243238384Sjkim pxor $twres, @XMM[7] 2244238384Sjkim___ 2245238384Sjkim $code.=<<___ if ($i>=1); 2246238384Sjkim movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2247238384Sjkim cmp \$`0x10*$i`,$len 2248238384Sjkim je .Lxts_enc_$i 2249238384Sjkim___ 2250238384Sjkim $code.=<<___ if ($i>=2); 2251238384Sjkim pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2252238384Sjkim___ 2253238384Sjkim } 2254238384Sjkim$code.=<<___; 2255238384Sjkim movdqu 0x60($inp), @XMM[8+6] 2256238384Sjkim pxor @XMM[8+5], @XMM[5] 2257238384Sjkim movdqa @XMM[7], 0x70(%rsp) 2258238384Sjkim lea 0x70($inp), $inp 2259238384Sjkim pxor @XMM[8+6], @XMM[6] 2260238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2261238384Sjkim mov %edx, %r10d # pass rounds 2262238384Sjkim 2263238384Sjkim call _bsaes_encrypt8 2264238384Sjkim 2265238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2266238384Sjkim pxor 0x10(%rsp), @XMM[1] 2267238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2268238384Sjkim pxor 0x20(%rsp), @XMM[4] 2269238384Sjkim movdqu @XMM[1], 0x10($out) 2270238384Sjkim pxor 0x30(%rsp), @XMM[6] 2271238384Sjkim movdqu @XMM[4], 0x20($out) 2272238384Sjkim pxor 0x40(%rsp), @XMM[3] 2273238384Sjkim movdqu @XMM[6], 0x30($out) 2274238384Sjkim pxor 0x50(%rsp), @XMM[7] 2275238384Sjkim movdqu @XMM[3], 0x40($out) 2276238384Sjkim pxor 0x60(%rsp), @XMM[2] 2277238384Sjkim movdqu @XMM[7], 0x50($out) 2278238384Sjkim movdqu @XMM[2], 0x60($out) 2279238384Sjkim lea 0x70($out), $out 2280238384Sjkim 2281238384Sjkim movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2282238384Sjkim jmp .Lxts_enc_done 2283238384Sjkim.align 16 2284238384Sjkim.Lxts_enc_6: 2285238384Sjkim pxor @XMM[8+4], @XMM[4] 2286238384Sjkim lea 0x60($inp), $inp 2287238384Sjkim pxor @XMM[8+5], @XMM[5] 2288238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2289238384Sjkim mov %edx, %r10d # pass rounds 2290238384Sjkim 2291238384Sjkim call _bsaes_encrypt8 2292238384Sjkim 2293238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2294238384Sjkim pxor 0x10(%rsp), @XMM[1] 2295238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2296238384Sjkim pxor 0x20(%rsp), @XMM[4] 2297238384Sjkim movdqu @XMM[1], 0x10($out) 2298238384Sjkim pxor 0x30(%rsp), @XMM[6] 2299238384Sjkim movdqu @XMM[4], 0x20($out) 2300238384Sjkim pxor 0x40(%rsp), @XMM[3] 2301238384Sjkim movdqu @XMM[6], 0x30($out) 2302238384Sjkim pxor 0x50(%rsp), @XMM[7] 2303238384Sjkim movdqu @XMM[3], 0x40($out) 2304238384Sjkim movdqu @XMM[7], 0x50($out) 2305238384Sjkim lea 0x60($out), $out 2306238384Sjkim 2307238384Sjkim movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2308238384Sjkim jmp .Lxts_enc_done 2309238384Sjkim.align 16 2310238384Sjkim.Lxts_enc_5: 2311238384Sjkim pxor @XMM[8+3], @XMM[3] 2312238384Sjkim lea 0x50($inp), $inp 2313238384Sjkim pxor @XMM[8+4], @XMM[4] 2314238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2315238384Sjkim mov %edx, %r10d # pass rounds 2316238384Sjkim 2317238384Sjkim call _bsaes_encrypt8 2318238384Sjkim 2319238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2320238384Sjkim pxor 0x10(%rsp), @XMM[1] 2321238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2322238384Sjkim pxor 0x20(%rsp), @XMM[4] 2323238384Sjkim movdqu @XMM[1], 0x10($out) 2324238384Sjkim pxor 0x30(%rsp), @XMM[6] 2325238384Sjkim movdqu @XMM[4], 0x20($out) 2326238384Sjkim pxor 0x40(%rsp), @XMM[3] 2327238384Sjkim movdqu @XMM[6], 0x30($out) 2328238384Sjkim movdqu @XMM[3], 0x40($out) 2329238384Sjkim lea 0x50($out), $out 2330238384Sjkim 2331238384Sjkim movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2332238384Sjkim jmp .Lxts_enc_done 2333238384Sjkim.align 16 2334238384Sjkim.Lxts_enc_4: 2335238384Sjkim pxor @XMM[8+2], @XMM[2] 2336238384Sjkim lea 0x40($inp), $inp 2337238384Sjkim pxor @XMM[8+3], @XMM[3] 2338238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2339238384Sjkim mov %edx, %r10d # pass rounds 2340238384Sjkim 2341238384Sjkim call _bsaes_encrypt8 2342238384Sjkim 2343238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2344238384Sjkim pxor 0x10(%rsp), @XMM[1] 2345238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2346238384Sjkim pxor 0x20(%rsp), @XMM[4] 2347238384Sjkim movdqu @XMM[1], 0x10($out) 2348238384Sjkim pxor 0x30(%rsp), @XMM[6] 2349238384Sjkim movdqu @XMM[4], 0x20($out) 2350238384Sjkim movdqu @XMM[6], 0x30($out) 2351238384Sjkim lea 0x40($out), $out 2352238384Sjkim 2353238384Sjkim movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2354238384Sjkim jmp .Lxts_enc_done 2355238384Sjkim.align 16 2356238384Sjkim.Lxts_enc_3: 2357238384Sjkim pxor @XMM[8+1], @XMM[1] 2358238384Sjkim lea 0x30($inp), $inp 2359238384Sjkim pxor @XMM[8+2], @XMM[2] 2360238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2361238384Sjkim mov %edx, %r10d # pass rounds 2362238384Sjkim 2363238384Sjkim call _bsaes_encrypt8 2364238384Sjkim 2365238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2366238384Sjkim pxor 0x10(%rsp), @XMM[1] 2367238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2368238384Sjkim pxor 0x20(%rsp), @XMM[4] 2369238384Sjkim movdqu @XMM[1], 0x10($out) 2370238384Sjkim movdqu @XMM[4], 0x20($out) 2371238384Sjkim lea 0x30($out), $out 2372238384Sjkim 2373238384Sjkim movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2374238384Sjkim jmp .Lxts_enc_done 2375238384Sjkim.align 16 2376238384Sjkim.Lxts_enc_2: 2377238384Sjkim pxor @XMM[8+0], @XMM[0] 2378238384Sjkim lea 0x20($inp), $inp 2379238384Sjkim pxor @XMM[8+1], @XMM[1] 2380238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2381238384Sjkim mov %edx, %r10d # pass rounds 2382238384Sjkim 2383238384Sjkim call _bsaes_encrypt8 2384238384Sjkim 2385238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2386238384Sjkim pxor 0x10(%rsp), @XMM[1] 2387238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2388238384Sjkim movdqu @XMM[1], 0x10($out) 2389238384Sjkim lea 0x20($out), $out 2390238384Sjkim 2391238384Sjkim movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2392238384Sjkim jmp .Lxts_enc_done 2393238384Sjkim.align 16 2394238384Sjkim.Lxts_enc_1: 2395238384Sjkim pxor @XMM[0], @XMM[8] 2396238384Sjkim lea 0x10($inp), $inp 2397238384Sjkim movdqa @XMM[8], 0x20(%rbp) 2398238384Sjkim lea 0x20(%rbp), $arg1 2399238384Sjkim lea 0x20(%rbp), $arg2 2400238384Sjkim lea ($key), $arg3 2401238384Sjkim call asm_AES_encrypt # doesn't touch %xmm 2402238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2403238384Sjkim #pxor @XMM[8], @XMM[0] 2404238384Sjkim #lea 0x80(%rsp), %rax # pass key schedule 2405238384Sjkim #mov %edx, %r10d # pass rounds 2406238384Sjkim #call _bsaes_encrypt8 2407238384Sjkim #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2408238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2409238384Sjkim lea 0x10($out), $out 2410238384Sjkim 2411238384Sjkim movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2412238384Sjkim 2413238384Sjkim.Lxts_enc_done: 2414238384Sjkim and \$15, %ebx 2415238384Sjkim jz .Lxts_enc_ret 2416238384Sjkim mov $out, %rdx 2417238384Sjkim 2418238384Sjkim.Lxts_enc_steal: 2419238384Sjkim movzb ($inp), %eax 2420238384Sjkim movzb -16(%rdx), %ecx 2421238384Sjkim lea 1($inp), $inp 2422238384Sjkim mov %al, -16(%rdx) 2423238384Sjkim mov %cl, 0(%rdx) 2424238384Sjkim lea 1(%rdx), %rdx 2425238384Sjkim sub \$1,%ebx 2426238384Sjkim jnz .Lxts_enc_steal 2427238384Sjkim 2428238384Sjkim movdqu -16($out), @XMM[0] 2429238384Sjkim lea 0x20(%rbp), $arg1 2430238384Sjkim pxor @XMM[7], @XMM[0] 2431238384Sjkim lea 0x20(%rbp), $arg2 2432238384Sjkim movdqa @XMM[0], 0x20(%rbp) 2433238384Sjkim lea ($key), $arg3 2434238384Sjkim call asm_AES_encrypt # doesn't touch %xmm 2435238384Sjkim pxor 0x20(%rbp), @XMM[7] 2436238384Sjkim movdqu @XMM[7], -16($out) 2437238384Sjkim 2438238384Sjkim.Lxts_enc_ret: 2439238384Sjkim lea (%rsp), %rax 2440238384Sjkim pxor %xmm0, %xmm0 2441238384Sjkim.Lxts_enc_bzero: # wipe key schedule [if any] 2442238384Sjkim movdqa %xmm0, 0x00(%rax) 2443238384Sjkim movdqa %xmm0, 0x10(%rax) 2444238384Sjkim lea 0x20(%rax), %rax 2445238384Sjkim cmp %rax, %rbp 2446238384Sjkim ja .Lxts_enc_bzero 2447238384Sjkim 2448238384Sjkim lea (%rbp),%rsp # restore %rsp 2449238384Sjkim___ 2450238384Sjkim$code.=<<___ if ($win64); 2451238384Sjkim movaps 0x40(%rbp), %xmm6 2452238384Sjkim movaps 0x50(%rbp), %xmm7 2453238384Sjkim movaps 0x60(%rbp), %xmm8 2454238384Sjkim movaps 0x70(%rbp), %xmm9 2455238384Sjkim movaps 0x80(%rbp), %xmm10 2456238384Sjkim movaps 0x90(%rbp), %xmm11 2457238384Sjkim movaps 0xa0(%rbp), %xmm12 2458238384Sjkim movaps 0xb0(%rbp), %xmm13 2459238384Sjkim movaps 0xc0(%rbp), %xmm14 2460238384Sjkim movaps 0xd0(%rbp), %xmm15 2461238384Sjkim lea 0xa0(%rbp), %rsp 2462238384Sjkim___ 2463238384Sjkim$code.=<<___; 2464238384Sjkim mov 0x48(%rsp), %r15 2465238384Sjkim mov 0x50(%rsp), %r14 2466238384Sjkim mov 0x58(%rsp), %r13 2467238384Sjkim mov 0x60(%rsp), %r12 2468238384Sjkim mov 0x68(%rsp), %rbx 2469238384Sjkim mov 0x70(%rsp), %rax 2470238384Sjkim lea 0x78(%rsp), %rsp 2471238384Sjkim mov %rax, %rbp 2472238384Sjkim.Lxts_enc_epilogue: 2473238384Sjkim ret 2474238384Sjkim.size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2475238384Sjkim 2476238384Sjkim.globl bsaes_xts_decrypt 2477238384Sjkim.type bsaes_xts_decrypt,\@abi-omnipotent 2478238384Sjkim.align 16 2479238384Sjkimbsaes_xts_decrypt: 2480238384Sjkim mov %rsp, %rax 2481238384Sjkim.Lxts_dec_prologue: 2482238384Sjkim push %rbp 2483238384Sjkim push %rbx 2484238384Sjkim push %r12 2485238384Sjkim push %r13 2486238384Sjkim push %r14 2487238384Sjkim push %r15 2488238384Sjkim lea -0x48(%rsp), %rsp 2489238384Sjkim___ 2490238384Sjkim$code.=<<___ if ($win64); 2491238384Sjkim mov 0xa0(%rsp),$arg5 # pull key2 2492238384Sjkim mov 0xa8(%rsp),$arg6 # pull ivp 2493238384Sjkim lea -0xa0(%rsp), %rsp 2494238384Sjkim movaps %xmm6, 0x40(%rsp) 2495238384Sjkim movaps %xmm7, 0x50(%rsp) 2496238384Sjkim movaps %xmm8, 0x60(%rsp) 2497238384Sjkim movaps %xmm9, 0x70(%rsp) 2498238384Sjkim movaps %xmm10, 0x80(%rsp) 2499238384Sjkim movaps %xmm11, 0x90(%rsp) 2500238384Sjkim movaps %xmm12, 0xa0(%rsp) 2501238384Sjkim movaps %xmm13, 0xb0(%rsp) 2502238384Sjkim movaps %xmm14, 0xc0(%rsp) 2503238384Sjkim movaps %xmm15, 0xd0(%rsp) 2504238384Sjkim.Lxts_dec_body: 2505238384Sjkim___ 2506238384Sjkim$code.=<<___; 2507238384Sjkim mov %rsp, %rbp # backup %rsp 2508238384Sjkim mov $arg1, $inp # backup arguments 2509238384Sjkim mov $arg2, $out 2510238384Sjkim mov $arg3, $len 2511238384Sjkim mov $arg4, $key 2512238384Sjkim 2513238384Sjkim lea ($arg6), $arg1 2514238384Sjkim lea 0x20(%rbp), $arg2 2515238384Sjkim lea ($arg5), $arg3 2516238384Sjkim call asm_AES_encrypt # generate initial tweak 2517238384Sjkim 2518238384Sjkim mov 240($key), %eax # rounds 2519238384Sjkim mov $len, %rbx # backup $len 2520238384Sjkim 2521238384Sjkim mov %eax, %edx # rounds 2522238384Sjkim shl \$7, %rax # 128 bytes per inner round key 2523238384Sjkim sub \$`128-32`, %rax # size of bit-sliced key schedule 2524238384Sjkim sub %rax, %rsp 2525238384Sjkim 2526238384Sjkim mov %rsp, %rax # pass key schedule 2527238384Sjkim mov $key, %rcx # pass key 2528238384Sjkim mov %edx, %r10d # pass rounds 2529238384Sjkim call _bsaes_key_convert 2530238384Sjkim pxor (%rsp), %xmm7 # fix up round 0 key 2531238384Sjkim movdqa %xmm6, (%rax) # save last round key 2532238384Sjkim movdqa %xmm7, (%rsp) 2533238384Sjkim 2534238384Sjkim xor %eax, %eax # if ($len%16) len-=16; 2535238384Sjkim and \$-16, $len 2536238384Sjkim test \$15, %ebx 2537238384Sjkim setnz %al 2538238384Sjkim shl \$4, %rax 2539238384Sjkim sub %rax, $len 2540238384Sjkim 2541238384Sjkim sub \$0x80, %rsp # place for tweak[8] 2542238384Sjkim movdqa 0x20(%rbp), @XMM[7] # initial tweak 2543238384Sjkim 2544238384Sjkim pxor $twtmp, $twtmp 2545238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2546238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2547238384Sjkim 2548238384Sjkim sub \$0x80, $len 2549238384Sjkim jc .Lxts_dec_short 2550238384Sjkim jmp .Lxts_dec_loop 2551238384Sjkim 2552238384Sjkim.align 16 2553238384Sjkim.Lxts_dec_loop: 2554238384Sjkim___ 2555238384Sjkim for ($i=0;$i<7;$i++) { 2556238384Sjkim $code.=<<___; 2557238384Sjkim pshufd \$0x13, $twtmp, $twres 2558238384Sjkim pxor $twtmp, $twtmp 2559238384Sjkim movdqa @XMM[7], @XMM[$i] 2560238384Sjkim movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2561238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2562238384Sjkim pand $twmask, $twres # isolate carry and residue 2563238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2564238384Sjkim pxor $twres, @XMM[7] 2565238384Sjkim___ 2566238384Sjkim $code.=<<___ if ($i>=1); 2567238384Sjkim movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2568238384Sjkim___ 2569238384Sjkim $code.=<<___ if ($i>=2); 2570238384Sjkim pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2571238384Sjkim___ 2572238384Sjkim } 2573238384Sjkim$code.=<<___; 2574238384Sjkim movdqu 0x60($inp), @XMM[8+6] 2575238384Sjkim pxor @XMM[8+5], @XMM[5] 2576238384Sjkim movdqu 0x70($inp), @XMM[8+7] 2577238384Sjkim lea 0x80($inp), $inp 2578238384Sjkim movdqa @XMM[7], 0x70(%rsp) 2579238384Sjkim pxor @XMM[8+6], @XMM[6] 2580238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2581238384Sjkim pxor @XMM[8+7], @XMM[7] 2582238384Sjkim mov %edx, %r10d # pass rounds 2583238384Sjkim 2584238384Sjkim call _bsaes_decrypt8 2585238384Sjkim 2586238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2587238384Sjkim pxor 0x10(%rsp), @XMM[1] 2588238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2589238384Sjkim pxor 0x20(%rsp), @XMM[6] 2590238384Sjkim movdqu @XMM[1], 0x10($out) 2591238384Sjkim pxor 0x30(%rsp), @XMM[4] 2592238384Sjkim movdqu @XMM[6], 0x20($out) 2593238384Sjkim pxor 0x40(%rsp), @XMM[2] 2594238384Sjkim movdqu @XMM[4], 0x30($out) 2595238384Sjkim pxor 0x50(%rsp), @XMM[7] 2596238384Sjkim movdqu @XMM[2], 0x40($out) 2597238384Sjkim pxor 0x60(%rsp), @XMM[3] 2598238384Sjkim movdqu @XMM[7], 0x50($out) 2599238384Sjkim pxor 0x70(%rsp), @XMM[5] 2600238384Sjkim movdqu @XMM[3], 0x60($out) 2601238384Sjkim movdqu @XMM[5], 0x70($out) 2602238384Sjkim lea 0x80($out), $out 2603238384Sjkim 2604238384Sjkim movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2605238384Sjkim pxor $twtmp, $twtmp 2606238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2607238384Sjkim pcmpgtd @XMM[7], $twtmp 2608238384Sjkim pshufd \$0x13, $twtmp, $twres 2609238384Sjkim pxor $twtmp, $twtmp 2610238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2611238384Sjkim pand $twmask, $twres # isolate carry and residue 2612238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2613238384Sjkim pxor $twres, @XMM[7] 2614238384Sjkim 2615238384Sjkim sub \$0x80,$len 2616238384Sjkim jnc .Lxts_dec_loop 2617238384Sjkim 2618238384Sjkim.Lxts_dec_short: 2619238384Sjkim add \$0x80, $len 2620238384Sjkim jz .Lxts_dec_done 2621238384Sjkim___ 2622238384Sjkim for ($i=0;$i<7;$i++) { 2623238384Sjkim $code.=<<___; 2624238384Sjkim pshufd \$0x13, $twtmp, $twres 2625238384Sjkim pxor $twtmp, $twtmp 2626238384Sjkim movdqa @XMM[7], @XMM[$i] 2627238384Sjkim movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2628238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2629238384Sjkim pand $twmask, $twres # isolate carry and residue 2630238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2631238384Sjkim pxor $twres, @XMM[7] 2632238384Sjkim___ 2633238384Sjkim $code.=<<___ if ($i>=1); 2634238384Sjkim movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2635238384Sjkim cmp \$`0x10*$i`,$len 2636238384Sjkim je .Lxts_dec_$i 2637238384Sjkim___ 2638238384Sjkim $code.=<<___ if ($i>=2); 2639238384Sjkim pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2640238384Sjkim___ 2641238384Sjkim } 2642238384Sjkim$code.=<<___; 2643238384Sjkim movdqu 0x60($inp), @XMM[8+6] 2644238384Sjkim pxor @XMM[8+5], @XMM[5] 2645238384Sjkim movdqa @XMM[7], 0x70(%rsp) 2646238384Sjkim lea 0x70($inp), $inp 2647238384Sjkim pxor @XMM[8+6], @XMM[6] 2648238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2649238384Sjkim mov %edx, %r10d # pass rounds 2650238384Sjkim 2651238384Sjkim call _bsaes_decrypt8 2652238384Sjkim 2653238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2654238384Sjkim pxor 0x10(%rsp), @XMM[1] 2655238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2656238384Sjkim pxor 0x20(%rsp), @XMM[6] 2657238384Sjkim movdqu @XMM[1], 0x10($out) 2658238384Sjkim pxor 0x30(%rsp), @XMM[4] 2659238384Sjkim movdqu @XMM[6], 0x20($out) 2660238384Sjkim pxor 0x40(%rsp), @XMM[2] 2661238384Sjkim movdqu @XMM[4], 0x30($out) 2662238384Sjkim pxor 0x50(%rsp), @XMM[7] 2663238384Sjkim movdqu @XMM[2], 0x40($out) 2664238384Sjkim pxor 0x60(%rsp), @XMM[3] 2665238384Sjkim movdqu @XMM[7], 0x50($out) 2666238384Sjkim movdqu @XMM[3], 0x60($out) 2667238384Sjkim lea 0x70($out), $out 2668238384Sjkim 2669238384Sjkim movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2670238384Sjkim jmp .Lxts_dec_done 2671238384Sjkim.align 16 2672238384Sjkim.Lxts_dec_6: 2673238384Sjkim pxor @XMM[8+4], @XMM[4] 2674238384Sjkim lea 0x60($inp), $inp 2675238384Sjkim pxor @XMM[8+5], @XMM[5] 2676238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2677238384Sjkim mov %edx, %r10d # pass rounds 2678238384Sjkim 2679238384Sjkim call _bsaes_decrypt8 2680238384Sjkim 2681238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2682238384Sjkim pxor 0x10(%rsp), @XMM[1] 2683238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2684238384Sjkim pxor 0x20(%rsp), @XMM[6] 2685238384Sjkim movdqu @XMM[1], 0x10($out) 2686238384Sjkim pxor 0x30(%rsp), @XMM[4] 2687238384Sjkim movdqu @XMM[6], 0x20($out) 2688238384Sjkim pxor 0x40(%rsp), @XMM[2] 2689238384Sjkim movdqu @XMM[4], 0x30($out) 2690238384Sjkim pxor 0x50(%rsp), @XMM[7] 2691238384Sjkim movdqu @XMM[2], 0x40($out) 2692238384Sjkim movdqu @XMM[7], 0x50($out) 2693238384Sjkim lea 0x60($out), $out 2694238384Sjkim 2695238384Sjkim movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2696238384Sjkim jmp .Lxts_dec_done 2697238384Sjkim.align 16 2698238384Sjkim.Lxts_dec_5: 2699238384Sjkim pxor @XMM[8+3], @XMM[3] 2700238384Sjkim lea 0x50($inp), $inp 2701238384Sjkim pxor @XMM[8+4], @XMM[4] 2702238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2703238384Sjkim mov %edx, %r10d # pass rounds 2704238384Sjkim 2705238384Sjkim call _bsaes_decrypt8 2706238384Sjkim 2707238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2708238384Sjkim pxor 0x10(%rsp), @XMM[1] 2709238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2710238384Sjkim pxor 0x20(%rsp), @XMM[6] 2711238384Sjkim movdqu @XMM[1], 0x10($out) 2712238384Sjkim pxor 0x30(%rsp), @XMM[4] 2713238384Sjkim movdqu @XMM[6], 0x20($out) 2714238384Sjkim pxor 0x40(%rsp), @XMM[2] 2715238384Sjkim movdqu @XMM[4], 0x30($out) 2716238384Sjkim movdqu @XMM[2], 0x40($out) 2717238384Sjkim lea 0x50($out), $out 2718238384Sjkim 2719238384Sjkim movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2720238384Sjkim jmp .Lxts_dec_done 2721238384Sjkim.align 16 2722238384Sjkim.Lxts_dec_4: 2723238384Sjkim pxor @XMM[8+2], @XMM[2] 2724238384Sjkim lea 0x40($inp), $inp 2725238384Sjkim pxor @XMM[8+3], @XMM[3] 2726238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2727238384Sjkim mov %edx, %r10d # pass rounds 2728238384Sjkim 2729238384Sjkim call _bsaes_decrypt8 2730238384Sjkim 2731238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2732238384Sjkim pxor 0x10(%rsp), @XMM[1] 2733238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2734238384Sjkim pxor 0x20(%rsp), @XMM[6] 2735238384Sjkim movdqu @XMM[1], 0x10($out) 2736238384Sjkim pxor 0x30(%rsp), @XMM[4] 2737238384Sjkim movdqu @XMM[6], 0x20($out) 2738238384Sjkim movdqu @XMM[4], 0x30($out) 2739238384Sjkim lea 0x40($out), $out 2740238384Sjkim 2741238384Sjkim movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2742238384Sjkim jmp .Lxts_dec_done 2743238384Sjkim.align 16 2744238384Sjkim.Lxts_dec_3: 2745238384Sjkim pxor @XMM[8+1], @XMM[1] 2746238384Sjkim lea 0x30($inp), $inp 2747238384Sjkim pxor @XMM[8+2], @XMM[2] 2748238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2749238384Sjkim mov %edx, %r10d # pass rounds 2750238384Sjkim 2751238384Sjkim call _bsaes_decrypt8 2752238384Sjkim 2753238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2754238384Sjkim pxor 0x10(%rsp), @XMM[1] 2755238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2756238384Sjkim pxor 0x20(%rsp), @XMM[6] 2757238384Sjkim movdqu @XMM[1], 0x10($out) 2758238384Sjkim movdqu @XMM[6], 0x20($out) 2759238384Sjkim lea 0x30($out), $out 2760238384Sjkim 2761238384Sjkim movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2762238384Sjkim jmp .Lxts_dec_done 2763238384Sjkim.align 16 2764238384Sjkim.Lxts_dec_2: 2765238384Sjkim pxor @XMM[8+0], @XMM[0] 2766238384Sjkim lea 0x20($inp), $inp 2767238384Sjkim pxor @XMM[8+1], @XMM[1] 2768238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2769238384Sjkim mov %edx, %r10d # pass rounds 2770238384Sjkim 2771238384Sjkim call _bsaes_decrypt8 2772238384Sjkim 2773238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2774238384Sjkim pxor 0x10(%rsp), @XMM[1] 2775238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2776238384Sjkim movdqu @XMM[1], 0x10($out) 2777238384Sjkim lea 0x20($out), $out 2778238384Sjkim 2779238384Sjkim movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2780238384Sjkim jmp .Lxts_dec_done 2781238384Sjkim.align 16 2782238384Sjkim.Lxts_dec_1: 2783238384Sjkim pxor @XMM[0], @XMM[8] 2784238384Sjkim lea 0x10($inp), $inp 2785238384Sjkim movdqa @XMM[8], 0x20(%rbp) 2786238384Sjkim lea 0x20(%rbp), $arg1 2787238384Sjkim lea 0x20(%rbp), $arg2 2788238384Sjkim lea ($key), $arg3 2789238384Sjkim call asm_AES_decrypt # doesn't touch %xmm 2790238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2791238384Sjkim #pxor @XMM[8], @XMM[0] 2792238384Sjkim #lea 0x80(%rsp), %rax # pass key schedule 2793238384Sjkim #mov %edx, %r10d # pass rounds 2794238384Sjkim #call _bsaes_decrypt8 2795238384Sjkim #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2796238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2797238384Sjkim lea 0x10($out), $out 2798238384Sjkim 2799238384Sjkim movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2800238384Sjkim 2801238384Sjkim.Lxts_dec_done: 2802238384Sjkim and \$15, %ebx 2803238384Sjkim jz .Lxts_dec_ret 2804238384Sjkim 2805238384Sjkim pxor $twtmp, $twtmp 2806238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2807238384Sjkim pcmpgtd @XMM[7], $twtmp 2808238384Sjkim pshufd \$0x13, $twtmp, $twres 2809238384Sjkim movdqa @XMM[7], @XMM[6] 2810238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2811238384Sjkim pand $twmask, $twres # isolate carry and residue 2812238384Sjkim movdqu ($inp), @XMM[0] 2813238384Sjkim pxor $twres, @XMM[7] 2814238384Sjkim 2815238384Sjkim lea 0x20(%rbp), $arg1 2816238384Sjkim pxor @XMM[7], @XMM[0] 2817238384Sjkim lea 0x20(%rbp), $arg2 2818238384Sjkim movdqa @XMM[0], 0x20(%rbp) 2819238384Sjkim lea ($key), $arg3 2820238384Sjkim call asm_AES_decrypt # doesn't touch %xmm 2821238384Sjkim pxor 0x20(%rbp), @XMM[7] 2822238384Sjkim mov $out, %rdx 2823238384Sjkim movdqu @XMM[7], ($out) 2824238384Sjkim 2825238384Sjkim.Lxts_dec_steal: 2826238384Sjkim movzb 16($inp), %eax 2827238384Sjkim movzb (%rdx), %ecx 2828238384Sjkim lea 1($inp), $inp 2829238384Sjkim mov %al, (%rdx) 2830238384Sjkim mov %cl, 16(%rdx) 2831238384Sjkim lea 1(%rdx), %rdx 2832238384Sjkim sub \$1,%ebx 2833238384Sjkim jnz .Lxts_dec_steal 2834238384Sjkim 2835238384Sjkim movdqu ($out), @XMM[0] 2836238384Sjkim lea 0x20(%rbp), $arg1 2837238384Sjkim pxor @XMM[6], @XMM[0] 2838238384Sjkim lea 0x20(%rbp), $arg2 2839238384Sjkim movdqa @XMM[0], 0x20(%rbp) 2840238384Sjkim lea ($key), $arg3 2841238384Sjkim call asm_AES_decrypt # doesn't touch %xmm 2842238384Sjkim pxor 0x20(%rbp), @XMM[6] 2843238384Sjkim movdqu @XMM[6], ($out) 2844238384Sjkim 2845238384Sjkim.Lxts_dec_ret: 2846238384Sjkim lea (%rsp), %rax 2847238384Sjkim pxor %xmm0, %xmm0 2848238384Sjkim.Lxts_dec_bzero: # wipe key schedule [if any] 2849238384Sjkim movdqa %xmm0, 0x00(%rax) 2850238384Sjkim movdqa %xmm0, 0x10(%rax) 2851238384Sjkim lea 0x20(%rax), %rax 2852238384Sjkim cmp %rax, %rbp 2853238384Sjkim ja .Lxts_dec_bzero 2854238384Sjkim 2855238384Sjkim lea (%rbp),%rsp # restore %rsp 2856238384Sjkim___ 2857238384Sjkim$code.=<<___ if ($win64); 2858238384Sjkim movaps 0x40(%rbp), %xmm6 2859238384Sjkim movaps 0x50(%rbp), %xmm7 2860238384Sjkim movaps 0x60(%rbp), %xmm8 2861238384Sjkim movaps 0x70(%rbp), %xmm9 2862238384Sjkim movaps 0x80(%rbp), %xmm10 2863238384Sjkim movaps 0x90(%rbp), %xmm11 2864238384Sjkim movaps 0xa0(%rbp), %xmm12 2865238384Sjkim movaps 0xb0(%rbp), %xmm13 2866238384Sjkim movaps 0xc0(%rbp), %xmm14 2867238384Sjkim movaps 0xd0(%rbp), %xmm15 2868238384Sjkim lea 0xa0(%rbp), %rsp 2869238384Sjkim___ 2870238384Sjkim$code.=<<___; 2871238384Sjkim mov 0x48(%rsp), %r15 2872238384Sjkim mov 0x50(%rsp), %r14 2873238384Sjkim mov 0x58(%rsp), %r13 2874238384Sjkim mov 0x60(%rsp), %r12 2875238384Sjkim mov 0x68(%rsp), %rbx 2876238384Sjkim mov 0x70(%rsp), %rax 2877238384Sjkim lea 0x78(%rsp), %rsp 2878238384Sjkim mov %rax, %rbp 2879238384Sjkim.Lxts_dec_epilogue: 2880238384Sjkim ret 2881238384Sjkim.size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2882238384Sjkim___ 2883238384Sjkim} 2884238384Sjkim$code.=<<___; 2885238384Sjkim.type _bsaes_const,\@object 2886238384Sjkim.align 64 2887238384Sjkim_bsaes_const: 2888238384Sjkim.LM0ISR: # InvShiftRows constants 2889238384Sjkim .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2890238384Sjkim.LISRM0: 2891238384Sjkim .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2892238384Sjkim.LISR: 2893238384Sjkim .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2894238384Sjkim.LBS0: # bit-slice constants 2895238384Sjkim .quad 0x5555555555555555, 0x5555555555555555 2896238384Sjkim.LBS1: 2897238384Sjkim .quad 0x3333333333333333, 0x3333333333333333 2898238384Sjkim.LBS2: 2899238384Sjkim .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2900238384Sjkim.LSR: # shiftrows constants 2901238384Sjkim .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2902238384Sjkim.LSRM0: 2903238384Sjkim .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2904238384Sjkim.LM0SR: 2905238384Sjkim .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2906238384Sjkim.LSWPUP: # byte-swap upper dword 2907238384Sjkim .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2908238384Sjkim.LSWPUPM0SR: 2909238384Sjkim .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2910238384Sjkim.LADD1: # counter increment constants 2911238384Sjkim .quad 0x0000000000000000, 0x0000000100000000 2912238384Sjkim.LADD2: 2913238384Sjkim .quad 0x0000000000000000, 0x0000000200000000 2914238384Sjkim.LADD3: 2915238384Sjkim .quad 0x0000000000000000, 0x0000000300000000 2916238384Sjkim.LADD4: 2917238384Sjkim .quad 0x0000000000000000, 0x0000000400000000 2918238384Sjkim.LADD5: 2919238384Sjkim .quad 0x0000000000000000, 0x0000000500000000 2920238384Sjkim.LADD6: 2921238384Sjkim .quad 0x0000000000000000, 0x0000000600000000 2922238384Sjkim.LADD7: 2923238384Sjkim .quad 0x0000000000000000, 0x0000000700000000 2924238384Sjkim.LADD8: 2925238384Sjkim .quad 0x0000000000000000, 0x0000000800000000 2926238384Sjkim.Lxts_magic: 2927238384Sjkim .long 0x87,0,1,0 2928238384Sjkim.Lmasks: 2929238384Sjkim .quad 0x0101010101010101, 0x0101010101010101 2930238384Sjkim .quad 0x0202020202020202, 0x0202020202020202 2931238384Sjkim .quad 0x0404040404040404, 0x0404040404040404 2932238384Sjkim .quad 0x0808080808080808, 0x0808080808080808 2933238384Sjkim.LM0: 2934238384Sjkim .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2935238384Sjkim.L63: 2936238384Sjkim .quad 0x6363636363636363, 0x6363636363636363 2937238384Sjkim.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia K��sper, Peter Schwabe, Andy Polyakov" 2938238384Sjkim.align 64 2939238384Sjkim.size _bsaes_const,.-_bsaes_const 2940238384Sjkim___ 2941238384Sjkim 2942238384Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2943238384Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2944238384Sjkimif ($win64) { 2945238384Sjkim$rec="%rcx"; 2946238384Sjkim$frame="%rdx"; 2947238384Sjkim$context="%r8"; 2948238384Sjkim$disp="%r9"; 2949238384Sjkim 2950238384Sjkim$code.=<<___; 2951238384Sjkim.extern __imp_RtlVirtualUnwind 2952238384Sjkim.type se_handler,\@abi-omnipotent 2953238384Sjkim.align 16 2954238384Sjkimse_handler: 2955238384Sjkim push %rsi 2956238384Sjkim push %rdi 2957238384Sjkim push %rbx 2958238384Sjkim push %rbp 2959238384Sjkim push %r12 2960238384Sjkim push %r13 2961238384Sjkim push %r14 2962238384Sjkim push %r15 2963238384Sjkim pushfq 2964238384Sjkim sub \$64,%rsp 2965238384Sjkim 2966238384Sjkim mov 120($context),%rax # pull context->Rax 2967238384Sjkim mov 248($context),%rbx # pull context->Rip 2968238384Sjkim 2969238384Sjkim mov 8($disp),%rsi # disp->ImageBase 2970238384Sjkim mov 56($disp),%r11 # disp->HandlerData 2971238384Sjkim 2972238384Sjkim mov 0(%r11),%r10d # HandlerData[0] 2973238384Sjkim lea (%rsi,%r10),%r10 # prologue label 2974238384Sjkim cmp %r10,%rbx # context->Rip<prologue label 2975238384Sjkim jb .Lin_prologue 2976238384Sjkim 2977238384Sjkim mov 152($context),%rax # pull context->Rsp 2978238384Sjkim 2979238384Sjkim mov 4(%r11),%r10d # HandlerData[1] 2980238384Sjkim lea (%rsi,%r10),%r10 # epilogue label 2981238384Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 2982238384Sjkim jae .Lin_prologue 2983238384Sjkim 2984238384Sjkim mov 160($context),%rax # pull context->Rbp 2985238384Sjkim 2986238384Sjkim lea 0x40(%rax),%rsi # %xmm save area 2987238384Sjkim lea 512($context),%rdi # &context.Xmm6 2988238384Sjkim mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2989238384Sjkim .long 0xa548f3fc # cld; rep movsq 2990238384Sjkim lea 0xa0(%rax),%rax # adjust stack pointer 2991238384Sjkim 2992238384Sjkim mov 0x70(%rax),%rbp 2993238384Sjkim mov 0x68(%rax),%rbx 2994238384Sjkim mov 0x60(%rax),%r12 2995238384Sjkim mov 0x58(%rax),%r13 2996238384Sjkim mov 0x50(%rax),%r14 2997238384Sjkim mov 0x48(%rax),%r15 2998238384Sjkim lea 0x78(%rax),%rax # adjust stack pointer 2999238384Sjkim mov %rbx,144($context) # restore context->Rbx 3000238384Sjkim mov %rbp,160($context) # restore context->Rbp 3001238384Sjkim mov %r12,216($context) # restore context->R12 3002238384Sjkim mov %r13,224($context) # restore context->R13 3003238384Sjkim mov %r14,232($context) # restore context->R14 3004238384Sjkim mov %r15,240($context) # restore context->R15 3005238384Sjkim 3006238384Sjkim.Lin_prologue: 3007238384Sjkim mov %rax,152($context) # restore context->Rsp 3008238384Sjkim 3009238384Sjkim mov 40($disp),%rdi # disp->ContextRecord 3010238384Sjkim mov $context,%rsi # context 3011238384Sjkim mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3012238384Sjkim .long 0xa548f3fc # cld; rep movsq 3013238384Sjkim 3014238384Sjkim mov $disp,%rsi 3015238384Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3016238384Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 3017238384Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 3018238384Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3019238384Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 3020238384Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 3021238384Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 3022238384Sjkim mov %r10,32(%rsp) # arg5 3023238384Sjkim mov %r11,40(%rsp) # arg6 3024238384Sjkim mov %r12,48(%rsp) # arg7 3025238384Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 3026238384Sjkim call *__imp_RtlVirtualUnwind(%rip) 3027238384Sjkim 3028238384Sjkim mov \$1,%eax # ExceptionContinueSearch 3029238384Sjkim add \$64,%rsp 3030238384Sjkim popfq 3031238384Sjkim pop %r15 3032238384Sjkim pop %r14 3033238384Sjkim pop %r13 3034238384Sjkim pop %r12 3035238384Sjkim pop %rbp 3036238384Sjkim pop %rbx 3037238384Sjkim pop %rdi 3038238384Sjkim pop %rsi 3039238384Sjkim ret 3040238384Sjkim.size se_handler,.-se_handler 3041238384Sjkim 3042238384Sjkim.section .pdata 3043238384Sjkim.align 4 3044238384Sjkim___ 3045238384Sjkim$code.=<<___ if ($ecb); 3046238384Sjkim .rva .Lecb_enc_prologue 3047238384Sjkim .rva .Lecb_enc_epilogue 3048238384Sjkim .rva .Lecb_enc_info 3049238384Sjkim 3050238384Sjkim .rva .Lecb_dec_prologue 3051238384Sjkim .rva .Lecb_dec_epilogue 3052238384Sjkim .rva .Lecb_dec_info 3053238384Sjkim___ 3054238384Sjkim$code.=<<___; 3055238384Sjkim .rva .Lcbc_dec_prologue 3056238384Sjkim .rva .Lcbc_dec_epilogue 3057238384Sjkim .rva .Lcbc_dec_info 3058238384Sjkim 3059238384Sjkim .rva .Lctr_enc_prologue 3060238384Sjkim .rva .Lctr_enc_epilogue 3061238384Sjkim .rva .Lctr_enc_info 3062238384Sjkim 3063238384Sjkim .rva .Lxts_enc_prologue 3064238384Sjkim .rva .Lxts_enc_epilogue 3065238384Sjkim .rva .Lxts_enc_info 3066238384Sjkim 3067238384Sjkim .rva .Lxts_dec_prologue 3068238384Sjkim .rva .Lxts_dec_epilogue 3069238384Sjkim .rva .Lxts_dec_info 3070238384Sjkim 3071238384Sjkim.section .xdata 3072238384Sjkim.align 8 3073238384Sjkim___ 3074238384Sjkim$code.=<<___ if ($ecb); 3075238384Sjkim.Lecb_enc_info: 3076238384Sjkim .byte 9,0,0,0 3077238384Sjkim .rva se_handler 3078238384Sjkim .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3079238384Sjkim.Lecb_dec_info: 3080238384Sjkim .byte 9,0,0,0 3081238384Sjkim .rva se_handler 3082238384Sjkim .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3083238384Sjkim___ 3084238384Sjkim$code.=<<___; 3085238384Sjkim.Lcbc_dec_info: 3086238384Sjkim .byte 9,0,0,0 3087238384Sjkim .rva se_handler 3088238384Sjkim .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3089238384Sjkim.Lctr_enc_info: 3090238384Sjkim .byte 9,0,0,0 3091238384Sjkim .rva se_handler 3092238384Sjkim .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3093238384Sjkim.Lxts_enc_info: 3094238384Sjkim .byte 9,0,0,0 3095238384Sjkim .rva se_handler 3096238384Sjkim .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3097238384Sjkim.Lxts_dec_info: 3098238384Sjkim .byte 9,0,0,0 3099238384Sjkim .rva se_handler 3100238384Sjkim .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3101238384Sjkim___ 3102238384Sjkim} 3103238384Sjkim 3104238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3105238384Sjkim 3106238384Sjkimprint $code; 3107238384Sjkim 3108238384Sjkimclose STDOUT; 3109