1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim################################################################### 4238384Sjkim### AES-128 [originally in CTR mode] ### 5238384Sjkim### bitsliced implementation for Intel Core 2 processors ### 6238384Sjkim### requires support of SSE extensions up to SSSE3 ### 7238384Sjkim### Author: Emilia K��sper and Peter Schwabe ### 8238384Sjkim### Date: 2009-03-19 ### 9238384Sjkim### Public domain ### 10238384Sjkim### ### 11238384Sjkim### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12238384Sjkim### further information. ### 13238384Sjkim################################################################### 14238384Sjkim# 15238384Sjkim# September 2011. 16238384Sjkim# 17238384Sjkim# Started as transliteration to "perlasm" the original code has 18238384Sjkim# undergone following changes: 19238384Sjkim# 20238384Sjkim# - code was made position-independent; 21238384Sjkim# - rounds were folded into a loop resulting in >5x size reduction 22238384Sjkim# from 12.5KB to 2.2KB; 23238384Sjkim# - above was possibile thanks to mixcolumns() modification that 24238384Sjkim# allowed to feed its output back to aesenc[last], this was 25238384Sjkim# achieved at cost of two additional inter-registers moves; 26238384Sjkim# - some instruction reordering and interleaving; 27238384Sjkim# - this module doesn't implement key setup subroutine, instead it 28238384Sjkim# relies on conversion of "conventional" key schedule as returned 29238384Sjkim# by AES_set_encrypt_key (see discussion below); 30238384Sjkim# - first and last round keys are treated differently, which allowed 31238384Sjkim# to skip one shiftrows(), reduce bit-sliced key schedule and 32238384Sjkim# speed-up conversion by 22%; 33238384Sjkim# - support for 192- and 256-bit keys was added; 34238384Sjkim# 35238384Sjkim# Resulting performance in CPU cycles spent to encrypt one byte out 36238384Sjkim# of 4096-byte buffer with 128-bit key is: 37238384Sjkim# 38238384Sjkim# Emilia's this(*) difference 39238384Sjkim# 40238384Sjkim# Core 2 9.30 8.69 +7% 41290207Sjkim# Nehalem(**) 7.63 6.88 +11% 42290207Sjkim# Atom 17.1 16.4 +4% 43290207Sjkim# Silvermont - 12.9 44238384Sjkim# 45238384Sjkim# (*) Comparison is not completely fair, because "this" is ECB, 46238384Sjkim# i.e. no extra processing such as counter values calculation 47238384Sjkim# and xor-ing input as in Emilia's CTR implementation is 48238384Sjkim# performed. However, the CTR calculations stand for not more 49238384Sjkim# than 1% of total time, so comparison is *rather* fair. 50238384Sjkim# 51238384Sjkim# (**) Results were collected on Westmere, which is considered to 52238384Sjkim# be equivalent to Nehalem for this code. 53238384Sjkim# 54238384Sjkim# As for key schedule conversion subroutine. Interface to OpenSSL 55238384Sjkim# relies on per-invocation on-the-fly conversion. This naturally 56238384Sjkim# has impact on performance, especially for short inputs. Conversion 57238384Sjkim# time in CPU cycles and its ratio to CPU cycles spent in 8x block 58238384Sjkim# function is: 59238384Sjkim# 60238384Sjkim# conversion conversion/8x block 61238384Sjkim# Core 2 240 0.22 62238384Sjkim# Nehalem 180 0.20 63290207Sjkim# Atom 430 0.20 64238384Sjkim# 65238384Sjkim# The ratio values mean that 128-byte blocks will be processed 66238384Sjkim# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 67238384Sjkim# etc. Then keep in mind that input sizes not divisible by 128 are 68238384Sjkim# *effectively* slower, especially shortest ones, e.g. consecutive 69238384Sjkim# 144-byte blocks are processed 44% slower than one would expect, 70238384Sjkim# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 71238384Sjkim# it's still faster than ["hyper-threading-safe" code path in] 72238384Sjkim# aes-x86_64.pl on all lengths above 64 bytes... 73238384Sjkim# 74238384Sjkim# October 2011. 75238384Sjkim# 76238384Sjkim# Add decryption procedure. Performance in CPU cycles spent to decrypt 77238384Sjkim# one byte out of 4096-byte buffer with 128-bit key is: 78238384Sjkim# 79290207Sjkim# Core 2 9.98 80290207Sjkim# Nehalem 7.80 81290207Sjkim# Atom 17.9 82290207Sjkim# Silvermont 14.0 83238384Sjkim# 84238384Sjkim# November 2011. 85238384Sjkim# 86238384Sjkim# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 87238384Sjkim# suboptimal, but XTS is meant to be used with larger blocks... 88238384Sjkim# 89238384Sjkim# <appro@openssl.org> 90238384Sjkim 91238384Sjkim$flavour = shift; 92238384Sjkim$output = shift; 93238384Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 94238384Sjkim 95238384Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 96238384Sjkim 97238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 98238384Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 99238384Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 100238384Sjkimdie "can't locate x86_64-xlate.pl"; 101238384Sjkim 102246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 103246772Sjkim*STDOUT=*OUT; 104238384Sjkim 105238384Sjkimmy ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 106238384Sjkimmy @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 107238384Sjkimmy $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 108238384Sjkim 109238384Sjkim{ 110238384Sjkimmy ($key,$rounds,$const)=("%rax","%r10d","%r11"); 111238384Sjkim 112238384Sjkimsub Sbox { 113238384Sjkim# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 114238384Sjkim# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 115238384Sjkimmy @b=@_[0..7]; 116238384Sjkimmy @t=@_[8..11]; 117238384Sjkimmy @s=@_[12..15]; 118238384Sjkim &InBasisChange (@b); 119238384Sjkim &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 120238384Sjkim &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 121238384Sjkim} 122238384Sjkim 123238384Sjkimsub InBasisChange { 124238384Sjkim# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 125238384Sjkim# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 126238384Sjkimmy @b=@_[0..7]; 127238384Sjkim$code.=<<___; 128238384Sjkim pxor @b[6], @b[5] 129238384Sjkim pxor @b[1], @b[2] 130238384Sjkim pxor @b[0], @b[3] 131238384Sjkim pxor @b[2], @b[6] 132238384Sjkim pxor @b[0], @b[5] 133238384Sjkim 134238384Sjkim pxor @b[3], @b[6] 135238384Sjkim pxor @b[7], @b[3] 136238384Sjkim pxor @b[5], @b[7] 137238384Sjkim pxor @b[4], @b[3] 138238384Sjkim pxor @b[5], @b[4] 139238384Sjkim pxor @b[1], @b[3] 140238384Sjkim 141238384Sjkim pxor @b[7], @b[2] 142238384Sjkim pxor @b[5], @b[1] 143238384Sjkim___ 144238384Sjkim} 145238384Sjkim 146238384Sjkimsub OutBasisChange { 147238384Sjkim# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 148238384Sjkim# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 149238384Sjkimmy @b=@_[0..7]; 150238384Sjkim$code.=<<___; 151238384Sjkim pxor @b[6], @b[0] 152238384Sjkim pxor @b[4], @b[1] 153238384Sjkim pxor @b[0], @b[2] 154238384Sjkim pxor @b[6], @b[4] 155238384Sjkim pxor @b[1], @b[6] 156238384Sjkim 157238384Sjkim pxor @b[5], @b[1] 158238384Sjkim pxor @b[3], @b[5] 159238384Sjkim pxor @b[7], @b[3] 160238384Sjkim pxor @b[5], @b[7] 161238384Sjkim pxor @b[5], @b[2] 162238384Sjkim 163238384Sjkim pxor @b[7], @b[4] 164238384Sjkim___ 165238384Sjkim} 166238384Sjkim 167238384Sjkimsub InvSbox { 168238384Sjkim# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 169238384Sjkim# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 170238384Sjkimmy @b=@_[0..7]; 171238384Sjkimmy @t=@_[8..11]; 172238384Sjkimmy @s=@_[12..15]; 173238384Sjkim &InvInBasisChange (@b); 174238384Sjkim &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 175238384Sjkim &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 176238384Sjkim} 177238384Sjkim 178238384Sjkimsub InvInBasisChange { # OutBasisChange in reverse 179238384Sjkimmy @b=@_[5,1,2,6,3,7,0,4]; 180238384Sjkim$code.=<<___ 181238384Sjkim pxor @b[7], @b[4] 182238384Sjkim 183238384Sjkim pxor @b[5], @b[7] 184238384Sjkim pxor @b[5], @b[2] 185238384Sjkim pxor @b[7], @b[3] 186238384Sjkim pxor @b[3], @b[5] 187238384Sjkim pxor @b[5], @b[1] 188238384Sjkim 189238384Sjkim pxor @b[1], @b[6] 190238384Sjkim pxor @b[0], @b[2] 191238384Sjkim pxor @b[6], @b[4] 192238384Sjkim pxor @b[6], @b[0] 193238384Sjkim pxor @b[4], @b[1] 194238384Sjkim___ 195238384Sjkim} 196238384Sjkim 197238384Sjkimsub InvOutBasisChange { # InBasisChange in reverse 198238384Sjkimmy @b=@_[2,5,7,3,6,1,0,4]; 199238384Sjkim$code.=<<___; 200238384Sjkim pxor @b[5], @b[1] 201238384Sjkim pxor @b[7], @b[2] 202238384Sjkim 203238384Sjkim pxor @b[1], @b[3] 204238384Sjkim pxor @b[5], @b[4] 205238384Sjkim pxor @b[5], @b[7] 206238384Sjkim pxor @b[4], @b[3] 207238384Sjkim pxor @b[0], @b[5] 208238384Sjkim pxor @b[7], @b[3] 209238384Sjkim pxor @b[2], @b[6] 210238384Sjkim pxor @b[1], @b[2] 211238384Sjkim pxor @b[3], @b[6] 212238384Sjkim 213238384Sjkim pxor @b[0], @b[3] 214238384Sjkim pxor @b[6], @b[5] 215238384Sjkim___ 216238384Sjkim} 217238384Sjkim 218238384Sjkimsub Mul_GF4 { 219238384Sjkim#;************************************************************* 220238384Sjkim#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 221238384Sjkim#;************************************************************* 222238384Sjkimmy ($x0,$x1,$y0,$y1,$t0)=@_; 223238384Sjkim$code.=<<___; 224238384Sjkim movdqa $y0, $t0 225238384Sjkim pxor $y1, $t0 226238384Sjkim pand $x0, $t0 227238384Sjkim pxor $x1, $x0 228238384Sjkim pand $y0, $x1 229238384Sjkim pand $y1, $x0 230238384Sjkim pxor $x1, $x0 231238384Sjkim pxor $t0, $x1 232238384Sjkim___ 233238384Sjkim} 234238384Sjkim 235238384Sjkimsub Mul_GF4_N { # not used, see next subroutine 236238384Sjkim# multiply and scale by N 237238384Sjkimmy ($x0,$x1,$y0,$y1,$t0)=@_; 238238384Sjkim$code.=<<___; 239238384Sjkim movdqa $y0, $t0 240238384Sjkim pxor $y1, $t0 241238384Sjkim pand $x0, $t0 242238384Sjkim pxor $x1, $x0 243238384Sjkim pand $y0, $x1 244238384Sjkim pand $y1, $x0 245238384Sjkim pxor $x0, $x1 246238384Sjkim pxor $t0, $x0 247238384Sjkim___ 248238384Sjkim} 249238384Sjkim 250238384Sjkimsub Mul_GF4_N_GF4 { 251238384Sjkim# interleaved Mul_GF4_N and Mul_GF4 252238384Sjkimmy ($x0,$x1,$y0,$y1,$t0, 253238384Sjkim $x2,$x3,$y2,$y3,$t1)=@_; 254238384Sjkim$code.=<<___; 255238384Sjkim movdqa $y0, $t0 256238384Sjkim movdqa $y2, $t1 257238384Sjkim pxor $y1, $t0 258238384Sjkim pxor $y3, $t1 259238384Sjkim pand $x0, $t0 260238384Sjkim pand $x2, $t1 261238384Sjkim pxor $x1, $x0 262238384Sjkim pxor $x3, $x2 263238384Sjkim pand $y0, $x1 264238384Sjkim pand $y2, $x3 265238384Sjkim pand $y1, $x0 266238384Sjkim pand $y3, $x2 267238384Sjkim pxor $x0, $x1 268238384Sjkim pxor $x3, $x2 269238384Sjkim pxor $t0, $x0 270238384Sjkim pxor $t1, $x3 271238384Sjkim___ 272238384Sjkim} 273238384Sjkimsub Mul_GF16_2 { 274238384Sjkimmy @x=@_[0..7]; 275238384Sjkimmy @y=@_[8..11]; 276238384Sjkimmy @t=@_[12..15]; 277238384Sjkim$code.=<<___; 278238384Sjkim movdqa @x[0], @t[0] 279238384Sjkim movdqa @x[1], @t[1] 280238384Sjkim___ 281238384Sjkim &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 282238384Sjkim$code.=<<___; 283238384Sjkim pxor @x[2], @t[0] 284238384Sjkim pxor @x[3], @t[1] 285238384Sjkim pxor @y[2], @y[0] 286238384Sjkim pxor @y[3], @y[1] 287238384Sjkim___ 288238384Sjkim Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 289238384Sjkim @x[2], @x[3], @y[2], @y[3], @t[2]); 290238384Sjkim$code.=<<___; 291238384Sjkim pxor @t[0], @x[0] 292238384Sjkim pxor @t[0], @x[2] 293238384Sjkim pxor @t[1], @x[1] 294238384Sjkim pxor @t[1], @x[3] 295238384Sjkim 296238384Sjkim movdqa @x[4], @t[0] 297238384Sjkim movdqa @x[5], @t[1] 298238384Sjkim pxor @x[6], @t[0] 299238384Sjkim pxor @x[7], @t[1] 300238384Sjkim___ 301238384Sjkim &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 302238384Sjkim @x[6], @x[7], @y[2], @y[3], @t[2]); 303238384Sjkim$code.=<<___; 304238384Sjkim pxor @y[2], @y[0] 305238384Sjkim pxor @y[3], @y[1] 306238384Sjkim___ 307238384Sjkim &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 308238384Sjkim$code.=<<___; 309238384Sjkim pxor @t[0], @x[4] 310238384Sjkim pxor @t[0], @x[6] 311238384Sjkim pxor @t[1], @x[5] 312238384Sjkim pxor @t[1], @x[7] 313238384Sjkim___ 314238384Sjkim} 315238384Sjkimsub Inv_GF256 { 316238384Sjkim#;******************************************************************** 317238384Sjkim#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 318238384Sjkim#;******************************************************************** 319238384Sjkimmy @x=@_[0..7]; 320238384Sjkimmy @t=@_[8..11]; 321238384Sjkimmy @s=@_[12..15]; 322238384Sjkim# direct optimizations from hardware 323238384Sjkim$code.=<<___; 324238384Sjkim movdqa @x[4], @t[3] 325238384Sjkim movdqa @x[5], @t[2] 326238384Sjkim movdqa @x[1], @t[1] 327238384Sjkim movdqa @x[7], @s[1] 328238384Sjkim movdqa @x[0], @s[0] 329238384Sjkim 330238384Sjkim pxor @x[6], @t[3] 331238384Sjkim pxor @x[7], @t[2] 332238384Sjkim pxor @x[3], @t[1] 333238384Sjkim movdqa @t[3], @s[2] 334238384Sjkim pxor @x[6], @s[1] 335238384Sjkim movdqa @t[2], @t[0] 336238384Sjkim pxor @x[2], @s[0] 337238384Sjkim movdqa @t[3], @s[3] 338238384Sjkim 339238384Sjkim por @t[1], @t[2] 340238384Sjkim por @s[0], @t[3] 341238384Sjkim pxor @t[0], @s[3] 342238384Sjkim pand @s[0], @s[2] 343238384Sjkim pxor @t[1], @s[0] 344238384Sjkim pand @t[1], @t[0] 345238384Sjkim pand @s[0], @s[3] 346238384Sjkim movdqa @x[3], @s[0] 347238384Sjkim pxor @x[2], @s[0] 348238384Sjkim pand @s[0], @s[1] 349238384Sjkim pxor @s[1], @t[3] 350238384Sjkim pxor @s[1], @t[2] 351238384Sjkim movdqa @x[4], @s[1] 352238384Sjkim movdqa @x[1], @s[0] 353238384Sjkim pxor @x[5], @s[1] 354238384Sjkim pxor @x[0], @s[0] 355238384Sjkim movdqa @s[1], @t[1] 356238384Sjkim pand @s[0], @s[1] 357238384Sjkim por @s[0], @t[1] 358238384Sjkim pxor @s[1], @t[0] 359238384Sjkim pxor @s[3], @t[3] 360238384Sjkim pxor @s[2], @t[2] 361238384Sjkim pxor @s[3], @t[1] 362238384Sjkim movdqa @x[7], @s[0] 363238384Sjkim pxor @s[2], @t[0] 364238384Sjkim movdqa @x[6], @s[1] 365238384Sjkim pxor @s[2], @t[1] 366238384Sjkim movdqa @x[5], @s[2] 367238384Sjkim pand @x[3], @s[0] 368238384Sjkim movdqa @x[4], @s[3] 369238384Sjkim pand @x[2], @s[1] 370238384Sjkim pand @x[1], @s[2] 371238384Sjkim por @x[0], @s[3] 372238384Sjkim pxor @s[0], @t[3] 373238384Sjkim pxor @s[1], @t[2] 374238384Sjkim pxor @s[2], @t[1] 375238384Sjkim pxor @s[3], @t[0] 376238384Sjkim 377238384Sjkim #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 378238384Sjkim 379238384Sjkim # new smaller inversion 380238384Sjkim 381238384Sjkim movdqa @t[3], @s[0] 382238384Sjkim pand @t[1], @t[3] 383238384Sjkim pxor @t[2], @s[0] 384238384Sjkim 385238384Sjkim movdqa @t[0], @s[2] 386238384Sjkim movdqa @s[0], @s[3] 387238384Sjkim pxor @t[3], @s[2] 388238384Sjkim pand @s[2], @s[3] 389238384Sjkim 390238384Sjkim movdqa @t[1], @s[1] 391238384Sjkim pxor @t[2], @s[3] 392238384Sjkim pxor @t[0], @s[1] 393238384Sjkim 394238384Sjkim pxor @t[2], @t[3] 395238384Sjkim 396238384Sjkim pand @t[3], @s[1] 397238384Sjkim 398238384Sjkim movdqa @s[2], @t[2] 399238384Sjkim pxor @t[0], @s[1] 400238384Sjkim 401238384Sjkim pxor @s[1], @t[2] 402238384Sjkim pxor @s[1], @t[1] 403238384Sjkim 404238384Sjkim pand @t[0], @t[2] 405238384Sjkim 406238384Sjkim pxor @t[2], @s[2] 407238384Sjkim pxor @t[2], @t[1] 408238384Sjkim 409238384Sjkim pand @s[3], @s[2] 410238384Sjkim 411238384Sjkim pxor @s[0], @s[2] 412238384Sjkim___ 413238384Sjkim# output in s3, s2, s1, t1 414238384Sjkim 415238384Sjkim# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 416238384Sjkim 417238384Sjkim# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 418238384Sjkim &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 419238384Sjkim 420238384Sjkim### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 421238384Sjkim} 422238384Sjkim 423238384Sjkim# AES linear components 424238384Sjkim 425238384Sjkimsub ShiftRows { 426238384Sjkimmy @x=@_[0..7]; 427238384Sjkimmy $mask=pop; 428238384Sjkim$code.=<<___; 429238384Sjkim pxor 0x00($key),@x[0] 430238384Sjkim pxor 0x10($key),@x[1] 431290207Sjkim pxor 0x20($key),@x[2] 432290207Sjkim pxor 0x30($key),@x[3] 433238384Sjkim pshufb $mask,@x[0] 434238384Sjkim pshufb $mask,@x[1] 435290207Sjkim pxor 0x40($key),@x[4] 436290207Sjkim pxor 0x50($key),@x[5] 437238384Sjkim pshufb $mask,@x[2] 438238384Sjkim pshufb $mask,@x[3] 439290207Sjkim pxor 0x60($key),@x[6] 440290207Sjkim pxor 0x70($key),@x[7] 441238384Sjkim pshufb $mask,@x[4] 442238384Sjkim pshufb $mask,@x[5] 443238384Sjkim pshufb $mask,@x[6] 444290207Sjkim pshufb $mask,@x[7] 445238384Sjkim lea 0x80($key),$key 446238384Sjkim___ 447238384Sjkim} 448238384Sjkim 449238384Sjkimsub MixColumns { 450238384Sjkim# modified to emit output in order suitable for feeding back to aesenc[last] 451238384Sjkimmy @x=@_[0..7]; 452238384Sjkimmy @t=@_[8..15]; 453261037Sjkimmy $inv=@_[16]; # optional 454238384Sjkim$code.=<<___; 455238384Sjkim pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 456238384Sjkim pshufd \$0x93, @x[1], @t[1] 457238384Sjkim pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 458238384Sjkim pshufd \$0x93, @x[2], @t[2] 459238384Sjkim pxor @t[1], @x[1] 460238384Sjkim pshufd \$0x93, @x[3], @t[3] 461238384Sjkim pxor @t[2], @x[2] 462238384Sjkim pshufd \$0x93, @x[4], @t[4] 463238384Sjkim pxor @t[3], @x[3] 464238384Sjkim pshufd \$0x93, @x[5], @t[5] 465238384Sjkim pxor @t[4], @x[4] 466238384Sjkim pshufd \$0x93, @x[6], @t[6] 467238384Sjkim pxor @t[5], @x[5] 468238384Sjkim pshufd \$0x93, @x[7], @t[7] 469238384Sjkim pxor @t[6], @x[6] 470238384Sjkim pxor @t[7], @x[7] 471238384Sjkim 472238384Sjkim pxor @x[0], @t[1] 473238384Sjkim pxor @x[7], @t[0] 474238384Sjkim pxor @x[7], @t[1] 475238384Sjkim pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 476238384Sjkim pxor @x[1], @t[2] 477238384Sjkim pshufd \$0x4E, @x[1], @x[1] 478238384Sjkim pxor @x[4], @t[5] 479238384Sjkim pxor @t[0], @x[0] 480238384Sjkim pxor @x[5], @t[6] 481238384Sjkim pxor @t[1], @x[1] 482238384Sjkim pxor @x[3], @t[4] 483238384Sjkim pshufd \$0x4E, @x[4], @t[0] 484238384Sjkim pxor @x[6], @t[7] 485238384Sjkim pshufd \$0x4E, @x[5], @t[1] 486238384Sjkim pxor @x[2], @t[3] 487238384Sjkim pshufd \$0x4E, @x[3], @x[4] 488238384Sjkim pxor @x[7], @t[3] 489238384Sjkim pshufd \$0x4E, @x[7], @x[5] 490238384Sjkim pxor @x[7], @t[4] 491238384Sjkim pshufd \$0x4E, @x[6], @x[3] 492238384Sjkim pxor @t[4], @t[0] 493238384Sjkim pshufd \$0x4E, @x[2], @x[6] 494238384Sjkim pxor @t[5], @t[1] 495261037Sjkim___ 496261037Sjkim$code.=<<___ if (!$inv); 497238384Sjkim pxor @t[3], @x[4] 498238384Sjkim pxor @t[7], @x[5] 499238384Sjkim pxor @t[6], @x[3] 500238384Sjkim movdqa @t[0], @x[2] 501238384Sjkim pxor @t[2], @x[6] 502238384Sjkim movdqa @t[1], @x[7] 503238384Sjkim___ 504261037Sjkim$code.=<<___ if ($inv); 505261037Sjkim pxor @x[4], @t[3] 506261037Sjkim pxor @t[7], @x[5] 507261037Sjkim pxor @x[3], @t[6] 508261037Sjkim movdqa @t[0], @x[3] 509261037Sjkim pxor @t[2], @x[6] 510261037Sjkim movdqa @t[6], @x[2] 511261037Sjkim movdqa @t[1], @x[7] 512261037Sjkim movdqa @x[6], @x[4] 513261037Sjkim movdqa @t[3], @x[6] 514261037Sjkim___ 515238384Sjkim} 516238384Sjkim 517261037Sjkimsub InvMixColumns_orig { 518238384Sjkimmy @x=@_[0..7]; 519238384Sjkimmy @t=@_[8..15]; 520238384Sjkim 521238384Sjkim$code.=<<___; 522238384Sjkim # multiplication by 0x0e 523238384Sjkim pshufd \$0x93, @x[7], @t[7] 524238384Sjkim movdqa @x[2], @t[2] 525238384Sjkim pxor @x[5], @x[7] # 7 5 526238384Sjkim pxor @x[5], @x[2] # 2 5 527238384Sjkim pshufd \$0x93, @x[0], @t[0] 528238384Sjkim movdqa @x[5], @t[5] 529238384Sjkim pxor @x[0], @x[5] # 5 0 [1] 530238384Sjkim pxor @x[1], @x[0] # 0 1 531238384Sjkim pshufd \$0x93, @x[1], @t[1] 532238384Sjkim pxor @x[2], @x[1] # 1 25 533238384Sjkim pxor @x[6], @x[0] # 01 6 [2] 534238384Sjkim pxor @x[3], @x[1] # 125 3 [4] 535238384Sjkim pshufd \$0x93, @x[3], @t[3] 536238384Sjkim pxor @x[0], @x[2] # 25 016 [3] 537238384Sjkim pxor @x[7], @x[3] # 3 75 538238384Sjkim pxor @x[6], @x[7] # 75 6 [0] 539238384Sjkim pshufd \$0x93, @x[6], @t[6] 540238384Sjkim movdqa @x[4], @t[4] 541238384Sjkim pxor @x[4], @x[6] # 6 4 542238384Sjkim pxor @x[3], @x[4] # 4 375 [6] 543238384Sjkim pxor @x[7], @x[3] # 375 756=36 544238384Sjkim pxor @t[5], @x[6] # 64 5 [7] 545238384Sjkim pxor @t[2], @x[3] # 36 2 546238384Sjkim pxor @t[4], @x[3] # 362 4 [5] 547238384Sjkim pshufd \$0x93, @t[5], @t[5] 548238384Sjkim___ 549238384Sjkim my @y = @x[7,5,0,2,1,3,4,6]; 550238384Sjkim$code.=<<___; 551238384Sjkim # multiplication by 0x0b 552238384Sjkim pxor @y[0], @y[1] 553238384Sjkim pxor @t[0], @y[0] 554238384Sjkim pxor @t[1], @y[1] 555238384Sjkim pshufd \$0x93, @t[2], @t[2] 556238384Sjkim pxor @t[5], @y[0] 557238384Sjkim pxor @t[6], @y[1] 558238384Sjkim pxor @t[7], @y[0] 559238384Sjkim pshufd \$0x93, @t[4], @t[4] 560238384Sjkim pxor @t[6], @t[7] # clobber t[7] 561238384Sjkim pxor @y[0], @y[1] 562238384Sjkim 563238384Sjkim pxor @t[0], @y[3] 564238384Sjkim pshufd \$0x93, @t[0], @t[0] 565238384Sjkim pxor @t[1], @y[2] 566238384Sjkim pxor @t[1], @y[4] 567238384Sjkim pxor @t[2], @y[2] 568238384Sjkim pshufd \$0x93, @t[1], @t[1] 569238384Sjkim pxor @t[2], @y[3] 570238384Sjkim pxor @t[2], @y[5] 571238384Sjkim pxor @t[7], @y[2] 572238384Sjkim pshufd \$0x93, @t[2], @t[2] 573238384Sjkim pxor @t[3], @y[3] 574238384Sjkim pxor @t[3], @y[6] 575238384Sjkim pxor @t[3], @y[4] 576238384Sjkim pshufd \$0x93, @t[3], @t[3] 577238384Sjkim pxor @t[4], @y[7] 578238384Sjkim pxor @t[4], @y[5] 579238384Sjkim pxor @t[7], @y[7] 580238384Sjkim pxor @t[5], @y[3] 581238384Sjkim pxor @t[4], @y[4] 582238384Sjkim pxor @t[5], @t[7] # clobber t[7] even more 583238384Sjkim 584238384Sjkim pxor @t[7], @y[5] 585238384Sjkim pshufd \$0x93, @t[4], @t[4] 586238384Sjkim pxor @t[7], @y[6] 587238384Sjkim pxor @t[7], @y[4] 588238384Sjkim 589238384Sjkim pxor @t[5], @t[7] 590238384Sjkim pshufd \$0x93, @t[5], @t[5] 591238384Sjkim pxor @t[6], @t[7] # restore t[7] 592238384Sjkim 593238384Sjkim # multiplication by 0x0d 594238384Sjkim pxor @y[7], @y[4] 595238384Sjkim pxor @t[4], @y[7] 596238384Sjkim pshufd \$0x93, @t[6], @t[6] 597238384Sjkim pxor @t[0], @y[2] 598238384Sjkim pxor @t[5], @y[7] 599238384Sjkim pxor @t[2], @y[2] 600238384Sjkim pshufd \$0x93, @t[7], @t[7] 601238384Sjkim 602238384Sjkim pxor @y[1], @y[3] 603238384Sjkim pxor @t[1], @y[1] 604238384Sjkim pxor @t[0], @y[0] 605238384Sjkim pxor @t[0], @y[3] 606238384Sjkim pxor @t[5], @y[1] 607238384Sjkim pxor @t[5], @y[0] 608238384Sjkim pxor @t[7], @y[1] 609238384Sjkim pshufd \$0x93, @t[0], @t[0] 610238384Sjkim pxor @t[6], @y[0] 611238384Sjkim pxor @y[1], @y[3] 612238384Sjkim pxor @t[1], @y[4] 613238384Sjkim pshufd \$0x93, @t[1], @t[1] 614238384Sjkim 615238384Sjkim pxor @t[7], @y[7] 616238384Sjkim pxor @t[2], @y[4] 617238384Sjkim pxor @t[2], @y[5] 618238384Sjkim pshufd \$0x93, @t[2], @t[2] 619238384Sjkim pxor @t[6], @y[2] 620238384Sjkim pxor @t[3], @t[6] # clobber t[6] 621238384Sjkim pxor @y[7], @y[4] 622238384Sjkim pxor @t[6], @y[3] 623238384Sjkim 624238384Sjkim pxor @t[6], @y[6] 625238384Sjkim pxor @t[5], @y[5] 626238384Sjkim pxor @t[4], @y[6] 627238384Sjkim pshufd \$0x93, @t[4], @t[4] 628238384Sjkim pxor @t[6], @y[5] 629238384Sjkim pxor @t[7], @y[6] 630238384Sjkim pxor @t[3], @t[6] # restore t[6] 631238384Sjkim 632238384Sjkim pshufd \$0x93, @t[5], @t[5] 633238384Sjkim pshufd \$0x93, @t[6], @t[6] 634238384Sjkim pshufd \$0x93, @t[7], @t[7] 635238384Sjkim pshufd \$0x93, @t[3], @t[3] 636238384Sjkim 637238384Sjkim # multiplication by 0x09 638238384Sjkim pxor @y[1], @y[4] 639238384Sjkim pxor @y[1], @t[1] # t[1]=y[1] 640238384Sjkim pxor @t[5], @t[0] # clobber t[0] 641238384Sjkim pxor @t[5], @t[1] 642238384Sjkim pxor @t[0], @y[3] 643238384Sjkim pxor @y[0], @t[0] # t[0]=y[0] 644238384Sjkim pxor @t[6], @t[1] 645238384Sjkim pxor @t[7], @t[6] # clobber t[6] 646238384Sjkim pxor @t[1], @y[4] 647238384Sjkim pxor @t[4], @y[7] 648238384Sjkim pxor @y[4], @t[4] # t[4]=y[4] 649238384Sjkim pxor @t[3], @y[6] 650238384Sjkim pxor @y[3], @t[3] # t[3]=y[3] 651238384Sjkim pxor @t[2], @y[5] 652238384Sjkim pxor @y[2], @t[2] # t[2]=y[2] 653238384Sjkim pxor @t[7], @t[3] 654238384Sjkim pxor @y[5], @t[5] # t[5]=y[5] 655238384Sjkim pxor @t[6], @t[2] 656238384Sjkim pxor @t[6], @t[5] 657238384Sjkim pxor @y[6], @t[6] # t[6]=y[6] 658238384Sjkim pxor @y[7], @t[7] # t[7]=y[7] 659238384Sjkim 660238384Sjkim movdqa @t[0],@XMM[0] 661238384Sjkim movdqa @t[1],@XMM[1] 662238384Sjkim movdqa @t[2],@XMM[2] 663238384Sjkim movdqa @t[3],@XMM[3] 664238384Sjkim movdqa @t[4],@XMM[4] 665238384Sjkim movdqa @t[5],@XMM[5] 666238384Sjkim movdqa @t[6],@XMM[6] 667238384Sjkim movdqa @t[7],@XMM[7] 668238384Sjkim___ 669238384Sjkim} 670238384Sjkim 671261037Sjkimsub InvMixColumns { 672261037Sjkimmy @x=@_[0..7]; 673261037Sjkimmy @t=@_[8..15]; 674261037Sjkim 675261037Sjkim# Thanks to Jussi Kivilinna for providing pointer to 676261037Sjkim# 677261037Sjkim# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 678261037Sjkim# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 679261037Sjkim# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 680261037Sjkim# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 681261037Sjkim 682261037Sjkim$code.=<<___; 683261037Sjkim # multiplication by 0x05-0x00-0x04-0x00 684261037Sjkim pshufd \$0x4E, @x[0], @t[0] 685261037Sjkim pshufd \$0x4E, @x[6], @t[6] 686261037Sjkim pxor @x[0], @t[0] 687261037Sjkim pshufd \$0x4E, @x[7], @t[7] 688261037Sjkim pxor @x[6], @t[6] 689261037Sjkim pshufd \$0x4E, @x[1], @t[1] 690261037Sjkim pxor @x[7], @t[7] 691261037Sjkim pshufd \$0x4E, @x[2], @t[2] 692261037Sjkim pxor @x[1], @t[1] 693261037Sjkim pshufd \$0x4E, @x[3], @t[3] 694261037Sjkim pxor @x[2], @t[2] 695261037Sjkim pxor @t[6], @x[0] 696261037Sjkim pxor @t[6], @x[1] 697261037Sjkim pshufd \$0x4E, @x[4], @t[4] 698261037Sjkim pxor @x[3], @t[3] 699261037Sjkim pxor @t[0], @x[2] 700261037Sjkim pxor @t[1], @x[3] 701261037Sjkim pshufd \$0x4E, @x[5], @t[5] 702261037Sjkim pxor @x[4], @t[4] 703261037Sjkim pxor @t[7], @x[1] 704261037Sjkim pxor @t[2], @x[4] 705261037Sjkim pxor @x[5], @t[5] 706261037Sjkim 707261037Sjkim pxor @t[7], @x[2] 708261037Sjkim pxor @t[6], @x[3] 709261037Sjkim pxor @t[6], @x[4] 710261037Sjkim pxor @t[3], @x[5] 711261037Sjkim pxor @t[4], @x[6] 712261037Sjkim pxor @t[7], @x[4] 713261037Sjkim pxor @t[7], @x[5] 714261037Sjkim pxor @t[5], @x[7] 715261037Sjkim___ 716261037Sjkim &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 717261037Sjkim} 718261037Sjkim 719238384Sjkimsub aesenc { # not used 720238384Sjkimmy @b=@_[0..7]; 721238384Sjkimmy @t=@_[8..15]; 722238384Sjkim$code.=<<___; 723238384Sjkim movdqa 0x30($const),@t[0] # .LSR 724238384Sjkim___ 725238384Sjkim &ShiftRows (@b,@t[0]); 726238384Sjkim &Sbox (@b,@t); 727238384Sjkim &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 728238384Sjkim} 729238384Sjkim 730238384Sjkimsub aesenclast { # not used 731238384Sjkimmy @b=@_[0..7]; 732238384Sjkimmy @t=@_[8..15]; 733238384Sjkim$code.=<<___; 734238384Sjkim movdqa 0x40($const),@t[0] # .LSRM0 735238384Sjkim___ 736238384Sjkim &ShiftRows (@b,@t[0]); 737238384Sjkim &Sbox (@b,@t); 738238384Sjkim$code.=<<___ 739238384Sjkim pxor 0x00($key),@b[0] 740238384Sjkim pxor 0x10($key),@b[1] 741238384Sjkim pxor 0x20($key),@b[4] 742238384Sjkim pxor 0x30($key),@b[6] 743238384Sjkim pxor 0x40($key),@b[3] 744238384Sjkim pxor 0x50($key),@b[7] 745238384Sjkim pxor 0x60($key),@b[2] 746238384Sjkim pxor 0x70($key),@b[5] 747238384Sjkim___ 748238384Sjkim} 749238384Sjkim 750238384Sjkimsub swapmove { 751238384Sjkimmy ($a,$b,$n,$mask,$t)=@_; 752238384Sjkim$code.=<<___; 753238384Sjkim movdqa $b,$t 754238384Sjkim psrlq \$$n,$b 755238384Sjkim pxor $a,$b 756238384Sjkim pand $mask,$b 757238384Sjkim pxor $b,$a 758238384Sjkim psllq \$$n,$b 759238384Sjkim pxor $t,$b 760238384Sjkim___ 761238384Sjkim} 762238384Sjkimsub swapmove2x { 763238384Sjkimmy ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 764238384Sjkim$code.=<<___; 765238384Sjkim movdqa $b0,$t0 766238384Sjkim psrlq \$$n,$b0 767238384Sjkim movdqa $b1,$t1 768238384Sjkim psrlq \$$n,$b1 769238384Sjkim pxor $a0,$b0 770238384Sjkim pxor $a1,$b1 771238384Sjkim pand $mask,$b0 772238384Sjkim pand $mask,$b1 773238384Sjkim pxor $b0,$a0 774238384Sjkim psllq \$$n,$b0 775238384Sjkim pxor $b1,$a1 776238384Sjkim psllq \$$n,$b1 777238384Sjkim pxor $t0,$b0 778238384Sjkim pxor $t1,$b1 779238384Sjkim___ 780238384Sjkim} 781238384Sjkim 782238384Sjkimsub bitslice { 783238384Sjkimmy @x=reverse(@_[0..7]); 784238384Sjkimmy ($t0,$t1,$t2,$t3)=@_[8..11]; 785238384Sjkim$code.=<<___; 786238384Sjkim movdqa 0x00($const),$t0 # .LBS0 787238384Sjkim movdqa 0x10($const),$t1 # .LBS1 788238384Sjkim___ 789238384Sjkim &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 790238384Sjkim &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 791238384Sjkim$code.=<<___; 792238384Sjkim movdqa 0x20($const),$t0 # .LBS2 793238384Sjkim___ 794238384Sjkim &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 795238384Sjkim &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 796238384Sjkim 797238384Sjkim &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 798238384Sjkim &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 799238384Sjkim} 800238384Sjkim 801238384Sjkim$code.=<<___; 802238384Sjkim.text 803238384Sjkim 804238384Sjkim.extern asm_AES_encrypt 805238384Sjkim.extern asm_AES_decrypt 806238384Sjkim 807238384Sjkim.type _bsaes_encrypt8,\@abi-omnipotent 808238384Sjkim.align 64 809238384Sjkim_bsaes_encrypt8: 810238384Sjkim lea .LBS0(%rip), $const # constants table 811238384Sjkim 812238384Sjkim movdqa ($key), @XMM[9] # round 0 key 813238384Sjkim lea 0x10($key), $key 814238384Sjkim movdqa 0x50($const), @XMM[8] # .LM0SR 815238384Sjkim pxor @XMM[9], @XMM[0] # xor with round0 key 816238384Sjkim pxor @XMM[9], @XMM[1] 817290207Sjkim pxor @XMM[9], @XMM[2] 818290207Sjkim pxor @XMM[9], @XMM[3] 819238384Sjkim pshufb @XMM[8], @XMM[0] 820238384Sjkim pshufb @XMM[8], @XMM[1] 821290207Sjkim pxor @XMM[9], @XMM[4] 822290207Sjkim pxor @XMM[9], @XMM[5] 823238384Sjkim pshufb @XMM[8], @XMM[2] 824238384Sjkim pshufb @XMM[8], @XMM[3] 825290207Sjkim pxor @XMM[9], @XMM[6] 826290207Sjkim pxor @XMM[9], @XMM[7] 827238384Sjkim pshufb @XMM[8], @XMM[4] 828238384Sjkim pshufb @XMM[8], @XMM[5] 829238384Sjkim pshufb @XMM[8], @XMM[6] 830238384Sjkim pshufb @XMM[8], @XMM[7] 831238384Sjkim_bsaes_encrypt8_bitslice: 832238384Sjkim___ 833238384Sjkim &bitslice (@XMM[0..7, 8..11]); 834238384Sjkim$code.=<<___; 835238384Sjkim dec $rounds 836238384Sjkim jmp .Lenc_sbox 837238384Sjkim.align 16 838238384Sjkim.Lenc_loop: 839238384Sjkim___ 840238384Sjkim &ShiftRows (@XMM[0..7, 8]); 841238384Sjkim$code.=".Lenc_sbox:\n"; 842238384Sjkim &Sbox (@XMM[0..7, 8..15]); 843238384Sjkim$code.=<<___; 844238384Sjkim dec $rounds 845238384Sjkim jl .Lenc_done 846238384Sjkim___ 847238384Sjkim &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 848238384Sjkim$code.=<<___; 849238384Sjkim movdqa 0x30($const), @XMM[8] # .LSR 850238384Sjkim jnz .Lenc_loop 851238384Sjkim movdqa 0x40($const), @XMM[8] # .LSRM0 852238384Sjkim jmp .Lenc_loop 853238384Sjkim.align 16 854238384Sjkim.Lenc_done: 855238384Sjkim___ 856238384Sjkim # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 857238384Sjkim &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 858238384Sjkim$code.=<<___; 859238384Sjkim movdqa ($key), @XMM[8] # last round key 860238384Sjkim pxor @XMM[8], @XMM[4] 861238384Sjkim pxor @XMM[8], @XMM[6] 862238384Sjkim pxor @XMM[8], @XMM[3] 863238384Sjkim pxor @XMM[8], @XMM[7] 864238384Sjkim pxor @XMM[8], @XMM[2] 865238384Sjkim pxor @XMM[8], @XMM[5] 866238384Sjkim pxor @XMM[8], @XMM[0] 867238384Sjkim pxor @XMM[8], @XMM[1] 868238384Sjkim ret 869238384Sjkim.size _bsaes_encrypt8,.-_bsaes_encrypt8 870238384Sjkim 871238384Sjkim.type _bsaes_decrypt8,\@abi-omnipotent 872238384Sjkim.align 64 873238384Sjkim_bsaes_decrypt8: 874238384Sjkim lea .LBS0(%rip), $const # constants table 875238384Sjkim 876238384Sjkim movdqa ($key), @XMM[9] # round 0 key 877238384Sjkim lea 0x10($key), $key 878238384Sjkim movdqa -0x30($const), @XMM[8] # .LM0ISR 879238384Sjkim pxor @XMM[9], @XMM[0] # xor with round0 key 880238384Sjkim pxor @XMM[9], @XMM[1] 881290207Sjkim pxor @XMM[9], @XMM[2] 882290207Sjkim pxor @XMM[9], @XMM[3] 883238384Sjkim pshufb @XMM[8], @XMM[0] 884238384Sjkim pshufb @XMM[8], @XMM[1] 885290207Sjkim pxor @XMM[9], @XMM[4] 886290207Sjkim pxor @XMM[9], @XMM[5] 887238384Sjkim pshufb @XMM[8], @XMM[2] 888238384Sjkim pshufb @XMM[8], @XMM[3] 889290207Sjkim pxor @XMM[9], @XMM[6] 890290207Sjkim pxor @XMM[9], @XMM[7] 891238384Sjkim pshufb @XMM[8], @XMM[4] 892238384Sjkim pshufb @XMM[8], @XMM[5] 893238384Sjkim pshufb @XMM[8], @XMM[6] 894238384Sjkim pshufb @XMM[8], @XMM[7] 895238384Sjkim___ 896238384Sjkim &bitslice (@XMM[0..7, 8..11]); 897238384Sjkim$code.=<<___; 898238384Sjkim dec $rounds 899238384Sjkim jmp .Ldec_sbox 900238384Sjkim.align 16 901238384Sjkim.Ldec_loop: 902238384Sjkim___ 903238384Sjkim &ShiftRows (@XMM[0..7, 8]); 904238384Sjkim$code.=".Ldec_sbox:\n"; 905238384Sjkim &InvSbox (@XMM[0..7, 8..15]); 906238384Sjkim$code.=<<___; 907238384Sjkim dec $rounds 908238384Sjkim jl .Ldec_done 909238384Sjkim___ 910238384Sjkim &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 911238384Sjkim$code.=<<___; 912238384Sjkim movdqa -0x10($const), @XMM[8] # .LISR 913238384Sjkim jnz .Ldec_loop 914238384Sjkim movdqa -0x20($const), @XMM[8] # .LISRM0 915238384Sjkim jmp .Ldec_loop 916238384Sjkim.align 16 917238384Sjkim.Ldec_done: 918238384Sjkim___ 919238384Sjkim &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 920238384Sjkim$code.=<<___; 921238384Sjkim movdqa ($key), @XMM[8] # last round key 922238384Sjkim pxor @XMM[8], @XMM[6] 923238384Sjkim pxor @XMM[8], @XMM[4] 924238384Sjkim pxor @XMM[8], @XMM[2] 925238384Sjkim pxor @XMM[8], @XMM[7] 926238384Sjkim pxor @XMM[8], @XMM[3] 927238384Sjkim pxor @XMM[8], @XMM[5] 928238384Sjkim pxor @XMM[8], @XMM[0] 929238384Sjkim pxor @XMM[8], @XMM[1] 930238384Sjkim ret 931238384Sjkim.size _bsaes_decrypt8,.-_bsaes_decrypt8 932238384Sjkim___ 933238384Sjkim} 934238384Sjkim{ 935238384Sjkimmy ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 936238384Sjkim 937238384Sjkimsub bitslice_key { 938238384Sjkimmy @x=reverse(@_[0..7]); 939238384Sjkimmy ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 940238384Sjkim 941238384Sjkim &swapmove (@x[0,1],1,$bs0,$t2,$t3); 942238384Sjkim$code.=<<___; 943238384Sjkim #&swapmove(@x[2,3],1,$t0,$t2,$t3); 944238384Sjkim movdqa @x[0], @x[2] 945238384Sjkim movdqa @x[1], @x[3] 946238384Sjkim___ 947238384Sjkim #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 948238384Sjkim 949238384Sjkim &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 950238384Sjkim$code.=<<___; 951238384Sjkim #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 952238384Sjkim movdqa @x[0], @x[4] 953238384Sjkim movdqa @x[2], @x[6] 954238384Sjkim movdqa @x[1], @x[5] 955238384Sjkim movdqa @x[3], @x[7] 956238384Sjkim___ 957238384Sjkim &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 958238384Sjkim &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 959238384Sjkim} 960238384Sjkim 961238384Sjkim$code.=<<___; 962238384Sjkim.type _bsaes_key_convert,\@abi-omnipotent 963238384Sjkim.align 16 964238384Sjkim_bsaes_key_convert: 965238384Sjkim lea .Lmasks(%rip), $const 966238384Sjkim movdqu ($inp), %xmm7 # load round 0 key 967238384Sjkim lea 0x10($inp), $inp 968238384Sjkim movdqa 0x00($const), %xmm0 # 0x01... 969238384Sjkim movdqa 0x10($const), %xmm1 # 0x02... 970238384Sjkim movdqa 0x20($const), %xmm2 # 0x04... 971238384Sjkim movdqa 0x30($const), %xmm3 # 0x08... 972238384Sjkim movdqa 0x40($const), %xmm4 # .LM0 973238384Sjkim pcmpeqd %xmm5, %xmm5 # .LNOT 974238384Sjkim 975238384Sjkim movdqu ($inp), %xmm6 # load round 1 key 976238384Sjkim movdqa %xmm7, ($out) # save round 0 key 977238384Sjkim lea 0x10($out), $out 978238384Sjkim dec $rounds 979238384Sjkim jmp .Lkey_loop 980238384Sjkim.align 16 981238384Sjkim.Lkey_loop: 982238384Sjkim pshufb %xmm4, %xmm6 # .LM0 983238384Sjkim 984238384Sjkim movdqa %xmm0, %xmm8 985238384Sjkim movdqa %xmm1, %xmm9 986238384Sjkim 987238384Sjkim pand %xmm6, %xmm8 988238384Sjkim pand %xmm6, %xmm9 989238384Sjkim movdqa %xmm2, %xmm10 990238384Sjkim pcmpeqb %xmm0, %xmm8 991238384Sjkim psllq \$4, %xmm0 # 0x10... 992238384Sjkim movdqa %xmm3, %xmm11 993238384Sjkim pcmpeqb %xmm1, %xmm9 994238384Sjkim psllq \$4, %xmm1 # 0x20... 995238384Sjkim 996238384Sjkim pand %xmm6, %xmm10 997238384Sjkim pand %xmm6, %xmm11 998238384Sjkim movdqa %xmm0, %xmm12 999238384Sjkim pcmpeqb %xmm2, %xmm10 1000238384Sjkim psllq \$4, %xmm2 # 0x40... 1001238384Sjkim movdqa %xmm1, %xmm13 1002238384Sjkim pcmpeqb %xmm3, %xmm11 1003238384Sjkim psllq \$4, %xmm3 # 0x80... 1004238384Sjkim 1005238384Sjkim movdqa %xmm2, %xmm14 1006238384Sjkim movdqa %xmm3, %xmm15 1007238384Sjkim pxor %xmm5, %xmm8 # "pnot" 1008238384Sjkim pxor %xmm5, %xmm9 1009238384Sjkim 1010238384Sjkim pand %xmm6, %xmm12 1011238384Sjkim pand %xmm6, %xmm13 1012238384Sjkim movdqa %xmm8, 0x00($out) # write bit-sliced round key 1013238384Sjkim pcmpeqb %xmm0, %xmm12 1014238384Sjkim psrlq \$4, %xmm0 # 0x01... 1015238384Sjkim movdqa %xmm9, 0x10($out) 1016238384Sjkim pcmpeqb %xmm1, %xmm13 1017238384Sjkim psrlq \$4, %xmm1 # 0x02... 1018238384Sjkim lea 0x10($inp), $inp 1019238384Sjkim 1020238384Sjkim pand %xmm6, %xmm14 1021238384Sjkim pand %xmm6, %xmm15 1022238384Sjkim movdqa %xmm10, 0x20($out) 1023238384Sjkim pcmpeqb %xmm2, %xmm14 1024238384Sjkim psrlq \$4, %xmm2 # 0x04... 1025238384Sjkim movdqa %xmm11, 0x30($out) 1026238384Sjkim pcmpeqb %xmm3, %xmm15 1027238384Sjkim psrlq \$4, %xmm3 # 0x08... 1028238384Sjkim movdqu ($inp), %xmm6 # load next round key 1029238384Sjkim 1030238384Sjkim pxor %xmm5, %xmm13 # "pnot" 1031238384Sjkim pxor %xmm5, %xmm14 1032238384Sjkim movdqa %xmm12, 0x40($out) 1033238384Sjkim movdqa %xmm13, 0x50($out) 1034238384Sjkim movdqa %xmm14, 0x60($out) 1035238384Sjkim movdqa %xmm15, 0x70($out) 1036238384Sjkim lea 0x80($out),$out 1037238384Sjkim dec $rounds 1038238384Sjkim jnz .Lkey_loop 1039238384Sjkim 1040238384Sjkim movdqa 0x50($const), %xmm7 # .L63 1041238384Sjkim #movdqa %xmm6, ($out) # don't save last round key 1042238384Sjkim ret 1043238384Sjkim.size _bsaes_key_convert,.-_bsaes_key_convert 1044238384Sjkim___ 1045238384Sjkim} 1046238384Sjkim 1047238384Sjkimif (0 && !$win64) { # following four functions are unsupported interface 1048238384Sjkim # used for benchmarking... 1049238384Sjkim$code.=<<___; 1050238384Sjkim.globl bsaes_enc_key_convert 1051238384Sjkim.type bsaes_enc_key_convert,\@function,2 1052238384Sjkim.align 16 1053238384Sjkimbsaes_enc_key_convert: 1054238384Sjkim mov 240($inp),%r10d # pass rounds 1055238384Sjkim mov $inp,%rcx # pass key 1056238384Sjkim mov $out,%rax # pass key schedule 1057238384Sjkim call _bsaes_key_convert 1058238384Sjkim pxor %xmm6,%xmm7 # fix up last round key 1059238384Sjkim movdqa %xmm7,(%rax) # save last round key 1060238384Sjkim ret 1061238384Sjkim.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1062238384Sjkim 1063238384Sjkim.globl bsaes_encrypt_128 1064238384Sjkim.type bsaes_encrypt_128,\@function,4 1065238384Sjkim.align 16 1066238384Sjkimbsaes_encrypt_128: 1067238384Sjkim.Lenc128_loop: 1068238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1069238384Sjkim movdqu 0x10($inp), @XMM[1] 1070238384Sjkim movdqu 0x20($inp), @XMM[2] 1071238384Sjkim movdqu 0x30($inp), @XMM[3] 1072238384Sjkim movdqu 0x40($inp), @XMM[4] 1073238384Sjkim movdqu 0x50($inp), @XMM[5] 1074238384Sjkim movdqu 0x60($inp), @XMM[6] 1075238384Sjkim movdqu 0x70($inp), @XMM[7] 1076238384Sjkim mov $key, %rax # pass the $key 1077238384Sjkim lea 0x80($inp), $inp 1078238384Sjkim mov \$10,%r10d 1079238384Sjkim 1080238384Sjkim call _bsaes_encrypt8 1081238384Sjkim 1082238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1083238384Sjkim movdqu @XMM[1], 0x10($out) 1084238384Sjkim movdqu @XMM[4], 0x20($out) 1085238384Sjkim movdqu @XMM[6], 0x30($out) 1086238384Sjkim movdqu @XMM[3], 0x40($out) 1087238384Sjkim movdqu @XMM[7], 0x50($out) 1088238384Sjkim movdqu @XMM[2], 0x60($out) 1089238384Sjkim movdqu @XMM[5], 0x70($out) 1090238384Sjkim lea 0x80($out), $out 1091238384Sjkim sub \$0x80,$len 1092238384Sjkim ja .Lenc128_loop 1093238384Sjkim ret 1094238384Sjkim.size bsaes_encrypt_128,.-bsaes_encrypt_128 1095238384Sjkim 1096238384Sjkim.globl bsaes_dec_key_convert 1097238384Sjkim.type bsaes_dec_key_convert,\@function,2 1098238384Sjkim.align 16 1099238384Sjkimbsaes_dec_key_convert: 1100238384Sjkim mov 240($inp),%r10d # pass rounds 1101238384Sjkim mov $inp,%rcx # pass key 1102238384Sjkim mov $out,%rax # pass key schedule 1103238384Sjkim call _bsaes_key_convert 1104238384Sjkim pxor ($out),%xmm7 # fix up round 0 key 1105238384Sjkim movdqa %xmm6,(%rax) # save last round key 1106238384Sjkim movdqa %xmm7,($out) 1107238384Sjkim ret 1108238384Sjkim.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1109238384Sjkim 1110238384Sjkim.globl bsaes_decrypt_128 1111238384Sjkim.type bsaes_decrypt_128,\@function,4 1112238384Sjkim.align 16 1113238384Sjkimbsaes_decrypt_128: 1114238384Sjkim.Ldec128_loop: 1115238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1116238384Sjkim movdqu 0x10($inp), @XMM[1] 1117238384Sjkim movdqu 0x20($inp), @XMM[2] 1118238384Sjkim movdqu 0x30($inp), @XMM[3] 1119238384Sjkim movdqu 0x40($inp), @XMM[4] 1120238384Sjkim movdqu 0x50($inp), @XMM[5] 1121238384Sjkim movdqu 0x60($inp), @XMM[6] 1122238384Sjkim movdqu 0x70($inp), @XMM[7] 1123238384Sjkim mov $key, %rax # pass the $key 1124238384Sjkim lea 0x80($inp), $inp 1125238384Sjkim mov \$10,%r10d 1126238384Sjkim 1127238384Sjkim call _bsaes_decrypt8 1128238384Sjkim 1129238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1130238384Sjkim movdqu @XMM[1], 0x10($out) 1131238384Sjkim movdqu @XMM[6], 0x20($out) 1132238384Sjkim movdqu @XMM[4], 0x30($out) 1133238384Sjkim movdqu @XMM[2], 0x40($out) 1134238384Sjkim movdqu @XMM[7], 0x50($out) 1135238384Sjkim movdqu @XMM[3], 0x60($out) 1136238384Sjkim movdqu @XMM[5], 0x70($out) 1137238384Sjkim lea 0x80($out), $out 1138238384Sjkim sub \$0x80,$len 1139238384Sjkim ja .Ldec128_loop 1140238384Sjkim ret 1141238384Sjkim.size bsaes_decrypt_128,.-bsaes_decrypt_128 1142238384Sjkim___ 1143238384Sjkim} 1144238384Sjkim{ 1145238384Sjkim###################################################################### 1146238384Sjkim# 1147238384Sjkim# OpenSSL interface 1148238384Sjkim# 1149238384Sjkimmy ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1150238384Sjkim : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1151238384Sjkimmy ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1152238384Sjkim 1153238384Sjkimif ($ecb) { 1154238384Sjkim$code.=<<___; 1155238384Sjkim.globl bsaes_ecb_encrypt_blocks 1156238384Sjkim.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1157238384Sjkim.align 16 1158238384Sjkimbsaes_ecb_encrypt_blocks: 1159238384Sjkim mov %rsp, %rax 1160238384Sjkim.Lecb_enc_prologue: 1161238384Sjkim push %rbp 1162238384Sjkim push %rbx 1163238384Sjkim push %r12 1164238384Sjkim push %r13 1165238384Sjkim push %r14 1166238384Sjkim push %r15 1167238384Sjkim lea -0x48(%rsp),%rsp 1168238384Sjkim___ 1169238384Sjkim$code.=<<___ if ($win64); 1170238384Sjkim lea -0xa0(%rsp), %rsp 1171238384Sjkim movaps %xmm6, 0x40(%rsp) 1172238384Sjkim movaps %xmm7, 0x50(%rsp) 1173238384Sjkim movaps %xmm8, 0x60(%rsp) 1174238384Sjkim movaps %xmm9, 0x70(%rsp) 1175238384Sjkim movaps %xmm10, 0x80(%rsp) 1176238384Sjkim movaps %xmm11, 0x90(%rsp) 1177238384Sjkim movaps %xmm12, 0xa0(%rsp) 1178238384Sjkim movaps %xmm13, 0xb0(%rsp) 1179238384Sjkim movaps %xmm14, 0xc0(%rsp) 1180238384Sjkim movaps %xmm15, 0xd0(%rsp) 1181238384Sjkim.Lecb_enc_body: 1182238384Sjkim___ 1183238384Sjkim$code.=<<___; 1184238384Sjkim mov %rsp,%rbp # backup %rsp 1185238384Sjkim mov 240($arg4),%eax # rounds 1186238384Sjkim mov $arg1,$inp # backup arguments 1187238384Sjkim mov $arg2,$out 1188238384Sjkim mov $arg3,$len 1189238384Sjkim mov $arg4,$key 1190238384Sjkim cmp \$8,$arg3 1191238384Sjkim jb .Lecb_enc_short 1192238384Sjkim 1193238384Sjkim mov %eax,%ebx # backup rounds 1194238384Sjkim shl \$7,%rax # 128 bytes per inner round key 1195238384Sjkim sub \$`128-32`,%rax # size of bit-sliced key schedule 1196238384Sjkim sub %rax,%rsp 1197238384Sjkim mov %rsp,%rax # pass key schedule 1198238384Sjkim mov $key,%rcx # pass key 1199238384Sjkim mov %ebx,%r10d # pass rounds 1200238384Sjkim call _bsaes_key_convert 1201238384Sjkim pxor %xmm6,%xmm7 # fix up last round key 1202238384Sjkim movdqa %xmm7,(%rax) # save last round key 1203238384Sjkim 1204238384Sjkim sub \$8,$len 1205238384Sjkim.Lecb_enc_loop: 1206238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1207238384Sjkim movdqu 0x10($inp), @XMM[1] 1208238384Sjkim movdqu 0x20($inp), @XMM[2] 1209238384Sjkim movdqu 0x30($inp), @XMM[3] 1210238384Sjkim movdqu 0x40($inp), @XMM[4] 1211238384Sjkim movdqu 0x50($inp), @XMM[5] 1212238384Sjkim mov %rsp, %rax # pass key schedule 1213238384Sjkim movdqu 0x60($inp), @XMM[6] 1214238384Sjkim mov %ebx,%r10d # pass rounds 1215238384Sjkim movdqu 0x70($inp), @XMM[7] 1216238384Sjkim lea 0x80($inp), $inp 1217238384Sjkim 1218238384Sjkim call _bsaes_encrypt8 1219238384Sjkim 1220238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1221238384Sjkim movdqu @XMM[1], 0x10($out) 1222238384Sjkim movdqu @XMM[4], 0x20($out) 1223238384Sjkim movdqu @XMM[6], 0x30($out) 1224238384Sjkim movdqu @XMM[3], 0x40($out) 1225238384Sjkim movdqu @XMM[7], 0x50($out) 1226238384Sjkim movdqu @XMM[2], 0x60($out) 1227238384Sjkim movdqu @XMM[5], 0x70($out) 1228238384Sjkim lea 0x80($out), $out 1229238384Sjkim sub \$8,$len 1230238384Sjkim jnc .Lecb_enc_loop 1231238384Sjkim 1232238384Sjkim add \$8,$len 1233238384Sjkim jz .Lecb_enc_done 1234238384Sjkim 1235238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1236238384Sjkim mov %rsp, %rax # pass key schedule 1237238384Sjkim mov %ebx,%r10d # pass rounds 1238238384Sjkim cmp \$2,$len 1239238384Sjkim jb .Lecb_enc_one 1240238384Sjkim movdqu 0x10($inp), @XMM[1] 1241238384Sjkim je .Lecb_enc_two 1242238384Sjkim movdqu 0x20($inp), @XMM[2] 1243238384Sjkim cmp \$4,$len 1244238384Sjkim jb .Lecb_enc_three 1245238384Sjkim movdqu 0x30($inp), @XMM[3] 1246238384Sjkim je .Lecb_enc_four 1247238384Sjkim movdqu 0x40($inp), @XMM[4] 1248238384Sjkim cmp \$6,$len 1249238384Sjkim jb .Lecb_enc_five 1250238384Sjkim movdqu 0x50($inp), @XMM[5] 1251238384Sjkim je .Lecb_enc_six 1252238384Sjkim movdqu 0x60($inp), @XMM[6] 1253238384Sjkim call _bsaes_encrypt8 1254238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1255238384Sjkim movdqu @XMM[1], 0x10($out) 1256238384Sjkim movdqu @XMM[4], 0x20($out) 1257238384Sjkim movdqu @XMM[6], 0x30($out) 1258238384Sjkim movdqu @XMM[3], 0x40($out) 1259238384Sjkim movdqu @XMM[7], 0x50($out) 1260238384Sjkim movdqu @XMM[2], 0x60($out) 1261238384Sjkim jmp .Lecb_enc_done 1262238384Sjkim.align 16 1263238384Sjkim.Lecb_enc_six: 1264238384Sjkim call _bsaes_encrypt8 1265238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1266238384Sjkim movdqu @XMM[1], 0x10($out) 1267238384Sjkim movdqu @XMM[4], 0x20($out) 1268238384Sjkim movdqu @XMM[6], 0x30($out) 1269238384Sjkim movdqu @XMM[3], 0x40($out) 1270238384Sjkim movdqu @XMM[7], 0x50($out) 1271238384Sjkim jmp .Lecb_enc_done 1272238384Sjkim.align 16 1273238384Sjkim.Lecb_enc_five: 1274238384Sjkim call _bsaes_encrypt8 1275238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1276238384Sjkim movdqu @XMM[1], 0x10($out) 1277238384Sjkim movdqu @XMM[4], 0x20($out) 1278238384Sjkim movdqu @XMM[6], 0x30($out) 1279238384Sjkim movdqu @XMM[3], 0x40($out) 1280238384Sjkim jmp .Lecb_enc_done 1281238384Sjkim.align 16 1282238384Sjkim.Lecb_enc_four: 1283238384Sjkim call _bsaes_encrypt8 1284238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1285238384Sjkim movdqu @XMM[1], 0x10($out) 1286238384Sjkim movdqu @XMM[4], 0x20($out) 1287238384Sjkim movdqu @XMM[6], 0x30($out) 1288238384Sjkim jmp .Lecb_enc_done 1289238384Sjkim.align 16 1290238384Sjkim.Lecb_enc_three: 1291238384Sjkim call _bsaes_encrypt8 1292238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1293238384Sjkim movdqu @XMM[1], 0x10($out) 1294238384Sjkim movdqu @XMM[4], 0x20($out) 1295238384Sjkim jmp .Lecb_enc_done 1296238384Sjkim.align 16 1297238384Sjkim.Lecb_enc_two: 1298238384Sjkim call _bsaes_encrypt8 1299238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1300238384Sjkim movdqu @XMM[1], 0x10($out) 1301238384Sjkim jmp .Lecb_enc_done 1302238384Sjkim.align 16 1303238384Sjkim.Lecb_enc_one: 1304238384Sjkim call _bsaes_encrypt8 1305238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1306238384Sjkim jmp .Lecb_enc_done 1307238384Sjkim.align 16 1308238384Sjkim.Lecb_enc_short: 1309238384Sjkim lea ($inp), $arg1 1310238384Sjkim lea ($out), $arg2 1311238384Sjkim lea ($key), $arg3 1312238384Sjkim call asm_AES_encrypt 1313238384Sjkim lea 16($inp), $inp 1314238384Sjkim lea 16($out), $out 1315238384Sjkim dec $len 1316238384Sjkim jnz .Lecb_enc_short 1317238384Sjkim 1318238384Sjkim.Lecb_enc_done: 1319238384Sjkim lea (%rsp),%rax 1320238384Sjkim pxor %xmm0, %xmm0 1321238384Sjkim.Lecb_enc_bzero: # wipe key schedule [if any] 1322238384Sjkim movdqa %xmm0, 0x00(%rax) 1323238384Sjkim movdqa %xmm0, 0x10(%rax) 1324238384Sjkim lea 0x20(%rax), %rax 1325238384Sjkim cmp %rax, %rbp 1326238384Sjkim jb .Lecb_enc_bzero 1327238384Sjkim 1328238384Sjkim lea (%rbp),%rsp # restore %rsp 1329238384Sjkim___ 1330238384Sjkim$code.=<<___ if ($win64); 1331238384Sjkim movaps 0x40(%rbp), %xmm6 1332238384Sjkim movaps 0x50(%rbp), %xmm7 1333238384Sjkim movaps 0x60(%rbp), %xmm8 1334238384Sjkim movaps 0x70(%rbp), %xmm9 1335238384Sjkim movaps 0x80(%rbp), %xmm10 1336238384Sjkim movaps 0x90(%rbp), %xmm11 1337238384Sjkim movaps 0xa0(%rbp), %xmm12 1338238384Sjkim movaps 0xb0(%rbp), %xmm13 1339238384Sjkim movaps 0xc0(%rbp), %xmm14 1340238384Sjkim movaps 0xd0(%rbp), %xmm15 1341238384Sjkim lea 0xa0(%rbp), %rsp 1342238384Sjkim___ 1343238384Sjkim$code.=<<___; 1344238384Sjkim mov 0x48(%rsp), %r15 1345238384Sjkim mov 0x50(%rsp), %r14 1346238384Sjkim mov 0x58(%rsp), %r13 1347238384Sjkim mov 0x60(%rsp), %r12 1348238384Sjkim mov 0x68(%rsp), %rbx 1349238384Sjkim mov 0x70(%rsp), %rax 1350238384Sjkim lea 0x78(%rsp), %rsp 1351238384Sjkim mov %rax, %rbp 1352238384Sjkim.Lecb_enc_epilogue: 1353238384Sjkim ret 1354238384Sjkim.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1355238384Sjkim 1356238384Sjkim.globl bsaes_ecb_decrypt_blocks 1357238384Sjkim.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1358238384Sjkim.align 16 1359238384Sjkimbsaes_ecb_decrypt_blocks: 1360238384Sjkim mov %rsp, %rax 1361238384Sjkim.Lecb_dec_prologue: 1362238384Sjkim push %rbp 1363238384Sjkim push %rbx 1364238384Sjkim push %r12 1365238384Sjkim push %r13 1366238384Sjkim push %r14 1367238384Sjkim push %r15 1368238384Sjkim lea -0x48(%rsp),%rsp 1369238384Sjkim___ 1370238384Sjkim$code.=<<___ if ($win64); 1371238384Sjkim lea -0xa0(%rsp), %rsp 1372238384Sjkim movaps %xmm6, 0x40(%rsp) 1373238384Sjkim movaps %xmm7, 0x50(%rsp) 1374238384Sjkim movaps %xmm8, 0x60(%rsp) 1375238384Sjkim movaps %xmm9, 0x70(%rsp) 1376238384Sjkim movaps %xmm10, 0x80(%rsp) 1377238384Sjkim movaps %xmm11, 0x90(%rsp) 1378238384Sjkim movaps %xmm12, 0xa0(%rsp) 1379238384Sjkim movaps %xmm13, 0xb0(%rsp) 1380238384Sjkim movaps %xmm14, 0xc0(%rsp) 1381238384Sjkim movaps %xmm15, 0xd0(%rsp) 1382238384Sjkim.Lecb_dec_body: 1383238384Sjkim___ 1384238384Sjkim$code.=<<___; 1385238384Sjkim mov %rsp,%rbp # backup %rsp 1386238384Sjkim mov 240($arg4),%eax # rounds 1387238384Sjkim mov $arg1,$inp # backup arguments 1388238384Sjkim mov $arg2,$out 1389238384Sjkim mov $arg3,$len 1390238384Sjkim mov $arg4,$key 1391238384Sjkim cmp \$8,$arg3 1392238384Sjkim jb .Lecb_dec_short 1393238384Sjkim 1394238384Sjkim mov %eax,%ebx # backup rounds 1395238384Sjkim shl \$7,%rax # 128 bytes per inner round key 1396238384Sjkim sub \$`128-32`,%rax # size of bit-sliced key schedule 1397238384Sjkim sub %rax,%rsp 1398238384Sjkim mov %rsp,%rax # pass key schedule 1399238384Sjkim mov $key,%rcx # pass key 1400238384Sjkim mov %ebx,%r10d # pass rounds 1401238384Sjkim call _bsaes_key_convert 1402238384Sjkim pxor (%rsp),%xmm7 # fix up 0 round key 1403238384Sjkim movdqa %xmm6,(%rax) # save last round key 1404238384Sjkim movdqa %xmm7,(%rsp) 1405238384Sjkim 1406238384Sjkim sub \$8,$len 1407238384Sjkim.Lecb_dec_loop: 1408238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1409238384Sjkim movdqu 0x10($inp), @XMM[1] 1410238384Sjkim movdqu 0x20($inp), @XMM[2] 1411238384Sjkim movdqu 0x30($inp), @XMM[3] 1412238384Sjkim movdqu 0x40($inp), @XMM[4] 1413238384Sjkim movdqu 0x50($inp), @XMM[5] 1414238384Sjkim mov %rsp, %rax # pass key schedule 1415238384Sjkim movdqu 0x60($inp), @XMM[6] 1416238384Sjkim mov %ebx,%r10d # pass rounds 1417238384Sjkim movdqu 0x70($inp), @XMM[7] 1418238384Sjkim lea 0x80($inp), $inp 1419238384Sjkim 1420238384Sjkim call _bsaes_decrypt8 1421238384Sjkim 1422238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1423238384Sjkim movdqu @XMM[1], 0x10($out) 1424238384Sjkim movdqu @XMM[6], 0x20($out) 1425238384Sjkim movdqu @XMM[4], 0x30($out) 1426238384Sjkim movdqu @XMM[2], 0x40($out) 1427238384Sjkim movdqu @XMM[7], 0x50($out) 1428238384Sjkim movdqu @XMM[3], 0x60($out) 1429238384Sjkim movdqu @XMM[5], 0x70($out) 1430238384Sjkim lea 0x80($out), $out 1431238384Sjkim sub \$8,$len 1432238384Sjkim jnc .Lecb_dec_loop 1433238384Sjkim 1434238384Sjkim add \$8,$len 1435238384Sjkim jz .Lecb_dec_done 1436238384Sjkim 1437238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1438238384Sjkim mov %rsp, %rax # pass key schedule 1439238384Sjkim mov %ebx,%r10d # pass rounds 1440238384Sjkim cmp \$2,$len 1441238384Sjkim jb .Lecb_dec_one 1442238384Sjkim movdqu 0x10($inp), @XMM[1] 1443238384Sjkim je .Lecb_dec_two 1444238384Sjkim movdqu 0x20($inp), @XMM[2] 1445238384Sjkim cmp \$4,$len 1446238384Sjkim jb .Lecb_dec_three 1447238384Sjkim movdqu 0x30($inp), @XMM[3] 1448238384Sjkim je .Lecb_dec_four 1449238384Sjkim movdqu 0x40($inp), @XMM[4] 1450238384Sjkim cmp \$6,$len 1451238384Sjkim jb .Lecb_dec_five 1452238384Sjkim movdqu 0x50($inp), @XMM[5] 1453238384Sjkim je .Lecb_dec_six 1454238384Sjkim movdqu 0x60($inp), @XMM[6] 1455238384Sjkim call _bsaes_decrypt8 1456238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1457238384Sjkim movdqu @XMM[1], 0x10($out) 1458238384Sjkim movdqu @XMM[6], 0x20($out) 1459238384Sjkim movdqu @XMM[4], 0x30($out) 1460238384Sjkim movdqu @XMM[2], 0x40($out) 1461238384Sjkim movdqu @XMM[7], 0x50($out) 1462238384Sjkim movdqu @XMM[3], 0x60($out) 1463238384Sjkim jmp .Lecb_dec_done 1464238384Sjkim.align 16 1465238384Sjkim.Lecb_dec_six: 1466238384Sjkim call _bsaes_decrypt8 1467238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1468238384Sjkim movdqu @XMM[1], 0x10($out) 1469238384Sjkim movdqu @XMM[6], 0x20($out) 1470238384Sjkim movdqu @XMM[4], 0x30($out) 1471238384Sjkim movdqu @XMM[2], 0x40($out) 1472238384Sjkim movdqu @XMM[7], 0x50($out) 1473238384Sjkim jmp .Lecb_dec_done 1474238384Sjkim.align 16 1475238384Sjkim.Lecb_dec_five: 1476238384Sjkim call _bsaes_decrypt8 1477238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1478238384Sjkim movdqu @XMM[1], 0x10($out) 1479238384Sjkim movdqu @XMM[6], 0x20($out) 1480238384Sjkim movdqu @XMM[4], 0x30($out) 1481238384Sjkim movdqu @XMM[2], 0x40($out) 1482238384Sjkim jmp .Lecb_dec_done 1483238384Sjkim.align 16 1484238384Sjkim.Lecb_dec_four: 1485238384Sjkim call _bsaes_decrypt8 1486238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1487238384Sjkim movdqu @XMM[1], 0x10($out) 1488238384Sjkim movdqu @XMM[6], 0x20($out) 1489238384Sjkim movdqu @XMM[4], 0x30($out) 1490238384Sjkim jmp .Lecb_dec_done 1491238384Sjkim.align 16 1492238384Sjkim.Lecb_dec_three: 1493238384Sjkim call _bsaes_decrypt8 1494238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1495238384Sjkim movdqu @XMM[1], 0x10($out) 1496238384Sjkim movdqu @XMM[6], 0x20($out) 1497238384Sjkim jmp .Lecb_dec_done 1498238384Sjkim.align 16 1499238384Sjkim.Lecb_dec_two: 1500238384Sjkim call _bsaes_decrypt8 1501238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1502238384Sjkim movdqu @XMM[1], 0x10($out) 1503238384Sjkim jmp .Lecb_dec_done 1504238384Sjkim.align 16 1505238384Sjkim.Lecb_dec_one: 1506238384Sjkim call _bsaes_decrypt8 1507238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1508238384Sjkim jmp .Lecb_dec_done 1509238384Sjkim.align 16 1510238384Sjkim.Lecb_dec_short: 1511238384Sjkim lea ($inp), $arg1 1512238384Sjkim lea ($out), $arg2 1513238384Sjkim lea ($key), $arg3 1514238384Sjkim call asm_AES_decrypt 1515238384Sjkim lea 16($inp), $inp 1516238384Sjkim lea 16($out), $out 1517238384Sjkim dec $len 1518238384Sjkim jnz .Lecb_dec_short 1519238384Sjkim 1520238384Sjkim.Lecb_dec_done: 1521238384Sjkim lea (%rsp),%rax 1522238384Sjkim pxor %xmm0, %xmm0 1523238384Sjkim.Lecb_dec_bzero: # wipe key schedule [if any] 1524238384Sjkim movdqa %xmm0, 0x00(%rax) 1525238384Sjkim movdqa %xmm0, 0x10(%rax) 1526238384Sjkim lea 0x20(%rax), %rax 1527238384Sjkim cmp %rax, %rbp 1528238384Sjkim jb .Lecb_dec_bzero 1529238384Sjkim 1530238384Sjkim lea (%rbp),%rsp # restore %rsp 1531238384Sjkim___ 1532238384Sjkim$code.=<<___ if ($win64); 1533238384Sjkim movaps 0x40(%rbp), %xmm6 1534238384Sjkim movaps 0x50(%rbp), %xmm7 1535238384Sjkim movaps 0x60(%rbp), %xmm8 1536238384Sjkim movaps 0x70(%rbp), %xmm9 1537238384Sjkim movaps 0x80(%rbp), %xmm10 1538238384Sjkim movaps 0x90(%rbp), %xmm11 1539238384Sjkim movaps 0xa0(%rbp), %xmm12 1540238384Sjkim movaps 0xb0(%rbp), %xmm13 1541238384Sjkim movaps 0xc0(%rbp), %xmm14 1542238384Sjkim movaps 0xd0(%rbp), %xmm15 1543238384Sjkim lea 0xa0(%rbp), %rsp 1544238384Sjkim___ 1545238384Sjkim$code.=<<___; 1546238384Sjkim mov 0x48(%rsp), %r15 1547238384Sjkim mov 0x50(%rsp), %r14 1548238384Sjkim mov 0x58(%rsp), %r13 1549238384Sjkim mov 0x60(%rsp), %r12 1550238384Sjkim mov 0x68(%rsp), %rbx 1551238384Sjkim mov 0x70(%rsp), %rax 1552238384Sjkim lea 0x78(%rsp), %rsp 1553238384Sjkim mov %rax, %rbp 1554238384Sjkim.Lecb_dec_epilogue: 1555238384Sjkim ret 1556238384Sjkim.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1557238384Sjkim___ 1558238384Sjkim} 1559238384Sjkim$code.=<<___; 1560238384Sjkim.extern asm_AES_cbc_encrypt 1561238384Sjkim.globl bsaes_cbc_encrypt 1562238384Sjkim.type bsaes_cbc_encrypt,\@abi-omnipotent 1563238384Sjkim.align 16 1564238384Sjkimbsaes_cbc_encrypt: 1565238384Sjkim___ 1566238384Sjkim$code.=<<___ if ($win64); 1567238384Sjkim mov 48(%rsp),$arg6 # pull direction flag 1568238384Sjkim___ 1569238384Sjkim$code.=<<___; 1570238384Sjkim cmp \$0,$arg6 1571238384Sjkim jne asm_AES_cbc_encrypt 1572238384Sjkim cmp \$128,$arg3 1573238384Sjkim jb asm_AES_cbc_encrypt 1574238384Sjkim 1575238384Sjkim mov %rsp, %rax 1576238384Sjkim.Lcbc_dec_prologue: 1577238384Sjkim push %rbp 1578238384Sjkim push %rbx 1579238384Sjkim push %r12 1580238384Sjkim push %r13 1581238384Sjkim push %r14 1582238384Sjkim push %r15 1583238384Sjkim lea -0x48(%rsp), %rsp 1584238384Sjkim___ 1585238384Sjkim$code.=<<___ if ($win64); 1586238384Sjkim mov 0xa0(%rsp),$arg5 # pull ivp 1587238384Sjkim lea -0xa0(%rsp), %rsp 1588238384Sjkim movaps %xmm6, 0x40(%rsp) 1589238384Sjkim movaps %xmm7, 0x50(%rsp) 1590238384Sjkim movaps %xmm8, 0x60(%rsp) 1591238384Sjkim movaps %xmm9, 0x70(%rsp) 1592238384Sjkim movaps %xmm10, 0x80(%rsp) 1593238384Sjkim movaps %xmm11, 0x90(%rsp) 1594238384Sjkim movaps %xmm12, 0xa0(%rsp) 1595238384Sjkim movaps %xmm13, 0xb0(%rsp) 1596238384Sjkim movaps %xmm14, 0xc0(%rsp) 1597238384Sjkim movaps %xmm15, 0xd0(%rsp) 1598238384Sjkim.Lcbc_dec_body: 1599238384Sjkim___ 1600238384Sjkim$code.=<<___; 1601238384Sjkim mov %rsp, %rbp # backup %rsp 1602238384Sjkim mov 240($arg4), %eax # rounds 1603238384Sjkim mov $arg1, $inp # backup arguments 1604238384Sjkim mov $arg2, $out 1605238384Sjkim mov $arg3, $len 1606238384Sjkim mov $arg4, $key 1607238384Sjkim mov $arg5, %rbx 1608238384Sjkim shr \$4, $len # bytes to blocks 1609238384Sjkim 1610238384Sjkim mov %eax, %edx # rounds 1611238384Sjkim shl \$7, %rax # 128 bytes per inner round key 1612238384Sjkim sub \$`128-32`, %rax # size of bit-sliced key schedule 1613238384Sjkim sub %rax, %rsp 1614238384Sjkim 1615238384Sjkim mov %rsp, %rax # pass key schedule 1616238384Sjkim mov $key, %rcx # pass key 1617238384Sjkim mov %edx, %r10d # pass rounds 1618238384Sjkim call _bsaes_key_convert 1619238384Sjkim pxor (%rsp),%xmm7 # fix up 0 round key 1620238384Sjkim movdqa %xmm6,(%rax) # save last round key 1621238384Sjkim movdqa %xmm7,(%rsp) 1622238384Sjkim 1623238384Sjkim movdqu (%rbx), @XMM[15] # load IV 1624238384Sjkim sub \$8,$len 1625238384Sjkim.Lcbc_dec_loop: 1626238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1627238384Sjkim movdqu 0x10($inp), @XMM[1] 1628238384Sjkim movdqu 0x20($inp), @XMM[2] 1629238384Sjkim movdqu 0x30($inp), @XMM[3] 1630238384Sjkim movdqu 0x40($inp), @XMM[4] 1631238384Sjkim movdqu 0x50($inp), @XMM[5] 1632238384Sjkim mov %rsp, %rax # pass key schedule 1633238384Sjkim movdqu 0x60($inp), @XMM[6] 1634238384Sjkim mov %edx,%r10d # pass rounds 1635238384Sjkim movdqu 0x70($inp), @XMM[7] 1636238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1637238384Sjkim 1638238384Sjkim call _bsaes_decrypt8 1639238384Sjkim 1640238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1641238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1642238384Sjkim movdqu 0x10($inp), @XMM[9] 1643238384Sjkim pxor @XMM[8], @XMM[1] 1644238384Sjkim movdqu 0x20($inp), @XMM[10] 1645238384Sjkim pxor @XMM[9], @XMM[6] 1646238384Sjkim movdqu 0x30($inp), @XMM[11] 1647238384Sjkim pxor @XMM[10], @XMM[4] 1648238384Sjkim movdqu 0x40($inp), @XMM[12] 1649238384Sjkim pxor @XMM[11], @XMM[2] 1650238384Sjkim movdqu 0x50($inp), @XMM[13] 1651238384Sjkim pxor @XMM[12], @XMM[7] 1652238384Sjkim movdqu 0x60($inp), @XMM[14] 1653238384Sjkim pxor @XMM[13], @XMM[3] 1654238384Sjkim movdqu 0x70($inp), @XMM[15] # IV 1655238384Sjkim pxor @XMM[14], @XMM[5] 1656238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1657238384Sjkim lea 0x80($inp), $inp 1658238384Sjkim movdqu @XMM[1], 0x10($out) 1659238384Sjkim movdqu @XMM[6], 0x20($out) 1660238384Sjkim movdqu @XMM[4], 0x30($out) 1661238384Sjkim movdqu @XMM[2], 0x40($out) 1662238384Sjkim movdqu @XMM[7], 0x50($out) 1663238384Sjkim movdqu @XMM[3], 0x60($out) 1664238384Sjkim movdqu @XMM[5], 0x70($out) 1665238384Sjkim lea 0x80($out), $out 1666238384Sjkim sub \$8,$len 1667238384Sjkim jnc .Lcbc_dec_loop 1668238384Sjkim 1669238384Sjkim add \$8,$len 1670238384Sjkim jz .Lcbc_dec_done 1671238384Sjkim 1672238384Sjkim movdqu 0x00($inp), @XMM[0] # load input 1673238384Sjkim mov %rsp, %rax # pass key schedule 1674238384Sjkim mov %edx, %r10d # pass rounds 1675238384Sjkim cmp \$2,$len 1676238384Sjkim jb .Lcbc_dec_one 1677238384Sjkim movdqu 0x10($inp), @XMM[1] 1678238384Sjkim je .Lcbc_dec_two 1679238384Sjkim movdqu 0x20($inp), @XMM[2] 1680238384Sjkim cmp \$4,$len 1681238384Sjkim jb .Lcbc_dec_three 1682238384Sjkim movdqu 0x30($inp), @XMM[3] 1683238384Sjkim je .Lcbc_dec_four 1684238384Sjkim movdqu 0x40($inp), @XMM[4] 1685238384Sjkim cmp \$6,$len 1686238384Sjkim jb .Lcbc_dec_five 1687238384Sjkim movdqu 0x50($inp), @XMM[5] 1688238384Sjkim je .Lcbc_dec_six 1689238384Sjkim movdqu 0x60($inp), @XMM[6] 1690238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1691238384Sjkim call _bsaes_decrypt8 1692238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1693238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1694238384Sjkim movdqu 0x10($inp), @XMM[9] 1695238384Sjkim pxor @XMM[8], @XMM[1] 1696238384Sjkim movdqu 0x20($inp), @XMM[10] 1697238384Sjkim pxor @XMM[9], @XMM[6] 1698238384Sjkim movdqu 0x30($inp), @XMM[11] 1699238384Sjkim pxor @XMM[10], @XMM[4] 1700238384Sjkim movdqu 0x40($inp), @XMM[12] 1701238384Sjkim pxor @XMM[11], @XMM[2] 1702238384Sjkim movdqu 0x50($inp), @XMM[13] 1703238384Sjkim pxor @XMM[12], @XMM[7] 1704238384Sjkim movdqu 0x60($inp), @XMM[15] # IV 1705238384Sjkim pxor @XMM[13], @XMM[3] 1706238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1707238384Sjkim movdqu @XMM[1], 0x10($out) 1708238384Sjkim movdqu @XMM[6], 0x20($out) 1709238384Sjkim movdqu @XMM[4], 0x30($out) 1710238384Sjkim movdqu @XMM[2], 0x40($out) 1711238384Sjkim movdqu @XMM[7], 0x50($out) 1712238384Sjkim movdqu @XMM[3], 0x60($out) 1713238384Sjkim jmp .Lcbc_dec_done 1714238384Sjkim.align 16 1715238384Sjkim.Lcbc_dec_six: 1716238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1717238384Sjkim call _bsaes_decrypt8 1718238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1719238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1720238384Sjkim movdqu 0x10($inp), @XMM[9] 1721238384Sjkim pxor @XMM[8], @XMM[1] 1722238384Sjkim movdqu 0x20($inp), @XMM[10] 1723238384Sjkim pxor @XMM[9], @XMM[6] 1724238384Sjkim movdqu 0x30($inp), @XMM[11] 1725238384Sjkim pxor @XMM[10], @XMM[4] 1726238384Sjkim movdqu 0x40($inp), @XMM[12] 1727238384Sjkim pxor @XMM[11], @XMM[2] 1728238384Sjkim movdqu 0x50($inp), @XMM[15] # IV 1729238384Sjkim pxor @XMM[12], @XMM[7] 1730238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1731238384Sjkim movdqu @XMM[1], 0x10($out) 1732238384Sjkim movdqu @XMM[6], 0x20($out) 1733238384Sjkim movdqu @XMM[4], 0x30($out) 1734238384Sjkim movdqu @XMM[2], 0x40($out) 1735238384Sjkim movdqu @XMM[7], 0x50($out) 1736238384Sjkim jmp .Lcbc_dec_done 1737238384Sjkim.align 16 1738238384Sjkim.Lcbc_dec_five: 1739238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1740238384Sjkim call _bsaes_decrypt8 1741238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1742238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1743238384Sjkim movdqu 0x10($inp), @XMM[9] 1744238384Sjkim pxor @XMM[8], @XMM[1] 1745238384Sjkim movdqu 0x20($inp), @XMM[10] 1746238384Sjkim pxor @XMM[9], @XMM[6] 1747238384Sjkim movdqu 0x30($inp), @XMM[11] 1748238384Sjkim pxor @XMM[10], @XMM[4] 1749238384Sjkim movdqu 0x40($inp), @XMM[15] # IV 1750238384Sjkim pxor @XMM[11], @XMM[2] 1751238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1752238384Sjkim movdqu @XMM[1], 0x10($out) 1753238384Sjkim movdqu @XMM[6], 0x20($out) 1754238384Sjkim movdqu @XMM[4], 0x30($out) 1755238384Sjkim movdqu @XMM[2], 0x40($out) 1756238384Sjkim jmp .Lcbc_dec_done 1757238384Sjkim.align 16 1758238384Sjkim.Lcbc_dec_four: 1759238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1760238384Sjkim call _bsaes_decrypt8 1761238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1762238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1763238384Sjkim movdqu 0x10($inp), @XMM[9] 1764238384Sjkim pxor @XMM[8], @XMM[1] 1765238384Sjkim movdqu 0x20($inp), @XMM[10] 1766238384Sjkim pxor @XMM[9], @XMM[6] 1767238384Sjkim movdqu 0x30($inp), @XMM[15] # IV 1768238384Sjkim pxor @XMM[10], @XMM[4] 1769238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1770238384Sjkim movdqu @XMM[1], 0x10($out) 1771238384Sjkim movdqu @XMM[6], 0x20($out) 1772238384Sjkim movdqu @XMM[4], 0x30($out) 1773238384Sjkim jmp .Lcbc_dec_done 1774238384Sjkim.align 16 1775238384Sjkim.Lcbc_dec_three: 1776238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1777238384Sjkim call _bsaes_decrypt8 1778238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1779238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1780238384Sjkim movdqu 0x10($inp), @XMM[9] 1781238384Sjkim pxor @XMM[8], @XMM[1] 1782238384Sjkim movdqu 0x20($inp), @XMM[15] # IV 1783238384Sjkim pxor @XMM[9], @XMM[6] 1784238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1785238384Sjkim movdqu @XMM[1], 0x10($out) 1786238384Sjkim movdqu @XMM[6], 0x20($out) 1787238384Sjkim jmp .Lcbc_dec_done 1788238384Sjkim.align 16 1789238384Sjkim.Lcbc_dec_two: 1790238384Sjkim movdqa @XMM[15], 0x20(%rbp) # put aside IV 1791238384Sjkim call _bsaes_decrypt8 1792238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= IV 1793238384Sjkim movdqu 0x00($inp), @XMM[8] # re-load input 1794238384Sjkim movdqu 0x10($inp), @XMM[15] # IV 1795238384Sjkim pxor @XMM[8], @XMM[1] 1796238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1797238384Sjkim movdqu @XMM[1], 0x10($out) 1798238384Sjkim jmp .Lcbc_dec_done 1799238384Sjkim.align 16 1800238384Sjkim.Lcbc_dec_one: 1801238384Sjkim lea ($inp), $arg1 1802238384Sjkim lea 0x20(%rbp), $arg2 # buffer output 1803238384Sjkim lea ($key), $arg3 1804238384Sjkim call asm_AES_decrypt # doesn't touch %xmm 1805238384Sjkim pxor 0x20(%rbp), @XMM[15] # ^= IV 1806238384Sjkim movdqu @XMM[15], ($out) # write output 1807238384Sjkim movdqa @XMM[0], @XMM[15] # IV 1808238384Sjkim 1809238384Sjkim.Lcbc_dec_done: 1810238384Sjkim movdqu @XMM[15], (%rbx) # return IV 1811238384Sjkim lea (%rsp), %rax 1812238384Sjkim pxor %xmm0, %xmm0 1813238384Sjkim.Lcbc_dec_bzero: # wipe key schedule [if any] 1814238384Sjkim movdqa %xmm0, 0x00(%rax) 1815238384Sjkim movdqa %xmm0, 0x10(%rax) 1816238384Sjkim lea 0x20(%rax), %rax 1817238384Sjkim cmp %rax, %rbp 1818238384Sjkim ja .Lcbc_dec_bzero 1819238384Sjkim 1820238384Sjkim lea (%rbp),%rsp # restore %rsp 1821238384Sjkim___ 1822238384Sjkim$code.=<<___ if ($win64); 1823238384Sjkim movaps 0x40(%rbp), %xmm6 1824238384Sjkim movaps 0x50(%rbp), %xmm7 1825238384Sjkim movaps 0x60(%rbp), %xmm8 1826238384Sjkim movaps 0x70(%rbp), %xmm9 1827238384Sjkim movaps 0x80(%rbp), %xmm10 1828238384Sjkim movaps 0x90(%rbp), %xmm11 1829238384Sjkim movaps 0xa0(%rbp), %xmm12 1830238384Sjkim movaps 0xb0(%rbp), %xmm13 1831238384Sjkim movaps 0xc0(%rbp), %xmm14 1832238384Sjkim movaps 0xd0(%rbp), %xmm15 1833238384Sjkim lea 0xa0(%rbp), %rsp 1834238384Sjkim___ 1835238384Sjkim$code.=<<___; 1836238384Sjkim mov 0x48(%rsp), %r15 1837238384Sjkim mov 0x50(%rsp), %r14 1838238384Sjkim mov 0x58(%rsp), %r13 1839238384Sjkim mov 0x60(%rsp), %r12 1840238384Sjkim mov 0x68(%rsp), %rbx 1841238384Sjkim mov 0x70(%rsp), %rax 1842238384Sjkim lea 0x78(%rsp), %rsp 1843238384Sjkim mov %rax, %rbp 1844238384Sjkim.Lcbc_dec_epilogue: 1845238384Sjkim ret 1846238384Sjkim.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1847238384Sjkim 1848238384Sjkim.globl bsaes_ctr32_encrypt_blocks 1849238384Sjkim.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1850238384Sjkim.align 16 1851238384Sjkimbsaes_ctr32_encrypt_blocks: 1852238384Sjkim mov %rsp, %rax 1853238384Sjkim.Lctr_enc_prologue: 1854238384Sjkim push %rbp 1855238384Sjkim push %rbx 1856238384Sjkim push %r12 1857238384Sjkim push %r13 1858238384Sjkim push %r14 1859238384Sjkim push %r15 1860238384Sjkim lea -0x48(%rsp), %rsp 1861238384Sjkim___ 1862238384Sjkim$code.=<<___ if ($win64); 1863238384Sjkim mov 0xa0(%rsp),$arg5 # pull ivp 1864238384Sjkim lea -0xa0(%rsp), %rsp 1865238384Sjkim movaps %xmm6, 0x40(%rsp) 1866238384Sjkim movaps %xmm7, 0x50(%rsp) 1867238384Sjkim movaps %xmm8, 0x60(%rsp) 1868238384Sjkim movaps %xmm9, 0x70(%rsp) 1869238384Sjkim movaps %xmm10, 0x80(%rsp) 1870238384Sjkim movaps %xmm11, 0x90(%rsp) 1871238384Sjkim movaps %xmm12, 0xa0(%rsp) 1872238384Sjkim movaps %xmm13, 0xb0(%rsp) 1873238384Sjkim movaps %xmm14, 0xc0(%rsp) 1874238384Sjkim movaps %xmm15, 0xd0(%rsp) 1875238384Sjkim.Lctr_enc_body: 1876238384Sjkim___ 1877238384Sjkim$code.=<<___; 1878238384Sjkim mov %rsp, %rbp # backup %rsp 1879238384Sjkim movdqu ($arg5), %xmm0 # load counter 1880238384Sjkim mov 240($arg4), %eax # rounds 1881238384Sjkim mov $arg1, $inp # backup arguments 1882238384Sjkim mov $arg2, $out 1883238384Sjkim mov $arg3, $len 1884238384Sjkim mov $arg4, $key 1885238384Sjkim movdqa %xmm0, 0x20(%rbp) # copy counter 1886238384Sjkim cmp \$8, $arg3 1887238384Sjkim jb .Lctr_enc_short 1888238384Sjkim 1889238384Sjkim mov %eax, %ebx # rounds 1890238384Sjkim shl \$7, %rax # 128 bytes per inner round key 1891238384Sjkim sub \$`128-32`, %rax # size of bit-sliced key schedule 1892238384Sjkim sub %rax, %rsp 1893238384Sjkim 1894238384Sjkim mov %rsp, %rax # pass key schedule 1895238384Sjkim mov $key, %rcx # pass key 1896238384Sjkim mov %ebx, %r10d # pass rounds 1897238384Sjkim call _bsaes_key_convert 1898238384Sjkim pxor %xmm6,%xmm7 # fix up last round key 1899238384Sjkim movdqa %xmm7,(%rax) # save last round key 1900238384Sjkim 1901238384Sjkim movdqa (%rsp), @XMM[9] # load round0 key 1902238384Sjkim lea .LADD1(%rip), %r11 1903238384Sjkim movdqa 0x20(%rbp), @XMM[0] # counter copy 1904238384Sjkim movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1905238384Sjkim pshufb @XMM[8], @XMM[9] # byte swap upper part 1906238384Sjkim pshufb @XMM[8], @XMM[0] 1907238384Sjkim movdqa @XMM[9], (%rsp) # save adjusted round0 key 1908238384Sjkim jmp .Lctr_enc_loop 1909238384Sjkim.align 16 1910238384Sjkim.Lctr_enc_loop: 1911238384Sjkim movdqa @XMM[0], 0x20(%rbp) # save counter 1912238384Sjkim movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1913238384Sjkim movdqa @XMM[0], @XMM[2] 1914238384Sjkim paddd 0x00(%r11), @XMM[1] # .LADD1 1915238384Sjkim movdqa @XMM[0], @XMM[3] 1916238384Sjkim paddd 0x10(%r11), @XMM[2] # .LADD2 1917238384Sjkim movdqa @XMM[0], @XMM[4] 1918238384Sjkim paddd 0x20(%r11), @XMM[3] # .LADD3 1919238384Sjkim movdqa @XMM[0], @XMM[5] 1920238384Sjkim paddd 0x30(%r11), @XMM[4] # .LADD4 1921238384Sjkim movdqa @XMM[0], @XMM[6] 1922238384Sjkim paddd 0x40(%r11), @XMM[5] # .LADD5 1923238384Sjkim movdqa @XMM[0], @XMM[7] 1924238384Sjkim paddd 0x50(%r11), @XMM[6] # .LADD6 1925238384Sjkim paddd 0x60(%r11), @XMM[7] # .LADD7 1926238384Sjkim 1927238384Sjkim # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1928238384Sjkim # to flip byte order in 32-bit counter 1929238384Sjkim movdqa (%rsp), @XMM[9] # round 0 key 1930238384Sjkim lea 0x10(%rsp), %rax # pass key schedule 1931238384Sjkim movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1932238384Sjkim pxor @XMM[9], @XMM[0] # xor with round0 key 1933238384Sjkim pxor @XMM[9], @XMM[1] 1934290207Sjkim pxor @XMM[9], @XMM[2] 1935290207Sjkim pxor @XMM[9], @XMM[3] 1936238384Sjkim pshufb @XMM[8], @XMM[0] 1937238384Sjkim pshufb @XMM[8], @XMM[1] 1938290207Sjkim pxor @XMM[9], @XMM[4] 1939290207Sjkim pxor @XMM[9], @XMM[5] 1940238384Sjkim pshufb @XMM[8], @XMM[2] 1941238384Sjkim pshufb @XMM[8], @XMM[3] 1942290207Sjkim pxor @XMM[9], @XMM[6] 1943290207Sjkim pxor @XMM[9], @XMM[7] 1944238384Sjkim pshufb @XMM[8], @XMM[4] 1945238384Sjkim pshufb @XMM[8], @XMM[5] 1946238384Sjkim pshufb @XMM[8], @XMM[6] 1947290207Sjkim pshufb @XMM[8], @XMM[7] 1948238384Sjkim lea .LBS0(%rip), %r11 # constants table 1949238384Sjkim mov %ebx,%r10d # pass rounds 1950238384Sjkim 1951238384Sjkim call _bsaes_encrypt8_bitslice 1952238384Sjkim 1953238384Sjkim sub \$8,$len 1954238384Sjkim jc .Lctr_enc_loop_done 1955238384Sjkim 1956238384Sjkim movdqu 0x00($inp), @XMM[8] # load input 1957238384Sjkim movdqu 0x10($inp), @XMM[9] 1958238384Sjkim movdqu 0x20($inp), @XMM[10] 1959238384Sjkim movdqu 0x30($inp), @XMM[11] 1960238384Sjkim movdqu 0x40($inp), @XMM[12] 1961238384Sjkim movdqu 0x50($inp), @XMM[13] 1962238384Sjkim movdqu 0x60($inp), @XMM[14] 1963238384Sjkim movdqu 0x70($inp), @XMM[15] 1964238384Sjkim lea 0x80($inp),$inp 1965238384Sjkim pxor @XMM[0], @XMM[8] 1966238384Sjkim movdqa 0x20(%rbp), @XMM[0] # load counter 1967238384Sjkim pxor @XMM[9], @XMM[1] 1968238384Sjkim movdqu @XMM[8], 0x00($out) # write output 1969238384Sjkim pxor @XMM[10], @XMM[4] 1970238384Sjkim movdqu @XMM[1], 0x10($out) 1971238384Sjkim pxor @XMM[11], @XMM[6] 1972238384Sjkim movdqu @XMM[4], 0x20($out) 1973238384Sjkim pxor @XMM[12], @XMM[3] 1974238384Sjkim movdqu @XMM[6], 0x30($out) 1975238384Sjkim pxor @XMM[13], @XMM[7] 1976238384Sjkim movdqu @XMM[3], 0x40($out) 1977238384Sjkim pxor @XMM[14], @XMM[2] 1978238384Sjkim movdqu @XMM[7], 0x50($out) 1979238384Sjkim pxor @XMM[15], @XMM[5] 1980238384Sjkim movdqu @XMM[2], 0x60($out) 1981238384Sjkim lea .LADD1(%rip), %r11 1982238384Sjkim movdqu @XMM[5], 0x70($out) 1983238384Sjkim lea 0x80($out), $out 1984238384Sjkim paddd 0x70(%r11), @XMM[0] # .LADD8 1985238384Sjkim jnz .Lctr_enc_loop 1986238384Sjkim 1987238384Sjkim jmp .Lctr_enc_done 1988238384Sjkim.align 16 1989238384Sjkim.Lctr_enc_loop_done: 1990238384Sjkim add \$8, $len 1991238384Sjkim movdqu 0x00($inp), @XMM[8] # load input 1992238384Sjkim pxor @XMM[8], @XMM[0] 1993238384Sjkim movdqu @XMM[0], 0x00($out) # write output 1994238384Sjkim cmp \$2,$len 1995238384Sjkim jb .Lctr_enc_done 1996238384Sjkim movdqu 0x10($inp), @XMM[9] 1997238384Sjkim pxor @XMM[9], @XMM[1] 1998238384Sjkim movdqu @XMM[1], 0x10($out) 1999238384Sjkim je .Lctr_enc_done 2000238384Sjkim movdqu 0x20($inp), @XMM[10] 2001238384Sjkim pxor @XMM[10], @XMM[4] 2002238384Sjkim movdqu @XMM[4], 0x20($out) 2003238384Sjkim cmp \$4,$len 2004238384Sjkim jb .Lctr_enc_done 2005238384Sjkim movdqu 0x30($inp), @XMM[11] 2006238384Sjkim pxor @XMM[11], @XMM[6] 2007238384Sjkim movdqu @XMM[6], 0x30($out) 2008238384Sjkim je .Lctr_enc_done 2009238384Sjkim movdqu 0x40($inp), @XMM[12] 2010238384Sjkim pxor @XMM[12], @XMM[3] 2011238384Sjkim movdqu @XMM[3], 0x40($out) 2012238384Sjkim cmp \$6,$len 2013238384Sjkim jb .Lctr_enc_done 2014238384Sjkim movdqu 0x50($inp), @XMM[13] 2015238384Sjkim pxor @XMM[13], @XMM[7] 2016238384Sjkim movdqu @XMM[7], 0x50($out) 2017238384Sjkim je .Lctr_enc_done 2018238384Sjkim movdqu 0x60($inp), @XMM[14] 2019238384Sjkim pxor @XMM[14], @XMM[2] 2020238384Sjkim movdqu @XMM[2], 0x60($out) 2021238384Sjkim jmp .Lctr_enc_done 2022238384Sjkim 2023238384Sjkim.align 16 2024238384Sjkim.Lctr_enc_short: 2025238384Sjkim lea 0x20(%rbp), $arg1 2026238384Sjkim lea 0x30(%rbp), $arg2 2027238384Sjkim lea ($key), $arg3 2028238384Sjkim call asm_AES_encrypt 2029238384Sjkim movdqu ($inp), @XMM[1] 2030238384Sjkim lea 16($inp), $inp 2031238384Sjkim mov 0x2c(%rbp), %eax # load 32-bit counter 2032238384Sjkim bswap %eax 2033238384Sjkim pxor 0x30(%rbp), @XMM[1] 2034238384Sjkim inc %eax # increment 2035238384Sjkim movdqu @XMM[1], ($out) 2036238384Sjkim bswap %eax 2037238384Sjkim lea 16($out), $out 2038238384Sjkim mov %eax, 0x2c(%rsp) # save 32-bit counter 2039238384Sjkim dec $len 2040238384Sjkim jnz .Lctr_enc_short 2041238384Sjkim 2042238384Sjkim.Lctr_enc_done: 2043238384Sjkim lea (%rsp), %rax 2044238384Sjkim pxor %xmm0, %xmm0 2045238384Sjkim.Lctr_enc_bzero: # wipe key schedule [if any] 2046238384Sjkim movdqa %xmm0, 0x00(%rax) 2047238384Sjkim movdqa %xmm0, 0x10(%rax) 2048238384Sjkim lea 0x20(%rax), %rax 2049238384Sjkim cmp %rax, %rbp 2050238384Sjkim ja .Lctr_enc_bzero 2051238384Sjkim 2052238384Sjkim lea (%rbp),%rsp # restore %rsp 2053238384Sjkim___ 2054238384Sjkim$code.=<<___ if ($win64); 2055238384Sjkim movaps 0x40(%rbp), %xmm6 2056238384Sjkim movaps 0x50(%rbp), %xmm7 2057238384Sjkim movaps 0x60(%rbp), %xmm8 2058238384Sjkim movaps 0x70(%rbp), %xmm9 2059238384Sjkim movaps 0x80(%rbp), %xmm10 2060238384Sjkim movaps 0x90(%rbp), %xmm11 2061238384Sjkim movaps 0xa0(%rbp), %xmm12 2062238384Sjkim movaps 0xb0(%rbp), %xmm13 2063238384Sjkim movaps 0xc0(%rbp), %xmm14 2064238384Sjkim movaps 0xd0(%rbp), %xmm15 2065238384Sjkim lea 0xa0(%rbp), %rsp 2066238384Sjkim___ 2067238384Sjkim$code.=<<___; 2068238384Sjkim mov 0x48(%rsp), %r15 2069238384Sjkim mov 0x50(%rsp), %r14 2070238384Sjkim mov 0x58(%rsp), %r13 2071238384Sjkim mov 0x60(%rsp), %r12 2072238384Sjkim mov 0x68(%rsp), %rbx 2073238384Sjkim mov 0x70(%rsp), %rax 2074238384Sjkim lea 0x78(%rsp), %rsp 2075238384Sjkim mov %rax, %rbp 2076238384Sjkim.Lctr_enc_epilogue: 2077238384Sjkim ret 2078238384Sjkim.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2079238384Sjkim___ 2080238384Sjkim###################################################################### 2081238384Sjkim# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2082238384Sjkim# const AES_KEY *key1, const AES_KEY *key2, 2083238384Sjkim# const unsigned char iv[16]); 2084238384Sjkim# 2085238384Sjkimmy ($twmask,$twres,$twtmp)=@XMM[13..15]; 2086261037Sjkim$arg6=~s/d$//; 2087261037Sjkim 2088238384Sjkim$code.=<<___; 2089238384Sjkim.globl bsaes_xts_encrypt 2090238384Sjkim.type bsaes_xts_encrypt,\@abi-omnipotent 2091238384Sjkim.align 16 2092238384Sjkimbsaes_xts_encrypt: 2093238384Sjkim mov %rsp, %rax 2094238384Sjkim.Lxts_enc_prologue: 2095238384Sjkim push %rbp 2096238384Sjkim push %rbx 2097238384Sjkim push %r12 2098238384Sjkim push %r13 2099238384Sjkim push %r14 2100238384Sjkim push %r15 2101238384Sjkim lea -0x48(%rsp), %rsp 2102238384Sjkim___ 2103238384Sjkim$code.=<<___ if ($win64); 2104238384Sjkim mov 0xa0(%rsp),$arg5 # pull key2 2105238384Sjkim mov 0xa8(%rsp),$arg6 # pull ivp 2106238384Sjkim lea -0xa0(%rsp), %rsp 2107238384Sjkim movaps %xmm6, 0x40(%rsp) 2108238384Sjkim movaps %xmm7, 0x50(%rsp) 2109238384Sjkim movaps %xmm8, 0x60(%rsp) 2110238384Sjkim movaps %xmm9, 0x70(%rsp) 2111238384Sjkim movaps %xmm10, 0x80(%rsp) 2112238384Sjkim movaps %xmm11, 0x90(%rsp) 2113238384Sjkim movaps %xmm12, 0xa0(%rsp) 2114238384Sjkim movaps %xmm13, 0xb0(%rsp) 2115238384Sjkim movaps %xmm14, 0xc0(%rsp) 2116238384Sjkim movaps %xmm15, 0xd0(%rsp) 2117238384Sjkim.Lxts_enc_body: 2118238384Sjkim___ 2119238384Sjkim$code.=<<___; 2120238384Sjkim mov %rsp, %rbp # backup %rsp 2121238384Sjkim mov $arg1, $inp # backup arguments 2122238384Sjkim mov $arg2, $out 2123238384Sjkim mov $arg3, $len 2124238384Sjkim mov $arg4, $key 2125238384Sjkim 2126238384Sjkim lea ($arg6), $arg1 2127238384Sjkim lea 0x20(%rbp), $arg2 2128238384Sjkim lea ($arg5), $arg3 2129238384Sjkim call asm_AES_encrypt # generate initial tweak 2130238384Sjkim 2131238384Sjkim mov 240($key), %eax # rounds 2132238384Sjkim mov $len, %rbx # backup $len 2133238384Sjkim 2134238384Sjkim mov %eax, %edx # rounds 2135238384Sjkim shl \$7, %rax # 128 bytes per inner round key 2136238384Sjkim sub \$`128-32`, %rax # size of bit-sliced key schedule 2137238384Sjkim sub %rax, %rsp 2138238384Sjkim 2139238384Sjkim mov %rsp, %rax # pass key schedule 2140238384Sjkim mov $key, %rcx # pass key 2141238384Sjkim mov %edx, %r10d # pass rounds 2142238384Sjkim call _bsaes_key_convert 2143238384Sjkim pxor %xmm6, %xmm7 # fix up last round key 2144238384Sjkim movdqa %xmm7, (%rax) # save last round key 2145238384Sjkim 2146238384Sjkim and \$-16, $len 2147238384Sjkim sub \$0x80, %rsp # place for tweak[8] 2148238384Sjkim movdqa 0x20(%rbp), @XMM[7] # initial tweak 2149238384Sjkim 2150238384Sjkim pxor $twtmp, $twtmp 2151238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2152238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2153238384Sjkim 2154238384Sjkim sub \$0x80, $len 2155238384Sjkim jc .Lxts_enc_short 2156238384Sjkim jmp .Lxts_enc_loop 2157238384Sjkim 2158238384Sjkim.align 16 2159238384Sjkim.Lxts_enc_loop: 2160238384Sjkim___ 2161238384Sjkim for ($i=0;$i<7;$i++) { 2162238384Sjkim $code.=<<___; 2163238384Sjkim pshufd \$0x13, $twtmp, $twres 2164238384Sjkim pxor $twtmp, $twtmp 2165238384Sjkim movdqa @XMM[7], @XMM[$i] 2166238384Sjkim movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2167238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2168238384Sjkim pand $twmask, $twres # isolate carry and residue 2169238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2170238384Sjkim pxor $twres, @XMM[7] 2171238384Sjkim___ 2172238384Sjkim $code.=<<___ if ($i>=1); 2173238384Sjkim movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2174238384Sjkim___ 2175238384Sjkim $code.=<<___ if ($i>=2); 2176238384Sjkim pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2177238384Sjkim___ 2178238384Sjkim } 2179238384Sjkim$code.=<<___; 2180238384Sjkim movdqu 0x60($inp), @XMM[8+6] 2181238384Sjkim pxor @XMM[8+5], @XMM[5] 2182238384Sjkim movdqu 0x70($inp), @XMM[8+7] 2183238384Sjkim lea 0x80($inp), $inp 2184238384Sjkim movdqa @XMM[7], 0x70(%rsp) 2185238384Sjkim pxor @XMM[8+6], @XMM[6] 2186238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2187238384Sjkim pxor @XMM[8+7], @XMM[7] 2188238384Sjkim mov %edx, %r10d # pass rounds 2189238384Sjkim 2190238384Sjkim call _bsaes_encrypt8 2191238384Sjkim 2192238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2193238384Sjkim pxor 0x10(%rsp), @XMM[1] 2194238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2195238384Sjkim pxor 0x20(%rsp), @XMM[4] 2196238384Sjkim movdqu @XMM[1], 0x10($out) 2197238384Sjkim pxor 0x30(%rsp), @XMM[6] 2198238384Sjkim movdqu @XMM[4], 0x20($out) 2199238384Sjkim pxor 0x40(%rsp), @XMM[3] 2200238384Sjkim movdqu @XMM[6], 0x30($out) 2201238384Sjkim pxor 0x50(%rsp), @XMM[7] 2202238384Sjkim movdqu @XMM[3], 0x40($out) 2203238384Sjkim pxor 0x60(%rsp), @XMM[2] 2204238384Sjkim movdqu @XMM[7], 0x50($out) 2205238384Sjkim pxor 0x70(%rsp), @XMM[5] 2206238384Sjkim movdqu @XMM[2], 0x60($out) 2207238384Sjkim movdqu @XMM[5], 0x70($out) 2208238384Sjkim lea 0x80($out), $out 2209238384Sjkim 2210238384Sjkim movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2211238384Sjkim pxor $twtmp, $twtmp 2212238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2213238384Sjkim pcmpgtd @XMM[7], $twtmp 2214238384Sjkim pshufd \$0x13, $twtmp, $twres 2215238384Sjkim pxor $twtmp, $twtmp 2216238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2217238384Sjkim pand $twmask, $twres # isolate carry and residue 2218238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2219238384Sjkim pxor $twres, @XMM[7] 2220238384Sjkim 2221238384Sjkim sub \$0x80,$len 2222238384Sjkim jnc .Lxts_enc_loop 2223238384Sjkim 2224238384Sjkim.Lxts_enc_short: 2225238384Sjkim add \$0x80, $len 2226238384Sjkim jz .Lxts_enc_done 2227238384Sjkim___ 2228238384Sjkim for ($i=0;$i<7;$i++) { 2229238384Sjkim $code.=<<___; 2230238384Sjkim pshufd \$0x13, $twtmp, $twres 2231238384Sjkim pxor $twtmp, $twtmp 2232238384Sjkim movdqa @XMM[7], @XMM[$i] 2233238384Sjkim movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2234238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2235238384Sjkim pand $twmask, $twres # isolate carry and residue 2236238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2237238384Sjkim pxor $twres, @XMM[7] 2238238384Sjkim___ 2239238384Sjkim $code.=<<___ if ($i>=1); 2240238384Sjkim movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2241238384Sjkim cmp \$`0x10*$i`,$len 2242238384Sjkim je .Lxts_enc_$i 2243238384Sjkim___ 2244238384Sjkim $code.=<<___ if ($i>=2); 2245238384Sjkim pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2246238384Sjkim___ 2247238384Sjkim } 2248238384Sjkim$code.=<<___; 2249238384Sjkim movdqu 0x60($inp), @XMM[8+6] 2250238384Sjkim pxor @XMM[8+5], @XMM[5] 2251238384Sjkim movdqa @XMM[7], 0x70(%rsp) 2252238384Sjkim lea 0x70($inp), $inp 2253238384Sjkim pxor @XMM[8+6], @XMM[6] 2254238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2255238384Sjkim mov %edx, %r10d # pass rounds 2256238384Sjkim 2257238384Sjkim call _bsaes_encrypt8 2258238384Sjkim 2259238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2260238384Sjkim pxor 0x10(%rsp), @XMM[1] 2261238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2262238384Sjkim pxor 0x20(%rsp), @XMM[4] 2263238384Sjkim movdqu @XMM[1], 0x10($out) 2264238384Sjkim pxor 0x30(%rsp), @XMM[6] 2265238384Sjkim movdqu @XMM[4], 0x20($out) 2266238384Sjkim pxor 0x40(%rsp), @XMM[3] 2267238384Sjkim movdqu @XMM[6], 0x30($out) 2268238384Sjkim pxor 0x50(%rsp), @XMM[7] 2269238384Sjkim movdqu @XMM[3], 0x40($out) 2270238384Sjkim pxor 0x60(%rsp), @XMM[2] 2271238384Sjkim movdqu @XMM[7], 0x50($out) 2272238384Sjkim movdqu @XMM[2], 0x60($out) 2273238384Sjkim lea 0x70($out), $out 2274238384Sjkim 2275238384Sjkim movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2276238384Sjkim jmp .Lxts_enc_done 2277238384Sjkim.align 16 2278238384Sjkim.Lxts_enc_6: 2279238384Sjkim pxor @XMM[8+4], @XMM[4] 2280238384Sjkim lea 0x60($inp), $inp 2281238384Sjkim pxor @XMM[8+5], @XMM[5] 2282238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2283238384Sjkim mov %edx, %r10d # pass rounds 2284238384Sjkim 2285238384Sjkim call _bsaes_encrypt8 2286238384Sjkim 2287238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2288238384Sjkim pxor 0x10(%rsp), @XMM[1] 2289238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2290238384Sjkim pxor 0x20(%rsp), @XMM[4] 2291238384Sjkim movdqu @XMM[1], 0x10($out) 2292238384Sjkim pxor 0x30(%rsp), @XMM[6] 2293238384Sjkim movdqu @XMM[4], 0x20($out) 2294238384Sjkim pxor 0x40(%rsp), @XMM[3] 2295238384Sjkim movdqu @XMM[6], 0x30($out) 2296238384Sjkim pxor 0x50(%rsp), @XMM[7] 2297238384Sjkim movdqu @XMM[3], 0x40($out) 2298238384Sjkim movdqu @XMM[7], 0x50($out) 2299238384Sjkim lea 0x60($out), $out 2300238384Sjkim 2301238384Sjkim movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2302238384Sjkim jmp .Lxts_enc_done 2303238384Sjkim.align 16 2304238384Sjkim.Lxts_enc_5: 2305238384Sjkim pxor @XMM[8+3], @XMM[3] 2306238384Sjkim lea 0x50($inp), $inp 2307238384Sjkim pxor @XMM[8+4], @XMM[4] 2308238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2309238384Sjkim mov %edx, %r10d # pass rounds 2310238384Sjkim 2311238384Sjkim call _bsaes_encrypt8 2312238384Sjkim 2313238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2314238384Sjkim pxor 0x10(%rsp), @XMM[1] 2315238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2316238384Sjkim pxor 0x20(%rsp), @XMM[4] 2317238384Sjkim movdqu @XMM[1], 0x10($out) 2318238384Sjkim pxor 0x30(%rsp), @XMM[6] 2319238384Sjkim movdqu @XMM[4], 0x20($out) 2320238384Sjkim pxor 0x40(%rsp), @XMM[3] 2321238384Sjkim movdqu @XMM[6], 0x30($out) 2322238384Sjkim movdqu @XMM[3], 0x40($out) 2323238384Sjkim lea 0x50($out), $out 2324238384Sjkim 2325238384Sjkim movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2326238384Sjkim jmp .Lxts_enc_done 2327238384Sjkim.align 16 2328238384Sjkim.Lxts_enc_4: 2329238384Sjkim pxor @XMM[8+2], @XMM[2] 2330238384Sjkim lea 0x40($inp), $inp 2331238384Sjkim pxor @XMM[8+3], @XMM[3] 2332238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2333238384Sjkim mov %edx, %r10d # pass rounds 2334238384Sjkim 2335238384Sjkim call _bsaes_encrypt8 2336238384Sjkim 2337238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2338238384Sjkim pxor 0x10(%rsp), @XMM[1] 2339238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2340238384Sjkim pxor 0x20(%rsp), @XMM[4] 2341238384Sjkim movdqu @XMM[1], 0x10($out) 2342238384Sjkim pxor 0x30(%rsp), @XMM[6] 2343238384Sjkim movdqu @XMM[4], 0x20($out) 2344238384Sjkim movdqu @XMM[6], 0x30($out) 2345238384Sjkim lea 0x40($out), $out 2346238384Sjkim 2347238384Sjkim movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2348238384Sjkim jmp .Lxts_enc_done 2349238384Sjkim.align 16 2350238384Sjkim.Lxts_enc_3: 2351238384Sjkim pxor @XMM[8+1], @XMM[1] 2352238384Sjkim lea 0x30($inp), $inp 2353238384Sjkim pxor @XMM[8+2], @XMM[2] 2354238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2355238384Sjkim mov %edx, %r10d # pass rounds 2356238384Sjkim 2357238384Sjkim call _bsaes_encrypt8 2358238384Sjkim 2359238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2360238384Sjkim pxor 0x10(%rsp), @XMM[1] 2361238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2362238384Sjkim pxor 0x20(%rsp), @XMM[4] 2363238384Sjkim movdqu @XMM[1], 0x10($out) 2364238384Sjkim movdqu @XMM[4], 0x20($out) 2365238384Sjkim lea 0x30($out), $out 2366238384Sjkim 2367238384Sjkim movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2368238384Sjkim jmp .Lxts_enc_done 2369238384Sjkim.align 16 2370238384Sjkim.Lxts_enc_2: 2371238384Sjkim pxor @XMM[8+0], @XMM[0] 2372238384Sjkim lea 0x20($inp), $inp 2373238384Sjkim pxor @XMM[8+1], @XMM[1] 2374238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2375238384Sjkim mov %edx, %r10d # pass rounds 2376238384Sjkim 2377238384Sjkim call _bsaes_encrypt8 2378238384Sjkim 2379238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2380238384Sjkim pxor 0x10(%rsp), @XMM[1] 2381238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2382238384Sjkim movdqu @XMM[1], 0x10($out) 2383238384Sjkim lea 0x20($out), $out 2384238384Sjkim 2385238384Sjkim movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2386238384Sjkim jmp .Lxts_enc_done 2387238384Sjkim.align 16 2388238384Sjkim.Lxts_enc_1: 2389238384Sjkim pxor @XMM[0], @XMM[8] 2390238384Sjkim lea 0x10($inp), $inp 2391238384Sjkim movdqa @XMM[8], 0x20(%rbp) 2392238384Sjkim lea 0x20(%rbp), $arg1 2393238384Sjkim lea 0x20(%rbp), $arg2 2394238384Sjkim lea ($key), $arg3 2395238384Sjkim call asm_AES_encrypt # doesn't touch %xmm 2396238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2397238384Sjkim #pxor @XMM[8], @XMM[0] 2398238384Sjkim #lea 0x80(%rsp), %rax # pass key schedule 2399238384Sjkim #mov %edx, %r10d # pass rounds 2400238384Sjkim #call _bsaes_encrypt8 2401238384Sjkim #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2402238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2403238384Sjkim lea 0x10($out), $out 2404238384Sjkim 2405238384Sjkim movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2406238384Sjkim 2407238384Sjkim.Lxts_enc_done: 2408238384Sjkim and \$15, %ebx 2409238384Sjkim jz .Lxts_enc_ret 2410238384Sjkim mov $out, %rdx 2411238384Sjkim 2412238384Sjkim.Lxts_enc_steal: 2413238384Sjkim movzb ($inp), %eax 2414238384Sjkim movzb -16(%rdx), %ecx 2415238384Sjkim lea 1($inp), $inp 2416238384Sjkim mov %al, -16(%rdx) 2417238384Sjkim mov %cl, 0(%rdx) 2418238384Sjkim lea 1(%rdx), %rdx 2419238384Sjkim sub \$1,%ebx 2420238384Sjkim jnz .Lxts_enc_steal 2421238384Sjkim 2422238384Sjkim movdqu -16($out), @XMM[0] 2423238384Sjkim lea 0x20(%rbp), $arg1 2424238384Sjkim pxor @XMM[7], @XMM[0] 2425238384Sjkim lea 0x20(%rbp), $arg2 2426238384Sjkim movdqa @XMM[0], 0x20(%rbp) 2427238384Sjkim lea ($key), $arg3 2428238384Sjkim call asm_AES_encrypt # doesn't touch %xmm 2429238384Sjkim pxor 0x20(%rbp), @XMM[7] 2430238384Sjkim movdqu @XMM[7], -16($out) 2431238384Sjkim 2432238384Sjkim.Lxts_enc_ret: 2433238384Sjkim lea (%rsp), %rax 2434238384Sjkim pxor %xmm0, %xmm0 2435238384Sjkim.Lxts_enc_bzero: # wipe key schedule [if any] 2436238384Sjkim movdqa %xmm0, 0x00(%rax) 2437238384Sjkim movdqa %xmm0, 0x10(%rax) 2438238384Sjkim lea 0x20(%rax), %rax 2439238384Sjkim cmp %rax, %rbp 2440238384Sjkim ja .Lxts_enc_bzero 2441238384Sjkim 2442238384Sjkim lea (%rbp),%rsp # restore %rsp 2443238384Sjkim___ 2444238384Sjkim$code.=<<___ if ($win64); 2445238384Sjkim movaps 0x40(%rbp), %xmm6 2446238384Sjkim movaps 0x50(%rbp), %xmm7 2447238384Sjkim movaps 0x60(%rbp), %xmm8 2448238384Sjkim movaps 0x70(%rbp), %xmm9 2449238384Sjkim movaps 0x80(%rbp), %xmm10 2450238384Sjkim movaps 0x90(%rbp), %xmm11 2451238384Sjkim movaps 0xa0(%rbp), %xmm12 2452238384Sjkim movaps 0xb0(%rbp), %xmm13 2453238384Sjkim movaps 0xc0(%rbp), %xmm14 2454238384Sjkim movaps 0xd0(%rbp), %xmm15 2455238384Sjkim lea 0xa0(%rbp), %rsp 2456238384Sjkim___ 2457238384Sjkim$code.=<<___; 2458238384Sjkim mov 0x48(%rsp), %r15 2459238384Sjkim mov 0x50(%rsp), %r14 2460238384Sjkim mov 0x58(%rsp), %r13 2461238384Sjkim mov 0x60(%rsp), %r12 2462238384Sjkim mov 0x68(%rsp), %rbx 2463238384Sjkim mov 0x70(%rsp), %rax 2464238384Sjkim lea 0x78(%rsp), %rsp 2465238384Sjkim mov %rax, %rbp 2466238384Sjkim.Lxts_enc_epilogue: 2467238384Sjkim ret 2468238384Sjkim.size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2469238384Sjkim 2470238384Sjkim.globl bsaes_xts_decrypt 2471238384Sjkim.type bsaes_xts_decrypt,\@abi-omnipotent 2472238384Sjkim.align 16 2473238384Sjkimbsaes_xts_decrypt: 2474238384Sjkim mov %rsp, %rax 2475238384Sjkim.Lxts_dec_prologue: 2476238384Sjkim push %rbp 2477238384Sjkim push %rbx 2478238384Sjkim push %r12 2479238384Sjkim push %r13 2480238384Sjkim push %r14 2481238384Sjkim push %r15 2482238384Sjkim lea -0x48(%rsp), %rsp 2483238384Sjkim___ 2484238384Sjkim$code.=<<___ if ($win64); 2485238384Sjkim mov 0xa0(%rsp),$arg5 # pull key2 2486238384Sjkim mov 0xa8(%rsp),$arg6 # pull ivp 2487238384Sjkim lea -0xa0(%rsp), %rsp 2488238384Sjkim movaps %xmm6, 0x40(%rsp) 2489238384Sjkim movaps %xmm7, 0x50(%rsp) 2490238384Sjkim movaps %xmm8, 0x60(%rsp) 2491238384Sjkim movaps %xmm9, 0x70(%rsp) 2492238384Sjkim movaps %xmm10, 0x80(%rsp) 2493238384Sjkim movaps %xmm11, 0x90(%rsp) 2494238384Sjkim movaps %xmm12, 0xa0(%rsp) 2495238384Sjkim movaps %xmm13, 0xb0(%rsp) 2496238384Sjkim movaps %xmm14, 0xc0(%rsp) 2497238384Sjkim movaps %xmm15, 0xd0(%rsp) 2498238384Sjkim.Lxts_dec_body: 2499238384Sjkim___ 2500238384Sjkim$code.=<<___; 2501238384Sjkim mov %rsp, %rbp # backup %rsp 2502238384Sjkim mov $arg1, $inp # backup arguments 2503238384Sjkim mov $arg2, $out 2504238384Sjkim mov $arg3, $len 2505238384Sjkim mov $arg4, $key 2506238384Sjkim 2507238384Sjkim lea ($arg6), $arg1 2508238384Sjkim lea 0x20(%rbp), $arg2 2509238384Sjkim lea ($arg5), $arg3 2510238384Sjkim call asm_AES_encrypt # generate initial tweak 2511238384Sjkim 2512238384Sjkim mov 240($key), %eax # rounds 2513238384Sjkim mov $len, %rbx # backup $len 2514238384Sjkim 2515238384Sjkim mov %eax, %edx # rounds 2516238384Sjkim shl \$7, %rax # 128 bytes per inner round key 2517238384Sjkim sub \$`128-32`, %rax # size of bit-sliced key schedule 2518238384Sjkim sub %rax, %rsp 2519238384Sjkim 2520238384Sjkim mov %rsp, %rax # pass key schedule 2521238384Sjkim mov $key, %rcx # pass key 2522238384Sjkim mov %edx, %r10d # pass rounds 2523238384Sjkim call _bsaes_key_convert 2524238384Sjkim pxor (%rsp), %xmm7 # fix up round 0 key 2525238384Sjkim movdqa %xmm6, (%rax) # save last round key 2526238384Sjkim movdqa %xmm7, (%rsp) 2527238384Sjkim 2528238384Sjkim xor %eax, %eax # if ($len%16) len-=16; 2529238384Sjkim and \$-16, $len 2530238384Sjkim test \$15, %ebx 2531238384Sjkim setnz %al 2532238384Sjkim shl \$4, %rax 2533238384Sjkim sub %rax, $len 2534238384Sjkim 2535238384Sjkim sub \$0x80, %rsp # place for tweak[8] 2536238384Sjkim movdqa 0x20(%rbp), @XMM[7] # initial tweak 2537238384Sjkim 2538238384Sjkim pxor $twtmp, $twtmp 2539238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2540238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2541238384Sjkim 2542238384Sjkim sub \$0x80, $len 2543238384Sjkim jc .Lxts_dec_short 2544238384Sjkim jmp .Lxts_dec_loop 2545238384Sjkim 2546238384Sjkim.align 16 2547238384Sjkim.Lxts_dec_loop: 2548238384Sjkim___ 2549238384Sjkim for ($i=0;$i<7;$i++) { 2550238384Sjkim $code.=<<___; 2551238384Sjkim pshufd \$0x13, $twtmp, $twres 2552238384Sjkim pxor $twtmp, $twtmp 2553238384Sjkim movdqa @XMM[7], @XMM[$i] 2554238384Sjkim movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2555238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2556238384Sjkim pand $twmask, $twres # isolate carry and residue 2557238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2558238384Sjkim pxor $twres, @XMM[7] 2559238384Sjkim___ 2560238384Sjkim $code.=<<___ if ($i>=1); 2561238384Sjkim movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2562238384Sjkim___ 2563238384Sjkim $code.=<<___ if ($i>=2); 2564238384Sjkim pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2565238384Sjkim___ 2566238384Sjkim } 2567238384Sjkim$code.=<<___; 2568238384Sjkim movdqu 0x60($inp), @XMM[8+6] 2569238384Sjkim pxor @XMM[8+5], @XMM[5] 2570238384Sjkim movdqu 0x70($inp), @XMM[8+7] 2571238384Sjkim lea 0x80($inp), $inp 2572238384Sjkim movdqa @XMM[7], 0x70(%rsp) 2573238384Sjkim pxor @XMM[8+6], @XMM[6] 2574238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2575238384Sjkim pxor @XMM[8+7], @XMM[7] 2576238384Sjkim mov %edx, %r10d # pass rounds 2577238384Sjkim 2578238384Sjkim call _bsaes_decrypt8 2579238384Sjkim 2580238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2581238384Sjkim pxor 0x10(%rsp), @XMM[1] 2582238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2583238384Sjkim pxor 0x20(%rsp), @XMM[6] 2584238384Sjkim movdqu @XMM[1], 0x10($out) 2585238384Sjkim pxor 0x30(%rsp), @XMM[4] 2586238384Sjkim movdqu @XMM[6], 0x20($out) 2587238384Sjkim pxor 0x40(%rsp), @XMM[2] 2588238384Sjkim movdqu @XMM[4], 0x30($out) 2589238384Sjkim pxor 0x50(%rsp), @XMM[7] 2590238384Sjkim movdqu @XMM[2], 0x40($out) 2591238384Sjkim pxor 0x60(%rsp), @XMM[3] 2592238384Sjkim movdqu @XMM[7], 0x50($out) 2593238384Sjkim pxor 0x70(%rsp), @XMM[5] 2594238384Sjkim movdqu @XMM[3], 0x60($out) 2595238384Sjkim movdqu @XMM[5], 0x70($out) 2596238384Sjkim lea 0x80($out), $out 2597238384Sjkim 2598238384Sjkim movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2599238384Sjkim pxor $twtmp, $twtmp 2600238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2601238384Sjkim pcmpgtd @XMM[7], $twtmp 2602238384Sjkim pshufd \$0x13, $twtmp, $twres 2603238384Sjkim pxor $twtmp, $twtmp 2604238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2605238384Sjkim pand $twmask, $twres # isolate carry and residue 2606238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2607238384Sjkim pxor $twres, @XMM[7] 2608238384Sjkim 2609238384Sjkim sub \$0x80,$len 2610238384Sjkim jnc .Lxts_dec_loop 2611238384Sjkim 2612238384Sjkim.Lxts_dec_short: 2613238384Sjkim add \$0x80, $len 2614238384Sjkim jz .Lxts_dec_done 2615238384Sjkim___ 2616238384Sjkim for ($i=0;$i<7;$i++) { 2617238384Sjkim $code.=<<___; 2618238384Sjkim pshufd \$0x13, $twtmp, $twres 2619238384Sjkim pxor $twtmp, $twtmp 2620238384Sjkim movdqa @XMM[7], @XMM[$i] 2621238384Sjkim movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2622238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2623238384Sjkim pand $twmask, $twres # isolate carry and residue 2624238384Sjkim pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2625238384Sjkim pxor $twres, @XMM[7] 2626238384Sjkim___ 2627238384Sjkim $code.=<<___ if ($i>=1); 2628238384Sjkim movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2629238384Sjkim cmp \$`0x10*$i`,$len 2630238384Sjkim je .Lxts_dec_$i 2631238384Sjkim___ 2632238384Sjkim $code.=<<___ if ($i>=2); 2633238384Sjkim pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2634238384Sjkim___ 2635238384Sjkim } 2636238384Sjkim$code.=<<___; 2637238384Sjkim movdqu 0x60($inp), @XMM[8+6] 2638238384Sjkim pxor @XMM[8+5], @XMM[5] 2639238384Sjkim movdqa @XMM[7], 0x70(%rsp) 2640238384Sjkim lea 0x70($inp), $inp 2641238384Sjkim pxor @XMM[8+6], @XMM[6] 2642238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2643238384Sjkim mov %edx, %r10d # pass rounds 2644238384Sjkim 2645238384Sjkim call _bsaes_decrypt8 2646238384Sjkim 2647238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2648238384Sjkim pxor 0x10(%rsp), @XMM[1] 2649238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2650238384Sjkim pxor 0x20(%rsp), @XMM[6] 2651238384Sjkim movdqu @XMM[1], 0x10($out) 2652238384Sjkim pxor 0x30(%rsp), @XMM[4] 2653238384Sjkim movdqu @XMM[6], 0x20($out) 2654238384Sjkim pxor 0x40(%rsp), @XMM[2] 2655238384Sjkim movdqu @XMM[4], 0x30($out) 2656238384Sjkim pxor 0x50(%rsp), @XMM[7] 2657238384Sjkim movdqu @XMM[2], 0x40($out) 2658238384Sjkim pxor 0x60(%rsp), @XMM[3] 2659238384Sjkim movdqu @XMM[7], 0x50($out) 2660238384Sjkim movdqu @XMM[3], 0x60($out) 2661238384Sjkim lea 0x70($out), $out 2662238384Sjkim 2663238384Sjkim movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2664238384Sjkim jmp .Lxts_dec_done 2665238384Sjkim.align 16 2666238384Sjkim.Lxts_dec_6: 2667238384Sjkim pxor @XMM[8+4], @XMM[4] 2668238384Sjkim lea 0x60($inp), $inp 2669238384Sjkim pxor @XMM[8+5], @XMM[5] 2670238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2671238384Sjkim mov %edx, %r10d # pass rounds 2672238384Sjkim 2673238384Sjkim call _bsaes_decrypt8 2674238384Sjkim 2675238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2676238384Sjkim pxor 0x10(%rsp), @XMM[1] 2677238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2678238384Sjkim pxor 0x20(%rsp), @XMM[6] 2679238384Sjkim movdqu @XMM[1], 0x10($out) 2680238384Sjkim pxor 0x30(%rsp), @XMM[4] 2681238384Sjkim movdqu @XMM[6], 0x20($out) 2682238384Sjkim pxor 0x40(%rsp), @XMM[2] 2683238384Sjkim movdqu @XMM[4], 0x30($out) 2684238384Sjkim pxor 0x50(%rsp), @XMM[7] 2685238384Sjkim movdqu @XMM[2], 0x40($out) 2686238384Sjkim movdqu @XMM[7], 0x50($out) 2687238384Sjkim lea 0x60($out), $out 2688238384Sjkim 2689238384Sjkim movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2690238384Sjkim jmp .Lxts_dec_done 2691238384Sjkim.align 16 2692238384Sjkim.Lxts_dec_5: 2693238384Sjkim pxor @XMM[8+3], @XMM[3] 2694238384Sjkim lea 0x50($inp), $inp 2695238384Sjkim pxor @XMM[8+4], @XMM[4] 2696238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2697238384Sjkim mov %edx, %r10d # pass rounds 2698238384Sjkim 2699238384Sjkim call _bsaes_decrypt8 2700238384Sjkim 2701238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2702238384Sjkim pxor 0x10(%rsp), @XMM[1] 2703238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2704238384Sjkim pxor 0x20(%rsp), @XMM[6] 2705238384Sjkim movdqu @XMM[1], 0x10($out) 2706238384Sjkim pxor 0x30(%rsp), @XMM[4] 2707238384Sjkim movdqu @XMM[6], 0x20($out) 2708238384Sjkim pxor 0x40(%rsp), @XMM[2] 2709238384Sjkim movdqu @XMM[4], 0x30($out) 2710238384Sjkim movdqu @XMM[2], 0x40($out) 2711238384Sjkim lea 0x50($out), $out 2712238384Sjkim 2713238384Sjkim movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2714238384Sjkim jmp .Lxts_dec_done 2715238384Sjkim.align 16 2716238384Sjkim.Lxts_dec_4: 2717238384Sjkim pxor @XMM[8+2], @XMM[2] 2718238384Sjkim lea 0x40($inp), $inp 2719238384Sjkim pxor @XMM[8+3], @XMM[3] 2720238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2721238384Sjkim mov %edx, %r10d # pass rounds 2722238384Sjkim 2723238384Sjkim call _bsaes_decrypt8 2724238384Sjkim 2725238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2726238384Sjkim pxor 0x10(%rsp), @XMM[1] 2727238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2728238384Sjkim pxor 0x20(%rsp), @XMM[6] 2729238384Sjkim movdqu @XMM[1], 0x10($out) 2730238384Sjkim pxor 0x30(%rsp), @XMM[4] 2731238384Sjkim movdqu @XMM[6], 0x20($out) 2732238384Sjkim movdqu @XMM[4], 0x30($out) 2733238384Sjkim lea 0x40($out), $out 2734238384Sjkim 2735238384Sjkim movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2736238384Sjkim jmp .Lxts_dec_done 2737238384Sjkim.align 16 2738238384Sjkim.Lxts_dec_3: 2739238384Sjkim pxor @XMM[8+1], @XMM[1] 2740238384Sjkim lea 0x30($inp), $inp 2741238384Sjkim pxor @XMM[8+2], @XMM[2] 2742238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2743238384Sjkim mov %edx, %r10d # pass rounds 2744238384Sjkim 2745238384Sjkim call _bsaes_decrypt8 2746238384Sjkim 2747238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2748238384Sjkim pxor 0x10(%rsp), @XMM[1] 2749238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2750238384Sjkim pxor 0x20(%rsp), @XMM[6] 2751238384Sjkim movdqu @XMM[1], 0x10($out) 2752238384Sjkim movdqu @XMM[6], 0x20($out) 2753238384Sjkim lea 0x30($out), $out 2754238384Sjkim 2755238384Sjkim movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2756238384Sjkim jmp .Lxts_dec_done 2757238384Sjkim.align 16 2758238384Sjkim.Lxts_dec_2: 2759238384Sjkim pxor @XMM[8+0], @XMM[0] 2760238384Sjkim lea 0x20($inp), $inp 2761238384Sjkim pxor @XMM[8+1], @XMM[1] 2762238384Sjkim lea 0x80(%rsp), %rax # pass key schedule 2763238384Sjkim mov %edx, %r10d # pass rounds 2764238384Sjkim 2765238384Sjkim call _bsaes_decrypt8 2766238384Sjkim 2767238384Sjkim pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2768238384Sjkim pxor 0x10(%rsp), @XMM[1] 2769238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2770238384Sjkim movdqu @XMM[1], 0x10($out) 2771238384Sjkim lea 0x20($out), $out 2772238384Sjkim 2773238384Sjkim movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2774238384Sjkim jmp .Lxts_dec_done 2775238384Sjkim.align 16 2776238384Sjkim.Lxts_dec_1: 2777238384Sjkim pxor @XMM[0], @XMM[8] 2778238384Sjkim lea 0x10($inp), $inp 2779238384Sjkim movdqa @XMM[8], 0x20(%rbp) 2780238384Sjkim lea 0x20(%rbp), $arg1 2781238384Sjkim lea 0x20(%rbp), $arg2 2782238384Sjkim lea ($key), $arg3 2783238384Sjkim call asm_AES_decrypt # doesn't touch %xmm 2784238384Sjkim pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2785238384Sjkim #pxor @XMM[8], @XMM[0] 2786238384Sjkim #lea 0x80(%rsp), %rax # pass key schedule 2787238384Sjkim #mov %edx, %r10d # pass rounds 2788238384Sjkim #call _bsaes_decrypt8 2789238384Sjkim #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2790238384Sjkim movdqu @XMM[0], 0x00($out) # write output 2791238384Sjkim lea 0x10($out), $out 2792238384Sjkim 2793238384Sjkim movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2794238384Sjkim 2795238384Sjkim.Lxts_dec_done: 2796238384Sjkim and \$15, %ebx 2797238384Sjkim jz .Lxts_dec_ret 2798238384Sjkim 2799238384Sjkim pxor $twtmp, $twtmp 2800238384Sjkim movdqa .Lxts_magic(%rip), $twmask 2801238384Sjkim pcmpgtd @XMM[7], $twtmp 2802238384Sjkim pshufd \$0x13, $twtmp, $twres 2803238384Sjkim movdqa @XMM[7], @XMM[6] 2804238384Sjkim paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2805238384Sjkim pand $twmask, $twres # isolate carry and residue 2806238384Sjkim movdqu ($inp), @XMM[0] 2807238384Sjkim pxor $twres, @XMM[7] 2808238384Sjkim 2809238384Sjkim lea 0x20(%rbp), $arg1 2810238384Sjkim pxor @XMM[7], @XMM[0] 2811238384Sjkim lea 0x20(%rbp), $arg2 2812238384Sjkim movdqa @XMM[0], 0x20(%rbp) 2813238384Sjkim lea ($key), $arg3 2814238384Sjkim call asm_AES_decrypt # doesn't touch %xmm 2815238384Sjkim pxor 0x20(%rbp), @XMM[7] 2816238384Sjkim mov $out, %rdx 2817238384Sjkim movdqu @XMM[7], ($out) 2818238384Sjkim 2819238384Sjkim.Lxts_dec_steal: 2820238384Sjkim movzb 16($inp), %eax 2821238384Sjkim movzb (%rdx), %ecx 2822238384Sjkim lea 1($inp), $inp 2823238384Sjkim mov %al, (%rdx) 2824238384Sjkim mov %cl, 16(%rdx) 2825238384Sjkim lea 1(%rdx), %rdx 2826238384Sjkim sub \$1,%ebx 2827238384Sjkim jnz .Lxts_dec_steal 2828238384Sjkim 2829238384Sjkim movdqu ($out), @XMM[0] 2830238384Sjkim lea 0x20(%rbp), $arg1 2831238384Sjkim pxor @XMM[6], @XMM[0] 2832238384Sjkim lea 0x20(%rbp), $arg2 2833238384Sjkim movdqa @XMM[0], 0x20(%rbp) 2834238384Sjkim lea ($key), $arg3 2835238384Sjkim call asm_AES_decrypt # doesn't touch %xmm 2836238384Sjkim pxor 0x20(%rbp), @XMM[6] 2837238384Sjkim movdqu @XMM[6], ($out) 2838238384Sjkim 2839238384Sjkim.Lxts_dec_ret: 2840238384Sjkim lea (%rsp), %rax 2841238384Sjkim pxor %xmm0, %xmm0 2842238384Sjkim.Lxts_dec_bzero: # wipe key schedule [if any] 2843238384Sjkim movdqa %xmm0, 0x00(%rax) 2844238384Sjkim movdqa %xmm0, 0x10(%rax) 2845238384Sjkim lea 0x20(%rax), %rax 2846238384Sjkim cmp %rax, %rbp 2847238384Sjkim ja .Lxts_dec_bzero 2848238384Sjkim 2849238384Sjkim lea (%rbp),%rsp # restore %rsp 2850238384Sjkim___ 2851238384Sjkim$code.=<<___ if ($win64); 2852238384Sjkim movaps 0x40(%rbp), %xmm6 2853238384Sjkim movaps 0x50(%rbp), %xmm7 2854238384Sjkim movaps 0x60(%rbp), %xmm8 2855238384Sjkim movaps 0x70(%rbp), %xmm9 2856238384Sjkim movaps 0x80(%rbp), %xmm10 2857238384Sjkim movaps 0x90(%rbp), %xmm11 2858238384Sjkim movaps 0xa0(%rbp), %xmm12 2859238384Sjkim movaps 0xb0(%rbp), %xmm13 2860238384Sjkim movaps 0xc0(%rbp), %xmm14 2861238384Sjkim movaps 0xd0(%rbp), %xmm15 2862238384Sjkim lea 0xa0(%rbp), %rsp 2863238384Sjkim___ 2864238384Sjkim$code.=<<___; 2865238384Sjkim mov 0x48(%rsp), %r15 2866238384Sjkim mov 0x50(%rsp), %r14 2867238384Sjkim mov 0x58(%rsp), %r13 2868238384Sjkim mov 0x60(%rsp), %r12 2869238384Sjkim mov 0x68(%rsp), %rbx 2870238384Sjkim mov 0x70(%rsp), %rax 2871238384Sjkim lea 0x78(%rsp), %rsp 2872238384Sjkim mov %rax, %rbp 2873238384Sjkim.Lxts_dec_epilogue: 2874238384Sjkim ret 2875238384Sjkim.size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2876238384Sjkim___ 2877238384Sjkim} 2878238384Sjkim$code.=<<___; 2879238384Sjkim.type _bsaes_const,\@object 2880238384Sjkim.align 64 2881238384Sjkim_bsaes_const: 2882238384Sjkim.LM0ISR: # InvShiftRows constants 2883238384Sjkim .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2884238384Sjkim.LISRM0: 2885238384Sjkim .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2886238384Sjkim.LISR: 2887238384Sjkim .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2888238384Sjkim.LBS0: # bit-slice constants 2889238384Sjkim .quad 0x5555555555555555, 0x5555555555555555 2890238384Sjkim.LBS1: 2891238384Sjkim .quad 0x3333333333333333, 0x3333333333333333 2892238384Sjkim.LBS2: 2893238384Sjkim .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2894238384Sjkim.LSR: # shiftrows constants 2895238384Sjkim .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2896238384Sjkim.LSRM0: 2897238384Sjkim .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2898238384Sjkim.LM0SR: 2899238384Sjkim .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2900238384Sjkim.LSWPUP: # byte-swap upper dword 2901238384Sjkim .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2902238384Sjkim.LSWPUPM0SR: 2903238384Sjkim .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2904238384Sjkim.LADD1: # counter increment constants 2905238384Sjkim .quad 0x0000000000000000, 0x0000000100000000 2906238384Sjkim.LADD2: 2907238384Sjkim .quad 0x0000000000000000, 0x0000000200000000 2908238384Sjkim.LADD3: 2909238384Sjkim .quad 0x0000000000000000, 0x0000000300000000 2910238384Sjkim.LADD4: 2911238384Sjkim .quad 0x0000000000000000, 0x0000000400000000 2912238384Sjkim.LADD5: 2913238384Sjkim .quad 0x0000000000000000, 0x0000000500000000 2914238384Sjkim.LADD6: 2915238384Sjkim .quad 0x0000000000000000, 0x0000000600000000 2916238384Sjkim.LADD7: 2917238384Sjkim .quad 0x0000000000000000, 0x0000000700000000 2918238384Sjkim.LADD8: 2919238384Sjkim .quad 0x0000000000000000, 0x0000000800000000 2920238384Sjkim.Lxts_magic: 2921238384Sjkim .long 0x87,0,1,0 2922238384Sjkim.Lmasks: 2923238384Sjkim .quad 0x0101010101010101, 0x0101010101010101 2924238384Sjkim .quad 0x0202020202020202, 0x0202020202020202 2925238384Sjkim .quad 0x0404040404040404, 0x0404040404040404 2926238384Sjkim .quad 0x0808080808080808, 0x0808080808080808 2927238384Sjkim.LM0: 2928238384Sjkim .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2929238384Sjkim.L63: 2930238384Sjkim .quad 0x6363636363636363, 0x6363636363636363 2931238384Sjkim.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia K��sper, Peter Schwabe, Andy Polyakov" 2932238384Sjkim.align 64 2933238384Sjkim.size _bsaes_const,.-_bsaes_const 2934238384Sjkim___ 2935238384Sjkim 2936238384Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2937238384Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2938238384Sjkimif ($win64) { 2939238384Sjkim$rec="%rcx"; 2940238384Sjkim$frame="%rdx"; 2941238384Sjkim$context="%r8"; 2942238384Sjkim$disp="%r9"; 2943238384Sjkim 2944238384Sjkim$code.=<<___; 2945238384Sjkim.extern __imp_RtlVirtualUnwind 2946238384Sjkim.type se_handler,\@abi-omnipotent 2947238384Sjkim.align 16 2948238384Sjkimse_handler: 2949238384Sjkim push %rsi 2950238384Sjkim push %rdi 2951238384Sjkim push %rbx 2952238384Sjkim push %rbp 2953238384Sjkim push %r12 2954238384Sjkim push %r13 2955238384Sjkim push %r14 2956238384Sjkim push %r15 2957238384Sjkim pushfq 2958238384Sjkim sub \$64,%rsp 2959238384Sjkim 2960238384Sjkim mov 120($context),%rax # pull context->Rax 2961238384Sjkim mov 248($context),%rbx # pull context->Rip 2962238384Sjkim 2963238384Sjkim mov 8($disp),%rsi # disp->ImageBase 2964238384Sjkim mov 56($disp),%r11 # disp->HandlerData 2965238384Sjkim 2966238384Sjkim mov 0(%r11),%r10d # HandlerData[0] 2967238384Sjkim lea (%rsi,%r10),%r10 # prologue label 2968238384Sjkim cmp %r10,%rbx # context->Rip<prologue label 2969238384Sjkim jb .Lin_prologue 2970238384Sjkim 2971238384Sjkim mov 152($context),%rax # pull context->Rsp 2972238384Sjkim 2973238384Sjkim mov 4(%r11),%r10d # HandlerData[1] 2974238384Sjkim lea (%rsi,%r10),%r10 # epilogue label 2975238384Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 2976238384Sjkim jae .Lin_prologue 2977238384Sjkim 2978238384Sjkim mov 160($context),%rax # pull context->Rbp 2979238384Sjkim 2980238384Sjkim lea 0x40(%rax),%rsi # %xmm save area 2981238384Sjkim lea 512($context),%rdi # &context.Xmm6 2982238384Sjkim mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2983238384Sjkim .long 0xa548f3fc # cld; rep movsq 2984238384Sjkim lea 0xa0(%rax),%rax # adjust stack pointer 2985238384Sjkim 2986238384Sjkim mov 0x70(%rax),%rbp 2987238384Sjkim mov 0x68(%rax),%rbx 2988238384Sjkim mov 0x60(%rax),%r12 2989238384Sjkim mov 0x58(%rax),%r13 2990238384Sjkim mov 0x50(%rax),%r14 2991238384Sjkim mov 0x48(%rax),%r15 2992238384Sjkim lea 0x78(%rax),%rax # adjust stack pointer 2993238384Sjkim mov %rbx,144($context) # restore context->Rbx 2994238384Sjkim mov %rbp,160($context) # restore context->Rbp 2995238384Sjkim mov %r12,216($context) # restore context->R12 2996238384Sjkim mov %r13,224($context) # restore context->R13 2997238384Sjkim mov %r14,232($context) # restore context->R14 2998238384Sjkim mov %r15,240($context) # restore context->R15 2999238384Sjkim 3000238384Sjkim.Lin_prologue: 3001238384Sjkim mov %rax,152($context) # restore context->Rsp 3002238384Sjkim 3003238384Sjkim mov 40($disp),%rdi # disp->ContextRecord 3004238384Sjkim mov $context,%rsi # context 3005238384Sjkim mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3006238384Sjkim .long 0xa548f3fc # cld; rep movsq 3007238384Sjkim 3008238384Sjkim mov $disp,%rsi 3009238384Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3010238384Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 3011238384Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 3012238384Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3013238384Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 3014238384Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 3015238384Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 3016238384Sjkim mov %r10,32(%rsp) # arg5 3017238384Sjkim mov %r11,40(%rsp) # arg6 3018238384Sjkim mov %r12,48(%rsp) # arg7 3019238384Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 3020238384Sjkim call *__imp_RtlVirtualUnwind(%rip) 3021238384Sjkim 3022238384Sjkim mov \$1,%eax # ExceptionContinueSearch 3023238384Sjkim add \$64,%rsp 3024238384Sjkim popfq 3025238384Sjkim pop %r15 3026238384Sjkim pop %r14 3027238384Sjkim pop %r13 3028238384Sjkim pop %r12 3029238384Sjkim pop %rbp 3030238384Sjkim pop %rbx 3031238384Sjkim pop %rdi 3032238384Sjkim pop %rsi 3033238384Sjkim ret 3034238384Sjkim.size se_handler,.-se_handler 3035238384Sjkim 3036238384Sjkim.section .pdata 3037238384Sjkim.align 4 3038238384Sjkim___ 3039238384Sjkim$code.=<<___ if ($ecb); 3040238384Sjkim .rva .Lecb_enc_prologue 3041238384Sjkim .rva .Lecb_enc_epilogue 3042238384Sjkim .rva .Lecb_enc_info 3043238384Sjkim 3044238384Sjkim .rva .Lecb_dec_prologue 3045238384Sjkim .rva .Lecb_dec_epilogue 3046238384Sjkim .rva .Lecb_dec_info 3047238384Sjkim___ 3048238384Sjkim$code.=<<___; 3049238384Sjkim .rva .Lcbc_dec_prologue 3050238384Sjkim .rva .Lcbc_dec_epilogue 3051238384Sjkim .rva .Lcbc_dec_info 3052238384Sjkim 3053238384Sjkim .rva .Lctr_enc_prologue 3054238384Sjkim .rva .Lctr_enc_epilogue 3055238384Sjkim .rva .Lctr_enc_info 3056238384Sjkim 3057238384Sjkim .rva .Lxts_enc_prologue 3058238384Sjkim .rva .Lxts_enc_epilogue 3059238384Sjkim .rva .Lxts_enc_info 3060238384Sjkim 3061238384Sjkim .rva .Lxts_dec_prologue 3062238384Sjkim .rva .Lxts_dec_epilogue 3063238384Sjkim .rva .Lxts_dec_info 3064238384Sjkim 3065238384Sjkim.section .xdata 3066238384Sjkim.align 8 3067238384Sjkim___ 3068238384Sjkim$code.=<<___ if ($ecb); 3069238384Sjkim.Lecb_enc_info: 3070238384Sjkim .byte 9,0,0,0 3071238384Sjkim .rva se_handler 3072238384Sjkim .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3073238384Sjkim.Lecb_dec_info: 3074238384Sjkim .byte 9,0,0,0 3075238384Sjkim .rva se_handler 3076238384Sjkim .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3077238384Sjkim___ 3078238384Sjkim$code.=<<___; 3079238384Sjkim.Lcbc_dec_info: 3080238384Sjkim .byte 9,0,0,0 3081238384Sjkim .rva se_handler 3082238384Sjkim .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3083238384Sjkim.Lctr_enc_info: 3084238384Sjkim .byte 9,0,0,0 3085238384Sjkim .rva se_handler 3086238384Sjkim .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3087238384Sjkim.Lxts_enc_info: 3088238384Sjkim .byte 9,0,0,0 3089238384Sjkim .rva se_handler 3090238384Sjkim .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3091238384Sjkim.Lxts_dec_info: 3092238384Sjkim .byte 9,0,0,0 3093238384Sjkim .rva se_handler 3094238384Sjkim .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3095238384Sjkim___ 3096238384Sjkim} 3097238384Sjkim 3098238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3099238384Sjkim 3100238384Sjkimprint $code; 3101238384Sjkim 3102238384Sjkimclose STDOUT; 3103