1#!/usr/bin/env perl 2 3###################################################################### 4## Constant-time SSSE3 AES core implementation. 5## version 0.1 6## 7## By Mike Hamburg (Stanford University), 2009 8## Public domain. 9## 10## For details see http://shiftleft.org/papers/vector_aes/ and 11## http://crypto.stanford.edu/vpaes/. 12 13###################################################################### 14# September 2011. 15# 16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for 17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt 18# doesn't handle partial vectors (doesn't have to if called from 19# EVP only). "Drop-in" implies that this module doesn't share key 20# schedule structure with the original nor does it make assumption 21# about its alignment... 22# 23# Performance summary. aes-586.pl column lists large-block CBC 24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per 25# byte processed with 128-bit key, and vpaes-x86.pl column - [also 26# large-block CBC] encrypt/decrypt. 27# 28# aes-586.pl vpaes-x86.pl 29# 30# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) 31# Nehalem 27.9/40.4/18.1 10.3/12.0 32# Atom 102./119./60.1 64.5/85.3(***) 33# 34# (*) "Hyper-threading" in the context refers rather to cache shared 35# among multiple cores, than to specifically Intel HTT. As vast 36# majority of contemporary cores share cache, slower code path 37# is common place. In other words "with-hyper-threading-off" 38# results are presented mostly for reference purposes. 39# 40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. 41# 42# (***) Less impressive improvement on Core 2 and Atom is due to slow 43# pshufb, yet it's respectable +32%/65% improvement on Core 2 44# and +58%/40% on Atom (as implied, over "hyper-threading-safe" 45# code path). 46# 47# <appro@openssl.org> 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50push(@INC,"${dir}","${dir}../../perlasm"); 51require "x86asm.pl"; 52 53&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); 54 55$PREFIX="vpaes"; 56 57my ($round, $base, $magic, $key, $const, $inp, $out)= 58 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); 59 60 &rodataseg(); 61&static_label("_vpaes_consts"); 62&static_label("_vpaes_schedule_low_round"); 63 64&set_label("_vpaes_consts",64); 65$k_inv=-0x30; # inv, inva 66 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); 67 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); 68 69$k_s0F=-0x10; # s0F 70 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); 71 72$k_ipt=0x00; # input transform (lo, hi) 73 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); 74 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); 75 76$k_sb1=0x20; # sb1u, sb1t 77 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); 78 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); 79$k_sb2=0x40; # sb2u, sb2t 80 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); 81 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); 82$k_sbo=0x60; # sbou, sbot 83 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); 84 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); 85 86$k_mc_forward=0x80; # mc_forward 87 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); 88 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); 89 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); 90 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); 91 92$k_mc_backward=0xc0; # mc_backward 93 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); 94 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); 95 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); 96 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); 97 98$k_sr=0x100; # sr 99 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); 100 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); 101 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); 102 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); 103 104$k_rcon=0x140; # rcon 105 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); 106 107$k_s63=0x150; # s63: all equal to 0x63 transformed 108 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); 109 110$k_opt=0x160; # output transform 111 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); 112 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); 113 114$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" 115 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); 116 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); 117## 118## Decryption stuff 119## Key schedule constants 120## 121$k_dksd=0x1a0; # decryption key schedule: invskew x*D 122 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); 123 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); 124$k_dksb=0x1c0; # decryption key schedule: invskew x*B 125 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); 126 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); 127$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 128 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); 129 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); 130$k_dks9=0x200; # decryption key schedule: invskew x*9 131 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); 132 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); 133 134## 135## Decryption stuff 136## Round function constants 137## 138$k_dipt=0x220; # decryption input transform 139 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); 140 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); 141 142$k_dsb9=0x240; # decryption sbox output *9*u, *9*t 143 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); 144 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); 145$k_dsbd=0x260; # decryption sbox output *D*u, *D*t 146 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); 147 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); 148$k_dsbb=0x280; # decryption sbox output *B*u, *B*t 149 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); 150 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); 151$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t 152 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); 153 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); 154$k_dsbo=0x2c0; # decryption sbox final output 155 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); 156 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); 157 &previous(); 158 159&function_begin_B("_vpaes_preheat"); 160 &movdqa ("xmm7",&QWP($k_inv,$const)); 161 &movdqa ("xmm6",&QWP($k_s0F,$const)); 162 &ret (); 163&function_end_B("_vpaes_preheat"); 164 165## 166## _aes_encrypt_core 167## 168## AES-encrypt %xmm0. 169## 170## Inputs: 171## %xmm0 = input 172## %xmm6-%xmm7 as in _vpaes_preheat 173## (%edx) = scheduled keys 174## 175## Output in %xmm0 176## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx 177## 178## 179&function_begin_B("_vpaes_encrypt_core"); 180 &mov ($magic,16); 181 &mov ($round,&DWP(240,$key)); 182 &movdqa ("xmm1","xmm6") 183 &movdqa ("xmm2",&QWP($k_ipt,$const)); 184 &pandn ("xmm1","xmm0"); 185 &movdqu ("xmm5",&QWP(0,$key)); 186 &psrld ("xmm1",4); 187 &pand ("xmm0","xmm6"); 188 &pshufb ("xmm2","xmm0"); 189 &movdqa ("xmm0",&QWP($k_ipt+16,$const)); 190 &pshufb ("xmm0","xmm1"); 191 &pxor ("xmm2","xmm5"); 192 &pxor ("xmm0","xmm2"); 193 &add ($key,16); 194 &lea ($base,&DWP($k_mc_backward,$const)); 195 &jmp (&label("enc_entry")); 196 197 198&set_label("enc_loop",16); 199 # middle of middle round 200 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u 201 &pshufb ("xmm4","xmm2"); # 4 = sb1u 202 &pxor ("xmm4","xmm5"); # 4 = sb1u + k 203 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t 204 &pshufb ("xmm0","xmm3"); # 0 = sb1t 205 &pxor ("xmm0","xmm4"); # 0 = A 206 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u 207 &pshufb ("xmm5","xmm2"); # 4 = sb2u 208 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] 209 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t 210 &pshufb ("xmm2","xmm3"); # 2 = sb2t 211 &pxor ("xmm2","xmm5"); # 2 = 2A 212 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] 213 &movdqa ("xmm3","xmm0"); # 3 = A 214 &pshufb ("xmm0","xmm1"); # 0 = B 215 &add ($key,16); # next key 216 &pxor ("xmm0","xmm2"); # 0 = 2A+B 217 &pshufb ("xmm3","xmm4"); # 3 = D 218 &add ($magic,16); # next mc 219 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D 220 &pshufb ("xmm0","xmm1"); # 0 = 2B+C 221 &and ($magic,0x30); # ... mod 4 222 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D 223 &sub ($round,1); # nr-- 224 225&set_label("enc_entry"); 226 # top of round 227 &movdqa ("xmm1","xmm6"); # 1 : i 228 &pandn ("xmm1","xmm0"); # 1 = i<<4 229 &psrld ("xmm1",4); # 1 = i 230 &pand ("xmm0","xmm6"); # 0 = k 231 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k 232 &pshufb ("xmm5","xmm0"); # 2 = a/k 233 &pxor ("xmm0","xmm1"); # 0 = j 234 &movdqa ("xmm3","xmm7"); # 3 : 1/i 235 &pshufb ("xmm3","xmm1"); # 3 = 1/i 236 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k 237 &movdqa ("xmm4","xmm7"); # 4 : 1/j 238 &pshufb ("xmm4","xmm0"); # 4 = 1/j 239 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k 240 &movdqa ("xmm2","xmm7"); # 2 : 1/iak 241 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 242 &pxor ("xmm2","xmm0"); # 2 = io 243 &movdqa ("xmm3","xmm7"); # 3 : 1/jak 244 &movdqu ("xmm5",&QWP(0,$key)); 245 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 246 &pxor ("xmm3","xmm1"); # 3 = jo 247 &jnz (&label("enc_loop")); 248 249 # middle of last round 250 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo 251 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 252 &pshufb ("xmm4","xmm2"); # 4 = sbou 253 &pxor ("xmm4","xmm5"); # 4 = sb1u + k 254 &pshufb ("xmm0","xmm3"); # 0 = sb1t 255 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] 256 &pxor ("xmm0","xmm4"); # 0 = A 257 &pshufb ("xmm0","xmm1"); 258 &ret (); 259&function_end_B("_vpaes_encrypt_core"); 260 261## 262## Decryption core 263## 264## Same API as encryption core. 265## 266&function_begin_B("_vpaes_decrypt_core"); 267 &mov ($round,&DWP(240,$key)); 268 &lea ($base,&DWP($k_dsbd,$const)); 269 &movdqa ("xmm1","xmm6"); 270 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); 271 &pandn ("xmm1","xmm0"); 272 &mov ($magic,$round); 273 &psrld ("xmm1",4) 274 &movdqu ("xmm5",&QWP(0,$key)); 275 &shl ($magic,4); 276 &pand ("xmm0","xmm6"); 277 &pshufb ("xmm2","xmm0"); 278 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); 279 &xor ($magic,0x30); 280 &pshufb ("xmm0","xmm1"); 281 &and ($magic,0x30); 282 &pxor ("xmm2","xmm5"); 283 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); 284 &pxor ("xmm0","xmm2"); 285 &add ($key,16); 286 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); 287 &jmp (&label("dec_entry")); 288 289&set_label("dec_loop",16); 290## 291## Inverse mix columns 292## 293 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u 294 &pshufb ("xmm4","xmm2"); # 4 = sb9u 295 &pxor ("xmm4","xmm0"); 296 &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t 297 &pshufb ("xmm0","xmm3"); # 0 = sb9t 298 &pxor ("xmm0","xmm4"); # 0 = ch 299 &add ($key,16); # next round key 300 301 &pshufb ("xmm0","xmm5"); # MC ch 302 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu 303 &pshufb ("xmm4","xmm2"); # 4 = sbdu 304 &pxor ("xmm4","xmm0"); # 4 = ch 305 &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt 306 &pshufb ("xmm0","xmm3"); # 0 = sbdt 307 &pxor ("xmm0","xmm4"); # 0 = ch 308 &sub ($round,1); # nr-- 309 310 &pshufb ("xmm0","xmm5"); # MC ch 311 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu 312 &pshufb ("xmm4","xmm2"); # 4 = sbbu 313 &pxor ("xmm4","xmm0"); # 4 = ch 314 &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt 315 &pshufb ("xmm0","xmm3"); # 0 = sbbt 316 &pxor ("xmm0","xmm4"); # 0 = ch 317 318 &pshufb ("xmm0","xmm5"); # MC ch 319 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu 320 &pshufb ("xmm4","xmm2"); # 4 = sbeu 321 &pxor ("xmm4","xmm0"); # 4 = ch 322 &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet 323 &pshufb ("xmm0","xmm3"); # 0 = sbet 324 &pxor ("xmm0","xmm4"); # 0 = ch 325 326 &palignr("xmm5","xmm5",12); 327 328&set_label("dec_entry"); 329 # top of round 330 &movdqa ("xmm1","xmm6"); # 1 : i 331 &pandn ("xmm1","xmm0"); # 1 = i<<4 332 &psrld ("xmm1",4); # 1 = i 333 &pand ("xmm0","xmm6"); # 0 = k 334 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 335 &pshufb ("xmm2","xmm0"); # 2 = a/k 336 &pxor ("xmm0","xmm1"); # 0 = j 337 &movdqa ("xmm3","xmm7"); # 3 : 1/i 338 &pshufb ("xmm3","xmm1"); # 3 = 1/i 339 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 340 &movdqa ("xmm4","xmm7"); # 4 : 1/j 341 &pshufb ("xmm4","xmm0"); # 4 = 1/j 342 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 343 &movdqa ("xmm2","xmm7"); # 2 : 1/iak 344 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 345 &pxor ("xmm2","xmm0"); # 2 = io 346 &movdqa ("xmm3","xmm7"); # 3 : 1/jak 347 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 348 &pxor ("xmm3","xmm1"); # 3 = jo 349 &movdqu ("xmm0",&QWP(0,$key)); 350 &jnz (&label("dec_loop")); 351 352 # middle of last round 353 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou 354 &pshufb ("xmm4","xmm2"); # 4 = sbou 355 &pxor ("xmm4","xmm0"); # 4 = sb1u + k 356 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot 357 &movdqa ("xmm2",&QWP(0,$magic)); 358 &pshufb ("xmm0","xmm3"); # 0 = sb1t 359 &pxor ("xmm0","xmm4"); # 0 = A 360 &pshufb ("xmm0","xmm2"); 361 &ret (); 362&function_end_B("_vpaes_decrypt_core"); 363 364######################################################## 365## ## 366## AES key schedule ## 367## ## 368######################################################## 369&function_begin_B("_vpaes_schedule_core"); 370 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) 371 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon 372 373 # input transform 374 &movdqa ("xmm3","xmm0"); 375 &lea ($base,&DWP($k_ipt,$const)); 376 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 377 &call ("_vpaes_schedule_transform"); 378 &movdqa ("xmm7","xmm0"); 379 380 &test ($out,$out); 381 &jnz (&label("schedule_am_decrypting")); 382 383 # encrypting, output zeroth round key after transform 384 &movdqu (&QWP(0,$key),"xmm0"); 385 &jmp (&label("schedule_go")); 386 387&set_label("schedule_am_decrypting"); 388 # decrypting, output zeroth round key after shiftrows 389 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 390 &pshufb ("xmm3","xmm1"); 391 &movdqu (&QWP(0,$key),"xmm3"); 392 &xor ($magic,0x30); 393 394&set_label("schedule_go"); 395 &cmp ($round,192); 396 &ja (&label("schedule_256")); 397 &je (&label("schedule_192")); 398 # 128: fall though 399 400## 401## .schedule_128 402## 403## 128-bit specific part of key schedule. 404## 405## This schedule is really simple, because all its parts 406## are accomplished by the subroutines. 407## 408&set_label("schedule_128"); 409 &mov ($round,10); 410 411&set_label("loop_schedule_128"); 412 &call ("_vpaes_schedule_round"); 413 &dec ($round); 414 &jz (&label("schedule_mangle_last")); 415 &call ("_vpaes_schedule_mangle"); # write output 416 &jmp (&label("loop_schedule_128")); 417 418## 419## .aes_schedule_192 420## 421## 192-bit specific part of key schedule. 422## 423## The main body of this schedule is the same as the 128-bit 424## schedule, but with more smearing. The long, high side is 425## stored in %xmm7 as before, and the short, low side is in 426## the high bits of %xmm6. 427## 428## This schedule is somewhat nastier, however, because each 429## round produces 192 bits of key material, or 1.5 round keys. 430## Therefore, on each cycle we do 2 rounds and produce 3 round 431## keys. 432## 433&set_label("schedule_192",16); 434 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) 435 &call ("_vpaes_schedule_transform"); # input transform 436 &movdqa ("xmm6","xmm0"); # save short part 437 &pxor ("xmm4","xmm4"); # clear 4 438 &movhlps("xmm6","xmm4"); # clobber low side with zeros 439 &mov ($round,4); 440 441&set_label("loop_schedule_192"); 442 &call ("_vpaes_schedule_round"); 443 &palignr("xmm0","xmm6",8); 444 &call ("_vpaes_schedule_mangle"); # save key n 445 &call ("_vpaes_schedule_192_smear"); 446 &call ("_vpaes_schedule_mangle"); # save key n+1 447 &call ("_vpaes_schedule_round"); 448 &dec ($round); 449 &jz (&label("schedule_mangle_last")); 450 &call ("_vpaes_schedule_mangle"); # save key n+2 451 &call ("_vpaes_schedule_192_smear"); 452 &jmp (&label("loop_schedule_192")); 453 454## 455## .aes_schedule_256 456## 457## 256-bit specific part of key schedule. 458## 459## The structure here is very similar to the 128-bit 460## schedule, but with an additional "low side" in 461## %xmm6. The low side's rounds are the same as the 462## high side's, except no rcon and no rotation. 463## 464&set_label("schedule_256",16); 465 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) 466 &call ("_vpaes_schedule_transform"); # input transform 467 &mov ($round,7); 468 469&set_label("loop_schedule_256"); 470 &call ("_vpaes_schedule_mangle"); # output low result 471 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 472 473 # high round 474 &call ("_vpaes_schedule_round"); 475 &dec ($round); 476 &jz (&label("schedule_mangle_last")); 477 &call ("_vpaes_schedule_mangle"); 478 479 # low round. swap xmm7 and xmm6 480 &pshufd ("xmm0","xmm0",0xFF); 481 &movdqa (&QWP(20,"esp"),"xmm7"); 482 &movdqa ("xmm7","xmm6"); 483 &call ("_vpaes_schedule_low_round"); 484 &movdqa ("xmm7",&QWP(20,"esp")); 485 486 &jmp (&label("loop_schedule_256")); 487 488## 489## .aes_schedule_mangle_last 490## 491## Mangler for last round of key schedule 492## Mangles %xmm0 493## when encrypting, outputs out(%xmm0) ^ 63 494## when decrypting, outputs unskew(%xmm0) 495## 496## Always called right before return... jumps to cleanup and exits 497## 498&set_label("schedule_mangle_last",16); 499 # schedule last round key from xmm0 500 &lea ($base,&DWP($k_deskew,$const)); 501 &test ($out,$out); 502 &jnz (&label("schedule_mangle_last_dec")); 503 504 # encrypting 505 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 506 &pshufb ("xmm0","xmm1"); # output permute 507 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform 508 &add ($key,32); 509 510&set_label("schedule_mangle_last_dec"); 511 &add ($key,-16); 512 &pxor ("xmm0",&QWP($k_s63,$const)); 513 &call ("_vpaes_schedule_transform"); # output transform 514 &movdqu (&QWP(0,$key),"xmm0"); # save last key 515 516 # cleanup 517 &pxor ("xmm0","xmm0"); 518 &pxor ("xmm1","xmm1"); 519 &pxor ("xmm2","xmm2"); 520 &pxor ("xmm3","xmm3"); 521 &pxor ("xmm4","xmm4"); 522 &pxor ("xmm5","xmm5"); 523 &pxor ("xmm6","xmm6"); 524 &pxor ("xmm7","xmm7"); 525 &ret (); 526&function_end_B("_vpaes_schedule_core"); 527 528## 529## .aes_schedule_192_smear 530## 531## Smear the short, low side in the 192-bit key schedule. 532## 533## Inputs: 534## %xmm7: high side, b a x y 535## %xmm6: low side, d c 0 0 536## %xmm13: 0 537## 538## Outputs: 539## %xmm6: b+c+d b+c 0 0 540## %xmm0: b+c+d b+c b a 541## 542&function_begin_B("_vpaes_schedule_192_smear"); 543 &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 544 &pxor ("xmm6","xmm0"); # -> c+d c 0 0 545 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a 546 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a 547 &movdqa ("xmm0","xmm6"); 548 &pxor ("xmm1","xmm1"); 549 &movhlps("xmm6","xmm1"); # clobber low side with zeros 550 &ret (); 551&function_end_B("_vpaes_schedule_192_smear"); 552 553## 554## .aes_schedule_round 555## 556## Runs one main round of the key schedule on %xmm0, %xmm7 557## 558## Specifically, runs subbytes on the high dword of %xmm0 559## then rotates it by one byte and xors into the low dword of 560## %xmm7. 561## 562## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 563## next rcon. 564## 565## Smears the dwords of %xmm7 by xoring the low into the 566## second low, result into third, result into highest. 567## 568## Returns results in %xmm7 = %xmm0. 569## Clobbers %xmm1-%xmm5. 570## 571&function_begin_B("_vpaes_schedule_round"); 572 # extract rcon from xmm8 573 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 574 &pxor ("xmm1","xmm1"); 575 &palignr("xmm1","xmm2",15); 576 &palignr("xmm2","xmm2",15); 577 &pxor ("xmm7","xmm1"); 578 579 # rotate 580 &pshufd ("xmm0","xmm0",0xFF); 581 &palignr("xmm0","xmm0",1); 582 583 # fall through... 584 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 585 586 # low round: same as high round, but no rotation and no rcon. 587&set_label("_vpaes_schedule_low_round"); 588 # smear xmm7 589 &movdqa ("xmm1","xmm7"); 590 &pslldq ("xmm7",4); 591 &pxor ("xmm7","xmm1"); 592 &movdqa ("xmm1","xmm7"); 593 &pslldq ("xmm7",8); 594 &pxor ("xmm7","xmm1"); 595 &pxor ("xmm7",&QWP($k_s63,$const)); 596 597 # subbyte 598 &movdqa ("xmm4",&QWP($k_s0F,$const)); 599 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j 600 &movdqa ("xmm1","xmm4"); 601 &pandn ("xmm1","xmm0"); 602 &psrld ("xmm1",4); # 1 = i 603 &pand ("xmm0","xmm4"); # 0 = k 604 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 605 &pshufb ("xmm2","xmm0"); # 2 = a/k 606 &pxor ("xmm0","xmm1"); # 0 = j 607 &movdqa ("xmm3","xmm5"); # 3 : 1/i 608 &pshufb ("xmm3","xmm1"); # 3 = 1/i 609 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 610 &movdqa ("xmm4","xmm5"); # 4 : 1/j 611 &pshufb ("xmm4","xmm0"); # 4 = 1/j 612 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 613 &movdqa ("xmm2","xmm5"); # 2 : 1/iak 614 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 615 &pxor ("xmm2","xmm0"); # 2 = io 616 &movdqa ("xmm3","xmm5"); # 3 : 1/jak 617 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 618 &pxor ("xmm3","xmm1"); # 3 = jo 619 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou 620 &pshufb ("xmm4","xmm2"); # 4 = sbou 621 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot 622 &pshufb ("xmm0","xmm3"); # 0 = sb1t 623 &pxor ("xmm0","xmm4"); # 0 = sbox output 624 625 # add in smeared stuff 626 &pxor ("xmm0","xmm7"); 627 &movdqa ("xmm7","xmm0"); 628 &ret (); 629&function_end_B("_vpaes_schedule_round"); 630 631## 632## .aes_schedule_transform 633## 634## Linear-transform %xmm0 according to tables at (%ebx) 635## 636## Output in %xmm0 637## Clobbers %xmm1, %xmm2 638## 639&function_begin_B("_vpaes_schedule_transform"); 640 &movdqa ("xmm2",&QWP($k_s0F,$const)); 641 &movdqa ("xmm1","xmm2"); 642 &pandn ("xmm1","xmm0"); 643 &psrld ("xmm1",4); 644 &pand ("xmm0","xmm2"); 645 &movdqa ("xmm2",&QWP(0,$base)); 646 &pshufb ("xmm2","xmm0"); 647 &movdqa ("xmm0",&QWP(16,$base)); 648 &pshufb ("xmm0","xmm1"); 649 &pxor ("xmm0","xmm2"); 650 &ret (); 651&function_end_B("_vpaes_schedule_transform"); 652 653## 654## .aes_schedule_mangle 655## 656## Mangle xmm0 from (basis-transformed) standard version 657## to our version. 658## 659## On encrypt, 660## xor with 0x63 661## multiply by circulant 0,1,1,1 662## apply shiftrows transform 663## 664## On decrypt, 665## xor with 0x63 666## multiply by "inverse mixcolumns" circulant E,B,D,9 667## deskew 668## apply shiftrows transform 669## 670## 671## Writes out to (%edx), and increments or decrements it 672## Keeps track of round number mod 4 in %ecx 673## Preserves xmm0 674## Clobbers xmm1-xmm5 675## 676&function_begin_B("_vpaes_schedule_mangle"); 677 &movdqa ("xmm4","xmm0"); # save xmm0 for later 678 &movdqa ("xmm5",&QWP($k_mc_forward,$const)); 679 &test ($out,$out); 680 &jnz (&label("schedule_mangle_dec")); 681 682 # encrypting 683 &add ($key,16); 684 &pxor ("xmm4",&QWP($k_s63,$const)); 685 &pshufb ("xmm4","xmm5"); 686 &movdqa ("xmm3","xmm4"); 687 &pshufb ("xmm4","xmm5"); 688 &pxor ("xmm3","xmm4"); 689 &pshufb ("xmm4","xmm5"); 690 &pxor ("xmm3","xmm4"); 691 692 &jmp (&label("schedule_mangle_both")); 693 694&set_label("schedule_mangle_dec",16); 695 # inverse mix columns 696 &movdqa ("xmm2",&QWP($k_s0F,$const)); 697 &lea ($inp,&DWP($k_dksd,$const)); 698 &movdqa ("xmm1","xmm2"); 699 &pandn ("xmm1","xmm4"); 700 &psrld ("xmm1",4); # 1 = hi 701 &pand ("xmm4","xmm2"); # 4 = lo 702 703 &movdqa ("xmm2",&QWP(0,$inp)); 704 &pshufb ("xmm2","xmm4"); 705 &movdqa ("xmm3",&QWP(0x10,$inp)); 706 &pshufb ("xmm3","xmm1"); 707 &pxor ("xmm3","xmm2"); 708 &pshufb ("xmm3","xmm5"); 709 710 &movdqa ("xmm2",&QWP(0x20,$inp)); 711 &pshufb ("xmm2","xmm4"); 712 &pxor ("xmm2","xmm3"); 713 &movdqa ("xmm3",&QWP(0x30,$inp)); 714 &pshufb ("xmm3","xmm1"); 715 &pxor ("xmm3","xmm2"); 716 &pshufb ("xmm3","xmm5"); 717 718 &movdqa ("xmm2",&QWP(0x40,$inp)); 719 &pshufb ("xmm2","xmm4"); 720 &pxor ("xmm2","xmm3"); 721 &movdqa ("xmm3",&QWP(0x50,$inp)); 722 &pshufb ("xmm3","xmm1"); 723 &pxor ("xmm3","xmm2"); 724 &pshufb ("xmm3","xmm5"); 725 726 &movdqa ("xmm2",&QWP(0x60,$inp)); 727 &pshufb ("xmm2","xmm4"); 728 &pxor ("xmm2","xmm3"); 729 &movdqa ("xmm3",&QWP(0x70,$inp)); 730 &pshufb ("xmm3","xmm1"); 731 &pxor ("xmm3","xmm2"); 732 733 &add ($key,-16); 734 735&set_label("schedule_mangle_both"); 736 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 737 &pshufb ("xmm3","xmm1"); 738 &add ($magic,-16); 739 &and ($magic,0x30); 740 &movdqu (&QWP(0,$key),"xmm3"); 741 &ret (); 742&function_end_B("_vpaes_schedule_mangle"); 743 744# 745# Interface to OpenSSL 746# 747&function_begin("${PREFIX}_set_encrypt_key"); 748 &mov ($inp,&wparam(0)); # inp 749 &lea ($base,&DWP(-56,"esp")); 750 &mov ($round,&wparam(1)); # bits 751 &and ($base,-16); 752 &mov ($key,&wparam(2)); # key 753 &xchg ($base,"esp"); # alloca 754 &mov (&DWP(48,"esp"),$base); 755 756 &mov ($base,$round); 757 &shr ($base,5); 758 &add ($base,5); 759 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 760 &mov ($magic,0x30); 761 &mov ($out,0); 762 763 &picsetup($const); 764 &picsymbol($const, &label("_vpaes_consts"), $const); 765 &lea ($const,&DWP(0x30,$const)) 766 767 &call ("_vpaes_schedule_core"); 768 769 &mov ("esp",&DWP(48,"esp")); 770 &xor ("eax","eax"); 771&function_end("${PREFIX}_set_encrypt_key"); 772 773&function_begin("${PREFIX}_set_decrypt_key"); 774 &mov ($inp,&wparam(0)); # inp 775 &lea ($base,&DWP(-56,"esp")); 776 &mov ($round,&wparam(1)); # bits 777 &and ($base,-16); 778 &mov ($key,&wparam(2)); # key 779 &xchg ($base,"esp"); # alloca 780 &mov (&DWP(48,"esp"),$base); 781 782 &mov ($base,$round); 783 &shr ($base,5); 784 &add ($base,5); 785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 786 &shl ($base,4); 787 &lea ($key,&DWP(16,$key,$base)); 788 789 &mov ($out,1); 790 &mov ($magic,$round); 791 &shr ($magic,1); 792 &and ($magic,32); 793 &xor ($magic,32); # nbist==192?0:32; 794 795 &picsetup($const); 796 &picsymbol($const, &label("_vpaes_consts"), $const); 797 &lea ($const,&DWP(0x30,$const)) 798 799 &call ("_vpaes_schedule_core"); 800 801 &mov ("esp",&DWP(48,"esp")); 802 &xor ("eax","eax"); 803&function_end("${PREFIX}_set_decrypt_key"); 804 805&function_begin("${PREFIX}_encrypt"); 806 &picsetup($const); 807 &picsymbol($const, &label("_vpaes_consts"), $const); 808 &lea ($const,&DWP(0x30,$const)) 809 810 &call ("_vpaes_preheat"); 811 &mov ($inp,&wparam(0)); # inp 812 &lea ($base,&DWP(-56,"esp")); 813 &mov ($out,&wparam(1)); # out 814 &and ($base,-16); 815 &mov ($key,&wparam(2)); # key 816 &xchg ($base,"esp"); # alloca 817 &mov (&DWP(48,"esp"),$base); 818 819 &movdqu ("xmm0",&QWP(0,$inp)); 820 &call ("_vpaes_encrypt_core"); 821 &movdqu (&QWP(0,$out),"xmm0"); 822 823 &mov ("esp",&DWP(48,"esp")); 824&function_end("${PREFIX}_encrypt"); 825 826&function_begin("${PREFIX}_decrypt"); 827 &picsetup($const); 828 &picsymbol($const, &label("_vpaes_consts"), $const); 829 &lea ($const,&DWP(0x30,$const)) 830 831 &call ("_vpaes_preheat"); 832 &mov ($inp,&wparam(0)); # inp 833 &lea ($base,&DWP(-56,"esp")); 834 &mov ($out,&wparam(1)); # out 835 &and ($base,-16); 836 &mov ($key,&wparam(2)); # key 837 &xchg ($base,"esp"); # alloca 838 &mov (&DWP(48,"esp"),$base); 839 840 &movdqu ("xmm0",&QWP(0,$inp)); 841 &call ("_vpaes_decrypt_core"); 842 &movdqu (&QWP(0,$out),"xmm0"); 843 844 &mov ("esp",&DWP(48,"esp")); 845&function_end("${PREFIX}_decrypt"); 846 847&function_begin("${PREFIX}_cbc_encrypt"); 848 &mov ($inp,&wparam(0)); # inp 849 &mov ($out,&wparam(1)); # out 850 &mov ($round,&wparam(2)); # len 851 &mov ($key,&wparam(3)); # key 852 &sub ($round,16); 853 &jc (&label("cbc_abort")); 854 &lea ($base,&DWP(-56,"esp")); 855 &mov ($const,&wparam(4)); # ivp 856 &and ($base,-16); 857 &mov ($magic,&wparam(5)); # enc 858 &xchg ($base,"esp"); # alloca 859 &movdqu ("xmm1",&QWP(0,$const)); # load IV 860 &sub ($out,$inp); 861 &mov (&DWP(48,"esp"),$base); 862 863 &mov (&DWP(0,"esp"),$out); # save out 864 &mov (&DWP(4,"esp"),$key) # save key 865 &mov (&DWP(8,"esp"),$const); # save ivp 866 &mov ($out,$round); # $out works as $len 867 868 &picsetup($const); 869 &picsymbol($const, &label("_vpaes_consts"), $const); 870 &lea ($const,&DWP(0x30,$const)) 871 872 &call ("_vpaes_preheat"); 873 &cmp ($magic,0); 874 &je (&label("cbc_dec_loop")); 875 &jmp (&label("cbc_enc_loop")); 876 877&set_label("cbc_enc_loop",16); 878 &movdqu ("xmm0",&QWP(0,$inp)); # load input 879 &pxor ("xmm0","xmm1"); # inp^=iv 880 &call ("_vpaes_encrypt_core"); 881 &mov ($base,&DWP(0,"esp")); # restore out 882 &mov ($key,&DWP(4,"esp")); # restore key 883 &movdqa ("xmm1","xmm0"); 884 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 885 &lea ($inp,&DWP(16,$inp)); 886 &sub ($out,16); 887 &jnc (&label("cbc_enc_loop")); 888 &jmp (&label("cbc_done")); 889 890&set_label("cbc_dec_loop",16); 891 &movdqu ("xmm0",&QWP(0,$inp)); # load input 892 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV 893 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV 894 &call ("_vpaes_decrypt_core"); 895 &mov ($base,&DWP(0,"esp")); # restore out 896 &mov ($key,&DWP(4,"esp")); # restore key 897 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv 898 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV 899 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 900 &lea ($inp,&DWP(16,$inp)); 901 &sub ($out,16); 902 &jnc (&label("cbc_dec_loop")); 903 904&set_label("cbc_done"); 905 &mov ($base,&DWP(8,"esp")); # restore ivp 906 &mov ("esp",&DWP(48,"esp")); 907 &movdqu (&QWP(0,$base),"xmm1"); # write IV 908&set_label("cbc_abort"); 909&function_end("${PREFIX}_cbc_encrypt"); 910 911&asm_finish(); 912