vpaes-ppc.pl revision 289848
1#!/usr/bin/env perl 2 3###################################################################### 4## Constant-time SSSE3 AES core implementation. 5## version 0.1 6## 7## By Mike Hamburg (Stanford University), 2009 8## Public domain. 9## 10## For details see http://shiftleft.org/papers/vector_aes/ and 11## http://crypto.stanford.edu/vpaes/. 12 13# CBC encrypt/decrypt performance in cycles per byte processed with 14# 128-bit key. 15# 16# aes-ppc.pl this 17# G4e 35.5/52.1/(23.8) 11.9(*)/15.4 18# POWER6 42.7/54.3/(28.2) 63.0/92.8(**) 19# POWER7 32.3/42.9/(18.4) 18.5/23.3 20# 21# (*) This is ~10% worse than reported in paper. The reason is 22# twofold. This module doesn't make any assumption about 23# key schedule (or data for that matter) alignment and handles 24# it in-line. Secondly it, being transliterated from 25# vpaes-x86_64.pl, relies on "nested inversion" better suited 26# for Intel CPUs. 27# (**) Inadequate POWER6 performance is due to astronomic AltiVec 28# latency, 9 cycles per simple logical operation. 29 30$flavour = shift; 31 32if ($flavour =~ /64/) { 33 $SIZE_T =8; 34 $LRSAVE =2*$SIZE_T; 35 $STU ="stdu"; 36 $POP ="ld"; 37 $PUSH ="std"; 38 $UCMP ="cmpld"; 39} elsif ($flavour =~ /32/) { 40 $SIZE_T =4; 41 $LRSAVE =$SIZE_T; 42 $STU ="stwu"; 43 $POP ="lwz"; 44 $PUSH ="stw"; 45 $UCMP ="cmplw"; 46} else { die "nonsense $flavour"; } 47 48$sp="r1"; 49$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload 50 51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 52( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 53( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 54die "can't locate ppc-xlate.pl"; 55 56open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 57 58$code.=<<___; 59.machine "any" 60 61.text 62 63.align 7 # totally strategic alignment 64_vpaes_consts: 65Lk_mc_forward: # mc_forward 66 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv 67 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv 68 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv 69 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv 70Lk_mc_backward: # mc_backward 71 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv 72 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv 73 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv 74 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv 75Lk_sr: # sr 76 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv 77 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv 78 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv 79 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv 80 81## 82## "Hot" constants 83## 84Lk_inv: # inv, inva 85 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev 86 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev 87Lk_ipt: # input transform (lo, hi) 88 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev 89 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev 90Lk_sbo: # sbou, sbot 91 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev 92 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev 93Lk_sb1: # sb1u, sb1t 94 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev 95 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev 96Lk_sb2: # sb2u, sb2t 97 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev 98 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev 99 100## 101## Decryption stuff 102## 103Lk_dipt: # decryption input transform 104 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev 105 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev 106Lk_dsbo: # decryption sbox final output 107 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev 108 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev 109Lk_dsb9: # decryption sbox output *9*u, *9*t 110 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev 111 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev 112Lk_dsbd: # decryption sbox output *D*u, *D*t 113 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev 114 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev 115Lk_dsbb: # decryption sbox output *B*u, *B*t 116 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev 117 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev 118Lk_dsbe: # decryption sbox output *E*u, *E*t 119 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev 120 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev 121 122## 123## Key schedule constants 124## 125Lk_dksd: # decryption key schedule: invskew x*D 126 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev 127 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev 128Lk_dksb: # decryption key schedule: invskew x*B 129 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev 130 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev 131Lk_dkse: # decryption key schedule: invskew x*E + 0x63 132 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev 133 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev 134Lk_dks9: # decryption key schedule: invskew x*9 135 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev 136 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev 137 138Lk_rcon: # rcon 139 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis 140Lk_s63: 141 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis 142 143Lk_opt: # output transform 144 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev 145 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev 146Lk_deskew: # deskew tables: inverts the sbox's "skew" 147 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev 148 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev 149.align 5 150Lconsts: 151 mflr r0 152 bcl 20,31,\$+4 153 mflr r12 #vvvvv "distance between . and _vpaes_consts 154 addi r12,r12,-0x308 155 mtlr r0 156 blr 157 .long 0 158 .byte 0,12,0x14,0,0,0,0,0 159.asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)" 160.align 6 161___ 162 163my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31)); 164{ 165my ($inp,$out,$key) = map("r$_",(3..5)); 166 167my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15)); 168my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19)); 169my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23)); 170 171$code.=<<___; 172## 173## _aes_preheat 174## 175## Fills register %r10 -> .aes_consts (so you can -fPIC) 176## and %xmm9-%xmm15 as specified below. 177## 178.align 4 179_vpaes_encrypt_preheat: 180 mflr r8 181 bl Lconsts 182 mtlr r8 183 li r11, 0xc0 # Lk_inv 184 li r10, 0xd0 185 li r9, 0xe0 # Lk_ipt 186 li r8, 0xf0 187 vxor v7, v7, v7 # 0x00..00 188 vspltisb v8,4 # 0x04..04 189 vspltisb v9,0x0f # 0x0f..0f 190 lvx $invlo, r12, r11 191 li r11, 0x100 192 lvx $invhi, r12, r10 193 li r10, 0x110 194 lvx $iptlo, r12, r9 195 li r9, 0x120 196 lvx $ipthi, r12, r8 197 li r8, 0x130 198 lvx $sbou, r12, r11 199 li r11, 0x140 200 lvx $sbot, r12, r10 201 li r10, 0x150 202 lvx $sb1u, r12, r9 203 lvx $sb1t, r12, r8 204 lvx $sb2u, r12, r11 205 lvx $sb2t, r12, r10 206 blr 207 .long 0 208 .byte 0,12,0x14,0,0,0,0,0 209 210## 211## _aes_encrypt_core 212## 213## AES-encrypt %xmm0. 214## 215## Inputs: 216## %xmm0 = input 217## %xmm9-%xmm15 as in _vpaes_preheat 218## (%rdx) = scheduled keys 219## 220## Output in %xmm0 221## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax 222## 223## 224.align 5 225_vpaes_encrypt_core: 226 lwz r8, 240($key) # pull rounds 227 li r9, 16 228 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key 229 li r11, 0x10 230 lvx v6, r9, $key 231 addi r9, r9, 16 232 ?vperm v5, v5, v6, $keyperm # align round key 233 addi r10, r11, 0x40 234 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 235 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1 236 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2 237 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0 238 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 239 mtctr r8 240 b Lenc_entry 241 242.align 4 243Lenc_loop: 244 # middle of middle round 245 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 246 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 247 addi r11, r11, 16 248 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 249 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 250 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4 251 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 252 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A 253 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 254 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 255 addi r10, r11, 0x40 256 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 257 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 258 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 259 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 260 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 261 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 262 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 263 264Lenc_entry: 265 # top of round 266 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 267 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 268 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 269 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 270 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 271 vand v0, v0, v9 272 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 273 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 274 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 275 vmr v5, v6 276 lvx v6, r9, $key # vmovdqu (%r9), %xmm5 277 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 278 addi r9, r9, 16 279 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io 280 ?vperm v5, v5, v6, $keyperm # align round key 281 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 282 bdnz Lenc_loop 283 284 # middle of last round 285 addi r10, r11, 0x80 286 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 287 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 288 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 289 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 290 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 291 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 292 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A 293 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0 294 blr 295 .long 0 296 .byte 0,12,0x14,0,0,0,0,0 297 298.globl .vpaes_encrypt 299.align 5 300.vpaes_encrypt: 301 $STU $sp,-$FRAME($sp) 302 li r10,`15+6*$SIZE_T` 303 li r11,`31+6*$SIZE_T` 304 mflr r6 305 mfspr r7, 256 # save vrsave 306 stvx v20,r10,$sp 307 addi r10,r10,32 308 stvx v21,r11,$sp 309 addi r11,r11,32 310 stvx v22,r10,$sp 311 addi r10,r10,32 312 stvx v23,r11,$sp 313 addi r11,r11,32 314 stvx v24,r10,$sp 315 addi r10,r10,32 316 stvx v25,r11,$sp 317 addi r11,r11,32 318 stvx v26,r10,$sp 319 addi r10,r10,32 320 stvx v27,r11,$sp 321 addi r11,r11,32 322 stvx v28,r10,$sp 323 addi r10,r10,32 324 stvx v29,r11,$sp 325 addi r11,r11,32 326 stvx v30,r10,$sp 327 stvx v31,r11,$sp 328 stw r7,`$FRAME-4`($sp) # save vrsave 329 li r0, -1 330 $PUSH r6,`$FRAME+$LRSAVE`($sp) 331 mtspr 256, r0 # preserve all AltiVec registers 332 333 bl _vpaes_encrypt_preheat 334 335 ?lvsl $inpperm, 0, $inp # prepare for unaligned access 336 lvx v0, 0, $inp 337 addi $inp, $inp, 15 # 15 is not a typo 338 ?lvsr $outperm, 0, $out 339 ?lvsl $keyperm, 0, $key # prepare for unaligned access 340 vnor $outmask, v7, v7 # 0xff..ff 341 lvx $inptail, 0, $inp # redundant in aligned case 342 ?vperm $outmask, v7, $outmask, $outperm 343 lvx $outhead, 0, $out 344 ?vperm v0, v0, $inptail, $inpperm 345 346 bl _vpaes_encrypt_core 347 348 vperm v0, v0, v0, $outperm # rotate right/left 349 vsel v1, $outhead, v0, $outmask 350 vmr $outhead, v0 351 stvx v1, 0, $out 352 addi $out, $out, 15 # 15 is not a typo 353 ######## 354 355 lvx v1, 0, $out # redundant in aligned case 356 vsel v1, $outhead, v1, $outmask 357 stvx v1, 0, $out 358 359 li r10,`15+6*$SIZE_T` 360 li r11,`31+6*$SIZE_T` 361 mtlr r6 362 mtspr 256, r7 # restore vrsave 363 lvx v20,r10,$sp 364 addi r10,r10,32 365 lvx v21,r11,$sp 366 addi r11,r11,32 367 lvx v22,r10,$sp 368 addi r10,r10,32 369 lvx v23,r11,$sp 370 addi r11,r11,32 371 lvx v24,r10,$sp 372 addi r10,r10,32 373 lvx v25,r11,$sp 374 addi r11,r11,32 375 lvx v26,r10,$sp 376 addi r10,r10,32 377 lvx v27,r11,$sp 378 addi r11,r11,32 379 lvx v28,r10,$sp 380 addi r10,r10,32 381 lvx v29,r11,$sp 382 addi r11,r11,32 383 lvx v30,r10,$sp 384 lvx v31,r11,$sp 385 addi $sp,$sp,$FRAME 386 blr 387 .long 0 388 .byte 0,12,0x04,1,0x80,0,3,0 389 .long 0 390.size .vpaes_encrypt,.-.vpaes_encrypt 391 392.align 4 393_vpaes_decrypt_preheat: 394 mflr r8 395 bl Lconsts 396 mtlr r8 397 li r11, 0xc0 # Lk_inv 398 li r10, 0xd0 399 li r9, 0x160 # Ldipt 400 li r8, 0x170 401 vxor v7, v7, v7 # 0x00..00 402 vspltisb v8,4 # 0x04..04 403 vspltisb v9,0x0f # 0x0f..0f 404 lvx $invlo, r12, r11 405 li r11, 0x180 406 lvx $invhi, r12, r10 407 li r10, 0x190 408 lvx $iptlo, r12, r9 409 li r9, 0x1a0 410 lvx $ipthi, r12, r8 411 li r8, 0x1b0 412 lvx $sbou, r12, r11 413 li r11, 0x1c0 414 lvx $sbot, r12, r10 415 li r10, 0x1d0 416 lvx $sb9u, r12, r9 417 li r9, 0x1e0 418 lvx $sb9t, r12, r8 419 li r8, 0x1f0 420 lvx $sbdu, r12, r11 421 li r11, 0x200 422 lvx $sbdt, r12, r10 423 li r10, 0x210 424 lvx $sbbu, r12, r9 425 lvx $sbbt, r12, r8 426 lvx $sbeu, r12, r11 427 lvx $sbet, r12, r10 428 blr 429 .long 0 430 .byte 0,12,0x14,0,0,0,0,0 431 432## 433## Decryption core 434## 435## Same API as encryption core. 436## 437.align 4 438_vpaes_decrypt_core: 439 lwz r8, 240($key) # pull rounds 440 li r9, 16 441 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key 442 li r11, 0x30 443 lvx v6, r9, $key 444 addi r9, r9, 16 445 ?vperm v5, v5, v6, $keyperm # align round key 446 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 447 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 448 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0 449 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2 450 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 451 mtctr r8 452 b Ldec_entry 453 454.align 4 455Ldec_loop: 456# 457# Inverse mix columns 458# 459 lvx v0, r12, r11 # v5 and v0 are flipped 460 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 461 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 462 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 463 subi r11, r11, 16 464 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 465 andi. r11, r11, 0x30 466 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 467 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 468 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 469 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 470 471 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 472 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 473 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 474 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 475 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 476 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 477 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 478 479 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 480 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 481 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 482 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 483 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 484 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 485 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 486 487 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 488 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 489 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 490 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 491 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 492 493Ldec_entry: 494 # top of round 495 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 496 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 497 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 498 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 499 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 500 vand v0, v0, v9 501 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 502 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 503 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 504 vmr v5, v6 505 lvx v6, r9, $key # vmovdqu (%r9), %xmm0 506 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 507 addi r9, r9, 16 508 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io 509 ?vperm v5, v5, v6, $keyperm # align round key 510 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 511 bdnz Ldec_loop 512 513 # middle of last round 514 addi r10, r11, 0x80 515 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 516 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 517 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 518 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 519 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 520 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 521 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A 522 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0 523 blr 524 .long 0 525 .byte 0,12,0x14,0,0,0,0,0 526 527.globl .vpaes_decrypt 528.align 5 529.vpaes_decrypt: 530 $STU $sp,-$FRAME($sp) 531 li r10,`15+6*$SIZE_T` 532 li r11,`31+6*$SIZE_T` 533 mflr r6 534 mfspr r7, 256 # save vrsave 535 stvx v20,r10,$sp 536 addi r10,r10,32 537 stvx v21,r11,$sp 538 addi r11,r11,32 539 stvx v22,r10,$sp 540 addi r10,r10,32 541 stvx v23,r11,$sp 542 addi r11,r11,32 543 stvx v24,r10,$sp 544 addi r10,r10,32 545 stvx v25,r11,$sp 546 addi r11,r11,32 547 stvx v26,r10,$sp 548 addi r10,r10,32 549 stvx v27,r11,$sp 550 addi r11,r11,32 551 stvx v28,r10,$sp 552 addi r10,r10,32 553 stvx v29,r11,$sp 554 addi r11,r11,32 555 stvx v30,r10,$sp 556 stvx v31,r11,$sp 557 stw r7,`$FRAME-4`($sp) # save vrsave 558 li r0, -1 559 $PUSH r6,`$FRAME+$LRSAVE`($sp) 560 mtspr 256, r0 # preserve all AltiVec registers 561 562 bl _vpaes_decrypt_preheat 563 564 ?lvsl $inpperm, 0, $inp # prepare for unaligned access 565 lvx v0, 0, $inp 566 addi $inp, $inp, 15 # 15 is not a typo 567 ?lvsr $outperm, 0, $out 568 ?lvsl $keyperm, 0, $key 569 vnor $outmask, v7, v7 # 0xff..ff 570 lvx $inptail, 0, $inp # redundant in aligned case 571 ?vperm $outmask, v7, $outmask, $outperm 572 lvx $outhead, 0, $out 573 ?vperm v0, v0, $inptail, $inpperm 574 575 bl _vpaes_decrypt_core 576 577 vperm v0, v0, v0, $outperm # rotate right/left 578 vsel v1, $outhead, v0, $outmask 579 vmr $outhead, v0 580 stvx v1, 0, $out 581 addi $out, $out, 15 # 15 is not a typo 582 ######## 583 584 lvx v1, 0, $out # redundant in aligned case 585 vsel v1, $outhead, v1, $outmask 586 stvx v1, 0, $out 587 588 li r10,`15+6*$SIZE_T` 589 li r11,`31+6*$SIZE_T` 590 mtlr r6 591 mtspr 256, r7 # restore vrsave 592 lvx v20,r10,$sp 593 addi r10,r10,32 594 lvx v21,r11,$sp 595 addi r11,r11,32 596 lvx v22,r10,$sp 597 addi r10,r10,32 598 lvx v23,r11,$sp 599 addi r11,r11,32 600 lvx v24,r10,$sp 601 addi r10,r10,32 602 lvx v25,r11,$sp 603 addi r11,r11,32 604 lvx v26,r10,$sp 605 addi r10,r10,32 606 lvx v27,r11,$sp 607 addi r11,r11,32 608 lvx v28,r10,$sp 609 addi r10,r10,32 610 lvx v29,r11,$sp 611 addi r11,r11,32 612 lvx v30,r10,$sp 613 lvx v31,r11,$sp 614 addi $sp,$sp,$FRAME 615 blr 616 .long 0 617 .byte 0,12,0x04,1,0x80,0,3,0 618 .long 0 619.size .vpaes_decrypt,.-.vpaes_decrypt 620 621.globl .vpaes_cbc_encrypt 622.align 5 623.vpaes_cbc_encrypt: 624 ${UCMP}i r5,16 625 bltlr- 626 627 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp) 628 mflr r0 629 li r10,`15+6*$SIZE_T` 630 li r11,`31+6*$SIZE_T` 631 mfspr r12, 256 632 stvx v20,r10,$sp 633 addi r10,r10,32 634 stvx v21,r11,$sp 635 addi r11,r11,32 636 stvx v22,r10,$sp 637 addi r10,r10,32 638 stvx v23,r11,$sp 639 addi r11,r11,32 640 stvx v24,r10,$sp 641 addi r10,r10,32 642 stvx v25,r11,$sp 643 addi r11,r11,32 644 stvx v26,r10,$sp 645 addi r10,r10,32 646 stvx v27,r11,$sp 647 addi r11,r11,32 648 stvx v28,r10,$sp 649 addi r10,r10,32 650 stvx v29,r11,$sp 651 addi r11,r11,32 652 stvx v30,r10,$sp 653 stvx v31,r11,$sp 654 stw r12,`$FRAME-4`($sp) # save vrsave 655 $PUSH r30,`$FRAME+$SIZE_T*0`($sp) 656 $PUSH r31,`$FRAME+$SIZE_T*1`($sp) 657 li r9, -16 658 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) 659 660 and r30, r5, r9 # copy length&-16 661 mr r5, r6 # copy pointer to key 662 mr r31, r7 # copy pointer to iv 663 blt Lcbc_abort 664 cmpwi r8, 0 # test direction 665 li r6, -1 666 mr r7, r12 # copy vrsave 667 mtspr 256, r6 # preserve all AltiVec registers 668 669 lvx v24, 0, r31 # load [potentially unaligned] iv 670 li r9, 15 671 ?lvsl $inpperm, 0, r31 672 lvx v25, r9, r31 673 ?vperm v24, v24, v25, $inpperm 674 675 neg r8, $inp # prepare for unaligned access 676 vxor v7, v7, v7 677 ?lvsl $keyperm, 0, $key 678 ?lvsr $outperm, 0, $out 679 ?lvsr $inpperm, 0, r8 # -$inp 680 vnor $outmask, v7, v7 # 0xff..ff 681 lvx $inptail, 0, $inp 682 ?vperm $outmask, v7, $outmask, $outperm 683 addi $inp, $inp, 15 # 15 is not a typo 684 lvx $outhead, 0, $out 685 686 beq Lcbc_decrypt 687 688 bl _vpaes_encrypt_preheat 689 li r0, 16 690 691Lcbc_enc_loop: 692 vmr v0, $inptail 693 lvx $inptail, 0, $inp 694 addi $inp, $inp, 16 695 ?vperm v0, v0, $inptail, $inpperm 696 vxor v0, v0, v24 # ^= iv 697 698 bl _vpaes_encrypt_core 699 700 vmr v24, v0 # put aside iv 701 sub. r30, r30, r0 # len -= 16 702 vperm v0, v0, v0, $outperm # rotate right/left 703 vsel v1, $outhead, v0, $outmask 704 vmr $outhead, v0 705 stvx v1, 0, $out 706 addi $out, $out, 16 707 bne Lcbc_enc_loop 708 709 b Lcbc_done 710 711.align 5 712Lcbc_decrypt: 713 bl _vpaes_decrypt_preheat 714 li r0, 16 715 716Lcbc_dec_loop: 717 vmr v0, $inptail 718 lvx $inptail, 0, $inp 719 addi $inp, $inp, 16 720 ?vperm v0, v0, $inptail, $inpperm 721 vmr v25, v0 # put aside input 722 723 bl _vpaes_decrypt_core 724 725 vxor v0, v0, v24 # ^= iv 726 vmr v24, v25 727 sub. r30, r30, r0 # len -= 16 728 vperm v0, v0, v0, $outperm # rotate right/left 729 vsel v1, $outhead, v0, $outmask 730 vmr $outhead, v0 731 stvx v1, 0, $out 732 addi $out, $out, 16 733 bne Lcbc_dec_loop 734 735Lcbc_done: 736 addi $out, $out, -1 737 lvx v1, 0, $out # redundant in aligned case 738 vsel v1, $outhead, v1, $outmask 739 stvx v1, 0, $out 740 741 neg r8, r31 # write [potentially unaligned] iv 742 ?lvsl $outperm, 0, r8 743 li r6, 15 744 vnor $outmask, v7, v7 # 0xff..ff 745 ?vperm $outmask, v7, $outmask, $outperm 746 lvx $outhead, 0, r31 747 vperm v24, v24, v24, $outperm # rotate right/left 748 vsel v0, $outhead, v24, $outmask 749 lvx v1, r6, r31 750 stvx v0, 0, r31 751 vsel v1, v24, v1, $outmask 752 stvx v1, r6, r31 753 754 mtspr 256, r7 # restore vrsave 755 li r10,`15+6*$SIZE_T` 756 li r11,`31+6*$SIZE_T` 757 lvx v20,r10,$sp 758 addi r10,r10,32 759 lvx v21,r11,$sp 760 addi r11,r11,32 761 lvx v22,r10,$sp 762 addi r10,r10,32 763 lvx v23,r11,$sp 764 addi r11,r11,32 765 lvx v24,r10,$sp 766 addi r10,r10,32 767 lvx v25,r11,$sp 768 addi r11,r11,32 769 lvx v26,r10,$sp 770 addi r10,r10,32 771 lvx v27,r11,$sp 772 addi r11,r11,32 773 lvx v28,r10,$sp 774 addi r10,r10,32 775 lvx v29,r11,$sp 776 addi r11,r11,32 777 lvx v30,r10,$sp 778 lvx v31,r11,$sp 779Lcbc_abort: 780 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) 781 $POP r30,`$FRAME+$SIZE_T*0`($sp) 782 $POP r31,`$FRAME+$SIZE_T*1`($sp) 783 mtlr r0 784 addi $sp,$sp,`$FRAME+$SIZE_T*2` 785 blr 786 .long 0 787 .byte 0,12,0x04,1,0x80,2,6,0 788 .long 0 789.size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt 790___ 791} 792{ 793my ($inp,$bits,$out)=map("r$_",(3..5)); 794my $dir="cr1"; 795my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24)); 796 797$code.=<<___; 798######################################################## 799## ## 800## AES key schedule ## 801## ## 802######################################################## 803.align 4 804_vpaes_key_preheat: 805 mflr r8 806 bl Lconsts 807 mtlr r8 808 li r11, 0xc0 # Lk_inv 809 li r10, 0xd0 810 li r9, 0xe0 # L_ipt 811 li r8, 0xf0 812 813 vspltisb v8,4 # 0x04..04 814 vxor v9,v9,v9 # 0x00..00 815 lvx $invlo, r12, r11 # Lk_inv 816 li r11, 0x120 817 lvx $invhi, r12, r10 818 li r10, 0x130 819 lvx $iptlo, r12, r9 # Lk_ipt 820 li r9, 0x220 821 lvx $ipthi, r12, r8 822 li r8, 0x230 823 824 lvx v14, r12, r11 # Lk_sb1 825 li r11, 0x240 826 lvx v15, r12, r10 827 li r10, 0x250 828 829 lvx v16, r12, r9 # Lk_dksd 830 li r9, 0x260 831 lvx v17, r12, r8 832 li r8, 0x270 833 lvx v18, r12, r11 # Lk_dksb 834 li r11, 0x280 835 lvx v19, r12, r10 836 li r10, 0x290 837 lvx v20, r12, r9 # Lk_dkse 838 li r9, 0x2a0 839 lvx v21, r12, r8 840 li r8, 0x2b0 841 lvx v22, r12, r11 # Lk_dks9 842 lvx v23, r12, r10 843 844 lvx v24, r12, r9 # Lk_rcon 845 lvx v25, 0, r12 # Lk_mc_forward[0] 846 lvx v26, r12, r8 # Lks63 847 blr 848 .long 0 849 .byte 0,12,0x14,0,0,0,0,0 850 851.align 4 852_vpaes_schedule_core: 853 mflr r7 854 855 bl _vpaes_key_preheat # load the tables 856 857 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned) 858 neg r8, $inp # prepare for unaligned access 859 lvx v0, 0, $inp 860 addi $inp, $inp, 15 # 15 is not typo 861 ?lvsr $inpperm, 0, r8 # -$inp 862 lvx v6, 0, $inp # v6 serves as inptail 863 addi $inp, $inp, 8 864 ?vperm v0, v0, v6, $inpperm 865 866 # input transform 867 vmr v3, v0 # vmovdqa %xmm0, %xmm3 868 bl _vpaes_schedule_transform 869 vmr v7, v0 # vmovdqa %xmm0, %xmm7 870 871 bne $dir, Lschedule_am_decrypting 872 873 # encrypting, output zeroth round key after transform 874 li r8, 0x30 # mov \$0x30,%r8d 875 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 876 877 ?lvsr $outperm, 0, $out # prepare for unaligned access 878 vnor $outmask, v9, v9 # 0xff..ff 879 lvx $outhead, 0, $out 880 ?vperm $outmask, v9, $outmask, $outperm 881 882 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) 883 vperm v1, v0, v0, $outperm # rotate right/left 884 vsel v2, $outhead, v1, $outmask 885 vmr $outhead, v1 886 stvx v2, 0, $out 887 b Lschedule_go 888 889Lschedule_am_decrypting: 890 srwi r8, $bits, 1 # shr \$1,%r8d 891 andi. r8, r8, 32 # and \$32,%r8d 892 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 893 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 894 # decrypting, output zeroth round key after shiftrows 895 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 896 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 897 898 neg r0, $out # prepare for unaligned access 899 ?lvsl $outperm, 0, r0 900 addi $out, $out, 15 # 15 is not typo 901 vnor $outmask, v9, v9 # 0xff..ff 902 lvx $outhead, 0, $out 903 ?vperm $outmask, $outmask, v9, $outperm 904 905 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) 906 vperm v4, v4, v4, $outperm # rotate right/left 907 vsel v2, $outhead, v4, $outmask 908 vmr $outhead, v4 909 stvx v2, 0, $out 910 xori r8, r8, 0x30 # xor \$0x30, %r8 911 912Lschedule_go: 913 cmplwi $bits, 192 # cmp \$192, %esi 914 bgt Lschedule_256 915 beq Lschedule_192 916 # 128: fall though 917 918## 919## .schedule_128 920## 921## 128-bit specific part of key schedule. 922## 923## This schedule is really simple, because all its parts 924## are accomplished by the subroutines. 925## 926Lschedule_128: 927 li r0, 10 # mov \$10, %esi 928 mtctr r0 929 930Loop_schedule_128: 931 bl _vpaes_schedule_round 932 bdz Lschedule_mangle_last # dec %esi 933 bl _vpaes_schedule_mangle # write output 934 b Loop_schedule_128 935 936## 937## .aes_schedule_192 938## 939## 192-bit specific part of key schedule. 940## 941## The main body of this schedule is the same as the 128-bit 942## schedule, but with more smearing. The long, high side is 943## stored in %xmm7 as before, and the short, low side is in 944## the high bits of %xmm6. 945## 946## This schedule is somewhat nastier, however, because each 947## round produces 192 bits of key material, or 1.5 round keys. 948## Therefore, on each cycle we do 2 rounds and produce 3 round 949## keys. 950## 951.align 4 952Lschedule_192: 953 li r0, 4 # mov \$4, %esi 954 lvx v0, 0, $inp 955 ?vperm v0, v6, v0, $inpperm 956 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 957 bl _vpaes_schedule_transform # input transform 958 ?vsldoi v6, v0, v9, 8 959 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros 960 mtctr r0 961 962Loop_schedule_192: 963 bl _vpaes_schedule_round 964 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 965 bl _vpaes_schedule_mangle # save key n 966 bl _vpaes_schedule_192_smear 967 bl _vpaes_schedule_mangle # save key n+1 968 bl _vpaes_schedule_round 969 bdz Lschedule_mangle_last # dec %esi 970 bl _vpaes_schedule_mangle # save key n+2 971 bl _vpaes_schedule_192_smear 972 b Loop_schedule_192 973 974## 975## .aes_schedule_256 976## 977## 256-bit specific part of key schedule. 978## 979## The structure here is very similar to the 128-bit 980## schedule, but with an additional "low side" in 981## %xmm6. The low side's rounds are the same as the 982## high side's, except no rcon and no rotation. 983## 984.align 4 985Lschedule_256: 986 li r0, 7 # mov \$7, %esi 987 addi $inp, $inp, 8 988 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 989 ?vperm v0, v6, v0, $inpperm 990 bl _vpaes_schedule_transform # input transform 991 mtctr r0 992 993Loop_schedule_256: 994 bl _vpaes_schedule_mangle # output low result 995 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 996 997 # high round 998 bl _vpaes_schedule_round 999 bdz Lschedule_mangle_last # dec %esi 1000 bl _vpaes_schedule_mangle 1001 1002 # low round. swap xmm7 and xmm6 1003 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 1004 vmr v5, v7 # vmovdqa %xmm7, %xmm5 1005 vmr v7, v6 # vmovdqa %xmm6, %xmm7 1006 bl _vpaes_schedule_low_round 1007 vmr v7, v5 # vmovdqa %xmm5, %xmm7 1008 1009 b Loop_schedule_256 1010## 1011## .aes_schedule_mangle_last 1012## 1013## Mangler for last round of key schedule 1014## Mangles %xmm0 1015## when encrypting, outputs out(%xmm0) ^ 63 1016## when decrypting, outputs unskew(%xmm0) 1017## 1018## Always called right before return... jumps to cleanup and exits 1019## 1020.align 4 1021Lschedule_mangle_last: 1022 # schedule last round key from xmm0 1023 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11 1024 li r9, 0x2f0 1025 bne $dir, Lschedule_mangle_last_dec 1026 1027 # encrypting 1028 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1 1029 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform 1030 li r9, 0x2d0 # prepare to output transform 1031 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute 1032 1033 lvx $iptlo, r11, r12 # reload $ipt 1034 lvx $ipthi, r9, r12 1035 addi $out, $out, 16 # add \$16, %rdx 1036 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 1037 bl _vpaes_schedule_transform # output transform 1038 1039 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key 1040 vperm v0, v0, v0, $outperm # rotate right/left 1041 vsel v2, $outhead, v0, $outmask 1042 vmr $outhead, v0 1043 stvx v2, 0, $out 1044 1045 addi $out, $out, 15 # 15 is not typo 1046 lvx v1, 0, $out # redundant in aligned case 1047 vsel v1, $outhead, v1, $outmask 1048 stvx v1, 0, $out 1049 b Lschedule_mangle_done 1050 1051.align 4 1052Lschedule_mangle_last_dec: 1053 lvx $iptlo, r11, r12 # reload $ipt 1054 lvx $ipthi, r9, r12 1055 addi $out, $out, -16 # add \$-16, %rdx 1056 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 1057 bl _vpaes_schedule_transform # output transform 1058 1059 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key 1060 vperm v0, v0, v0, $outperm # rotate right/left 1061 vsel v2, $outhead, v0, $outmask 1062 vmr $outhead, v0 1063 stvx v2, 0, $out 1064 1065 addi $out, $out, -15 # -15 is not typo 1066 lvx v1, 0, $out # redundant in aligned case 1067 vsel v1, $outhead, v1, $outmask 1068 stvx v1, 0, $out 1069 1070Lschedule_mangle_done: 1071 mtlr r7 1072 # cleanup 1073 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0 1074 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1 1075 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2 1076 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3 1077 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 1078 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5 1079 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6 1080 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7 1081 1082 blr 1083 .long 0 1084 .byte 0,12,0x14,0,0,0,0,0 1085 1086## 1087## .aes_schedule_192_smear 1088## 1089## Smear the short, low side in the 192-bit key schedule. 1090## 1091## Inputs: 1092## %xmm7: high side, b a x y 1093## %xmm6: low side, d c 0 0 1094## %xmm13: 0 1095## 1096## Outputs: 1097## %xmm6: b+c+d b+c 0 0 1098## %xmm0: b+c+d b+c b a 1099## 1100.align 4 1101_vpaes_schedule_192_smear: 1102 ?vspltw v0, v7, 3 1103 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 1104 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 1105 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 1106 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 1107 vmr v0, v6 1108 ?vsldoi v6, v6, v9, 8 1109 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros 1110 blr 1111 .long 0 1112 .byte 0,12,0x14,0,0,0,0,0 1113 1114## 1115## .aes_schedule_round 1116## 1117## Runs one main round of the key schedule on %xmm0, %xmm7 1118## 1119## Specifically, runs subbytes on the high dword of %xmm0 1120## then rotates it by one byte and xors into the low dword of 1121## %xmm7. 1122## 1123## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 1124## next rcon. 1125## 1126## Smears the dwords of %xmm7 by xoring the low into the 1127## second low, result into third, result into highest. 1128## 1129## Returns results in %xmm7 = %xmm0. 1130## Clobbers %xmm1-%xmm4, %r11. 1131## 1132.align 4 1133_vpaes_schedule_round: 1134 # extract rcon from xmm8 1135 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 1136 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 1137 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 1138 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 1139 1140 # rotate 1141 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 1142 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 1143 1144 # fall through... 1145 1146 # low round: same as high round, but no rotation and no rcon. 1147_vpaes_schedule_low_round: 1148 # smear xmm7 1149 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 1150 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 1151 vspltisb v1, 0x0f # 0x0f..0f 1152 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 1153 1154 # subbytes 1155 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k 1156 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 1157 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7 1158 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 1159 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 1160 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 1161 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 1162 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 1163 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7 1164 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 1165 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 1166 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 1167 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io 1168 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 1169 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 1170 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 1171 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 1172 1173 # add in smeared stuff 1174 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0 1175 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7 1176 blr 1177 .long 0 1178 .byte 0,12,0x14,0,0,0,0,0 1179 1180## 1181## .aes_schedule_transform 1182## 1183## Linear-transform %xmm0 according to tables at (%r11) 1184## 1185## Requires that %xmm9 = 0x0F0F... as in preheat 1186## Output in %xmm0 1187## Clobbers %xmm2 1188## 1189.align 4 1190_vpaes_schedule_transform: 1191 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1 1192 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 1193 # vmovdqa (%r11), %xmm2 # lo 1194 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 1195 # vmovdqa 16(%r11), %xmm1 # hi 1196 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0 1197 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0 1198 blr 1199 .long 0 1200 .byte 0,12,0x14,0,0,0,0,0 1201 1202## 1203## .aes_schedule_mangle 1204## 1205## Mangle xmm0 from (basis-transformed) standard version 1206## to our version. 1207## 1208## On encrypt, 1209## xor with 0x63 1210## multiply by circulant 0,1,1,1 1211## apply shiftrows transform 1212## 1213## On decrypt, 1214## xor with 0x63 1215## multiply by "inverse mixcolumns" circulant E,B,D,9 1216## deskew 1217## apply shiftrows transform 1218## 1219## 1220## Writes out to (%rdx), and increments or decrements it 1221## Keeps track of round number mod 4 in %r8 1222## Preserves xmm0 1223## Clobbers xmm1-xmm5 1224## 1225.align 4 1226_vpaes_schedule_mangle: 1227 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later 1228 # vmovdqa .Lk_mc_forward(%rip),%xmm5 1229 bne $dir, Lschedule_mangle_dec 1230 1231 # encrypting 1232 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4 1233 addi $out, $out, 16 # add \$16, %rdx 1234 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4 1235 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1 1236 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3 1237 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4 1238 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 1239 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3 1240 1241 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 1242 addi r8, r8, -16 # add \$-16, %r8 1243 andi. r8, r8, 0x30 # and \$0x30, %r8 1244 1245 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) 1246 vperm v1, v3, v3, $outperm # rotate right/left 1247 vsel v2, $outhead, v1, $outmask 1248 vmr $outhead, v1 1249 stvx v2, 0, $out 1250 blr 1251 1252.align 4 1253Lschedule_mangle_dec: 1254 # inverse mix columns 1255 # lea .Lk_dksd(%rip),%r11 1256 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi 1257 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo 1258 1259 # vmovdqa 0x00(%r11), %xmm2 1260 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2 1261 # vmovdqa 0x10(%r11), %xmm3 1262 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3 1263 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 1264 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 1265 1266 # vmovdqa 0x20(%r11), %xmm2 1267 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2 1268 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 1269 # vmovdqa 0x30(%r11), %xmm3 1270 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3 1271 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 1272 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 1273 1274 # vmovdqa 0x40(%r11), %xmm2 1275 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2 1276 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 1277 # vmovdqa 0x50(%r11), %xmm3 1278 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3 1279 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 1280 1281 # vmovdqa 0x60(%r11), %xmm2 1282 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2 1283 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 1284 # vmovdqa 0x70(%r11), %xmm4 1285 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4 1286 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 1287 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 1288 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3 1289 1290 addi $out, $out, -16 # add \$-16, %rdx 1291 1292 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 1293 addi r8, r8, -16 # add \$-16, %r8 1294 andi. r8, r8, 0x30 # and \$0x30, %r8 1295 1296 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) 1297 vperm v1, v3, v3, $outperm # rotate right/left 1298 vsel v2, $outhead, v1, $outmask 1299 vmr $outhead, v1 1300 stvx v2, 0, $out 1301 blr 1302 .long 0 1303 .byte 0,12,0x14,0,0,0,0,0 1304 1305.globl .vpaes_set_encrypt_key 1306.align 5 1307.vpaes_set_encrypt_key: 1308 $STU $sp,-$FRAME($sp) 1309 li r10,`15+6*$SIZE_T` 1310 li r11,`31+6*$SIZE_T` 1311 mflr r0 1312 mfspr r6, 256 # save vrsave 1313 stvx v20,r10,$sp 1314 addi r10,r10,32 1315 stvx v21,r11,$sp 1316 addi r11,r11,32 1317 stvx v22,r10,$sp 1318 addi r10,r10,32 1319 stvx v23,r11,$sp 1320 addi r11,r11,32 1321 stvx v24,r10,$sp 1322 addi r10,r10,32 1323 stvx v25,r11,$sp 1324 addi r11,r11,32 1325 stvx v26,r10,$sp 1326 addi r10,r10,32 1327 stvx v27,r11,$sp 1328 addi r11,r11,32 1329 stvx v28,r10,$sp 1330 addi r10,r10,32 1331 stvx v29,r11,$sp 1332 addi r11,r11,32 1333 stvx v30,r10,$sp 1334 stvx v31,r11,$sp 1335 stw r6,`$FRAME-4`($sp) # save vrsave 1336 li r7, -1 1337 $PUSH r0, `$FRAME+$LRSAVE`($sp) 1338 mtspr 256, r7 # preserve all AltiVec registers 1339 1340 srwi r9, $bits, 5 # shr \$5,%eax 1341 addi r9, r9, 6 # add \$5,%eax 1342 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1343 1344 cmplw $dir, $bits, $bits # set encrypt direction 1345 li r8, 0x30 # mov \$0x30,%r8d 1346 bl _vpaes_schedule_core 1347 1348 $POP r0, `$FRAME+$LRSAVE`($sp) 1349 li r10,`15+6*$SIZE_T` 1350 li r11,`31+6*$SIZE_T` 1351 mtspr 256, r6 # restore vrsave 1352 mtlr r0 1353 xor r3, r3, r3 1354 lvx v20,r10,$sp 1355 addi r10,r10,32 1356 lvx v21,r11,$sp 1357 addi r11,r11,32 1358 lvx v22,r10,$sp 1359 addi r10,r10,32 1360 lvx v23,r11,$sp 1361 addi r11,r11,32 1362 lvx v24,r10,$sp 1363 addi r10,r10,32 1364 lvx v25,r11,$sp 1365 addi r11,r11,32 1366 lvx v26,r10,$sp 1367 addi r10,r10,32 1368 lvx v27,r11,$sp 1369 addi r11,r11,32 1370 lvx v28,r10,$sp 1371 addi r10,r10,32 1372 lvx v29,r11,$sp 1373 addi r11,r11,32 1374 lvx v30,r10,$sp 1375 lvx v31,r11,$sp 1376 addi $sp,$sp,$FRAME 1377 blr 1378 .long 0 1379 .byte 0,12,0x04,1,0x80,0,3,0 1380 .long 0 1381.size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key 1382 1383.globl .vpaes_set_decrypt_key 1384.align 4 1385.vpaes_set_decrypt_key: 1386 $STU $sp,-$FRAME($sp) 1387 li r10,`15+6*$SIZE_T` 1388 li r11,`31+6*$SIZE_T` 1389 mflr r0 1390 mfspr r6, 256 # save vrsave 1391 stvx v20,r10,$sp 1392 addi r10,r10,32 1393 stvx v21,r11,$sp 1394 addi r11,r11,32 1395 stvx v22,r10,$sp 1396 addi r10,r10,32 1397 stvx v23,r11,$sp 1398 addi r11,r11,32 1399 stvx v24,r10,$sp 1400 addi r10,r10,32 1401 stvx v25,r11,$sp 1402 addi r11,r11,32 1403 stvx v26,r10,$sp 1404 addi r10,r10,32 1405 stvx v27,r11,$sp 1406 addi r11,r11,32 1407 stvx v28,r10,$sp 1408 addi r10,r10,32 1409 stvx v29,r11,$sp 1410 addi r11,r11,32 1411 stvx v30,r10,$sp 1412 stvx v31,r11,$sp 1413 stw r6,`$FRAME-4`($sp) # save vrsave 1414 li r7, -1 1415 $PUSH r0, `$FRAME+$LRSAVE`($sp) 1416 mtspr 256, r7 # preserve all AltiVec registers 1417 1418 srwi r9, $bits, 5 # shr \$5,%eax 1419 addi r9, r9, 6 # add \$5,%eax 1420 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1421 1422 slwi r9, r9, 4 # shl \$4,%eax 1423 add $out, $out, r9 # lea (%rdx,%rax),%rdx 1424 1425 cmplwi $dir, $bits, 0 # set decrypt direction 1426 srwi r8, $bits, 1 # shr \$1,%r8d 1427 andi. r8, r8, 32 # and \$32,%r8d 1428 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 1429 bl _vpaes_schedule_core 1430 1431 $POP r0, `$FRAME+$LRSAVE`($sp) 1432 li r10,`15+6*$SIZE_T` 1433 li r11,`31+6*$SIZE_T` 1434 mtspr 256, r6 # restore vrsave 1435 mtlr r0 1436 xor r3, r3, r3 1437 lvx v20,r10,$sp 1438 addi r10,r10,32 1439 lvx v21,r11,$sp 1440 addi r11,r11,32 1441 lvx v22,r10,$sp 1442 addi r10,r10,32 1443 lvx v23,r11,$sp 1444 addi r11,r11,32 1445 lvx v24,r10,$sp 1446 addi r10,r10,32 1447 lvx v25,r11,$sp 1448 addi r11,r11,32 1449 lvx v26,r10,$sp 1450 addi r10,r10,32 1451 lvx v27,r11,$sp 1452 addi r11,r11,32 1453 lvx v28,r10,$sp 1454 addi r10,r10,32 1455 lvx v29,r11,$sp 1456 addi r11,r11,32 1457 lvx v30,r10,$sp 1458 lvx v31,r11,$sp 1459 addi $sp,$sp,$FRAME 1460 blr 1461 .long 0 1462 .byte 0,12,0x04,1,0x80,0,3,0 1463 .long 0 1464.size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key 1465___ 1466} 1467 1468my $consts=1; 1469foreach (split("\n",$code)) { 1470 s/\`([^\`]*)\`/eval $1/geo; 1471 1472 # constants table endian-specific conversion 1473 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) { 1474 my $conv=$2; 1475 my @bytes=(); 1476 1477 # convert to endian-agnostic format 1478 foreach (split(/,\s+/,$1)) { 1479 my $l = /^0/?oct:int; 1480 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; 1481 } 1482 1483 # little-endian conversion 1484 if ($flavour =~ /le$/o) { 1485 SWITCH: for($conv) { 1486 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; 1487 /\?rev/ && do { @bytes=reverse(@bytes); last; }; 1488 } 1489 } 1490 1491 #emit 1492 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; 1493 next; 1494 } 1495 $consts=0 if (m/Lconsts:/o); # end of table 1496 1497 # instructions prefixed with '?' are endian-specific and need 1498 # to be adjusted accordingly... 1499 if ($flavour =~ /le$/o) { # little-endian 1500 s/\?lvsr/lvsl/o or 1501 s/\?lvsl/lvsr/o or 1502 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or 1503 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or 1504 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; 1505 } else { # big-endian 1506 s/\?([a-z]+)/$1/o; 1507 } 1508 1509 print $_,"\n"; 1510} 1511 1512close STDOUT; 1513