vpaes-armv8.S revision 1.3
1#include "arm_asm.h" 2.text 3 4.type _vpaes_consts,%object 5.align 7 // totally strategic alignment 6_vpaes_consts: 7.Lk_mc_forward: // mc_forward 8.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 9.quad 0x080B0A0904070605, 0x000302010C0F0E0D 10.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 11.quad 0x000302010C0F0E0D, 0x080B0A0904070605 12.Lk_mc_backward: // mc_backward 13.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 14.quad 0x020100030E0D0C0F, 0x0A09080B06050407 15.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 16.quad 0x0A09080B06050407, 0x020100030E0D0C0F 17.Lk_sr: // sr 18.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 19.quad 0x030E09040F0A0500, 0x0B06010C07020D08 20.quad 0x0F060D040B020900, 0x070E050C030A0108 21.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 22 23// 24// "Hot" constants 25// 26.Lk_inv: // inv, inva 27.quad 0x0E05060F0D080180, 0x040703090A0B0C02 28.quad 0x01040A060F0B0780, 0x030D0E0C02050809 29.Lk_ipt: // input transform (lo, hi) 30.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 31.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 32.Lk_sbo: // sbou, sbot 33.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 34.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 35.Lk_sb1: // sb1u, sb1t 36.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 37.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 38.Lk_sb2: // sb2u, sb2t 39.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 40.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 41 42// 43// Decryption stuff 44// 45.Lk_dipt: // decryption input transform 46.quad 0x0F505B040B545F00, 0x154A411E114E451A 47.quad 0x86E383E660056500, 0x12771772F491F194 48.Lk_dsbo: // decryption sbox final output 49.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 50.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 51.Lk_dsb9: // decryption sbox output *9*u, *9*t 52.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 53.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 54.Lk_dsbd: // decryption sbox output *D*u, *D*t 55.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 56.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 57.Lk_dsbb: // decryption sbox output *B*u, *B*t 58.quad 0xD022649296B44200, 0x602646F6B0F2D404 59.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 60.Lk_dsbe: // decryption sbox output *E*u, *E*t 61.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 62.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 63 64// 65// Key schedule constants 66// 67.Lk_dksd: // decryption key schedule: invskew x*D 68.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 69.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 70.Lk_dksb: // decryption key schedule: invskew x*B 71.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 72.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 73.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 74.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 75.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 76.Lk_dks9: // decryption key schedule: invskew x*9 77.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 78.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 79 80.Lk_rcon: // rcon 81.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 82 83.Lk_opt: // output transform 84.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 85.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 86.Lk_deskew: // deskew tables: inverts the sbox's "skew" 87.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 88.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 89 90.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 91.align 2 92.size _vpaes_consts,.-_vpaes_consts 93.align 6 94## 95## _aes_preheat 96## 97## Fills register %r10 -> .aes_consts (so you can -fPIC) 98## and %xmm9-%xmm15 as specified below. 99## 100.type _vpaes_encrypt_preheat,%function 101.align 4 102_vpaes_encrypt_preheat: 103 adr x10, .Lk_inv 104 movi v17.16b, #0x0f 105 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 106 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 107 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 108 ret 109.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 110 111## 112## _aes_encrypt_core 113## 114## AES-encrypt %xmm0. 115## 116## Inputs: 117## %xmm0 = input 118## %xmm9-%xmm15 as in _vpaes_preheat 119## (%rdx) = scheduled keys 120## 121## Output in %xmm0 122## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 123## Preserves %xmm6 - %xmm8 so you get some local vectors 124## 125## 126.type _vpaes_encrypt_core,%function 127.align 4 128_vpaes_encrypt_core: 129 mov x9, x2 130 ldr w8, [x2,#240] // pull rounds 131 adr x11, .Lk_mc_forward+16 132 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 133 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 134 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 135 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 136 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 137 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 138 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 139 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 140 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 141 b .Lenc_entry 142 143.align 4 144.Lenc_loop: 145 // middle of middle round 146 add x10, x11, #0x40 147 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 148 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 149 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 150 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 151 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 152 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 153 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 154 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 155 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 156 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 157 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 158 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 159 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 160 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 161 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 162 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 163 sub w8, w8, #1 // nr-- 164 165.Lenc_entry: 166 // top of round 167 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 168 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 169 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 170 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 171 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 172 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 173 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 174 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 175 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 176 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 177 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 178 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 179 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 180 cbnz w8, .Lenc_loop 181 182 // middle of last round 183 add x10, x11, #0x80 184 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 185 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 186 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 187 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 188 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 189 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 190 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 191 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 192 ret 193.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 194 195.globl vpaes_encrypt 196.type vpaes_encrypt,%function 197.align 4 198vpaes_encrypt: 199 stp x29,x30,[sp,#-16]! 200 add x29,sp,#0 201 202 ld1 {v7.16b}, [x0] 203 bl _vpaes_encrypt_preheat 204 bl _vpaes_encrypt_core 205 st1 {v0.16b}, [x1] 206 207 ldp x29,x30,[sp],#16 208 ret 209.size vpaes_encrypt,.-vpaes_encrypt 210 211.type _vpaes_encrypt_2x,%function 212.align 4 213_vpaes_encrypt_2x: 214 mov x9, x2 215 ldr w8, [x2,#240] // pull rounds 216 adr x11, .Lk_mc_forward+16 217 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 218 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 219 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 220 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 221 and v9.16b, v15.16b, v17.16b 222 ushr v8.16b, v15.16b, #4 223 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 224 tbl v9.16b, {v20.16b}, v9.16b 225 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 226 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 227 tbl v10.16b, {v21.16b}, v8.16b 228 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 229 eor v8.16b, v9.16b, v16.16b 230 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 231 eor v8.16b, v8.16b, v10.16b 232 b .Lenc_2x_entry 233 234.align 4 235.Lenc_2x_loop: 236 // middle of middle round 237 add x10, x11, #0x40 238 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 239 tbl v12.16b, {v25.16b}, v10.16b 240 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 241 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 242 tbl v8.16b, {v24.16b}, v11.16b 243 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 244 eor v12.16b, v12.16b, v16.16b 245 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 246 tbl v13.16b, {v27.16b}, v10.16b 247 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 248 eor v8.16b, v8.16b, v12.16b 249 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 250 tbl v10.16b, {v26.16b}, v11.16b 251 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 252 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 253 tbl v11.16b, {v8.16b}, v1.16b 254 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 255 eor v10.16b, v10.16b, v13.16b 256 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 257 tbl v8.16b, {v8.16b}, v4.16b 258 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 259 eor v11.16b, v11.16b, v10.16b 260 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 261 tbl v12.16b, {v11.16b},v1.16b 262 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 263 eor v8.16b, v8.16b, v11.16b 264 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 265 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 266 eor v8.16b, v8.16b, v12.16b 267 sub w8, w8, #1 // nr-- 268 269.Lenc_2x_entry: 270 // top of round 271 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 272 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 273 and v9.16b, v8.16b, v17.16b 274 ushr v8.16b, v8.16b, #4 275 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 276 tbl v13.16b, {v19.16b},v9.16b 277 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 278 eor v9.16b, v9.16b, v8.16b 279 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 280 tbl v11.16b, {v18.16b},v8.16b 281 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 282 tbl v12.16b, {v18.16b},v9.16b 283 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 284 eor v11.16b, v11.16b, v13.16b 285 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 286 eor v12.16b, v12.16b, v13.16b 287 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 288 tbl v10.16b, {v18.16b},v11.16b 289 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 290 tbl v11.16b, {v18.16b},v12.16b 291 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 292 eor v10.16b, v10.16b, v9.16b 293 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 294 eor v11.16b, v11.16b, v8.16b 295 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 296 cbnz w8, .Lenc_2x_loop 297 298 // middle of last round 299 add x10, x11, #0x80 300 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 301 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 302 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 303 tbl v12.16b, {v22.16b}, v10.16b 304 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 305 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 306 tbl v8.16b, {v23.16b}, v11.16b 307 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 308 eor v12.16b, v12.16b, v16.16b 309 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 310 eor v8.16b, v8.16b, v12.16b 311 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 312 tbl v1.16b, {v8.16b},v1.16b 313 ret 314.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 315 316.type _vpaes_decrypt_preheat,%function 317.align 4 318_vpaes_decrypt_preheat: 319 adr x10, .Lk_inv 320 movi v17.16b, #0x0f 321 adr x11, .Lk_dipt 322 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 323 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 324 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 325 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 326 ret 327.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 328 329## 330## Decryption core 331## 332## Same API as encryption core. 333## 334.type _vpaes_decrypt_core,%function 335.align 4 336_vpaes_decrypt_core: 337 mov x9, x2 338 ldr w8, [x2,#240] // pull rounds 339 340 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 341 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 342 eor x11, x11, #0x30 // xor $0x30, %r11 343 adr x10, .Lk_sr 344 and x11, x11, #0x30 // and $0x30, %r11 345 add x11, x11, x10 346 adr x10, .Lk_mc_forward+48 347 348 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 349 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 350 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 351 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 352 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 353 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 354 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 355 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 356 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 357 b .Ldec_entry 358 359.align 4 360.Ldec_loop: 361// 362// Inverse mix columns 363// 364 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 365 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 366 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 367 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 368 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 369 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 370 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 371 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 372 373 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 374 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 375 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 376 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 377 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 378 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 379 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 380 381 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 382 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 383 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 384 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 385 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 386 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 387 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 388 389 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 390 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 391 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 392 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 393 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 394 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 395 sub w8, w8, #1 // sub $1,%rax # nr-- 396 397.Ldec_entry: 398 // top of round 399 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 400 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 401 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 402 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 403 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 404 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 405 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 406 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 407 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 408 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 409 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 410 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 411 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 412 cbnz w8, .Ldec_loop 413 414 // middle of last round 415 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 416 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 417 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 418 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 419 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 420 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 421 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 422 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 423 ret 424.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 425 426.globl vpaes_decrypt 427.type vpaes_decrypt,%function 428.align 4 429vpaes_decrypt: 430 stp x29,x30,[sp,#-16]! 431 add x29,sp,#0 432 433 ld1 {v7.16b}, [x0] 434 bl _vpaes_decrypt_preheat 435 bl _vpaes_decrypt_core 436 st1 {v0.16b}, [x1] 437 438 ldp x29,x30,[sp],#16 439 ret 440.size vpaes_decrypt,.-vpaes_decrypt 441 442// v14-v15 input, v0-v1 output 443.type _vpaes_decrypt_2x,%function 444.align 4 445_vpaes_decrypt_2x: 446 mov x9, x2 447 ldr w8, [x2,#240] // pull rounds 448 449 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 450 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 451 eor x11, x11, #0x30 // xor $0x30, %r11 452 adr x10, .Lk_sr 453 and x11, x11, #0x30 // and $0x30, %r11 454 add x11, x11, x10 455 adr x10, .Lk_mc_forward+48 456 457 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 458 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 459 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 460 and v9.16b, v15.16b, v17.16b 461 ushr v8.16b, v15.16b, #4 462 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 463 tbl v10.16b, {v20.16b},v9.16b 464 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 465 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 466 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 467 tbl v8.16b, {v21.16b},v8.16b 468 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 469 eor v10.16b, v10.16b, v16.16b 470 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 471 eor v8.16b, v8.16b, v10.16b 472 b .Ldec_2x_entry 473 474.align 4 475.Ldec_2x_loop: 476// 477// Inverse mix columns 478// 479 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 480 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 481 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 482 tbl v12.16b, {v24.16b}, v10.16b 483 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 484 tbl v9.16b, {v25.16b}, v11.16b 485 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 486 eor v8.16b, v12.16b, v16.16b 487 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 488 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 489 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 490 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 491 492 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 493 tbl v12.16b, {v26.16b}, v10.16b 494 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 495 tbl v8.16b, {v8.16b},v5.16b 496 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 497 tbl v9.16b, {v27.16b}, v11.16b 498 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 499 eor v8.16b, v8.16b, v12.16b 500 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 501 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 502 eor v8.16b, v8.16b, v9.16b 503 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 504 505 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 506 tbl v12.16b, {v28.16b}, v10.16b 507 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 508 tbl v8.16b, {v8.16b},v5.16b 509 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 510 tbl v9.16b, {v29.16b}, v11.16b 511 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 512 eor v8.16b, v8.16b, v12.16b 513 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 514 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 515 eor v8.16b, v8.16b, v9.16b 516 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 517 518 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 519 tbl v12.16b, {v30.16b}, v10.16b 520 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 521 tbl v8.16b, {v8.16b},v5.16b 522 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 523 tbl v9.16b, {v31.16b}, v11.16b 524 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 525 eor v8.16b, v8.16b, v12.16b 526 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 527 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 528 eor v8.16b, v8.16b, v9.16b 529 sub w8, w8, #1 // sub $1,%rax # nr-- 530 531.Ldec_2x_entry: 532 // top of round 533 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 534 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 535 and v9.16b, v8.16b, v17.16b 536 ushr v8.16b, v8.16b, #4 537 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 538 tbl v10.16b, {v19.16b},v9.16b 539 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 540 eor v9.16b, v9.16b, v8.16b 541 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 542 tbl v11.16b, {v18.16b},v8.16b 543 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 544 tbl v12.16b, {v18.16b},v9.16b 545 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 546 eor v11.16b, v11.16b, v10.16b 547 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 548 eor v12.16b, v12.16b, v10.16b 549 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 550 tbl v10.16b, {v18.16b},v11.16b 551 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 552 tbl v11.16b, {v18.16b},v12.16b 553 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 554 eor v10.16b, v10.16b, v9.16b 555 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 556 eor v11.16b, v11.16b, v8.16b 557 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 558 cbnz w8, .Ldec_2x_loop 559 560 // middle of last round 561 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 562 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 563 tbl v12.16b, {v22.16b}, v10.16b 564 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 565 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 566 tbl v9.16b, {v23.16b}, v11.16b 567 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 568 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 569 eor v12.16b, v12.16b, v16.16b 570 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 571 eor v8.16b, v9.16b, v12.16b 572 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 573 tbl v1.16b, {v8.16b},v2.16b 574 ret 575.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 576######################################################## 577## ## 578## AES key schedule ## 579## ## 580######################################################## 581.type _vpaes_key_preheat,%function 582.align 4 583_vpaes_key_preheat: 584 adr x10, .Lk_inv 585 movi v16.16b, #0x5b // .Lk_s63 586 adr x11, .Lk_sb1 587 movi v17.16b, #0x0f // .Lk_s0F 588 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 589 adr x10, .Lk_dksd 590 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 591 adr x11, .Lk_mc_forward 592 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 593 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 594 ld1 {v8.2d}, [x10] // .Lk_rcon 595 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 596 ret 597.size _vpaes_key_preheat,.-_vpaes_key_preheat 598 599.type _vpaes_schedule_core,%function 600.align 4 601_vpaes_schedule_core: 602 stp x29, x30, [sp,#-16]! 603 add x29,sp,#0 604 605 bl _vpaes_key_preheat // load the tables 606 607 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 608 609 // input transform 610 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 611 bl _vpaes_schedule_transform 612 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 613 614 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10 615 add x8, x8, x10 616 cbnz w3, .Lschedule_am_decrypting 617 618 // encrypting, output zeroth round key after transform 619 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 620 b .Lschedule_go 621 622.Lschedule_am_decrypting: 623 // decrypting, output zeroth round key after shiftrows 624 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 625 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 626 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 627 eor x8, x8, #0x30 // xor $0x30, %r8 628 629.Lschedule_go: 630 cmp w1, #192 // cmp $192, %esi 631 b.hi .Lschedule_256 632 b.eq .Lschedule_192 633 // 128: fall though 634 635## 636## .schedule_128 637## 638## 128-bit specific part of key schedule. 639## 640## This schedule is really simple, because all its parts 641## are accomplished by the subroutines. 642## 643.Lschedule_128: 644 mov x0, #10 // mov $10, %esi 645 646.Loop_schedule_128: 647 sub x0, x0, #1 // dec %esi 648 bl _vpaes_schedule_round 649 cbz x0, .Lschedule_mangle_last 650 bl _vpaes_schedule_mangle // write output 651 b .Loop_schedule_128 652 653## 654## .aes_schedule_192 655## 656## 192-bit specific part of key schedule. 657## 658## The main body of this schedule is the same as the 128-bit 659## schedule, but with more smearing. The long, high side is 660## stored in %xmm7 as before, and the short, low side is in 661## the high bits of %xmm6. 662## 663## This schedule is somewhat nastier, however, because each 664## round produces 192 bits of key material, or 1.5 round keys. 665## Therefore, on each cycle we do 2 rounds and produce 3 round 666## keys. 667## 668.align 4 669.Lschedule_192: 670 sub x0, x0, #8 671 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 672 bl _vpaes_schedule_transform // input transform 673 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 674 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 675 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 676 mov x0, #4 // mov $4, %esi 677 678.Loop_schedule_192: 679 sub x0, x0, #1 // dec %esi 680 bl _vpaes_schedule_round 681 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 682 bl _vpaes_schedule_mangle // save key n 683 bl _vpaes_schedule_192_smear 684 bl _vpaes_schedule_mangle // save key n+1 685 bl _vpaes_schedule_round 686 cbz x0, .Lschedule_mangle_last 687 bl _vpaes_schedule_mangle // save key n+2 688 bl _vpaes_schedule_192_smear 689 b .Loop_schedule_192 690 691## 692## .aes_schedule_256 693## 694## 256-bit specific part of key schedule. 695## 696## The structure here is very similar to the 128-bit 697## schedule, but with an additional "low side" in 698## %xmm6. The low side's rounds are the same as the 699## high side's, except no rcon and no rotation. 700## 701.align 4 702.Lschedule_256: 703 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 704 bl _vpaes_schedule_transform // input transform 705 mov x0, #7 // mov $7, %esi 706 707.Loop_schedule_256: 708 sub x0, x0, #1 // dec %esi 709 bl _vpaes_schedule_mangle // output low result 710 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 711 712 // high round 713 bl _vpaes_schedule_round 714 cbz x0, .Lschedule_mangle_last 715 bl _vpaes_schedule_mangle 716 717 // low round. swap xmm7 and xmm6 718 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 719 movi v4.16b, #0 720 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 721 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 722 bl _vpaes_schedule_low_round 723 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 724 725 b .Loop_schedule_256 726 727## 728## .aes_schedule_mangle_last 729## 730## Mangler for last round of key schedule 731## Mangles %xmm0 732## when encrypting, outputs out(%xmm0) ^ 63 733## when decrypting, outputs unskew(%xmm0) 734## 735## Always called right before return... jumps to cleanup and exits 736## 737.align 4 738.Lschedule_mangle_last: 739 // schedule last round key from xmm0 740 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 741 cbnz w3, .Lschedule_mangle_last_dec 742 743 // encrypting 744 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 745 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 746 add x2, x2, #32 // add $32, %rdx 747 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 748 749.Lschedule_mangle_last_dec: 750 ld1 {v20.2d,v21.2d}, [x11] // reload constants 751 sub x2, x2, #16 // add $-16, %rdx 752 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 753 bl _vpaes_schedule_transform // output transform 754 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 755 756 // cleanup 757 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 758 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 759 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 760 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 761 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 762 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 763 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 764 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 765 ldp x29, x30, [sp],#16 766 ret 767.size _vpaes_schedule_core,.-_vpaes_schedule_core 768 769## 770## .aes_schedule_192_smear 771## 772## Smear the short, low side in the 192-bit key schedule. 773## 774## Inputs: 775## %xmm7: high side, b a x y 776## %xmm6: low side, d c 0 0 777## %xmm13: 0 778## 779## Outputs: 780## %xmm6: b+c+d b+c 0 0 781## %xmm0: b+c+d b+c b a 782## 783.type _vpaes_schedule_192_smear,%function 784.align 4 785_vpaes_schedule_192_smear: 786 movi v1.16b, #0 787 dup v0.4s, v7.s[3] 788 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 789 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 790 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 791 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 792 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 793 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 794 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 795 ret 796.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 797 798## 799## .aes_schedule_round 800## 801## Runs one main round of the key schedule on %xmm0, %xmm7 802## 803## Specifically, runs subbytes on the high dword of %xmm0 804## then rotates it by one byte and xors into the low dword of 805## %xmm7. 806## 807## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 808## next rcon. 809## 810## Smears the dwords of %xmm7 by xoring the low into the 811## second low, result into third, result into highest. 812## 813## Returns results in %xmm7 = %xmm0. 814## Clobbers %xmm1-%xmm4, %r11. 815## 816.type _vpaes_schedule_round,%function 817.align 4 818_vpaes_schedule_round: 819 // extract rcon from xmm8 820 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 821 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 822 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 823 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 824 825 // rotate 826 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 827 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 828 829 // fall through... 830 831 // low round: same as high round, but no rotation and no rcon. 832_vpaes_schedule_low_round: 833 // smear xmm7 834 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 835 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 836 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 837 838 // subbytes 839 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 840 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 841 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 842 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 843 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 844 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 845 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 846 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 847 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 848 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 849 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 850 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 851 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 852 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 853 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 854 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 855 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 856 857 // add in smeared stuff 858 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 859 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 860 ret 861.size _vpaes_schedule_round,.-_vpaes_schedule_round 862 863## 864## .aes_schedule_transform 865## 866## Linear-transform %xmm0 according to tables at (%r11) 867## 868## Requires that %xmm9 = 0x0F0F... as in preheat 869## Output in %xmm0 870## Clobbers %xmm1, %xmm2 871## 872.type _vpaes_schedule_transform,%function 873.align 4 874_vpaes_schedule_transform: 875 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 876 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 877 // vmovdqa (%r11), %xmm2 # lo 878 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 879 // vmovdqa 16(%r11), %xmm1 # hi 880 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 881 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 882 ret 883.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 884 885## 886## .aes_schedule_mangle 887## 888## Mangle xmm0 from (basis-transformed) standard version 889## to our version. 890## 891## On encrypt, 892## xor with 0x63 893## multiply by circulant 0,1,1,1 894## apply shiftrows transform 895## 896## On decrypt, 897## xor with 0x63 898## multiply by "inverse mixcolumns" circulant E,B,D,9 899## deskew 900## apply shiftrows transform 901## 902## 903## Writes out to (%rdx), and increments or decrements it 904## Keeps track of round number mod 4 in %r8 905## Preserves xmm0 906## Clobbers xmm1-xmm5 907## 908.type _vpaes_schedule_mangle,%function 909.align 4 910_vpaes_schedule_mangle: 911 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 912 // vmovdqa .Lk_mc_forward(%rip),%xmm5 913 cbnz w3, .Lschedule_mangle_dec 914 915 // encrypting 916 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 917 add x2, x2, #16 // add $16, %rdx 918 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 919 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 920 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 921 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 922 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 923 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 924 925 b .Lschedule_mangle_both 926.align 4 927.Lschedule_mangle_dec: 928 // inverse mix columns 929 // lea .Lk_dksd(%rip),%r11 930 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi 931 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 932 933 // vmovdqa 0x00(%r11), %xmm2 934 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 935 // vmovdqa 0x10(%r11), %xmm3 936 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 937 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 938 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 939 940 // vmovdqa 0x20(%r11), %xmm2 941 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 942 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 943 // vmovdqa 0x30(%r11), %xmm3 944 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 945 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 946 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 947 948 // vmovdqa 0x40(%r11), %xmm2 949 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 950 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 951 // vmovdqa 0x50(%r11), %xmm3 952 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 953 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 954 955 // vmovdqa 0x60(%r11), %xmm2 956 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 957 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 958 // vmovdqa 0x70(%r11), %xmm4 959 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 960 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 961 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 962 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 963 964 sub x2, x2, #16 // add $-16, %rdx 965 966.Lschedule_mangle_both: 967 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 968 add x8, x8, #64-16 // add $-16, %r8 969 and x8, x8, #~(1<<6) // and $0x30, %r8 970 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 971 ret 972.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 973 974.globl vpaes_set_encrypt_key 975.type vpaes_set_encrypt_key,%function 976.align 4 977vpaes_set_encrypt_key: 978 stp x29,x30,[sp,#-16]! 979 add x29,sp,#0 980 stp d8,d9,[sp,#-16]! // ABI spec says so 981 982 lsr w9, w1, #5 // shr $5,%eax 983 add w9, w9, #5 // $5,%eax 984 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 985 986 mov w3, #0 // mov $0,%ecx 987 mov x8, #0x30 // mov $0x30,%r8d 988 bl _vpaes_schedule_core 989 eor x0, x0, x0 990 991 ldp d8,d9,[sp],#16 992 ldp x29,x30,[sp],#16 993 ret 994.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 995 996.globl vpaes_set_decrypt_key 997.type vpaes_set_decrypt_key,%function 998.align 4 999vpaes_set_decrypt_key: 1000 stp x29,x30,[sp,#-16]! 1001 add x29,sp,#0 1002 stp d8,d9,[sp,#-16]! // ABI spec says so 1003 1004 lsr w9, w1, #5 // shr $5,%eax 1005 add w9, w9, #5 // $5,%eax 1006 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1007 lsl w9, w9, #4 // shl $4,%eax 1008 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx 1009 add x2, x2, x9 1010 1011 mov w3, #1 // mov $1,%ecx 1012 lsr w8, w1, #1 // shr $1,%r8d 1013 and x8, x8, #32 // and $32,%r8d 1014 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 1015 bl _vpaes_schedule_core 1016 1017 ldp d8,d9,[sp],#16 1018 ldp x29,x30,[sp],#16 1019 ret 1020.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1021.globl vpaes_cbc_encrypt 1022.type vpaes_cbc_encrypt,%function 1023.align 4 1024vpaes_cbc_encrypt: 1025 cbz x2, .Lcbc_abort 1026 cmp w5, #0 // check direction 1027 b.eq vpaes_cbc_decrypt 1028 1029 stp x29,x30,[sp,#-16]! 1030 add x29,sp,#0 1031 1032 mov x17, x2 // reassign 1033 mov x2, x3 // reassign 1034 1035 ld1 {v0.16b}, [x4] // load ivec 1036 bl _vpaes_encrypt_preheat 1037 b .Lcbc_enc_loop 1038 1039.align 4 1040.Lcbc_enc_loop: 1041 ld1 {v7.16b}, [x0],#16 // load input 1042 eor v7.16b, v7.16b, v0.16b // xor with ivec 1043 bl _vpaes_encrypt_core 1044 st1 {v0.16b}, [x1],#16 // save output 1045 subs x17, x17, #16 1046 b.hi .Lcbc_enc_loop 1047 1048 st1 {v0.16b}, [x4] // write ivec 1049 1050 ldp x29,x30,[sp],#16 1051.Lcbc_abort: 1052 ret 1053.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1054 1055.type vpaes_cbc_decrypt,%function 1056.align 4 1057vpaes_cbc_decrypt: 1058 stp x29,x30,[sp,#-16]! 1059 add x29,sp,#0 1060 stp d8,d9,[sp,#-16]! // ABI spec says so 1061 stp d10,d11,[sp,#-16]! 1062 stp d12,d13,[sp,#-16]! 1063 stp d14,d15,[sp,#-16]! 1064 1065 mov x17, x2 // reassign 1066 mov x2, x3 // reassign 1067 ld1 {v6.16b}, [x4] // load ivec 1068 bl _vpaes_decrypt_preheat 1069 tst x17, #16 1070 b.eq .Lcbc_dec_loop2x 1071 1072 ld1 {v7.16b}, [x0], #16 // load input 1073 bl _vpaes_decrypt_core 1074 eor v0.16b, v0.16b, v6.16b // xor with ivec 1075 orr v6.16b, v7.16b, v7.16b // next ivec value 1076 st1 {v0.16b}, [x1], #16 1077 subs x17, x17, #16 1078 b.ls .Lcbc_dec_done 1079 1080.align 4 1081.Lcbc_dec_loop2x: 1082 ld1 {v14.16b,v15.16b}, [x0], #32 1083 bl _vpaes_decrypt_2x 1084 eor v0.16b, v0.16b, v6.16b // xor with ivec 1085 eor v1.16b, v1.16b, v14.16b 1086 orr v6.16b, v15.16b, v15.16b 1087 st1 {v0.16b,v1.16b}, [x1], #32 1088 subs x17, x17, #32 1089 b.hi .Lcbc_dec_loop2x 1090 1091.Lcbc_dec_done: 1092 st1 {v6.16b}, [x4] 1093 1094 ldp d14,d15,[sp],#16 1095 ldp d12,d13,[sp],#16 1096 ldp d10,d11,[sp],#16 1097 ldp d8,d9,[sp],#16 1098 ldp x29,x30,[sp],#16 1099 ret 1100.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1101.globl vpaes_ecb_encrypt 1102.type vpaes_ecb_encrypt,%function 1103.align 4 1104vpaes_ecb_encrypt: 1105 stp x29,x30,[sp,#-16]! 1106 add x29,sp,#0 1107 stp d8,d9,[sp,#-16]! // ABI spec says so 1108 stp d10,d11,[sp,#-16]! 1109 stp d12,d13,[sp,#-16]! 1110 stp d14,d15,[sp,#-16]! 1111 1112 mov x17, x2 1113 mov x2, x3 1114 bl _vpaes_encrypt_preheat 1115 tst x17, #16 1116 b.eq .Lecb_enc_loop 1117 1118 ld1 {v7.16b}, [x0],#16 1119 bl _vpaes_encrypt_core 1120 st1 {v0.16b}, [x1],#16 1121 subs x17, x17, #16 1122 b.ls .Lecb_enc_done 1123 1124.align 4 1125.Lecb_enc_loop: 1126 ld1 {v14.16b,v15.16b}, [x0], #32 1127 bl _vpaes_encrypt_2x 1128 st1 {v0.16b,v1.16b}, [x1], #32 1129 subs x17, x17, #32 1130 b.hi .Lecb_enc_loop 1131 1132.Lecb_enc_done: 1133 ldp d14,d15,[sp],#16 1134 ldp d12,d13,[sp],#16 1135 ldp d10,d11,[sp],#16 1136 ldp d8,d9,[sp],#16 1137 ldp x29,x30,[sp],#16 1138 ret 1139.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt 1140 1141.globl vpaes_ecb_decrypt 1142.type vpaes_ecb_decrypt,%function 1143.align 4 1144vpaes_ecb_decrypt: 1145 stp x29,x30,[sp,#-16]! 1146 add x29,sp,#0 1147 stp d8,d9,[sp,#-16]! // ABI spec says so 1148 stp d10,d11,[sp,#-16]! 1149 stp d12,d13,[sp,#-16]! 1150 stp d14,d15,[sp,#-16]! 1151 1152 mov x17, x2 1153 mov x2, x3 1154 bl _vpaes_decrypt_preheat 1155 tst x17, #16 1156 b.eq .Lecb_dec_loop 1157 1158 ld1 {v7.16b}, [x0],#16 1159 bl _vpaes_encrypt_core 1160 st1 {v0.16b}, [x1],#16 1161 subs x17, x17, #16 1162 b.ls .Lecb_dec_done 1163 1164.align 4 1165.Lecb_dec_loop: 1166 ld1 {v14.16b,v15.16b}, [x0], #32 1167 bl _vpaes_decrypt_2x 1168 st1 {v0.16b,v1.16b}, [x1], #32 1169 subs x17, x17, #32 1170 b.hi .Lecb_dec_loop 1171 1172.Lecb_dec_done: 1173 ldp d14,d15,[sp],#16 1174 ldp d12,d13,[sp],#16 1175 ldp d10,d11,[sp],#16 1176 ldp d8,d9,[sp],#16 1177 ldp x29,x30,[sp],#16 1178 ret 1179.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt 1180