1/* $NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <sys/cdefs.h> 30__KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $"); 31 32#ifdef _KERNEL 33#include <sys/systm.h> 34#include <lib/libkern/libkern.h> 35#else 36#include <err.h> 37#include <assert.h> 38#include <inttypes.h> 39#include <stdio.h> 40#include <string.h> 41#define KASSERT assert 42#define panic(fmt, args...) err(1, fmt, ##args) 43#endif 44 45#include <crypto/aes/aes.h> 46#include <crypto/aes/arch/x86/aes_sse2.h> 47 48#include "aes_sse2_impl.h" 49 50void 51aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds) 52{ 53 size_t key_len; 54 55 switch (nrounds) { 56 case 10: 57 key_len = 16; 58 break; 59 case 12: 60 key_len = 24; 61 break; 62 case 14: 63 key_len = 32; 64 break; 65 default: 66 panic("invalid AES nrounds: %u", nrounds); 67 } 68 69 aes_sse2_keysched(rk, key, key_len); 70} 71 72void 73aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16], 74 uint8_t out[static 16], uint32_t nrounds) 75{ 76 uint64_t sk_exp[120]; 77 __m128i q[4]; 78 79 /* Expand round keys for bitslicing. */ 80 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); 81 82 /* Load input block interleaved with garbage blocks. */ 83 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); 84 q[1] = q[2] = q[3] = _mm_setzero_si128(); 85 86 /* Transform to bitslice, decrypt, transform from bitslice. */ 87 aes_sse2_ortho(q); 88 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); 89 aes_sse2_ortho(q); 90 91 /* Store output block. */ 92 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); 93 94 /* Paranoia: Zero temporary buffers. */ 95 explicit_memset(sk_exp, 0, sizeof sk_exp); 96 explicit_memset(q, 0, sizeof q); 97} 98 99void 100aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16], 101 uint8_t out[static 16], uint32_t nrounds) 102{ 103 uint64_t sk_exp[120]; 104 __m128i q[4]; 105 106 /* Expand round keys for bitslicing. */ 107 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); 108 109 /* Load input block interleaved with garbage blocks. */ 110 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); 111 q[1] = q[2] = q[3] = _mm_setzero_si128(); 112 113 /* Transform to bitslice, decrypt, transform from bitslice. */ 114 aes_sse2_ortho(q); 115 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); 116 aes_sse2_ortho(q); 117 118 /* Store output block. */ 119 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); 120 121 /* Paranoia: Zero temporary buffers. */ 122 explicit_memset(sk_exp, 0, sizeof sk_exp); 123 explicit_memset(q, 0, sizeof q); 124} 125 126void 127aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], 128 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], 129 uint32_t nrounds) 130{ 131 uint64_t sk_exp[120]; 132 __m128i q[4]; 133 __m128i cv; 134 135 KASSERT(nbytes); 136 KASSERT(nbytes % 16 == 0); 137 138 /* Expand round keys for bitslicing. */ 139 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); 140 141 /* Load the IV. */ 142 cv = _mm_loadu_epi8(iv); 143 144 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 145 /* Load input block and apply CV. */ 146 q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in)); 147 148 /* Transform to bitslice, encrypt, transform from bitslice. */ 149 aes_sse2_ortho(q); 150 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); 151 aes_sse2_ortho(q); 152 153 /* Remember ciphertext as CV and store output block. */ 154 cv = aes_sse2_interleave_out(q[0]); 155 _mm_storeu_epi8(out, cv); 156 } 157 158 /* Store updated IV. */ 159 _mm_storeu_epi8(iv, cv); 160 161 /* Paranoia: Zero temporary buffers. */ 162 explicit_memset(sk_exp, 0, sizeof sk_exp); 163 explicit_memset(q, 0, sizeof q); 164} 165 166void 167aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], 168 uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16], 169 uint32_t nrounds) 170{ 171 uint64_t sk_exp[120]; 172 __m128i q[4]; 173 __m128i cv, iv, w; 174 175 KASSERT(nbytes); 176 KASSERT(nbytes % 16 == 0); 177 178 /* Expand round keys for bitslicing. */ 179 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); 180 181 /* Load the IV. */ 182 iv = _mm_loadu_epi8(ivp); 183 184 /* Load the last cipher block. */ 185 cv = _mm_loadu_epi8(in + nbytes - 16); 186 187 /* Store the updated IV. */ 188 _mm_storeu_epi8(ivp, cv); 189 190 /* Process the last blocks if not an even multiple of four. */ 191 if (nbytes % (4*16)) { 192 unsigned n = (nbytes/16) % 4; 193 194 KASSERT(n > 0); 195 KASSERT(n < 4); 196 197 q[1] = q[2] = q[3] = _mm_setzero_si128(); 198 q[n - 1] = aes_sse2_interleave_in(cv); 199 switch (nbytes % 64) { 200 case 48: 201 w = _mm_loadu_epi8(in + nbytes - 32); 202 q[1] = aes_sse2_interleave_in(w); 203 w = _mm_loadu_epi8(in + nbytes - 48); 204 q[0] = aes_sse2_interleave_in(w); 205 break; 206 case 32: 207 w = _mm_loadu_epi8(in + nbytes - 32); 208 q[0] = aes_sse2_interleave_in(w); 209 break; 210 case 16: 211 break; 212 } 213 214 /* Decrypt. */ 215 aes_sse2_ortho(q); 216 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); 217 aes_sse2_ortho(q); 218 219 do { 220 n--; 221 w = aes_sse2_interleave_out(q[n]); 222 if ((nbytes -= 16) == 0) 223 goto out; 224 cv = _mm_loadu_epi8(in + nbytes - 16); 225 _mm_storeu_epi8(out + nbytes, w ^ cv); 226 } while (n); 227 } 228 229 for (;;) { 230 KASSERT(nbytes >= 64); 231 nbytes -= 64; 232 233 /* 234 * 1. Set up upper cipher block from cv. 235 * 2. Load lower cipher block into cv and set it up. 236 * 3. Decrypt. 237 */ 238 q[3] = aes_sse2_interleave_in(cv); 239 240 w = _mm_loadu_epi8(in + nbytes + 4*8); 241 q[2] = aes_sse2_interleave_in(w); 242 243 w = _mm_loadu_epi8(in + nbytes + 4*4); 244 q[1] = aes_sse2_interleave_in(w); 245 246 w = _mm_loadu_epi8(in + nbytes + 4*0); 247 q[0] = aes_sse2_interleave_in(w); 248 249 aes_sse2_ortho(q); 250 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); 251 aes_sse2_ortho(q); 252 253 /* Store the upper output block. */ 254 w = aes_sse2_interleave_out(q[3]); 255 cv = _mm_loadu_epi8(in + nbytes + 4*8); 256 _mm_storeu_epi8(out + nbytes + 4*12, w ^ cv); 257 258 /* Store the middle output blocks. */ 259 w = aes_sse2_interleave_out(q[2]); 260 cv = _mm_loadu_epi8(in + nbytes + 4*4); 261 _mm_storeu_epi8(out + nbytes + 4*8, w ^ cv); 262 263 w = aes_sse2_interleave_out(q[1]); 264 cv = _mm_loadu_epi8(in + nbytes + 4*0); 265 _mm_storeu_epi8(out + nbytes + 4*4, w ^ cv); 266 267 /* 268 * Get the first output block, but don't load the CV 269 * yet -- it might be the previous ciphertext block, or 270 * it might be the IV. 271 */ 272 w = aes_sse2_interleave_out(q[0]); 273 274 /* Stop if we've reached the first output block. */ 275 if (nbytes == 0) 276 goto out; 277 278 /* 279 * Load the preceding cipher block, and apply it as the 280 * chaining value to this one. 281 */ 282 cv = _mm_loadu_epi8(in + nbytes - 16); 283 _mm_storeu_epi8(out + nbytes, w ^ cv); 284 } 285 286out: /* Store the first output block. */ 287 _mm_storeu_epi8(out, w ^ iv); 288 289 /* Paranoia: Zero temporary buffers. */ 290 explicit_memset(sk_exp, 0, sizeof sk_exp); 291 explicit_memset(q, 0, sizeof q); 292} 293 294static inline __m128i 295aes_sse2_xts_update(__m128i t) 296{ 297 const __m128i one = _mm_set_epi64x(1, 1); 298 __m128i s, m, c; 299 300 s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */ 301 m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */ 302 m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */ 303 c = _mm_set_epi64x(1, 0x87); /* carry */ 304 305 return _mm_slli_epi64(t, 1) ^ (c & ~m); 306} 307 308static int 309aes_sse2_xts_update_selftest(void) 310{ 311 static const struct { 312 uint32_t in[4], out[4]; 313 } cases[] = { 314 [0] = { {1}, {2} }, 315 [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, 316 [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, 317 [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, 318 [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, 319 [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, 320 }; 321 unsigned i; 322 uint32_t t[4]; 323 int result = 0; 324 325 for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { 326 t[0] = cases[i].in[0]; 327 t[1] = cases[i].in[1]; 328 t[2] = cases[i].in[2]; 329 t[3] = cases[i].in[3]; 330 _mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t))); 331 if (t[0] != cases[i].out[0] || 332 t[1] != cases[i].out[1] || 333 t[2] != cases[i].out[2] || 334 t[3] != cases[i].out[3]) { 335 printf("%s %u:" 336 " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", 337 __func__, i, t[0], t[1], t[2], t[3]); 338 result = -1; 339 } 340 } 341 342 return result; 343} 344 345void 346aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], 347 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], 348 uint32_t nrounds) 349{ 350 uint64_t sk_exp[120]; 351 __m128i q[4]; 352 __m128i w; 353 __m128i t[5]; 354 unsigned i; 355 356 KASSERT(nbytes); 357 KASSERT(nbytes % 16 == 0); 358 359 /* Expand round keys for bitslicing. */ 360 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); 361 362 /* Load tweak. */ 363 t[0] = _mm_loadu_epi8(tweak); 364 365 /* Handle the first block separately if odd number. */ 366 if (nbytes % (4*16)) { 367 /* Load up the tweaked inputs. */ 368 for (i = 0; i < (nbytes/16) % 4; i++) { 369 w = _mm_loadu_epi8(in + 16*i) ^ t[i]; 370 q[i] = aes_sse2_interleave_in(w); 371 t[i + 1] = aes_sse2_xts_update(t[i]); 372 } 373 for (; i < 4; i++) 374 q[i] = _mm_setzero_si128(); 375 376 /* Encrypt up to four blocks. */ 377 aes_sse2_ortho(q); 378 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); 379 aes_sse2_ortho(q); 380 381 /* Store the tweaked outputs. */ 382 for (i = 0; i < (nbytes/16) % 4; i++) { 383 w = aes_sse2_interleave_out(q[i]); 384 _mm_storeu_epi8(out + 16*i, w ^ t[i]); 385 } 386 387 /* Advance to the next block. */ 388 t[0] = t[i]; 389 in += nbytes % (4*16); 390 out += nbytes % (4*16); 391 nbytes -= nbytes % (4*16); 392 if (nbytes == 0) 393 goto out; 394 } 395 396 do { 397 KASSERT(nbytes % 64 == 0); 398 KASSERT(nbytes >= 64); 399 400 /* Load up the tweaked inputs. */ 401 for (i = 0; i < 4; i++) { 402 w = _mm_loadu_epi8(in + 16*i) ^ t[i]; 403 q[i] = aes_sse2_interleave_in(w); 404 t[i + 1] = aes_sse2_xts_update(t[i]); 405 } 406 407 /* Encrypt four blocks. */ 408 aes_sse2_ortho(q); 409 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); 410 aes_sse2_ortho(q); 411 412 /* Store the tweaked outputs. */ 413 for (i = 0; i < 4; i++) { 414 w = aes_sse2_interleave_out(q[i]); 415 _mm_storeu_epi8(out + 16*i, w ^ t[i]); 416 } 417 418 /* Advance to the next block. */ 419 t[0] = t[4]; 420 in += 64; 421 out += 64; 422 nbytes -= 64; 423 } while (nbytes); 424 425out: /* Store the updated tweak. */ 426 _mm_storeu_epi8(tweak, t[0]); 427 428 /* Paranoia: Zero temporary buffers. */ 429 explicit_memset(sk_exp, 0, sizeof sk_exp); 430 explicit_memset(q, 0, sizeof q); 431 explicit_memset(t, 0, sizeof t); 432} 433 434void 435aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], 436 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], 437 uint32_t nrounds) 438{ 439 uint64_t sk_exp[120]; 440 __m128i q[4]; 441 __m128i w; 442 __m128i t[5]; 443 unsigned i; 444 445 KASSERT(nbytes); 446 KASSERT(nbytes % 16 == 0); 447 448 /* Expand round keys for bitslicing. */ 449 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); 450 451 /* Load tweak. */ 452 t[0] = _mm_loadu_epi8(tweak); 453 454 /* Handle the first block separately if odd number. */ 455 if (nbytes % (4*16)) { 456 /* Load up the tweaked inputs. */ 457 for (i = 0; i < (nbytes/16) % 4; i++) { 458 w = _mm_loadu_epi8(in + 16*i) ^ t[i]; 459 q[i] = aes_sse2_interleave_in(w); 460 t[i + 1] = aes_sse2_xts_update(t[i]); 461 } 462 for (; i < 4; i++) 463 q[i] = _mm_setzero_si128(); 464 465 /* Decrypt up to four blocks. */ 466 aes_sse2_ortho(q); 467 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); 468 aes_sse2_ortho(q); 469 470 /* Store the tweaked outputs. */ 471 for (i = 0; i < (nbytes/16) % 4; i++) { 472 w = aes_sse2_interleave_out(q[i]); 473 _mm_storeu_epi8(out + 16*i, w ^ t[i]); 474 } 475 476 /* Advance to the next block. */ 477 t[0] = t[i]; 478 in += nbytes % (4*16); 479 out += nbytes % (4*16); 480 nbytes -= nbytes % (4*16); 481 if (nbytes == 0) 482 goto out; 483 } 484 485 do { 486 KASSERT(nbytes % 64 == 0); 487 KASSERT(nbytes >= 64); 488 489 /* Load up the tweaked inputs. */ 490 for (i = 0; i < 4; i++) { 491 w = _mm_loadu_epi8(in + 16*i) ^ t[i]; 492 q[i] = aes_sse2_interleave_in(w); 493 t[i + 1] = aes_sse2_xts_update(t[i]); 494 } 495 496 /* Decrypt four blocks. */ 497 aes_sse2_ortho(q); 498 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); 499 aes_sse2_ortho(q); 500 501 /* Store the tweaked outputs. */ 502 for (i = 0; i < 4; i++) { 503 w = aes_sse2_interleave_out(q[i]); 504 _mm_storeu_epi8(out + 16*i, w ^ t[i]); 505 } 506 507 /* Advance to the next block. */ 508 t[0] = t[4]; 509 in += 64; 510 out += 64; 511 nbytes -= 64; 512 } while (nbytes); 513 514out: /* Store the updated tweak. */ 515 _mm_storeu_epi8(tweak, t[0]); 516 517 /* Paranoia: Zero temporary buffers. */ 518 explicit_memset(sk_exp, 0, sizeof sk_exp); 519 explicit_memset(q, 0, sizeof q); 520 explicit_memset(t, 0, sizeof t); 521} 522 523void 524aes_sse2_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], 525 size_t nbytes, uint8_t auth[static 16], uint32_t nrounds) 526{ 527 uint64_t sk_exp[120]; 528 __m128i q[4]; 529 530 KASSERT(nbytes); 531 KASSERT(nbytes % 16 == 0); 532 533 /* Expand round keys for bitslicing. */ 534 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); 535 536 /* Load initial authenticator. */ 537 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(auth)); 538 539 for (; nbytes; nbytes -= 16, in += 16) { 540 q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in)); 541 aes_sse2_ortho(q); 542 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); 543 aes_sse2_ortho(q); 544 } 545 546 /* Store updated authenticator. */ 547 _mm_storeu_epi8(auth, aes_sse2_interleave_out(q[0])); 548 549 /* Paranoia: Zero temporary buffers. */ 550 explicit_memset(sk_exp, 0, sizeof sk_exp); 551 explicit_memset(q, 0, sizeof q); 552} 553 554void 555aes_sse2_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], 556 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], 557 uint32_t nrounds) 558{ 559 uint64_t sk_exp[120]; 560 __m128i q[4]; 561 __m128i ctr; 562 uint32_t c0, c1, c2, c3; 563 564 KASSERT(nbytes); 565 KASSERT(nbytes % 16 == 0); 566 567 /* Expand round keys for bitslicing. */ 568 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); 569 570 /* Set first block to authenticator. */ 571 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr)); 572 573 /* Load initial counter block, big-endian so we can increment it. */ 574 c0 = le32dec(authctr + 16 + 4*0); 575 c1 = le32dec(authctr + 16 + 4*1); 576 c2 = le32dec(authctr + 16 + 4*2); 577 c3 = be32dec(authctr + 16 + 4*3); 578 579 /* Set other blocks to garbage -- can't take advantage. */ 580 q[2] = q[3] = _mm_setzero_si128(); 581 582 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 583 /* Update authenticator. */ 584 q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in)); 585 586 /* Increment 32-bit counter. */ 587 ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0); 588 q[1] = aes_sse2_interleave_in(ctr); 589 590 /* Encrypt authenticator and counter. */ 591 aes_sse2_ortho(q); 592 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); 593 aes_sse2_ortho(q); 594 595 /* Encrypt with CTR output. */ 596 _mm_storeu_epi8(out, 597 _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[1])); 598 } 599 600 /* Update authenticator. */ 601 _mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[0])); 602 603 /* Update counter. */ 604 be32enc(authctr + 16 + 4*3, c3); 605 606 /* Paranoia: Zero temporary buffers. */ 607 explicit_memset(sk_exp, 0, sizeof sk_exp); 608 explicit_memset(q, 0, sizeof q); 609} 610 611void 612aes_sse2_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16], 613 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], 614 uint32_t nrounds) 615{ 616 uint64_t sk_exp[120]; 617 __m128i q[4]; 618 __m128i ctr, block; 619 uint32_t c0, c1, c2, c3; 620 621 KASSERT(nbytes); 622 KASSERT(nbytes % 16 == 0); 623 624 /* Expand round keys for bitslicing. */ 625 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); 626 627 /* Load initial counter block, big-endian so we can increment it. */ 628 c0 = le32dec(authctr + 16 + 4*0); 629 c1 = le32dec(authctr + 16 + 4*1); 630 c2 = le32dec(authctr + 16 + 4*2); 631 c3 = be32dec(authctr + 16 + 4*3); 632 633 /* Increment 32-bit counter. */ 634 ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0); 635 q[0] = aes_sse2_interleave_in(ctr); 636 637 /* 638 * Set the other blocks to garbage -- we don't have any 639 * plaintext to authenticate yet. 640 */ 641 q[1] = q[2] = q[3] = _mm_setzero_si128(); 642 643 /* Encrypt first CTR. */ 644 aes_sse2_ortho(q); 645 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); 646 aes_sse2_ortho(q); 647 648 /* Load the initial authenticator. */ 649 q[1] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr)); 650 651 for (;; in += 16, out += 16) { 652 /* Decrypt the block. */ 653 block = _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[0]); 654 655 /* Update authenticator. */ 656 q[1] ^= aes_sse2_interleave_in(block); 657 658 /* Store plaintext. */ 659 _mm_storeu_epi8(out, block); 660 661 /* If this is the last block, stop. */ 662 if ((nbytes -= 16) == 0) 663 break; 664 665 /* Increment 32-bit counter. */ 666 ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0); 667 q[0] = aes_sse2_interleave_in(ctr); 668 669 /* Authenticate previous plaintext, encrypt next CTR. */ 670 aes_sse2_ortho(q); 671 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); 672 aes_sse2_ortho(q); 673 } 674 675 /* 676 * Authenticate last plaintext. We're only doing this for the 677 * authenticator, not for the counter, so don't bother to 678 * initialize q[0], q[2], q[3]. (Even for the sake of 679 * sanitizers, they're already initialized to something by 680 * now.) 681 */ 682 aes_sse2_ortho(q); 683 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); 684 aes_sse2_ortho(q); 685 686 /* Update authenticator. */ 687 _mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[1])); 688 689 /* Update counter. */ 690 be32enc(authctr + 16 + 4*3, c3); 691 692 /* Paranoia: Zero temporary buffers. */ 693 explicit_memset(sk_exp, 0, sizeof sk_exp); 694 explicit_memset(q, 0, sizeof q); 695} 696 697int 698aes_sse2_selftest(void) 699{ 700 701 if (aes_sse2_xts_update_selftest()) 702 return -1; 703 704 /* XXX test aes_sse2_bitslice_decrypt */ 705 /* XXX test aes_sse2_bitslice_encrypt */ 706 /* XXX test aes_sse2_keysched */ 707 /* XXX test aes_sse2_ortho */ 708 /* XXX test aes_sse2_skey_expand */ 709 710 return 0; 711} 712