chacha_sse2.c revision 1.1
1/* $NetBSD: chacha_sse2.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <sys/types.h> 30#include <sys/endian.h> 31 32#include "immintrin.h" 33 34#include "chacha_sse2.h" 35 36static inline __m128i 37rol32(__m128i x, uint8_t n) 38{ 39 40 return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n); 41} 42 43static inline void 44chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3, 45 unsigned nr) 46{ 47 __m128i r0, r1, r2, r3; 48 __m128i c0, c1, c2, c3; 49 50 r0 = *p0; 51 r1 = *p1; 52 r2 = *p2; 53 r3 = *p3; 54 55 for (; nr > 0; nr -= 2) { 56 r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16); 57 r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12); 58 r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8); 59 r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7); 60 61 c0 = r0; 62 c1 = _mm_shuffle_epi32(r1, 0x39); 63 c2 = _mm_shuffle_epi32(r2, 0x4e); 64 c3 = _mm_shuffle_epi32(r3, 0x93); 65 66 c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16); 67 c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12); 68 c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8); 69 c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7); 70 71 r0 = c0; 72 r1 = _mm_shuffle_epi32(c1, 0x93); 73 r2 = _mm_shuffle_epi32(c2, 0x4e); 74 r3 = _mm_shuffle_epi32(c3, 0x39); 75 } 76 77 *p0 = r0; 78 *p1 = r1; 79 *p2 = r2; 80 *p3 = r3; 81} 82 83void 84chacha_core_sse2(uint8_t out[restrict static 64], 85 const uint8_t in[static 16], 86 const uint8_t k[static 32], 87 const uint8_t c[static 16], 88 unsigned nr) 89{ 90 __m128i in0, in1, in2, in3; 91 __m128i r0, r1, r2, r3; 92 93 r0 = in0 = _mm_loadu_si128((const __m128i *)c); 94 r1 = in1 = _mm_loadu_si128((const __m128i *)k); 95 r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1); 96 r3 = in3 = _mm_loadu_si128((const __m128i *)in); 97 98 chacha_permute(&r0, &r1, &r2, &r3, nr); 99 100 _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0)); 101 _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1)); 102 _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2)); 103 _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3)); 104} 105 106void 107hchacha_sse2(uint8_t out[restrict static 32], 108 const uint8_t in[static 16], 109 const uint8_t k[static 32], 110 const uint8_t c[static 16], 111 unsigned nr) 112{ 113 __m128i r0, r1, r2, r3; 114 115 r0 = _mm_loadu_si128((const __m128i *)c); 116 r1 = _mm_loadu_si128((const __m128i *)k); 117 r2 = _mm_loadu_si128((const __m128i *)k + 1); 118 r3 = _mm_loadu_si128((const __m128i *)in); 119 120 chacha_permute(&r0, &r1, &r2, &r3, nr); 121 122 _mm_storeu_si128((__m128i *)out + 0, r0); 123 _mm_storeu_si128((__m128i *)out + 1, r3); 124} 125 126#define CHACHA_QUARTERROUND(a, b, c, d) do \ 127{ \ 128 (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16); \ 129 (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12); \ 130 (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8); \ 131 (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7); \ 132} while (/*CONSTCOND*/0) 133 134static inline __m128i 135load1_epi32(const void *p) 136{ 137 return (__m128i)_mm_load1_ps(p); 138} 139 140static inline __m128i 141loadu_epi32(const void *p) 142{ 143 return _mm_loadu_si128(p); 144} 145 146static inline void 147storeu_epi32(void *p, __m128i v) 148{ 149 return _mm_storeu_si128(p, v); 150} 151 152static inline __m128i 153unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 154{ 155 __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */ 156 __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */ 157 158 /* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */ 159 return (__m128i)_mm_movelh_ps(lo, hi); 160} 161 162static inline __m128i 163unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 164{ 165 __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */ 166 __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */ 167 168 /* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */ 169 return (__m128i)_mm_movehl_ps(hi, lo); 170} 171 172static inline __m128i 173unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 174{ 175 __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */ 176 __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */ 177 178 /* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */ 179 return (__m128i)_mm_movelh_ps(lo, hi); 180} 181 182static inline __m128i 183unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d) 184{ 185 __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */ 186 __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */ 187 188 /* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */ 189 return (__m128i)_mm_movehl_ps(hi, lo); 190} 191 192void 193chacha_stream_sse2(uint8_t *restrict s, size_t n, 194 uint32_t blkno, 195 const uint8_t nonce[static 12], 196 const uint8_t k[static 32], 197 unsigned nr) 198{ 199 __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 200 __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; 201 __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; 202 unsigned r; 203 204 if (n < 256) 205 goto out; 206 207 x0 = load1_epi32(chacha_const32 + 0); 208 x1 = load1_epi32(chacha_const32 + 4); 209 x2 = load1_epi32(chacha_const32 + 8); 210 x3 = load1_epi32(chacha_const32 + 12); 211 x4 = load1_epi32(k + 0); 212 x5 = load1_epi32(k + 4); 213 x6 = load1_epi32(k + 8); 214 x7 = load1_epi32(k + 12); 215 x8 = load1_epi32(k + 16); 216 x9 = load1_epi32(k + 20); 217 x10 = load1_epi32(k + 24); 218 x11 = load1_epi32(k + 28); 219 /* x12 set in the loop */ 220 x13 = load1_epi32(nonce + 0); 221 x14 = load1_epi32(nonce + 4); 222 x15 = load1_epi32(nonce + 8); 223 224 for (; n >= 256; s += 256, n -= 256, blkno += 4) { 225 x12 = _mm_add_epi32(_mm_set1_epi32(blkno), 226 _mm_set_epi32(3,2,1,0)); 227 y0 = x0; 228 y1 = x1; 229 y2 = x2; 230 y3 = x3; 231 y4 = x4; 232 y5 = x5; 233 y6 = x6; 234 y7 = x7; 235 y8 = x8; 236 y9 = x9; 237 y10 = x10; 238 y11 = x11; 239 y12 = x12; 240 y13 = x13; 241 y14 = x14; 242 y15 = x15; 243 for (r = nr; r > 0; r -= 2) { 244 CHACHA_QUARTERROUND( y0, y4, y8,y12); 245 CHACHA_QUARTERROUND( y1, y5, y9,y13); 246 CHACHA_QUARTERROUND( y2, y6,y10,y14); 247 CHACHA_QUARTERROUND( y3, y7,y11,y15); 248 CHACHA_QUARTERROUND( y0, y5,y10,y15); 249 CHACHA_QUARTERROUND( y1, y6,y11,y12); 250 CHACHA_QUARTERROUND( y2, y7, y8,y13); 251 CHACHA_QUARTERROUND( y3, y4, y9,y14); 252 } 253 y0 = _mm_add_epi32(y0, x0); 254 y1 = _mm_add_epi32(y1, x1); 255 y2 = _mm_add_epi32(y2, x2); 256 y3 = _mm_add_epi32(y3, x3); 257 y4 = _mm_add_epi32(y4, x4); 258 y5 = _mm_add_epi32(y5, x5); 259 y6 = _mm_add_epi32(y6, x6); 260 y7 = _mm_add_epi32(y7, x7); 261 y8 = _mm_add_epi32(y8, x8); 262 y9 = _mm_add_epi32(y9, x9); 263 y10 = _mm_add_epi32(y10, x10); 264 y11 = _mm_add_epi32(y11, x11); 265 y12 = _mm_add_epi32(y12, x12); 266 y13 = _mm_add_epi32(y13, x13); 267 y14 = _mm_add_epi32(y14, x14); 268 y15 = _mm_add_epi32(y15, x15); 269 270 z0 = unpack0_epi32(y0, y1, y2, y3); 271 z1 = unpack0_epi32(y4, y5, y6, y7); 272 z2 = unpack0_epi32(y8, y9, y10, y11); 273 z3 = unpack0_epi32(y12, y13, y14, y15); 274 z4 = unpack1_epi32(y0, y1, y2, y3); 275 z5 = unpack1_epi32(y4, y5, y6, y7); 276 z6 = unpack1_epi32(y8, y9, y10, y11); 277 z7 = unpack1_epi32(y12, y13, y14, y15); 278 z8 = unpack2_epi32(y0, y1, y2, y3); 279 z9 = unpack2_epi32(y4, y5, y6, y7); 280 z10 = unpack2_epi32(y8, y9, y10, y11); 281 z11 = unpack2_epi32(y12, y13, y14, y15); 282 z12 = unpack3_epi32(y0, y1, y2, y3); 283 z13 = unpack3_epi32(y4, y5, y6, y7); 284 z14 = unpack3_epi32(y8, y9, y10, y11); 285 z15 = unpack3_epi32(y12, y13, y14, y15); 286 287 storeu_epi32(s + 16*0, z0); 288 storeu_epi32(s + 16*1, z1); 289 storeu_epi32(s + 16*2, z2); 290 storeu_epi32(s + 16*3, z3); 291 storeu_epi32(s + 16*4, z4); 292 storeu_epi32(s + 16*5, z5); 293 storeu_epi32(s + 16*6, z6); 294 storeu_epi32(s + 16*7, z7); 295 storeu_epi32(s + 16*8, z8); 296 storeu_epi32(s + 16*9, z9); 297 storeu_epi32(s + 16*10, z10); 298 storeu_epi32(s + 16*11, z11); 299 storeu_epi32(s + 16*12, z12); 300 storeu_epi32(s + 16*13, z13); 301 storeu_epi32(s + 16*14, z14); 302 storeu_epi32(s + 16*15, z15); 303 } 304 305out: if (n) { 306 const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); 307 __m128i in0, in1, in2, in3; 308 __m128i r0, r1, r2, r3; 309 310 in0 = _mm_loadu_si128((const __m128i *)chacha_const32); 311 in1 = _mm_loadu_si128((const __m128i *)k); 312 in2 = _mm_loadu_si128((const __m128i *)k + 1); 313 in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), 314 le32dec(nonce), blkno); 315 316 for (; n >= 64; s += 64, n -= 64) { 317 r0 = in0; 318 r1 = in1; 319 r2 = in2; 320 r3 = in3; 321 chacha_permute(&r0, &r1, &r2, &r3, nr); 322 r0 = _mm_add_epi32(r0, in0); 323 r1 = _mm_add_epi32(r1, in1); 324 r2 = _mm_add_epi32(r2, in2); 325 r3 = _mm_add_epi32(r3, in3); 326 _mm_storeu_si128((__m128i *)s + 0, r0); 327 _mm_storeu_si128((__m128i *)s + 1, r1); 328 _mm_storeu_si128((__m128i *)s + 2, r2); 329 _mm_storeu_si128((__m128i *)s + 3, r3); 330 in3 = _mm_add_epi32(in3, blkno_inc); 331 } 332 333 if (n) { 334 uint8_t buf[64]; 335 unsigned i; 336 337 r0 = in0; 338 r1 = in1; 339 r2 = in2; 340 r3 = in3; 341 chacha_permute(&r0, &r1, &r2, &r3, nr); 342 r0 = _mm_add_epi32(r0, in0); 343 r1 = _mm_add_epi32(r1, in1); 344 r2 = _mm_add_epi32(r2, in2); 345 r3 = _mm_add_epi32(r3, in3); 346 _mm_storeu_si128((__m128i *)buf + 0, r0); 347 _mm_storeu_si128((__m128i *)buf + 1, r1); 348 _mm_storeu_si128((__m128i *)buf + 2, r2); 349 _mm_storeu_si128((__m128i *)buf + 3, r3); 350 351 for (i = 0; i < n - n%4; i += 4) 352 le32enc(s + i, le32dec(buf + i)); 353 for (; i < n; i++) 354 s[i] = buf[i]; 355 } 356 } 357} 358 359void 360chacha_stream_xor_sse2(uint8_t *s, const uint8_t *p, size_t n, 361 uint32_t blkno, 362 const uint8_t nonce[static 12], 363 const uint8_t k[static 32], 364 unsigned nr) 365{ 366 __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 367 __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; 368 __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; 369 unsigned r; 370 371 if (n < 256) 372 goto out; 373 374 x0 = load1_epi32(chacha_const32 + 0); 375 x1 = load1_epi32(chacha_const32 + 4); 376 x2 = load1_epi32(chacha_const32 + 8); 377 x3 = load1_epi32(chacha_const32 + 12); 378 x4 = load1_epi32(k + 0); 379 x5 = load1_epi32(k + 4); 380 x6 = load1_epi32(k + 8); 381 x7 = load1_epi32(k + 12); 382 x8 = load1_epi32(k + 16); 383 x9 = load1_epi32(k + 20); 384 x10 = load1_epi32(k + 24); 385 x11 = load1_epi32(k + 28); 386 /* x12 set in the loop */ 387 x13 = load1_epi32(nonce + 0); 388 x14 = load1_epi32(nonce + 4); 389 x15 = load1_epi32(nonce + 8); 390 391 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) { 392 x12 = _mm_add_epi32(_mm_set1_epi32(blkno), 393 _mm_set_epi32(3,2,1,0)); 394 y0 = x0; 395 y1 = x1; 396 y2 = x2; 397 y3 = x3; 398 y4 = x4; 399 y5 = x5; 400 y6 = x6; 401 y7 = x7; 402 y8 = x8; 403 y9 = x9; 404 y10 = x10; 405 y11 = x11; 406 y12 = x12; 407 y13 = x13; 408 y14 = x14; 409 y15 = x15; 410 for (r = nr; r > 0; r -= 2) { 411 CHACHA_QUARTERROUND( y0, y4, y8,y12); 412 CHACHA_QUARTERROUND( y1, y5, y9,y13); 413 CHACHA_QUARTERROUND( y2, y6,y10,y14); 414 CHACHA_QUARTERROUND( y3, y7,y11,y15); 415 CHACHA_QUARTERROUND( y0, y5,y10,y15); 416 CHACHA_QUARTERROUND( y1, y6,y11,y12); 417 CHACHA_QUARTERROUND( y2, y7, y8,y13); 418 CHACHA_QUARTERROUND( y3, y4, y9,y14); 419 } 420 y0 = _mm_add_epi32(y0, x0); 421 y1 = _mm_add_epi32(y1, x1); 422 y2 = _mm_add_epi32(y2, x2); 423 y3 = _mm_add_epi32(y3, x3); 424 y4 = _mm_add_epi32(y4, x4); 425 y5 = _mm_add_epi32(y5, x5); 426 y6 = _mm_add_epi32(y6, x6); 427 y7 = _mm_add_epi32(y7, x7); 428 y8 = _mm_add_epi32(y8, x8); 429 y9 = _mm_add_epi32(y9, x9); 430 y10 = _mm_add_epi32(y10, x10); 431 y11 = _mm_add_epi32(y11, x11); 432 y12 = _mm_add_epi32(y12, x12); 433 y13 = _mm_add_epi32(y13, x13); 434 y14 = _mm_add_epi32(y14, x14); 435 y15 = _mm_add_epi32(y15, x15); 436 437 z0 = unpack0_epi32(y0, y1, y2, y3); 438 z1 = unpack0_epi32(y4, y5, y6, y7); 439 z2 = unpack0_epi32(y8, y9, y10, y11); 440 z3 = unpack0_epi32(y12, y13, y14, y15); 441 z4 = unpack1_epi32(y0, y1, y2, y3); 442 z5 = unpack1_epi32(y4, y5, y6, y7); 443 z6 = unpack1_epi32(y8, y9, y10, y11); 444 z7 = unpack1_epi32(y12, y13, y14, y15); 445 z8 = unpack2_epi32(y0, y1, y2, y3); 446 z9 = unpack2_epi32(y4, y5, y6, y7); 447 z10 = unpack2_epi32(y8, y9, y10, y11); 448 z11 = unpack2_epi32(y12, y13, y14, y15); 449 z12 = unpack3_epi32(y0, y1, y2, y3); 450 z13 = unpack3_epi32(y4, y5, y6, y7); 451 z14 = unpack3_epi32(y8, y9, y10, y11); 452 z15 = unpack3_epi32(y12, y13, y14, y15); 453 454 storeu_epi32(s + 16*0, loadu_epi32(p + 16*0) ^ z0); 455 storeu_epi32(s + 16*1, loadu_epi32(p + 16*1) ^ z1); 456 storeu_epi32(s + 16*2, loadu_epi32(p + 16*2) ^ z2); 457 storeu_epi32(s + 16*3, loadu_epi32(p + 16*3) ^ z3); 458 storeu_epi32(s + 16*4, loadu_epi32(p + 16*4) ^ z4); 459 storeu_epi32(s + 16*5, loadu_epi32(p + 16*5) ^ z5); 460 storeu_epi32(s + 16*6, loadu_epi32(p + 16*6) ^ z6); 461 storeu_epi32(s + 16*7, loadu_epi32(p + 16*7) ^ z7); 462 storeu_epi32(s + 16*8, loadu_epi32(p + 16*8) ^ z8); 463 storeu_epi32(s + 16*9, loadu_epi32(p + 16*9) ^ z9); 464 storeu_epi32(s + 16*10, loadu_epi32(p + 16*10) ^ z10); 465 storeu_epi32(s + 16*11, loadu_epi32(p + 16*11) ^ z11); 466 storeu_epi32(s + 16*12, loadu_epi32(p + 16*12) ^ z12); 467 storeu_epi32(s + 16*13, loadu_epi32(p + 16*13) ^ z13); 468 storeu_epi32(s + 16*14, loadu_epi32(p + 16*14) ^ z14); 469 storeu_epi32(s + 16*15, loadu_epi32(p + 16*15) ^ z15); 470 } 471 472out: if (n) { 473 const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); 474 __m128i in0, in1, in2, in3; 475 __m128i r0, r1, r2, r3; 476 477 in0 = _mm_loadu_si128((const __m128i *)chacha_const32); 478 in1 = _mm_loadu_si128((const __m128i *)k); 479 in2 = _mm_loadu_si128((const __m128i *)k + 1); 480 in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), 481 le32dec(nonce), blkno); 482 483 for (; n >= 64; s += 64, p += 64, n -= 64) { 484 r0 = in0; 485 r1 = in1; 486 r2 = in2; 487 r3 = in3; 488 chacha_permute(&r0, &r1, &r2, &r3, nr); 489 r0 = _mm_add_epi32(r0, in0); 490 r1 = _mm_add_epi32(r1, in1); 491 r2 = _mm_add_epi32(r2, in2); 492 r3 = _mm_add_epi32(r3, in3); 493 r0 ^= _mm_loadu_si128((const __m128i *)p + 0); 494 r1 ^= _mm_loadu_si128((const __m128i *)p + 1); 495 r2 ^= _mm_loadu_si128((const __m128i *)p + 2); 496 r3 ^= _mm_loadu_si128((const __m128i *)p + 3); 497 _mm_storeu_si128((__m128i *)s + 0, r0); 498 _mm_storeu_si128((__m128i *)s + 1, r1); 499 _mm_storeu_si128((__m128i *)s + 2, r2); 500 _mm_storeu_si128((__m128i *)s + 3, r3); 501 in3 = _mm_add_epi32(in3, blkno_inc); 502 } 503 504 if (n) { 505 uint8_t buf[64]; 506 unsigned i; 507 508 r0 = in0; 509 r1 = in1; 510 r2 = in2; 511 r3 = in3; 512 chacha_permute(&r0, &r1, &r2, &r3, nr); 513 r0 = _mm_add_epi32(r0, in0); 514 r1 = _mm_add_epi32(r1, in1); 515 r2 = _mm_add_epi32(r2, in2); 516 r3 = _mm_add_epi32(r3, in3); 517 _mm_storeu_si128((__m128i *)buf + 0, r0); 518 _mm_storeu_si128((__m128i *)buf + 1, r1); 519 _mm_storeu_si128((__m128i *)buf + 2, r2); 520 _mm_storeu_si128((__m128i *)buf + 3, r3); 521 522 for (i = 0; i < n - n%4; i += 4) 523 le32enc(s + i, 524 le32dec(p + i) ^ le32dec(buf + i)); 525 for (; i < n; i++) 526 s[i] = p[i] ^ buf[i]; 527 } 528 } 529} 530 531void 532xchacha_stream_sse2(uint8_t *restrict s, size_t nbytes, 533 uint32_t blkno, 534 const uint8_t nonce[static 24], 535 const uint8_t k[static 32], 536 unsigned nr) 537{ 538 uint8_t subkey[32]; 539 uint8_t subnonce[12]; 540 541 hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 542 memset(subnonce, 0, 4); 543 memcpy(subnonce + 4, nonce + 16, 8); 544 chacha_stream_sse2(s, nbytes, blkno, subnonce, subkey, nr); 545} 546 547void 548xchacha_stream_xor_sse2(uint8_t *restrict c, const uint8_t *p, size_t nbytes, 549 uint32_t blkno, 550 const uint8_t nonce[static 24], 551 const uint8_t k[static 32], 552 unsigned nr) 553{ 554 uint8_t subkey[32]; 555 uint8_t subnonce[12]; 556 557 hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 558 memset(subnonce, 0, 4); 559 memcpy(subnonce + 4, nonce + 16, 8); 560 chacha_stream_xor_sse2(c, p, nbytes, blkno, subnonce, subkey, nr); 561} 562