chacha_sse2.c revision 1.3
1/*	$NetBSD: chacha_sse2.c,v 1.3 2023/08/07 01:07:36 rin Exp $	*/
2
3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/types.h>
30#include <sys/endian.h>
31
32#include <crypto/arch/x86/immintrin.h>
33
34#include "chacha_sse2.h"
35
36static inline __m128i
37rol32(__m128i x, uint8_t n)
38{
39
40	return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n);
41}
42
43static inline void
44chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3,
45    unsigned nr)
46{
47	__m128i r0, r1, r2, r3;
48	__m128i c0, c1, c2, c3;
49
50	r0 = *p0;
51	r1 = *p1;
52	r2 = *p2;
53	r3 = *p3;
54
55	for (; nr > 0; nr -= 2) {
56		r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16);
57		r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12);
58		r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8);
59		r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7);
60
61		c0 = r0;
62		c1 = _mm_shuffle_epi32(r1, 0x39);
63		c2 = _mm_shuffle_epi32(r2, 0x4e);
64		c3 = _mm_shuffle_epi32(r3, 0x93);
65
66		c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16);
67		c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12);
68		c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8);
69		c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7);
70
71		r0 = c0;
72		r1 = _mm_shuffle_epi32(c1, 0x93);
73		r2 = _mm_shuffle_epi32(c2, 0x4e);
74		r3 = _mm_shuffle_epi32(c3, 0x39);
75	}
76
77	*p0 = r0;
78	*p1 = r1;
79	*p2 = r2;
80	*p3 = r3;
81}
82
83void
84chacha_core_sse2(uint8_t out[restrict static 64],
85    const uint8_t in[static 16],
86    const uint8_t k[static 32],
87    const uint8_t c[static 16],
88    unsigned nr)
89{
90	__m128i in0, in1, in2, in3;
91	__m128i r0, r1, r2, r3;
92
93	r0 = in0 = _mm_loadu_si128((const __m128i *)c);
94	r1 = in1 = _mm_loadu_si128((const __m128i *)k);
95	r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1);
96	r3 = in3 = _mm_loadu_si128((const __m128i *)in);
97
98	chacha_permute(&r0, &r1, &r2, &r3, nr);
99
100	_mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0));
101	_mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1));
102	_mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2));
103	_mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3));
104}
105
106void
107hchacha_sse2(uint8_t out[restrict static 32],
108    const uint8_t in[static 16],
109    const uint8_t k[static 32],
110    const uint8_t c[static 16],
111    unsigned nr)
112{
113	__m128i r0, r1, r2, r3;
114
115	r0 = _mm_loadu_si128((const __m128i *)c);
116	r1 = _mm_loadu_si128((const __m128i *)k);
117	r2 = _mm_loadu_si128((const __m128i *)k + 1);
118	r3 = _mm_loadu_si128((const __m128i *)in);
119
120	chacha_permute(&r0, &r1, &r2, &r3, nr);
121
122	_mm_storeu_si128((__m128i *)out + 0, r0);
123	_mm_storeu_si128((__m128i *)out + 1, r3);
124}
125
126#define	CHACHA_QUARTERROUND(a, b, c, d) do				      \
127{									      \
128	(a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16);	      \
129	(c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12);	      \
130	(a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8);	      \
131	(c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7);	      \
132} while (/*CONSTCOND*/0)
133
134static inline __m128i
135load1_epi32(const void *p)
136{
137	return (__m128i)_mm_load1_ps(p);
138}
139
140static inline __m128i
141loadu_epi32(const void *p)
142{
143	return _mm_loadu_si128(p);
144}
145
146static inline void
147storeu_epi32(void *p, __m128i v)
148{
149	return _mm_storeu_si128(p, v);
150}
151
152static inline __m128i
153unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
154{
155	__m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */
156	__m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */
157
158	/* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */
159	return (__m128i)_mm_movelh_ps(lo, hi);
160}
161
162static inline __m128i
163unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
164{
165	__m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */
166	__m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */
167
168	/* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */
169	return (__m128i)_mm_movehl_ps(hi, lo);
170}
171
172static inline __m128i
173unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
174{
175	__m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */
176	__m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */
177
178	/* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */
179	return (__m128i)_mm_movelh_ps(lo, hi);
180}
181
182static inline __m128i
183unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
184{
185	__m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */
186	__m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */
187
188	/* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */
189	return (__m128i)_mm_movehl_ps(hi, lo);
190}
191
192void
193chacha_stream_sse2(uint8_t *restrict s, size_t n,
194    uint32_t blkno,
195    const uint8_t nonce[static 12],
196    const uint8_t k[static 32],
197    unsigned nr)
198{
199	__m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
200	__m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
201	__m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
202	unsigned r;
203
204	if (n < 256)
205		goto out;
206
207	x0 = load1_epi32(chacha_const32 + 0);
208	x1 = load1_epi32(chacha_const32 + 4);
209	x2 = load1_epi32(chacha_const32 + 8);
210	x3 = load1_epi32(chacha_const32 + 12);
211	x4 = load1_epi32(k + 0);
212	x5 = load1_epi32(k + 4);
213	x6 = load1_epi32(k + 8);
214	x7 = load1_epi32(k + 12);
215	x8 = load1_epi32(k + 16);
216	x9 = load1_epi32(k + 20);
217	x10 = load1_epi32(k + 24);
218	x11 = load1_epi32(k + 28);
219	/* x12 set in the loop */
220	x13 = load1_epi32(nonce + 0);
221	x14 = load1_epi32(nonce + 4);
222	x15 = load1_epi32(nonce + 8);
223
224	for (; n >= 256; s += 256, n -= 256, blkno += 4) {
225		x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
226		    _mm_set_epi32(3,2,1,0));
227		y0 = x0;
228		y1 = x1;
229		y2 = x2;
230		y3 = x3;
231		y4 = x4;
232		y5 = x5;
233		y6 = x6;
234		y7 = x7;
235		y8 = x8;
236		y9 = x9;
237		y10 = x10;
238		y11 = x11;
239		y12 = x12;
240		y13 = x13;
241		y14 = x14;
242		y15 = x15;
243		for (r = nr; r > 0; r -= 2) {
244			CHACHA_QUARTERROUND( y0, y4, y8,y12);
245			CHACHA_QUARTERROUND( y1, y5, y9,y13);
246			CHACHA_QUARTERROUND( y2, y6,y10,y14);
247			CHACHA_QUARTERROUND( y3, y7,y11,y15);
248			CHACHA_QUARTERROUND( y0, y5,y10,y15);
249			CHACHA_QUARTERROUND( y1, y6,y11,y12);
250			CHACHA_QUARTERROUND( y2, y7, y8,y13);
251			CHACHA_QUARTERROUND( y3, y4, y9,y14);
252		}
253		y0 = _mm_add_epi32(y0, x0);
254		y1 = _mm_add_epi32(y1, x1);
255		y2 = _mm_add_epi32(y2, x2);
256		y3 = _mm_add_epi32(y3, x3);
257		y4 = _mm_add_epi32(y4, x4);
258		y5 = _mm_add_epi32(y5, x5);
259		y6 = _mm_add_epi32(y6, x6);
260		y7 = _mm_add_epi32(y7, x7);
261		y8 = _mm_add_epi32(y8, x8);
262		y9 = _mm_add_epi32(y9, x9);
263		y10 = _mm_add_epi32(y10, x10);
264		y11 = _mm_add_epi32(y11, x11);
265		y12 = _mm_add_epi32(y12, x12);
266		y13 = _mm_add_epi32(y13, x13);
267		y14 = _mm_add_epi32(y14, x14);
268		y15 = _mm_add_epi32(y15, x15);
269
270		z0 = unpack0_epi32(y0, y1, y2, y3);
271		z1 = unpack0_epi32(y4, y5, y6, y7);
272		z2 = unpack0_epi32(y8, y9, y10, y11);
273		z3 = unpack0_epi32(y12, y13, y14, y15);
274		z4 = unpack1_epi32(y0, y1, y2, y3);
275		z5 = unpack1_epi32(y4, y5, y6, y7);
276		z6 = unpack1_epi32(y8, y9, y10, y11);
277		z7 = unpack1_epi32(y12, y13, y14, y15);
278		z8 = unpack2_epi32(y0, y1, y2, y3);
279		z9 = unpack2_epi32(y4, y5, y6, y7);
280		z10 = unpack2_epi32(y8, y9, y10, y11);
281		z11 = unpack2_epi32(y12, y13, y14, y15);
282		z12 = unpack3_epi32(y0, y1, y2, y3);
283		z13 = unpack3_epi32(y4, y5, y6, y7);
284		z14 = unpack3_epi32(y8, y9, y10, y11);
285		z15 = unpack3_epi32(y12, y13, y14, y15);
286
287		storeu_epi32(s + 16*0, z0);
288		storeu_epi32(s + 16*1, z1);
289		storeu_epi32(s + 16*2, z2);
290		storeu_epi32(s + 16*3, z3);
291		storeu_epi32(s + 16*4, z4);
292		storeu_epi32(s + 16*5, z5);
293		storeu_epi32(s + 16*6, z6);
294		storeu_epi32(s + 16*7, z7);
295		storeu_epi32(s + 16*8, z8);
296		storeu_epi32(s + 16*9, z9);
297		storeu_epi32(s + 16*10, z10);
298		storeu_epi32(s + 16*11, z11);
299		storeu_epi32(s + 16*12, z12);
300		storeu_epi32(s + 16*13, z13);
301		storeu_epi32(s + 16*14, z14);
302		storeu_epi32(s + 16*15, z15);
303	}
304
305out:	if (n) {
306		const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
307		__m128i in0, in1, in2, in3;
308		__m128i r0, r1, r2, r3;
309
310		in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
311		in1 = _mm_loadu_si128((const __m128i *)k);
312		in2 = _mm_loadu_si128((const __m128i *)k + 1);
313		in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
314		    le32dec(nonce), blkno);
315
316		for (; n; s += 64, n -= 64) {
317			r0 = in0;
318			r1 = in1;
319			r2 = in2;
320			r3 = in3;
321			chacha_permute(&r0, &r1, &r2, &r3, nr);
322			r0 = _mm_add_epi32(r0, in0);
323			r1 = _mm_add_epi32(r1, in1);
324			r2 = _mm_add_epi32(r2, in2);
325			r3 = _mm_add_epi32(r3, in3);
326
327			if (n < 64) {
328				uint8_t buf[64] __aligned(16);
329
330				_mm_storeu_si128((__m128i *)buf + 0, r0);
331				_mm_storeu_si128((__m128i *)buf + 1, r1);
332				_mm_storeu_si128((__m128i *)buf + 2, r2);
333				_mm_storeu_si128((__m128i *)buf + 3, r3);
334				memcpy(s, buf, n);
335
336				break;
337			}
338
339			_mm_storeu_si128((__m128i *)s + 0, r0);
340			_mm_storeu_si128((__m128i *)s + 1, r1);
341			_mm_storeu_si128((__m128i *)s + 2, r2);
342			_mm_storeu_si128((__m128i *)s + 3, r3);
343			in3 = _mm_add_epi32(in3, blkno_inc);
344		}
345	}
346}
347
348void
349chacha_stream_xor_sse2(uint8_t *s, const uint8_t *p, size_t n,
350    uint32_t blkno,
351    const uint8_t nonce[static 12],
352    const uint8_t k[static 32],
353    unsigned nr)
354{
355	__m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
356	__m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
357	__m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
358	unsigned r;
359
360	if (n < 256)
361		goto out;
362
363	x0 = load1_epi32(chacha_const32 + 0);
364	x1 = load1_epi32(chacha_const32 + 4);
365	x2 = load1_epi32(chacha_const32 + 8);
366	x3 = load1_epi32(chacha_const32 + 12);
367	x4 = load1_epi32(k + 0);
368	x5 = load1_epi32(k + 4);
369	x6 = load1_epi32(k + 8);
370	x7 = load1_epi32(k + 12);
371	x8 = load1_epi32(k + 16);
372	x9 = load1_epi32(k + 20);
373	x10 = load1_epi32(k + 24);
374	x11 = load1_epi32(k + 28);
375	/* x12 set in the loop */
376	x13 = load1_epi32(nonce + 0);
377	x14 = load1_epi32(nonce + 4);
378	x15 = load1_epi32(nonce + 8);
379
380	for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) {
381		x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
382		    _mm_set_epi32(3,2,1,0));
383		y0 = x0;
384		y1 = x1;
385		y2 = x2;
386		y3 = x3;
387		y4 = x4;
388		y5 = x5;
389		y6 = x6;
390		y7 = x7;
391		y8 = x8;
392		y9 = x9;
393		y10 = x10;
394		y11 = x11;
395		y12 = x12;
396		y13 = x13;
397		y14 = x14;
398		y15 = x15;
399		for (r = nr; r > 0; r -= 2) {
400			CHACHA_QUARTERROUND( y0, y4, y8,y12);
401			CHACHA_QUARTERROUND( y1, y5, y9,y13);
402			CHACHA_QUARTERROUND( y2, y6,y10,y14);
403			CHACHA_QUARTERROUND( y3, y7,y11,y15);
404			CHACHA_QUARTERROUND( y0, y5,y10,y15);
405			CHACHA_QUARTERROUND( y1, y6,y11,y12);
406			CHACHA_QUARTERROUND( y2, y7, y8,y13);
407			CHACHA_QUARTERROUND( y3, y4, y9,y14);
408		}
409		y0 = _mm_add_epi32(y0, x0);
410		y1 = _mm_add_epi32(y1, x1);
411		y2 = _mm_add_epi32(y2, x2);
412		y3 = _mm_add_epi32(y3, x3);
413		y4 = _mm_add_epi32(y4, x4);
414		y5 = _mm_add_epi32(y5, x5);
415		y6 = _mm_add_epi32(y6, x6);
416		y7 = _mm_add_epi32(y7, x7);
417		y8 = _mm_add_epi32(y8, x8);
418		y9 = _mm_add_epi32(y9, x9);
419		y10 = _mm_add_epi32(y10, x10);
420		y11 = _mm_add_epi32(y11, x11);
421		y12 = _mm_add_epi32(y12, x12);
422		y13 = _mm_add_epi32(y13, x13);
423		y14 = _mm_add_epi32(y14, x14);
424		y15 = _mm_add_epi32(y15, x15);
425
426		z0 = unpack0_epi32(y0, y1, y2, y3);
427		z1 = unpack0_epi32(y4, y5, y6, y7);
428		z2 = unpack0_epi32(y8, y9, y10, y11);
429		z3 = unpack0_epi32(y12, y13, y14, y15);
430		z4 = unpack1_epi32(y0, y1, y2, y3);
431		z5 = unpack1_epi32(y4, y5, y6, y7);
432		z6 = unpack1_epi32(y8, y9, y10, y11);
433		z7 = unpack1_epi32(y12, y13, y14, y15);
434		z8 = unpack2_epi32(y0, y1, y2, y3);
435		z9 = unpack2_epi32(y4, y5, y6, y7);
436		z10 = unpack2_epi32(y8, y9, y10, y11);
437		z11 = unpack2_epi32(y12, y13, y14, y15);
438		z12 = unpack3_epi32(y0, y1, y2, y3);
439		z13 = unpack3_epi32(y4, y5, y6, y7);
440		z14 = unpack3_epi32(y8, y9, y10, y11);
441		z15 = unpack3_epi32(y12, y13, y14, y15);
442
443		storeu_epi32(s + 16*0, loadu_epi32(p + 16*0) ^ z0);
444		storeu_epi32(s + 16*1, loadu_epi32(p + 16*1) ^ z1);
445		storeu_epi32(s + 16*2, loadu_epi32(p + 16*2) ^ z2);
446		storeu_epi32(s + 16*3, loadu_epi32(p + 16*3) ^ z3);
447		storeu_epi32(s + 16*4, loadu_epi32(p + 16*4) ^ z4);
448		storeu_epi32(s + 16*5, loadu_epi32(p + 16*5) ^ z5);
449		storeu_epi32(s + 16*6, loadu_epi32(p + 16*6) ^ z6);
450		storeu_epi32(s + 16*7, loadu_epi32(p + 16*7) ^ z7);
451		storeu_epi32(s + 16*8, loadu_epi32(p + 16*8) ^ z8);
452		storeu_epi32(s + 16*9, loadu_epi32(p + 16*9) ^ z9);
453		storeu_epi32(s + 16*10, loadu_epi32(p + 16*10) ^ z10);
454		storeu_epi32(s + 16*11, loadu_epi32(p + 16*11) ^ z11);
455		storeu_epi32(s + 16*12, loadu_epi32(p + 16*12) ^ z12);
456		storeu_epi32(s + 16*13, loadu_epi32(p + 16*13) ^ z13);
457		storeu_epi32(s + 16*14, loadu_epi32(p + 16*14) ^ z14);
458		storeu_epi32(s + 16*15, loadu_epi32(p + 16*15) ^ z15);
459	}
460
461out:	if (n) {
462		const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
463		__m128i in0, in1, in2, in3;
464		__m128i r0, r1, r2, r3;
465
466		in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
467		in1 = _mm_loadu_si128((const __m128i *)k);
468		in2 = _mm_loadu_si128((const __m128i *)k + 1);
469		in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
470		    le32dec(nonce), blkno);
471
472		for (; n; s += 64, p += 64, n -= 64) {
473			r0 = in0;
474			r1 = in1;
475			r2 = in2;
476			r3 = in3;
477			chacha_permute(&r0, &r1, &r2, &r3, nr);
478			r0 = _mm_add_epi32(r0, in0);
479			r1 = _mm_add_epi32(r1, in1);
480			r2 = _mm_add_epi32(r2, in2);
481			r3 = _mm_add_epi32(r3, in3);
482
483			if (n < 64) {
484				uint8_t buf[64] __aligned(16);
485				unsigned i;
486
487				_mm_storeu_si128((__m128i *)buf + 0, r0);
488				_mm_storeu_si128((__m128i *)buf + 1, r1);
489				_mm_storeu_si128((__m128i *)buf + 2, r2);
490				_mm_storeu_si128((__m128i *)buf + 3, r3);
491
492				for (i = 0; i < n - n%4; i += 4)
493					le32enc(s + i,
494					    le32dec(p + i) ^ le32dec(buf + i));
495				for (; i < n; i++)
496					s[i] = p[i] ^ buf[i];
497
498				break;
499			}
500
501			r0 ^= _mm_loadu_si128((const __m128i *)p + 0);
502			r1 ^= _mm_loadu_si128((const __m128i *)p + 1);
503			r2 ^= _mm_loadu_si128((const __m128i *)p + 2);
504			r3 ^= _mm_loadu_si128((const __m128i *)p + 3);
505			_mm_storeu_si128((__m128i *)s + 0, r0);
506			_mm_storeu_si128((__m128i *)s + 1, r1);
507			_mm_storeu_si128((__m128i *)s + 2, r2);
508			_mm_storeu_si128((__m128i *)s + 3, r3);
509			in3 = _mm_add_epi32(in3, blkno_inc);
510		}
511	}
512}
513
514void
515xchacha_stream_sse2(uint8_t *restrict s, size_t nbytes,
516    uint32_t blkno,
517    const uint8_t nonce[static 24],
518    const uint8_t k[static 32],
519    unsigned nr)
520{
521	uint8_t subkey[32];
522	uint8_t subnonce[12];
523
524	hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
525	memset(subnonce, 0, 4);
526	memcpy(subnonce + 4, nonce + 16, 8);
527	chacha_stream_sse2(s, nbytes, blkno, subnonce, subkey, nr);
528}
529
530void
531xchacha_stream_xor_sse2(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
532    uint32_t blkno,
533    const uint8_t nonce[static 24],
534    const uint8_t k[static 32],
535    unsigned nr)
536{
537	uint8_t subkey[32];
538	uint8_t subnonce[12];
539
540	hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
541	memset(subnonce, 0, 4);
542	memcpy(subnonce + 4, nonce + 16, 8);
543	chacha_stream_xor_sse2(c, p, nbytes, blkno, subnonce, subkey, nr);
544}
545