1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#define BR_ENABLE_INTRINSICS   1
26#include "inner.h"
27
28#if BR_SSE2
29
30/*
31 * This file contains a ChaCha20 implementation that leverages SSE2
32 * opcodes for better performance.
33 */
34
35/* see bearssl_block.h */
36br_chacha20_run
37br_chacha20_sse2_get(void)
38{
39	/*
40	 * If using 64-bit mode, then SSE2 opcodes should be automatically
41	 * available, since they are part of the ABI.
42	 *
43	 * In 32-bit mode, we use CPUID to detect the SSE2 feature.
44	 */
45
46#if BR_amd64
47	return &br_chacha20_sse2_run;
48#else
49
50	/*
51	 * SSE2 support is indicated by bit 26 in EDX.
52	 */
53	if (br_cpuid(0, 0, 0, 0x04000000)) {
54		return &br_chacha20_sse2_run;
55	} else {
56		return 0;
57	}
58#endif
59}
60
61BR_TARGETS_X86_UP
62
63/* see bearssl_block.h */
64BR_TARGET("sse2")
65uint32_t
66br_chacha20_sse2_run(const void *key,
67	const void *iv, uint32_t cc, void *data, size_t len)
68{
69	unsigned char *buf;
70	uint32_t ivtmp[4];
71	__m128i kw0, kw1;
72	__m128i iw, cw;
73	__m128i one;
74
75	static const uint32_t CW[] = {
76		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
77	};
78
79	buf = data;
80	kw0 = _mm_loadu_si128(key);
81	kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
82	ivtmp[0] = cc;
83	memcpy(ivtmp + 1, iv, 12);
84	iw = _mm_loadu_si128((const void *)ivtmp);
85	cw = _mm_loadu_si128((const void *)CW);
86	one = _mm_set_epi32(0, 0, 0, 1);
87
88	while (len > 0) {
89		/*
90		 * sj contains state words 4*j to 4*j+3.
91		 */
92		__m128i s0, s1, s2, s3;
93		int i;
94
95		s0 = cw;
96		s1 = kw0;
97		s2 = kw1;
98		s3 = iw;
99		for (i = 0; i < 10; i ++) {
100			/*
101			 * Even round is straightforward application on
102			 * the state words.
103			 */
104			s0 = _mm_add_epi32(s0, s1);
105			s3 = _mm_xor_si128(s3, s0);
106			s3 = _mm_or_si128(
107				_mm_slli_epi32(s3, 16),
108				_mm_srli_epi32(s3, 16));
109
110			s2 = _mm_add_epi32(s2, s3);
111			s1 = _mm_xor_si128(s1, s2);
112			s1 = _mm_or_si128(
113				_mm_slli_epi32(s1, 12),
114				_mm_srli_epi32(s1, 20));
115
116			s0 = _mm_add_epi32(s0, s1);
117			s3 = _mm_xor_si128(s3, s0);
118			s3 = _mm_or_si128(
119				_mm_slli_epi32(s3, 8),
120				_mm_srli_epi32(s3, 24));
121
122			s2 = _mm_add_epi32(s2, s3);
123			s1 = _mm_xor_si128(s1, s2);
124			s1 = _mm_or_si128(
125				_mm_slli_epi32(s1, 7),
126				_mm_srli_epi32(s1, 25));
127
128			/*
129			 * For the odd round, we must rotate some state
130			 * words so that the computations apply on the
131			 * right combinations of words.
132			 */
133			s1 = _mm_shuffle_epi32(s1, 0x39);
134			s2 = _mm_shuffle_epi32(s2, 0x4E);
135			s3 = _mm_shuffle_epi32(s3, 0x93);
136
137			s0 = _mm_add_epi32(s0, s1);
138			s3 = _mm_xor_si128(s3, s0);
139			s3 = _mm_or_si128(
140				_mm_slli_epi32(s3, 16),
141				_mm_srli_epi32(s3, 16));
142
143			s2 = _mm_add_epi32(s2, s3);
144			s1 = _mm_xor_si128(s1, s2);
145			s1 = _mm_or_si128(
146				_mm_slli_epi32(s1, 12),
147				_mm_srli_epi32(s1, 20));
148
149			s0 = _mm_add_epi32(s0, s1);
150			s3 = _mm_xor_si128(s3, s0);
151			s3 = _mm_or_si128(
152				_mm_slli_epi32(s3, 8),
153				_mm_srli_epi32(s3, 24));
154
155			s2 = _mm_add_epi32(s2, s3);
156			s1 = _mm_xor_si128(s1, s2);
157			s1 = _mm_or_si128(
158				_mm_slli_epi32(s1, 7),
159				_mm_srli_epi32(s1, 25));
160
161			/*
162			 * After the odd round, we rotate back the values
163			 * to undo the rotate at the start of the odd round.
164			 */
165			s1 = _mm_shuffle_epi32(s1, 0x93);
166			s2 = _mm_shuffle_epi32(s2, 0x4E);
167			s3 = _mm_shuffle_epi32(s3, 0x39);
168		}
169
170		/*
171		 * Addition with the initial state.
172		 */
173		s0 = _mm_add_epi32(s0, cw);
174		s1 = _mm_add_epi32(s1, kw0);
175		s2 = _mm_add_epi32(s2, kw1);
176		s3 = _mm_add_epi32(s3, iw);
177
178		/*
179		 * Increment block counter.
180		 */
181		iw = _mm_add_epi32(iw, one);
182
183		/*
184		 * XOR final state with the data.
185		 */
186		if (len < 64) {
187			unsigned char tmp[64];
188			size_t u;
189
190			_mm_storeu_si128((void *)(tmp +  0), s0);
191			_mm_storeu_si128((void *)(tmp + 16), s1);
192			_mm_storeu_si128((void *)(tmp + 32), s2);
193			_mm_storeu_si128((void *)(tmp + 48), s3);
194			for (u = 0; u < len; u ++) {
195				buf[u] ^= tmp[u];
196			}
197			break;
198		} else {
199			__m128i b0, b1, b2, b3;
200
201			b0 = _mm_loadu_si128((const void *)(buf +  0));
202			b1 = _mm_loadu_si128((const void *)(buf + 16));
203			b2 = _mm_loadu_si128((const void *)(buf + 32));
204			b3 = _mm_loadu_si128((const void *)(buf + 48));
205			b0 = _mm_xor_si128(b0, s0);
206			b1 = _mm_xor_si128(b1, s1);
207			b2 = _mm_xor_si128(b2, s2);
208			b3 = _mm_xor_si128(b3, s3);
209			_mm_storeu_si128((void *)(buf +  0), b0);
210			_mm_storeu_si128((void *)(buf + 16), b1);
211			_mm_storeu_si128((void *)(buf + 32), b2);
212			_mm_storeu_si128((void *)(buf + 48), b3);
213			buf += 64;
214			len -= 64;
215		}
216	}
217
218	/*
219	 * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
220	 * raw SSE2, thus we use _mm_extract_epi16().
221	 */
222	return (uint32_t)_mm_extract_epi16(iw, 0)
223		| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
224}
225
226BR_TARGETS_X86_DOWN
227
228#else
229
230/* see bearssl_block.h */
231br_chacha20_run
232br_chacha20_sse2_get(void)
233{
234	return 0;
235}
236
237#endif
238