1/*
2   BLAKE2 reference source code package - optimized C implementations
3
4   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
5
6   To the extent possible under law, the author(s) have dedicated all copyright
7   and related and neighboring rights to this software to the public domain
8   worldwide. This software is distributed without any warranty.
9
10   You should have received a copy of the CC0 Public Domain Dedication along with
11   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
12*/
13#pragma once
14#ifndef __BLAKE2B_LOAD_SSE41_H__
15#define __BLAKE2B_LOAD_SSE41_H__
16
17#define LOAD_MSG_0_1(b0, b1) \
18do \
19{ \
20b0 = _mm_unpacklo_epi64(m0, m1); \
21b1 = _mm_unpacklo_epi64(m2, m3); \
22} while(0)
23
24
25#define LOAD_MSG_0_2(b0, b1) \
26do \
27{ \
28b0 = _mm_unpackhi_epi64(m0, m1); \
29b1 = _mm_unpackhi_epi64(m2, m3); \
30} while(0)
31
32
33#define LOAD_MSG_0_3(b0, b1) \
34do \
35{ \
36b0 = _mm_unpacklo_epi64(m4, m5); \
37b1 = _mm_unpacklo_epi64(m6, m7); \
38} while(0)
39
40
41#define LOAD_MSG_0_4(b0, b1) \
42do \
43{ \
44b0 = _mm_unpackhi_epi64(m4, m5); \
45b1 = _mm_unpackhi_epi64(m6, m7); \
46} while(0)
47
48
49#define LOAD_MSG_1_1(b0, b1) \
50do \
51{ \
52b0 = _mm_unpacklo_epi64(m7, m2); \
53b1 = _mm_unpackhi_epi64(m4, m6); \
54} while(0)
55
56
57#define LOAD_MSG_1_2(b0, b1) \
58do \
59{ \
60b0 = _mm_unpacklo_epi64(m5, m4); \
61b1 = _mm_alignr_epi8(m3, m7, 8); \
62} while(0)
63
64
65#define LOAD_MSG_1_3(b0, b1) \
66do \
67{ \
68b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
69b1 = _mm_unpackhi_epi64(m5, m2); \
70} while(0)
71
72
73#define LOAD_MSG_1_4(b0, b1) \
74do \
75{ \
76b0 = _mm_unpacklo_epi64(m6, m1); \
77b1 = _mm_unpackhi_epi64(m3, m1); \
78} while(0)
79
80
81#define LOAD_MSG_2_1(b0, b1) \
82do \
83{ \
84b0 = _mm_alignr_epi8(m6, m5, 8); \
85b1 = _mm_unpackhi_epi64(m2, m7); \
86} while(0)
87
88
89#define LOAD_MSG_2_2(b0, b1) \
90do \
91{ \
92b0 = _mm_unpacklo_epi64(m4, m0); \
93b1 = _mm_blend_epi16(m1, m6, 0xF0); \
94} while(0)
95
96
97#define LOAD_MSG_2_3(b0, b1) \
98do \
99{ \
100b0 = _mm_blend_epi16(m5, m1, 0xF0); \
101b1 = _mm_unpackhi_epi64(m3, m4); \
102} while(0)
103
104
105#define LOAD_MSG_2_4(b0, b1) \
106do \
107{ \
108b0 = _mm_unpacklo_epi64(m7, m3); \
109b1 = _mm_alignr_epi8(m2, m0, 8); \
110} while(0)
111
112
113#define LOAD_MSG_3_1(b0, b1) \
114do \
115{ \
116b0 = _mm_unpackhi_epi64(m3, m1); \
117b1 = _mm_unpackhi_epi64(m6, m5); \
118} while(0)
119
120
121#define LOAD_MSG_3_2(b0, b1) \
122do \
123{ \
124b0 = _mm_unpackhi_epi64(m4, m0); \
125b1 = _mm_unpacklo_epi64(m6, m7); \
126} while(0)
127
128
129#define LOAD_MSG_3_3(b0, b1) \
130do \
131{ \
132b0 = _mm_blend_epi16(m1, m2, 0xF0); \
133b1 = _mm_blend_epi16(m2, m7, 0xF0); \
134} while(0)
135
136
137#define LOAD_MSG_3_4(b0, b1) \
138do \
139{ \
140b0 = _mm_unpacklo_epi64(m3, m5); \
141b1 = _mm_unpacklo_epi64(m0, m4); \
142} while(0)
143
144
145#define LOAD_MSG_4_1(b0, b1) \
146do \
147{ \
148b0 = _mm_unpackhi_epi64(m4, m2); \
149b1 = _mm_unpacklo_epi64(m1, m5); \
150} while(0)
151
152
153#define LOAD_MSG_4_2(b0, b1) \
154do \
155{ \
156b0 = _mm_blend_epi16(m0, m3, 0xF0); \
157b1 = _mm_blend_epi16(m2, m7, 0xF0); \
158} while(0)
159
160
161#define LOAD_MSG_4_3(b0, b1) \
162do \
163{ \
164b0 = _mm_blend_epi16(m7, m5, 0xF0); \
165b1 = _mm_blend_epi16(m3, m1, 0xF0); \
166} while(0)
167
168
169#define LOAD_MSG_4_4(b0, b1) \
170do \
171{ \
172b0 = _mm_alignr_epi8(m6, m0, 8); \
173b1 = _mm_blend_epi16(m4, m6, 0xF0); \
174} while(0)
175
176
177#define LOAD_MSG_5_1(b0, b1) \
178do \
179{ \
180b0 = _mm_unpacklo_epi64(m1, m3); \
181b1 = _mm_unpacklo_epi64(m0, m4); \
182} while(0)
183
184
185#define LOAD_MSG_5_2(b0, b1) \
186do \
187{ \
188b0 = _mm_unpacklo_epi64(m6, m5); \
189b1 = _mm_unpackhi_epi64(m5, m1); \
190} while(0)
191
192
193#define LOAD_MSG_5_3(b0, b1) \
194do \
195{ \
196b0 = _mm_blend_epi16(m2, m3, 0xF0); \
197b1 = _mm_unpackhi_epi64(m7, m0); \
198} while(0)
199
200
201#define LOAD_MSG_5_4(b0, b1) \
202do \
203{ \
204b0 = _mm_unpackhi_epi64(m6, m2); \
205b1 = _mm_blend_epi16(m7, m4, 0xF0); \
206} while(0)
207
208
209#define LOAD_MSG_6_1(b0, b1) \
210do \
211{ \
212b0 = _mm_blend_epi16(m6, m0, 0xF0); \
213b1 = _mm_unpacklo_epi64(m7, m2); \
214} while(0)
215
216
217#define LOAD_MSG_6_2(b0, b1) \
218do \
219{ \
220b0 = _mm_unpackhi_epi64(m2, m7); \
221b1 = _mm_alignr_epi8(m5, m6, 8); \
222} while(0)
223
224
225#define LOAD_MSG_6_3(b0, b1) \
226do \
227{ \
228b0 = _mm_unpacklo_epi64(m0, m3); \
229b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
230} while(0)
231
232
233#define LOAD_MSG_6_4(b0, b1) \
234do \
235{ \
236b0 = _mm_unpackhi_epi64(m3, m1); \
237b1 = _mm_blend_epi16(m1, m5, 0xF0); \
238} while(0)
239
240
241#define LOAD_MSG_7_1(b0, b1) \
242do \
243{ \
244b0 = _mm_unpackhi_epi64(m6, m3); \
245b1 = _mm_blend_epi16(m6, m1, 0xF0); \
246} while(0)
247
248
249#define LOAD_MSG_7_2(b0, b1) \
250do \
251{ \
252b0 = _mm_alignr_epi8(m7, m5, 8); \
253b1 = _mm_unpackhi_epi64(m0, m4); \
254} while(0)
255
256
257#define LOAD_MSG_7_3(b0, b1) \
258do \
259{ \
260b0 = _mm_unpackhi_epi64(m2, m7); \
261b1 = _mm_unpacklo_epi64(m4, m1); \
262} while(0)
263
264
265#define LOAD_MSG_7_4(b0, b1) \
266do \
267{ \
268b0 = _mm_unpacklo_epi64(m0, m2); \
269b1 = _mm_unpacklo_epi64(m3, m5); \
270} while(0)
271
272
273#define LOAD_MSG_8_1(b0, b1) \
274do \
275{ \
276b0 = _mm_unpacklo_epi64(m3, m7); \
277b1 = _mm_alignr_epi8(m0, m5, 8); \
278} while(0)
279
280
281#define LOAD_MSG_8_2(b0, b1) \
282do \
283{ \
284b0 = _mm_unpackhi_epi64(m7, m4); \
285b1 = _mm_alignr_epi8(m4, m1, 8); \
286} while(0)
287
288
289#define LOAD_MSG_8_3(b0, b1) \
290do \
291{ \
292b0 = m6; \
293b1 = _mm_alignr_epi8(m5, m0, 8); \
294} while(0)
295
296
297#define LOAD_MSG_8_4(b0, b1) \
298do \
299{ \
300b0 = _mm_blend_epi16(m1, m3, 0xF0); \
301b1 = m2; \
302} while(0)
303
304
305#define LOAD_MSG_9_1(b0, b1) \
306do \
307{ \
308b0 = _mm_unpacklo_epi64(m5, m4); \
309b1 = _mm_unpackhi_epi64(m3, m0); \
310} while(0)
311
312
313#define LOAD_MSG_9_2(b0, b1) \
314do \
315{ \
316b0 = _mm_unpacklo_epi64(m1, m2); \
317b1 = _mm_blend_epi16(m3, m2, 0xF0); \
318} while(0)
319
320
321#define LOAD_MSG_9_3(b0, b1) \
322do \
323{ \
324b0 = _mm_unpackhi_epi64(m7, m4); \
325b1 = _mm_unpackhi_epi64(m1, m6); \
326} while(0)
327
328
329#define LOAD_MSG_9_4(b0, b1) \
330do \
331{ \
332b0 = _mm_alignr_epi8(m7, m5, 8); \
333b1 = _mm_unpacklo_epi64(m6, m0); \
334} while(0)
335
336
337#define LOAD_MSG_10_1(b0, b1) \
338do \
339{ \
340b0 = _mm_unpacklo_epi64(m0, m1); \
341b1 = _mm_unpacklo_epi64(m2, m3); \
342} while(0)
343
344
345#define LOAD_MSG_10_2(b0, b1) \
346do \
347{ \
348b0 = _mm_unpackhi_epi64(m0, m1); \
349b1 = _mm_unpackhi_epi64(m2, m3); \
350} while(0)
351
352
353#define LOAD_MSG_10_3(b0, b1) \
354do \
355{ \
356b0 = _mm_unpacklo_epi64(m4, m5); \
357b1 = _mm_unpacklo_epi64(m6, m7); \
358} while(0)
359
360
361#define LOAD_MSG_10_4(b0, b1) \
362do \
363{ \
364b0 = _mm_unpackhi_epi64(m4, m5); \
365b1 = _mm_unpackhi_epi64(m6, m7); \
366} while(0)
367
368
369#define LOAD_MSG_11_1(b0, b1) \
370do \
371{ \
372b0 = _mm_unpacklo_epi64(m7, m2); \
373b1 = _mm_unpackhi_epi64(m4, m6); \
374} while(0)
375
376
377#define LOAD_MSG_11_2(b0, b1) \
378do \
379{ \
380b0 = _mm_unpacklo_epi64(m5, m4); \
381b1 = _mm_alignr_epi8(m3, m7, 8); \
382} while(0)
383
384
385#define LOAD_MSG_11_3(b0, b1) \
386do \
387{ \
388b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
389b1 = _mm_unpackhi_epi64(m5, m2); \
390} while(0)
391
392
393#define LOAD_MSG_11_4(b0, b1) \
394do \
395{ \
396b0 = _mm_unpacklo_epi64(m6, m1); \
397b1 = _mm_unpackhi_epi64(m3, m1); \
398} while(0)
399
400
401#endif
402
403