1#ifndef blake2b_load_avx2_H
2#define blake2b_load_avx2_H
3
4#define BLAKE2B_LOAD_MSG_0_1(b0)               \
5    do {                                       \
6        t0 = _mm256_unpacklo_epi64(m0, m1);    \
7        t1 = _mm256_unpacklo_epi64(m2, m3);    \
8        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
9    } while (0)
10
11#define BLAKE2B_LOAD_MSG_0_2(b0)               \
12    do {                                       \
13        t0 = _mm256_unpackhi_epi64(m0, m1);    \
14        t1 = _mm256_unpackhi_epi64(m2, m3);    \
15        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
16    } while (0)
17
18#define BLAKE2B_LOAD_MSG_0_3(b0)               \
19    do {                                       \
20        t0 = _mm256_unpacklo_epi64(m4, m5);    \
21        t1 = _mm256_unpacklo_epi64(m6, m7);    \
22        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
23    } while (0)
24
25#define BLAKE2B_LOAD_MSG_0_4(b0)               \
26    do {                                       \
27        t0 = _mm256_unpackhi_epi64(m4, m5);    \
28        t1 = _mm256_unpackhi_epi64(m6, m7);    \
29        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
30    } while (0)
31
32#define BLAKE2B_LOAD_MSG_1_1(b0)               \
33    do {                                       \
34        t0 = _mm256_unpacklo_epi64(m7, m2);    \
35        t1 = _mm256_unpackhi_epi64(m4, m6);    \
36        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
37    } while (0)
38
39#define BLAKE2B_LOAD_MSG_1_2(b0)               \
40    do {                                       \
41        t0 = _mm256_unpacklo_epi64(m5, m4);    \
42        t1 = _mm256_alignr_epi8(m3, m7, 8);    \
43        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
44    } while (0)
45
46#define BLAKE2B_LOAD_MSG_1_3(b0)                                \
47    do {                                                        \
48        t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
49        t1 = _mm256_unpackhi_epi64(m5, m2);                     \
50        b0 = _mm256_blend_epi32(t0, t1, 0xF0);                  \
51    } while (0)
52
53#define BLAKE2B_LOAD_MSG_1_4(b0)               \
54    do {                                       \
55        t0 = _mm256_unpacklo_epi64(m6, m1);    \
56        t1 = _mm256_unpackhi_epi64(m3, m1);    \
57        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
58    } while (0)
59
60#define BLAKE2B_LOAD_MSG_2_1(b0)               \
61    do {                                       \
62        t0 = _mm256_alignr_epi8(m6, m5, 8);    \
63        t1 = _mm256_unpackhi_epi64(m2, m7);    \
64        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
65    } while (0)
66
67#define BLAKE2B_LOAD_MSG_2_2(b0)               \
68    do {                                       \
69        t0 = _mm256_unpacklo_epi64(m4, m0);    \
70        t1 = _mm256_blend_epi32(m6, m1, 0x33); \
71        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
72    } while (0)
73
74#define BLAKE2B_LOAD_MSG_2_3(b0)               \
75    do {                                       \
76        t0 = _mm256_blend_epi32(m1, m5, 0x33); \
77        t1 = _mm256_unpackhi_epi64(m3, m4);    \
78        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
79    } while (0)
80
81#define BLAKE2B_LOAD_MSG_2_4(b0)               \
82    do {                                       \
83        t0 = _mm256_unpacklo_epi64(m7, m3);    \
84        t1 = _mm256_alignr_epi8(m2, m0, 8);    \
85        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
86    } while (0)
87
88#define BLAKE2B_LOAD_MSG_3_1(b0)               \
89    do {                                       \
90        t0 = _mm256_unpackhi_epi64(m3, m1);    \
91        t1 = _mm256_unpackhi_epi64(m6, m5);    \
92        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
93    } while (0)
94
95#define BLAKE2B_LOAD_MSG_3_2(b0)               \
96    do {                                       \
97        t0 = _mm256_unpackhi_epi64(m4, m0);    \
98        t1 = _mm256_unpacklo_epi64(m6, m7);    \
99        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
100    } while (0)
101
102#define BLAKE2B_LOAD_MSG_3_3(b0)               \
103    do {                                       \
104        t0 = _mm256_blend_epi32(m2, m1, 0x33); \
105        t1 = _mm256_blend_epi32(m7, m2, 0x33); \
106        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
107    } while (0)
108
109#define BLAKE2B_LOAD_MSG_3_4(b0)               \
110    do {                                       \
111        t0 = _mm256_unpacklo_epi64(m3, m5);    \
112        t1 = _mm256_unpacklo_epi64(m0, m4);    \
113        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
114    } while (0)
115
116#define BLAKE2B_LOAD_MSG_4_1(b0)               \
117    do {                                       \
118        t0 = _mm256_unpackhi_epi64(m4, m2);    \
119        t1 = _mm256_unpacklo_epi64(m1, m5);    \
120        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
121    } while (0)
122
123#define BLAKE2B_LOAD_MSG_4_2(b0)               \
124    do {                                       \
125        t0 = _mm256_blend_epi32(m3, m0, 0x33); \
126        t1 = _mm256_blend_epi32(m7, m2, 0x33); \
127        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
128    } while (0)
129
130#define BLAKE2B_LOAD_MSG_4_3(b0)               \
131    do {                                       \
132        t0 = _mm256_blend_epi32(m5, m7, 0x33); \
133        t1 = _mm256_blend_epi32(m1, m3, 0x33); \
134        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
135    } while (0)
136
137#define BLAKE2B_LOAD_MSG_4_4(b0)               \
138    do {                                       \
139        t0 = _mm256_alignr_epi8(m6, m0, 8);    \
140        t1 = _mm256_blend_epi32(m6, m4, 0x33); \
141        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
142    } while (0)
143
144#define BLAKE2B_LOAD_MSG_5_1(b0)               \
145    do {                                       \
146        t0 = _mm256_unpacklo_epi64(m1, m3);    \
147        t1 = _mm256_unpacklo_epi64(m0, m4);    \
148        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
149    } while (0)
150
151#define BLAKE2B_LOAD_MSG_5_2(b0)               \
152    do {                                       \
153        t0 = _mm256_unpacklo_epi64(m6, m5);    \
154        t1 = _mm256_unpackhi_epi64(m5, m1);    \
155        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
156    } while (0)
157
158#define BLAKE2B_LOAD_MSG_5_3(b0)               \
159    do {                                       \
160        t0 = _mm256_blend_epi32(m3, m2, 0x33); \
161        t1 = _mm256_unpackhi_epi64(m7, m0);    \
162        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
163    } while (0)
164
165#define BLAKE2B_LOAD_MSG_5_4(b0)               \
166    do {                                       \
167        t0 = _mm256_unpackhi_epi64(m6, m2);    \
168        t1 = _mm256_blend_epi32(m4, m7, 0x33); \
169        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
170    } while (0)
171
172#define BLAKE2B_LOAD_MSG_6_1(b0)               \
173    do {                                       \
174        t0 = _mm256_blend_epi32(m0, m6, 0x33); \
175        t1 = _mm256_unpacklo_epi64(m7, m2);    \
176        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
177    } while (0)
178
179#define BLAKE2B_LOAD_MSG_6_2(b0)               \
180    do {                                       \
181        t0 = _mm256_unpackhi_epi64(m2, m7);    \
182        t1 = _mm256_alignr_epi8(m5, m6, 8);    \
183        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
184    } while (0)
185
186#define BLAKE2B_LOAD_MSG_6_3(b0)                                \
187    do {                                                        \
188        t0 = _mm256_unpacklo_epi64(m0, m3);                     \
189        t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
190        b0 = _mm256_blend_epi32(t0, t1, 0xF0);                  \
191    } while (0)
192
193#define BLAKE2B_LOAD_MSG_6_4(b0)               \
194    do {                                       \
195        t0 = _mm256_unpackhi_epi64(m3, m1);    \
196        t1 = _mm256_blend_epi32(m5, m1, 0x33); \
197        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
198    } while (0)
199
200#define BLAKE2B_LOAD_MSG_7_1(b0)               \
201    do {                                       \
202        t0 = _mm256_unpackhi_epi64(m6, m3);    \
203        t1 = _mm256_blend_epi32(m1, m6, 0x33); \
204        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
205    } while (0)
206
207#define BLAKE2B_LOAD_MSG_7_2(b0)               \
208    do {                                       \
209        t0 = _mm256_alignr_epi8(m7, m5, 8);    \
210        t1 = _mm256_unpackhi_epi64(m0, m4);    \
211        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
212    } while (0)
213
214#define BLAKE2B_LOAD_MSG_7_3(b0)               \
215    do {                                       \
216        t0 = _mm256_unpackhi_epi64(m2, m7);    \
217        t1 = _mm256_unpacklo_epi64(m4, m1);    \
218        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
219    } while (0)
220
221#define BLAKE2B_LOAD_MSG_7_4(b0)               \
222    do {                                       \
223        t0 = _mm256_unpacklo_epi64(m0, m2);    \
224        t1 = _mm256_unpacklo_epi64(m3, m5);    \
225        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
226    } while (0)
227
228#define BLAKE2B_LOAD_MSG_8_1(b0)               \
229    do {                                       \
230        t0 = _mm256_unpacklo_epi64(m3, m7);    \
231        t1 = _mm256_alignr_epi8(m0, m5, 8);    \
232        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
233    } while (0)
234
235#define BLAKE2B_LOAD_MSG_8_2(b0)               \
236    do {                                       \
237        t0 = _mm256_unpackhi_epi64(m7, m4);    \
238        t1 = _mm256_alignr_epi8(m4, m1, 8);    \
239        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
240    } while (0)
241
242#define BLAKE2B_LOAD_MSG_8_3(b0)               \
243    do {                                       \
244        t0 = m6;                               \
245        t1 = _mm256_alignr_epi8(m5, m0, 8);    \
246        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
247    } while (0)
248
249#define BLAKE2B_LOAD_MSG_8_4(b0)               \
250    do {                                       \
251        t0 = _mm256_blend_epi32(m3, m1, 0x33); \
252        t1 = m2;                               \
253        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
254    } while (0)
255
256#define BLAKE2B_LOAD_MSG_9_1(b0)               \
257    do {                                       \
258        t0 = _mm256_unpacklo_epi64(m5, m4);    \
259        t1 = _mm256_unpackhi_epi64(m3, m0);    \
260        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
261    } while (0)
262
263#define BLAKE2B_LOAD_MSG_9_2(b0)               \
264    do {                                       \
265        t0 = _mm256_unpacklo_epi64(m1, m2);    \
266        t1 = _mm256_blend_epi32(m2, m3, 0x33); \
267        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
268    } while (0)
269
270#define BLAKE2B_LOAD_MSG_9_3(b0)               \
271    do {                                       \
272        t0 = _mm256_unpackhi_epi64(m7, m4);    \
273        t1 = _mm256_unpackhi_epi64(m1, m6);    \
274        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
275    } while (0)
276
277#define BLAKE2B_LOAD_MSG_9_4(b0)               \
278    do {                                       \
279        t0 = _mm256_alignr_epi8(m7, m5, 8);    \
280        t1 = _mm256_unpacklo_epi64(m6, m0);    \
281        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
282    } while (0)
283
284#define BLAKE2B_LOAD_MSG_10_1(b0)              \
285    do {                                       \
286        t0 = _mm256_unpacklo_epi64(m0, m1);    \
287        t1 = _mm256_unpacklo_epi64(m2, m3);    \
288        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
289    } while (0)
290
291#define BLAKE2B_LOAD_MSG_10_2(b0)              \
292    do {                                       \
293        t0 = _mm256_unpackhi_epi64(m0, m1);    \
294        t1 = _mm256_unpackhi_epi64(m2, m3);    \
295        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
296    } while (0)
297
298#define BLAKE2B_LOAD_MSG_10_3(b0)              \
299    do {                                       \
300        t0 = _mm256_unpacklo_epi64(m4, m5);    \
301        t1 = _mm256_unpacklo_epi64(m6, m7);    \
302        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
303    } while (0)
304
305#define BLAKE2B_LOAD_MSG_10_4(b0)              \
306    do {                                       \
307        t0 = _mm256_unpackhi_epi64(m4, m5);    \
308        t1 = _mm256_unpackhi_epi64(m6, m7);    \
309        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
310    } while (0)
311
312#define BLAKE2B_LOAD_MSG_11_1(b0)              \
313    do {                                       \
314        t0 = _mm256_unpacklo_epi64(m7, m2);    \
315        t1 = _mm256_unpackhi_epi64(m4, m6);    \
316        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
317    } while (0)
318
319#define BLAKE2B_LOAD_MSG_11_2(b0)              \
320    do {                                       \
321        t0 = _mm256_unpacklo_epi64(m5, m4);    \
322        t1 = _mm256_alignr_epi8(m3, m7, 8);    \
323        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
324    } while (0)
325
326#define BLAKE2B_LOAD_MSG_11_3(b0)                               \
327    do {                                                        \
328        t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
329        t1 = _mm256_unpackhi_epi64(m5, m2);                     \
330        b0 = _mm256_blend_epi32(t0, t1, 0xF0);                  \
331    } while (0)
332
333#define BLAKE2B_LOAD_MSG_11_4(b0)              \
334    do {                                       \
335        t0 = _mm256_unpacklo_epi64(m6, m1);    \
336        t1 = _mm256_unpackhi_epi64(m3, m1);    \
337        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
338    } while (0)
339
340#endif
341