1if (bytes >= 256) {
2    __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
3        y15;
4    __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
5        z15;
6    __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
7        orig9, orig10, orig11, orig12, orig13, orig14, orig15;
8
9    uint32_t in8;
10    uint32_t in9;
11    int      i;
12
13    /* element broadcast immediate for _mm_shuffle_epi32 are in order:
14       0x00, 0x55, 0xaa, 0xff */
15    z0  = _mm_loadu_si128((__m128i *) (x + 0));
16    z5  = _mm_shuffle_epi32(z0, 0x55);
17    z10 = _mm_shuffle_epi32(z0, 0xaa);
18    z15 = _mm_shuffle_epi32(z0, 0xff);
19    z0  = _mm_shuffle_epi32(z0, 0x00);
20    z1  = _mm_loadu_si128((__m128i *) (x + 4));
21    z6  = _mm_shuffle_epi32(z1, 0xaa);
22    z11 = _mm_shuffle_epi32(z1, 0xff);
23    z12 = _mm_shuffle_epi32(z1, 0x00);
24    z1  = _mm_shuffle_epi32(z1, 0x55);
25    z2  = _mm_loadu_si128((__m128i *) (x + 8));
26    z7  = _mm_shuffle_epi32(z2, 0xff);
27    z13 = _mm_shuffle_epi32(z2, 0x55);
28    z2  = _mm_shuffle_epi32(z2, 0xaa);
29    /* no z8 -> first half of the nonce, will fill later */
30    z3  = _mm_loadu_si128((__m128i *) (x + 12));
31    z4  = _mm_shuffle_epi32(z3, 0x00);
32    z14 = _mm_shuffle_epi32(z3, 0xaa);
33    z3  = _mm_shuffle_epi32(z3, 0xff);
34    /* no z9 -> second half of the nonce, will fill later */
35    orig0  = z0;
36    orig1  = z1;
37    orig2  = z2;
38    orig3  = z3;
39    orig4  = z4;
40    orig5  = z5;
41    orig6  = z6;
42    orig7  = z7;
43    orig10 = z10;
44    orig11 = z11;
45    orig12 = z12;
46    orig13 = z13;
47    orig14 = z14;
48    orig15 = z15;
49
50    while (bytes >= 256) {
51        /* vector implementation for z8 and z9 */
52        /* not sure if it helps for only 4 blocks */
53        const __m128i addv8 = _mm_set_epi64x(1, 0);
54        const __m128i addv9 = _mm_set_epi64x(3, 2);
55        __m128i       t8, t9;
56        uint64_t      in89;
57
58        in8  = x[8];
59        in9  = x[13];
60        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
61        t8   = _mm_set1_epi64x(in89);
62        t9   = _mm_set1_epi64x(in89);
63
64        z8 = _mm_add_epi64(addv8, t8);
65        z9 = _mm_add_epi64(addv9, t9);
66
67        t8 = _mm_unpacklo_epi32(z8, z9);
68        t9 = _mm_unpackhi_epi32(z8, z9);
69
70        z8 = _mm_unpacklo_epi32(t8, t9);
71        z9 = _mm_unpackhi_epi32(t8, t9);
72
73        orig8 = z8;
74        orig9 = z9;
75
76        in89 += 4;
77
78        x[8]  = in89 & 0xFFFFFFFF;
79        x[13] = (in89 >> 32) & 0xFFFFFFFF;
80
81        z5  = orig5;
82        z10 = orig10;
83        z15 = orig15;
84        z14 = orig14;
85        z3  = orig3;
86        z6  = orig6;
87        z11 = orig11;
88        z1  = orig1;
89
90        z7  = orig7;
91        z13 = orig13;
92        z2  = orig2;
93        z9  = orig9;
94        z0  = orig0;
95        z12 = orig12;
96        z4  = orig4;
97        z8  = orig8;
98
99        for (i = 0; i < ROUNDS; i += 2) {
100            /* the inner loop is a direct translation (regexp search/replace)
101             * from the amd64-xmm6 ASM */
102            __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
103                r14, r15;
104
105            y4 = z12;
106            y4 = _mm_add_epi32(y4, z0);
107            r4 = y4;
108            y4 = _mm_slli_epi32(y4, 7);
109            z4 = _mm_xor_si128(z4, y4);
110            r4 = _mm_srli_epi32(r4, 25);
111            z4 = _mm_xor_si128(z4, r4);
112
113            y9 = z1;
114            y9 = _mm_add_epi32(y9, z5);
115            r9 = y9;
116            y9 = _mm_slli_epi32(y9, 7);
117            z9 = _mm_xor_si128(z9, y9);
118            r9 = _mm_srli_epi32(r9, 25);
119            z9 = _mm_xor_si128(z9, r9);
120
121            y8 = z0;
122            y8 = _mm_add_epi32(y8, z4);
123            r8 = y8;
124            y8 = _mm_slli_epi32(y8, 9);
125            z8 = _mm_xor_si128(z8, y8);
126            r8 = _mm_srli_epi32(r8, 23);
127            z8 = _mm_xor_si128(z8, r8);
128
129            y13 = z5;
130            y13 = _mm_add_epi32(y13, z9);
131            r13 = y13;
132            y13 = _mm_slli_epi32(y13, 9);
133            z13 = _mm_xor_si128(z13, y13);
134            r13 = _mm_srli_epi32(r13, 23);
135            z13 = _mm_xor_si128(z13, r13);
136
137            y12 = z4;
138            y12 = _mm_add_epi32(y12, z8);
139            r12 = y12;
140            y12 = _mm_slli_epi32(y12, 13);
141            z12 = _mm_xor_si128(z12, y12);
142            r12 = _mm_srli_epi32(r12, 19);
143            z12 = _mm_xor_si128(z12, r12);
144
145            y1 = z9;
146            y1 = _mm_add_epi32(y1, z13);
147            r1 = y1;
148            y1 = _mm_slli_epi32(y1, 13);
149            z1 = _mm_xor_si128(z1, y1);
150            r1 = _mm_srli_epi32(r1, 19);
151            z1 = _mm_xor_si128(z1, r1);
152
153            y0 = z8;
154            y0 = _mm_add_epi32(y0, z12);
155            r0 = y0;
156            y0 = _mm_slli_epi32(y0, 18);
157            z0 = _mm_xor_si128(z0, y0);
158            r0 = _mm_srli_epi32(r0, 14);
159            z0 = _mm_xor_si128(z0, r0);
160
161            y5 = z13;
162            y5 = _mm_add_epi32(y5, z1);
163            r5 = y5;
164            y5 = _mm_slli_epi32(y5, 18);
165            z5 = _mm_xor_si128(z5, y5);
166            r5 = _mm_srli_epi32(r5, 14);
167            z5 = _mm_xor_si128(z5, r5);
168
169            y14 = z6;
170            y14 = _mm_add_epi32(y14, z10);
171            r14 = y14;
172            y14 = _mm_slli_epi32(y14, 7);
173            z14 = _mm_xor_si128(z14, y14);
174            r14 = _mm_srli_epi32(r14, 25);
175            z14 = _mm_xor_si128(z14, r14);
176
177            y3 = z11;
178            y3 = _mm_add_epi32(y3, z15);
179            r3 = y3;
180            y3 = _mm_slli_epi32(y3, 7);
181            z3 = _mm_xor_si128(z3, y3);
182            r3 = _mm_srli_epi32(r3, 25);
183            z3 = _mm_xor_si128(z3, r3);
184
185            y2 = z10;
186            y2 = _mm_add_epi32(y2, z14);
187            r2 = y2;
188            y2 = _mm_slli_epi32(y2, 9);
189            z2 = _mm_xor_si128(z2, y2);
190            r2 = _mm_srli_epi32(r2, 23);
191            z2 = _mm_xor_si128(z2, r2);
192
193            y7 = z15;
194            y7 = _mm_add_epi32(y7, z3);
195            r7 = y7;
196            y7 = _mm_slli_epi32(y7, 9);
197            z7 = _mm_xor_si128(z7, y7);
198            r7 = _mm_srli_epi32(r7, 23);
199            z7 = _mm_xor_si128(z7, r7);
200
201            y6 = z14;
202            y6 = _mm_add_epi32(y6, z2);
203            r6 = y6;
204            y6 = _mm_slli_epi32(y6, 13);
205            z6 = _mm_xor_si128(z6, y6);
206            r6 = _mm_srli_epi32(r6, 19);
207            z6 = _mm_xor_si128(z6, r6);
208
209            y11 = z3;
210            y11 = _mm_add_epi32(y11, z7);
211            r11 = y11;
212            y11 = _mm_slli_epi32(y11, 13);
213            z11 = _mm_xor_si128(z11, y11);
214            r11 = _mm_srli_epi32(r11, 19);
215            z11 = _mm_xor_si128(z11, r11);
216
217            y10 = z2;
218            y10 = _mm_add_epi32(y10, z6);
219            r10 = y10;
220            y10 = _mm_slli_epi32(y10, 18);
221            z10 = _mm_xor_si128(z10, y10);
222            r10 = _mm_srli_epi32(r10, 14);
223            z10 = _mm_xor_si128(z10, r10);
224
225            y1 = z3;
226            y1 = _mm_add_epi32(y1, z0);
227            r1 = y1;
228            y1 = _mm_slli_epi32(y1, 7);
229            z1 = _mm_xor_si128(z1, y1);
230            r1 = _mm_srli_epi32(r1, 25);
231            z1 = _mm_xor_si128(z1, r1);
232
233            y15 = z7;
234            y15 = _mm_add_epi32(y15, z11);
235            r15 = y15;
236            y15 = _mm_slli_epi32(y15, 18);
237            z15 = _mm_xor_si128(z15, y15);
238            r15 = _mm_srli_epi32(r15, 14);
239            z15 = _mm_xor_si128(z15, r15);
240
241            y6 = z4;
242            y6 = _mm_add_epi32(y6, z5);
243            r6 = y6;
244            y6 = _mm_slli_epi32(y6, 7);
245            z6 = _mm_xor_si128(z6, y6);
246            r6 = _mm_srli_epi32(r6, 25);
247            z6 = _mm_xor_si128(z6, r6);
248
249            y2 = z0;
250            y2 = _mm_add_epi32(y2, z1);
251            r2 = y2;
252            y2 = _mm_slli_epi32(y2, 9);
253            z2 = _mm_xor_si128(z2, y2);
254            r2 = _mm_srli_epi32(r2, 23);
255            z2 = _mm_xor_si128(z2, r2);
256
257            y7 = z5;
258            y7 = _mm_add_epi32(y7, z6);
259            r7 = y7;
260            y7 = _mm_slli_epi32(y7, 9);
261            z7 = _mm_xor_si128(z7, y7);
262            r7 = _mm_srli_epi32(r7, 23);
263            z7 = _mm_xor_si128(z7, r7);
264
265            y3 = z1;
266            y3 = _mm_add_epi32(y3, z2);
267            r3 = y3;
268            y3 = _mm_slli_epi32(y3, 13);
269            z3 = _mm_xor_si128(z3, y3);
270            r3 = _mm_srli_epi32(r3, 19);
271            z3 = _mm_xor_si128(z3, r3);
272
273            y4 = z6;
274            y4 = _mm_add_epi32(y4, z7);
275            r4 = y4;
276            y4 = _mm_slli_epi32(y4, 13);
277            z4 = _mm_xor_si128(z4, y4);
278            r4 = _mm_srli_epi32(r4, 19);
279            z4 = _mm_xor_si128(z4, r4);
280
281            y0 = z2;
282            y0 = _mm_add_epi32(y0, z3);
283            r0 = y0;
284            y0 = _mm_slli_epi32(y0, 18);
285            z0 = _mm_xor_si128(z0, y0);
286            r0 = _mm_srli_epi32(r0, 14);
287            z0 = _mm_xor_si128(z0, r0);
288
289            y5 = z7;
290            y5 = _mm_add_epi32(y5, z4);
291            r5 = y5;
292            y5 = _mm_slli_epi32(y5, 18);
293            z5 = _mm_xor_si128(z5, y5);
294            r5 = _mm_srli_epi32(r5, 14);
295            z5 = _mm_xor_si128(z5, r5);
296
297            y11 = z9;
298            y11 = _mm_add_epi32(y11, z10);
299            r11 = y11;
300            y11 = _mm_slli_epi32(y11, 7);
301            z11 = _mm_xor_si128(z11, y11);
302            r11 = _mm_srli_epi32(r11, 25);
303            z11 = _mm_xor_si128(z11, r11);
304
305            y12 = z14;
306            y12 = _mm_add_epi32(y12, z15);
307            r12 = y12;
308            y12 = _mm_slli_epi32(y12, 7);
309            z12 = _mm_xor_si128(z12, y12);
310            r12 = _mm_srli_epi32(r12, 25);
311            z12 = _mm_xor_si128(z12, r12);
312
313            y8 = z10;
314            y8 = _mm_add_epi32(y8, z11);
315            r8 = y8;
316            y8 = _mm_slli_epi32(y8, 9);
317            z8 = _mm_xor_si128(z8, y8);
318            r8 = _mm_srli_epi32(r8, 23);
319            z8 = _mm_xor_si128(z8, r8);
320
321            y13 = z15;
322            y13 = _mm_add_epi32(y13, z12);
323            r13 = y13;
324            y13 = _mm_slli_epi32(y13, 9);
325            z13 = _mm_xor_si128(z13, y13);
326            r13 = _mm_srli_epi32(r13, 23);
327            z13 = _mm_xor_si128(z13, r13);
328
329            y9 = z11;
330            y9 = _mm_add_epi32(y9, z8);
331            r9 = y9;
332            y9 = _mm_slli_epi32(y9, 13);
333            z9 = _mm_xor_si128(z9, y9);
334            r9 = _mm_srli_epi32(r9, 19);
335            z9 = _mm_xor_si128(z9, r9);
336
337            y14 = z12;
338            y14 = _mm_add_epi32(y14, z13);
339            r14 = y14;
340            y14 = _mm_slli_epi32(y14, 13);
341            z14 = _mm_xor_si128(z14, y14);
342            r14 = _mm_srli_epi32(r14, 19);
343            z14 = _mm_xor_si128(z14, r14);
344
345            y10 = z8;
346            y10 = _mm_add_epi32(y10, z9);
347            r10 = y10;
348            y10 = _mm_slli_epi32(y10, 18);
349            z10 = _mm_xor_si128(z10, y10);
350            r10 = _mm_srli_epi32(r10, 14);
351            z10 = _mm_xor_si128(z10, r10);
352
353            y15 = z13;
354            y15 = _mm_add_epi32(y15, z14);
355            r15 = y15;
356            y15 = _mm_slli_epi32(y15, 18);
357            z15 = _mm_xor_si128(z15, y15);
358            r15 = _mm_srli_epi32(r15, 14);
359            z15 = _mm_xor_si128(z15, r15);
360        }
361
362/* store data ; this macro replicates the original amd64-xmm6 code */
363#define ONEQUAD_SHUFFLE(A, B, C, D)        \
364    z##A  = _mm_add_epi32(z##A, orig##A);  \
365    z##B  = _mm_add_epi32(z##B, orig##B);  \
366    z##C  = _mm_add_epi32(z##C, orig##C);  \
367    z##D  = _mm_add_epi32(z##D, orig##D);  \
368    in##A = _mm_cvtsi128_si32(z##A);       \
369    in##B = _mm_cvtsi128_si32(z##B);       \
370    in##C = _mm_cvtsi128_si32(z##C);       \
371    in##D = _mm_cvtsi128_si32(z##D);       \
372    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
373    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
374    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
375    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
376                                           \
377    in##A ^= *(uint32_t *) (m + 0);        \
378    in##B ^= *(uint32_t *) (m + 4);        \
379    in##C ^= *(uint32_t *) (m + 8);        \
380    in##D ^= *(uint32_t *) (m + 12);       \
381                                           \
382    *(uint32_t *) (c + 0)  = in##A;        \
383    *(uint32_t *) (c + 4)  = in##B;        \
384    *(uint32_t *) (c + 8)  = in##C;        \
385    *(uint32_t *) (c + 12) = in##D;        \
386                                           \
387    in##A = _mm_cvtsi128_si32(z##A);       \
388    in##B = _mm_cvtsi128_si32(z##B);       \
389    in##C = _mm_cvtsi128_si32(z##C);       \
390    in##D = _mm_cvtsi128_si32(z##D);       \
391    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
392    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
393    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
394    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
395                                           \
396    in##A ^= *(uint32_t *) (m + 64);       \
397    in##B ^= *(uint32_t *) (m + 68);       \
398    in##C ^= *(uint32_t *) (m + 72);       \
399    in##D ^= *(uint32_t *) (m + 76);       \
400    *(uint32_t *) (c + 64) = in##A;        \
401    *(uint32_t *) (c + 68) = in##B;        \
402    *(uint32_t *) (c + 72) = in##C;        \
403    *(uint32_t *) (c + 76) = in##D;        \
404                                           \
405    in##A = _mm_cvtsi128_si32(z##A);       \
406    in##B = _mm_cvtsi128_si32(z##B);       \
407    in##C = _mm_cvtsi128_si32(z##C);       \
408    in##D = _mm_cvtsi128_si32(z##D);       \
409    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
410    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
411    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
412    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
413                                           \
414    in##A ^= *(uint32_t *) (m + 128);      \
415    in##B ^= *(uint32_t *) (m + 132);      \
416    in##C ^= *(uint32_t *) (m + 136);      \
417    in##D ^= *(uint32_t *) (m + 140);      \
418    *(uint32_t *) (c + 128) = in##A;       \
419    *(uint32_t *) (c + 132) = in##B;       \
420    *(uint32_t *) (c + 136) = in##C;       \
421    *(uint32_t *) (c + 140) = in##D;       \
422                                           \
423    in##A = _mm_cvtsi128_si32(z##A);       \
424    in##B = _mm_cvtsi128_si32(z##B);       \
425    in##C = _mm_cvtsi128_si32(z##C);       \
426    in##D = _mm_cvtsi128_si32(z##D);       \
427                                           \
428    in##A ^= *(uint32_t *) (m + 192);      \
429    in##B ^= *(uint32_t *) (m + 196);      \
430    in##C ^= *(uint32_t *) (m + 200);      \
431    in##D ^= *(uint32_t *) (m + 204);      \
432    *(uint32_t *) (c + 192) = in##A;       \
433    *(uint32_t *) (c + 196) = in##B;       \
434    *(uint32_t *) (c + 200) = in##C;       \
435    *(uint32_t *) (c + 204) = in##D
436
437/* store data ; this macro replaces shuffle+mov by a direct extract; not much
438 * difference */
439#define ONEQUAD_EXTRACT(A, B, C, D)       \
440    z##A  = _mm_add_epi32(z##A, orig##A); \
441    z##B  = _mm_add_epi32(z##B, orig##B); \
442    z##C  = _mm_add_epi32(z##C, orig##C); \
443    z##D  = _mm_add_epi32(z##D, orig##D); \
444    in##A = _mm_cvtsi128_si32(z##A);      \
445    in##B = _mm_cvtsi128_si32(z##B);      \
446    in##C = _mm_cvtsi128_si32(z##C);      \
447    in##D = _mm_cvtsi128_si32(z##D);      \
448    in##A ^= *(uint32_t *) (m + 0);       \
449    in##B ^= *(uint32_t *) (m + 4);       \
450    in##C ^= *(uint32_t *) (m + 8);       \
451    in##D ^= *(uint32_t *) (m + 12);      \
452    *(uint32_t *) (c + 0)  = in##A;       \
453    *(uint32_t *) (c + 4)  = in##B;       \
454    *(uint32_t *) (c + 8)  = in##C;       \
455    *(uint32_t *) (c + 12) = in##D;       \
456                                          \
457    in##A = _mm_extract_epi32(z##A, 1);   \
458    in##B = _mm_extract_epi32(z##B, 1);   \
459    in##C = _mm_extract_epi32(z##C, 1);   \
460    in##D = _mm_extract_epi32(z##D, 1);   \
461                                          \
462    in##A ^= *(uint32_t *) (m + 64);      \
463    in##B ^= *(uint32_t *) (m + 68);      \
464    in##C ^= *(uint32_t *) (m + 72);      \
465    in##D ^= *(uint32_t *) (m + 76);      \
466    *(uint32_t *) (c + 64) = in##A;       \
467    *(uint32_t *) (c + 68) = in##B;       \
468    *(uint32_t *) (c + 72) = in##C;       \
469    *(uint32_t *) (c + 76) = in##D;       \
470                                          \
471    in##A = _mm_extract_epi32(z##A, 2);   \
472    in##B = _mm_extract_epi32(z##B, 2);   \
473    in##C = _mm_extract_epi32(z##C, 2);   \
474    in##D = _mm_extract_epi32(z##D, 2);   \
475                                          \
476    in##A ^= *(uint32_t *) (m + 128);     \
477    in##B ^= *(uint32_t *) (m + 132);     \
478    in##C ^= *(uint32_t *) (m + 136);     \
479    in##D ^= *(uint32_t *) (m + 140);     \
480    *(uint32_t *) (c + 128) = in##A;      \
481    *(uint32_t *) (c + 132) = in##B;      \
482    *(uint32_t *) (c + 136) = in##C;      \
483    *(uint32_t *) (c + 140) = in##D;      \
484                                          \
485    in##A = _mm_extract_epi32(z##A, 3);   \
486    in##B = _mm_extract_epi32(z##B, 3);   \
487    in##C = _mm_extract_epi32(z##C, 3);   \
488    in##D = _mm_extract_epi32(z##D, 3);   \
489                                          \
490    in##A ^= *(uint32_t *) (m + 192);     \
491    in##B ^= *(uint32_t *) (m + 196);     \
492    in##C ^= *(uint32_t *) (m + 200);     \
493    in##D ^= *(uint32_t *) (m + 204);     \
494    *(uint32_t *) (c + 192) = in##A;      \
495    *(uint32_t *) (c + 196) = in##B;      \
496    *(uint32_t *) (c + 200) = in##C;      \
497    *(uint32_t *) (c + 204) = in##D
498
499/* store data ; this macro first transpose data in-registers, and then store
500 * them in memory. much faster with icc. */
501#define ONEQUAD_TRANSPOSE(A, B, C, D)                                   \
502    z##A = _mm_add_epi32(z##A, orig##A);                                \
503    z##B = _mm_add_epi32(z##B, orig##B);                                \
504    z##C = _mm_add_epi32(z##C, orig##C);                                \
505    z##D = _mm_add_epi32(z##D, orig##D);                                \
506    y##A = _mm_unpacklo_epi32(z##A, z##B);                              \
507    y##B = _mm_unpacklo_epi32(z##C, z##D);                              \
508    y##C = _mm_unpackhi_epi32(z##A, z##B);                              \
509    y##D = _mm_unpackhi_epi32(z##C, z##D);                              \
510    z##A = _mm_unpacklo_epi64(y##A, y##B);                              \
511    z##B = _mm_unpackhi_epi64(y##A, y##B);                              \
512    z##C = _mm_unpacklo_epi64(y##C, y##D);                              \
513    z##D = _mm_unpackhi_epi64(y##C, y##D);                              \
514    y##A = _mm_xor_si128(z##A, _mm_loadu_si128((__m128i *) (m + 0)));   \
515    _mm_storeu_si128((__m128i *) (c + 0), y##A);                        \
516    y##B = _mm_xor_si128(z##B, _mm_loadu_si128((__m128i *) (m + 64)));  \
517    _mm_storeu_si128((__m128i *) (c + 64), y##B);                       \
518    y##C = _mm_xor_si128(z##C, _mm_loadu_si128((__m128i *) (m + 128))); \
519    _mm_storeu_si128((__m128i *) (c + 128), y##C);                      \
520    y##D = _mm_xor_si128(z##D, _mm_loadu_si128((__m128i *) (m + 192))); \
521    _mm_storeu_si128((__m128i *) (c + 192), y##D)
522
523#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
524
525        ONEQUAD(0, 1, 2, 3);
526        m += 16;
527        c += 16;
528        ONEQUAD(4, 5, 6, 7);
529        m += 16;
530        c += 16;
531        ONEQUAD(8, 9, 10, 11);
532        m += 16;
533        c += 16;
534        ONEQUAD(12, 13, 14, 15);
535        m -= 48;
536        c -= 48;
537
538#undef ONEQUAD
539#undef ONEQUAD_TRANSPOSE
540#undef ONEQUAD_EXTRACT
541#undef ONEQUAD_SHUFFLE
542
543        bytes -= 256;
544        c += 256;
545        m += 256;
546    }
547}
548