avxintrin.h revision 234353
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43  return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49  return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55  return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61  return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67  return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73  return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79  return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85  return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91  return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97  return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103  return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109  return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115  return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121  return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133  return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145  return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
148#define _mm256_round_pd(V, M) __extension__ ({ \
149    __m256d __V = (V); \
150    (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
151
152#define _mm256_round_ps(V, M) __extension__ ({ \
153  __m256 __V = (V); \
154  (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
155
156#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160
161/* Logical */
162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
163_mm256_and_pd(__m256d a, __m256d b)
164{
165  return (__m256d)((__v4di)a & (__v4di)b);
166}
167
168static __inline __m256 __attribute__((__always_inline__, __nodebug__))
169_mm256_and_ps(__m256 a, __m256 b)
170{
171  return (__m256)((__v8si)a & (__v8si)b);
172}
173
174static __inline __m256d __attribute__((__always_inline__, __nodebug__))
175_mm256_andnot_pd(__m256d a, __m256d b)
176{
177  return (__m256d)(~(__v4di)a & (__v4di)b);
178}
179
180static __inline __m256 __attribute__((__always_inline__, __nodebug__))
181_mm256_andnot_ps(__m256 a, __m256 b)
182{
183  return (__m256)(~(__v8si)a & (__v8si)b);
184}
185
186static __inline __m256d __attribute__((__always_inline__, __nodebug__))
187_mm256_or_pd(__m256d a, __m256d b)
188{
189  return (__m256d)((__v4di)a | (__v4di)b);
190}
191
192static __inline __m256 __attribute__((__always_inline__, __nodebug__))
193_mm256_or_ps(__m256 a, __m256 b)
194{
195  return (__m256)((__v8si)a | (__v8si)b);
196}
197
198static __inline __m256d __attribute__((__always_inline__, __nodebug__))
199_mm256_xor_pd(__m256d a, __m256d b)
200{
201  return (__m256d)((__v4di)a ^ (__v4di)b);
202}
203
204static __inline __m256 __attribute__((__always_inline__, __nodebug__))
205_mm256_xor_ps(__m256 a, __m256 b)
206{
207  return (__m256)((__v8si)a ^ (__v8si)b);
208}
209
210/* Horizontal arithmetic */
211static __inline __m256d __attribute__((__always_inline__, __nodebug__))
212_mm256_hadd_pd(__m256d a, __m256d b)
213{
214  return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215}
216
217static __inline __m256 __attribute__((__always_inline__, __nodebug__))
218_mm256_hadd_ps(__m256 a, __m256 b)
219{
220  return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221}
222
223static __inline __m256d __attribute__((__always_inline__, __nodebug__))
224_mm256_hsub_pd(__m256d a, __m256d b)
225{
226  return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227}
228
229static __inline __m256 __attribute__((__always_inline__, __nodebug__))
230_mm256_hsub_ps(__m256 a, __m256 b)
231{
232  return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233}
234
235/* Vector permutations */
236static __inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_permutevar_pd(__m128d a, __m128i c)
238{
239  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240}
241
242static __inline __m256d __attribute__((__always_inline__, __nodebug__))
243_mm256_permutevar_pd(__m256d a, __m256i c)
244{
245  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246}
247
248static __inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_permutevar_ps(__m128 a, __m128i c)
250{
251  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252}
253
254static __inline __m256 __attribute__((__always_inline__, __nodebug__))
255_mm256_permutevar_ps(__m256 a, __m256i c)
256{
257  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258						  (__v8si)c);
259}
260
261#define _mm_permute_pd(A, C) __extension__ ({ \
262  __m128d __A = (A); \
263  (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
264                                   (C) & 0x1, ((C) & 0x2) >> 1); })
265
266#define _mm256_permute_pd(A, C) __extension__ ({ \
267  __m256d __A = (A); \
268  (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
269                                   (C) & 0x1, ((C) & 0x2) >> 1, \
270                                   2 + (((C) & 0x4) >> 2), \
271                                   2 + (((C) & 0x8) >> 3)); })
272
273#define _mm_permute_ps(A, C) __extension__ ({ \
274  __m128 __A = (A); \
275  (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
276                                   (C) & 0x3, ((C) & 0xc) >> 2, \
277                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
278
279#define _mm256_permute_ps(A, C) __extension__ ({ \
280  __m256 __A = (A); \
281  (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
282                                  (C) & 0x3, ((C) & 0xc) >> 2, \
283                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
284                                  4 + (((C) & 0x03) >> 0), \
285                                  4 + (((C) & 0x0c) >> 2), \
286                                  4 + (((C) & 0x30) >> 4), \
287                                  4 + (((C) & 0xc0) >> 6)); })
288
289#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
290  __m256d __V1 = (V1); \
291  __m256d __V2 = (V2); \
292  (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
293                                   ((M) & 0x3) * 2, \
294                                   ((M) & 0x3) * 2 + 1, \
295                                   (((M) & 0x30) >> 4) * 2, \
296                                   (((M) & 0x30) >> 4) * 2 + 1); })
297
298#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
299  __m256 __V1 = (V1); \
300  __m256 __V2 = (V2); \
301  (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
302                                  ((M) & 0x3) * 4, \
303                                  ((M) & 0x3) * 4 + 1, \
304                                  ((M) & 0x3) * 4 + 2, \
305                                  ((M) & 0x3) * 4 + 3, \
306                                  (((M) & 0x30) >> 4) * 4, \
307                                  (((M) & 0x30) >> 4) * 4 + 1, \
308                                  (((M) & 0x30) >> 4) * 4 + 2, \
309                                  (((M) & 0x30) >> 4) * 4 + 3); })
310
311#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
312  __m256i __V1 = (V1); \
313  __m256i __V2 = (V2); \
314  (__m256i)__builtin_shufflevector((__v8si)__V1, (__v8si)__V2, \
315                                   ((M) & 0x3) * 4, \
316                                   ((M) & 0x3) * 4 + 1, \
317                                   ((M) & 0x3) * 4 + 2, \
318                                   ((M) & 0x3) * 4 + 3, \
319                                   (((M) & 0x30) >> 4) * 4, \
320                                   (((M) & 0x30) >> 4) * 4 + 1, \
321                                   (((M) & 0x30) >> 4) * 4 + 2, \
322                                   (((M) & 0x30) >> 4) * 4 + 3); })
323
324/* Vector Blend */
325#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
326  __m256d __V1 = (V1); \
327  __m256d __V2 = (V2); \
328  (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); })
329
330#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
331  __m256 __V1 = (V1); \
332  __m256 __V2 = (V2); \
333  (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
334
335static __inline __m256d __attribute__((__always_inline__, __nodebug__))
336_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
337{
338  return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
339}
340
341static __inline __m256 __attribute__((__always_inline__, __nodebug__))
342_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
343{
344  return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
345}
346
347/* Vector Dot Product */
348#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
349  __m256 __V1 = (V1); \
350  __m256 __V2 = (V2); \
351  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
352
353/* Vector shuffle */
354#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
355        __m256 __a = (a); \
356        __m256 __b = (b); \
357        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
358        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
359        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
360        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
361        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
362
363#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
364        __m256d __a = (a); \
365        __m256d __b = (b); \
366        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
367        (mask) & 0x1, \
368        (((mask) & 0x2) >> 1) + 4, \
369        (((mask) & 0x4) >> 2) + 2, \
370        (((mask) & 0x8) >> 3) + 6); })
371
372/* Compare */
373#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
374#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
375#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
376#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
377#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
378#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
379#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
380#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
381#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
382#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
383#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
384#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
385#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
386#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
387#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
388#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
389#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
390#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
391#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
392#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
393#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
394#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
395#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
396#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
397#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
398#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
399#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
400#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
401#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
402#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
403#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
404#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
405
406#define _mm_cmp_pd(a, b, c) __extension__ ({ \
407  __m128d __a = (a); \
408  __m128d __b = (b); \
409  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
410
411#define _mm_cmp_ps(a, b, c) __extension__ ({ \
412  __m128 __a = (a); \
413  __m128 __b = (b); \
414  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
415
416#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
417  __m256d __a = (a); \
418  __m256d __b = (b); \
419  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
420
421#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
422  __m256 __a = (a); \
423  __m256 __b = (b); \
424  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
425
426#define _mm_cmp_sd(a, b, c) __extension__ ({ \
427  __m128d __a = (a); \
428  __m128d __b = (b); \
429  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
430
431#define _mm_cmp_ss(a, b, c) __extension__ ({ \
432  __m128 __a = (a); \
433  __m128 __b = (b); \
434  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
435
436/* Vector extract */
437#define _mm256_extractf128_pd(A, O) __extension__ ({ \
438  __m256d __A = (A); \
439  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
440
441#define _mm256_extractf128_ps(A, O) __extension__ ({ \
442  __m256 __A = (A); \
443  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
444
445#define _mm256_extractf128_si256(A, O) __extension__ ({ \
446  __m256i __A = (A); \
447  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
448
449static __inline int __attribute__((__always_inline__, __nodebug__))
450_mm256_extract_epi32(__m256i a, int const imm)
451{
452  __v8si b = (__v8si)a;
453  return b[imm];
454}
455
456static __inline int __attribute__((__always_inline__, __nodebug__))
457_mm256_extract_epi16(__m256i a, int const imm)
458{
459  __v16hi b = (__v16hi)a;
460  return b[imm];
461}
462
463static __inline int __attribute__((__always_inline__, __nodebug__))
464_mm256_extract_epi8(__m256i a, int const imm)
465{
466  __v32qi b = (__v32qi)a;
467  return b[imm];
468}
469
470#ifdef __x86_64__
471static __inline long long  __attribute__((__always_inline__, __nodebug__))
472_mm256_extract_epi64(__m256i a, const int imm)
473{
474  __v4di b = (__v4di)a;
475  return b[imm];
476}
477#endif
478
479/* Vector insert */
480#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
481  __m256d __V1 = (V1); \
482  __m128d __V2 = (V2); \
483  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
484
485#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
486  __m256 __V1 = (V1); \
487  __m128 __V2 = (V2); \
488  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
489
490#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
491  __m256i __V1 = (V1); \
492  __m128i __V2 = (V2); \
493  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
494
495static __inline __m256i __attribute__((__always_inline__, __nodebug__))
496_mm256_insert_epi32(__m256i a, int b, int const imm)
497{
498  __v8si c = (__v8si)a;
499  c[imm & 7] = b;
500  return (__m256i)c;
501}
502
503static __inline __m256i __attribute__((__always_inline__, __nodebug__))
504_mm256_insert_epi16(__m256i a, int b, int const imm)
505{
506  __v16hi c = (__v16hi)a;
507  c[imm & 15] = b;
508  return (__m256i)c;
509}
510
511static __inline __m256i __attribute__((__always_inline__, __nodebug__))
512_mm256_insert_epi8(__m256i a, int b, int const imm)
513{
514  __v32qi c = (__v32qi)a;
515  c[imm & 31] = b;
516  return (__m256i)c;
517}
518
519#ifdef __x86_64__
520static __inline __m256i __attribute__((__always_inline__, __nodebug__))
521_mm256_insert_epi64(__m256i a, int b, int const imm)
522{
523  __v4di c = (__v4di)a;
524  c[imm & 3] = b;
525  return (__m256i)c;
526}
527#endif
528
529/* Conversion */
530static __inline __m256d __attribute__((__always_inline__, __nodebug__))
531_mm256_cvtepi32_pd(__m128i a)
532{
533  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
534}
535
536static __inline __m256 __attribute__((__always_inline__, __nodebug__))
537_mm256_cvtepi32_ps(__m256i a)
538{
539  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
540}
541
542static __inline __m128 __attribute__((__always_inline__, __nodebug__))
543_mm256_cvtpd_ps(__m256d a)
544{
545  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
546}
547
548static __inline __m256i __attribute__((__always_inline__, __nodebug__))
549_mm256_cvtps_epi32(__m256 a)
550{
551  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
552}
553
554static __inline __m256d __attribute__((__always_inline__, __nodebug__))
555_mm256_cvtps_pd(__m128 a)
556{
557  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
558}
559
560static __inline __m128i __attribute__((__always_inline__, __nodebug__))
561_mm256_cvttpd_epi32(__m256d a)
562{
563  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
564}
565
566static __inline __m128i __attribute__((__always_inline__, __nodebug__))
567_mm256_cvtpd_epi32(__m256d a)
568{
569  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
570}
571
572static __inline __m256i __attribute__((__always_inline__, __nodebug__))
573_mm256_cvttps_epi32(__m256 a)
574{
575  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
576}
577
578/* Vector replicate */
579static __inline __m256 __attribute__((__always_inline__, __nodebug__))
580_mm256_movehdup_ps(__m256 a)
581{
582  return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
583}
584
585static __inline __m256 __attribute__((__always_inline__, __nodebug__))
586_mm256_moveldup_ps(__m256 a)
587{
588  return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
589}
590
591static __inline __m256d __attribute__((__always_inline__, __nodebug__))
592_mm256_movedup_pd(__m256d a)
593{
594  return __builtin_shufflevector(a, a, 0, 0, 2, 2);
595}
596
597/* Unpack and Interleave */
598static __inline __m256d __attribute__((__always_inline__, __nodebug__))
599_mm256_unpackhi_pd(__m256d a, __m256d b)
600{
601  return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
602}
603
604static __inline __m256d __attribute__((__always_inline__, __nodebug__))
605_mm256_unpacklo_pd(__m256d a, __m256d b)
606{
607  return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
608}
609
610static __inline __m256 __attribute__((__always_inline__, __nodebug__))
611_mm256_unpackhi_ps(__m256 a, __m256 b)
612{
613  return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
614}
615
616static __inline __m256 __attribute__((__always_inline__, __nodebug__))
617_mm256_unpacklo_ps(__m256 a, __m256 b)
618{
619  return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
620}
621
622/* Bit Test */
623static __inline int __attribute__((__always_inline__, __nodebug__))
624_mm_testz_pd(__m128d a, __m128d b)
625{
626  return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
627}
628
629static __inline int __attribute__((__always_inline__, __nodebug__))
630_mm_testc_pd(__m128d a, __m128d b)
631{
632  return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
633}
634
635static __inline int __attribute__((__always_inline__, __nodebug__))
636_mm_testnzc_pd(__m128d a, __m128d b)
637{
638  return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
639}
640
641static __inline int __attribute__((__always_inline__, __nodebug__))
642_mm_testz_ps(__m128 a, __m128 b)
643{
644  return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
645}
646
647static __inline int __attribute__((__always_inline__, __nodebug__))
648_mm_testc_ps(__m128 a, __m128 b)
649{
650  return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
651}
652
653static __inline int __attribute__((__always_inline__, __nodebug__))
654_mm_testnzc_ps(__m128 a, __m128 b)
655{
656  return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
657}
658
659static __inline int __attribute__((__always_inline__, __nodebug__))
660_mm256_testz_pd(__m256d a, __m256d b)
661{
662  return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
663}
664
665static __inline int __attribute__((__always_inline__, __nodebug__))
666_mm256_testc_pd(__m256d a, __m256d b)
667{
668  return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
669}
670
671static __inline int __attribute__((__always_inline__, __nodebug__))
672_mm256_testnzc_pd(__m256d a, __m256d b)
673{
674  return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
675}
676
677static __inline int __attribute__((__always_inline__, __nodebug__))
678_mm256_testz_ps(__m256 a, __m256 b)
679{
680  return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
681}
682
683static __inline int __attribute__((__always_inline__, __nodebug__))
684_mm256_testc_ps(__m256 a, __m256 b)
685{
686  return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
687}
688
689static __inline int __attribute__((__always_inline__, __nodebug__))
690_mm256_testnzc_ps(__m256 a, __m256 b)
691{
692  return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
693}
694
695static __inline int __attribute__((__always_inline__, __nodebug__))
696_mm256_testz_si256(__m256i a, __m256i b)
697{
698  return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
699}
700
701static __inline int __attribute__((__always_inline__, __nodebug__))
702_mm256_testc_si256(__m256i a, __m256i b)
703{
704  return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
705}
706
707static __inline int __attribute__((__always_inline__, __nodebug__))
708_mm256_testnzc_si256(__m256i a, __m256i b)
709{
710  return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
711}
712
713/* Vector extract sign mask */
714static __inline int __attribute__((__always_inline__, __nodebug__))
715_mm256_movemask_pd(__m256d a)
716{
717  return __builtin_ia32_movmskpd256((__v4df)a);
718}
719
720static __inline int __attribute__((__always_inline__, __nodebug__))
721_mm256_movemask_ps(__m256 a)
722{
723  return __builtin_ia32_movmskps256((__v8sf)a);
724}
725
726/* Vector zero */
727static __inline void __attribute__((__always_inline__, __nodebug__))
728_mm256_zeroall(void)
729{
730  __builtin_ia32_vzeroall();
731}
732
733static __inline void __attribute__((__always_inline__, __nodebug__))
734_mm256_zeroupper(void)
735{
736  __builtin_ia32_vzeroupper();
737}
738
739/* Vector load with broadcast */
740static __inline __m128 __attribute__((__always_inline__, __nodebug__))
741_mm_broadcast_ss(float const *a)
742{
743  return (__m128)__builtin_ia32_vbroadcastss(a);
744}
745
746static __inline __m256d __attribute__((__always_inline__, __nodebug__))
747_mm256_broadcast_sd(double const *a)
748{
749  return (__m256d)__builtin_ia32_vbroadcastsd256(a);
750}
751
752static __inline __m256 __attribute__((__always_inline__, __nodebug__))
753_mm256_broadcast_ss(float const *a)
754{
755  return (__m256)__builtin_ia32_vbroadcastss256(a);
756}
757
758static __inline __m256d __attribute__((__always_inline__, __nodebug__))
759_mm256_broadcast_pd(__m128d const *a)
760{
761  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
762}
763
764static __inline __m256 __attribute__((__always_inline__, __nodebug__))
765_mm256_broadcast_ps(__m128 const *a)
766{
767  return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
768}
769
770/* SIMD load ops */
771static __inline __m256d __attribute__((__always_inline__, __nodebug__))
772_mm256_load_pd(double const *p)
773{
774  return *(__m256d *)p;
775}
776
777static __inline __m256 __attribute__((__always_inline__, __nodebug__))
778_mm256_load_ps(float const *p)
779{
780  return *(__m256 *)p;
781}
782
783static __inline __m256d __attribute__((__always_inline__, __nodebug__))
784_mm256_loadu_pd(double const *p)
785{
786  struct __loadu_pd {
787    __m256d v;
788  } __attribute__((packed, may_alias));
789  return ((struct __loadu_pd*)p)->v;
790}
791
792static __inline __m256 __attribute__((__always_inline__, __nodebug__))
793_mm256_loadu_ps(float const *p)
794{
795  struct __loadu_ps {
796    __m256 v;
797  } __attribute__((packed, may_alias));
798  return ((struct __loadu_ps*)p)->v;
799}
800
801static __inline __m256i __attribute__((__always_inline__, __nodebug__))
802_mm256_load_si256(__m256i const *p)
803{
804  return *p;
805}
806
807static __inline __m256i __attribute__((__always_inline__, __nodebug__))
808_mm256_loadu_si256(__m256i const *p)
809{
810  struct __loadu_si256 {
811    __m256i v;
812  } __attribute__((packed, may_alias));
813  return ((struct __loadu_si256*)p)->v;
814}
815
816static __inline __m256i __attribute__((__always_inline__, __nodebug__))
817_mm256_lddqu_si256(__m256i const *p)
818{
819  return (__m256i)__builtin_ia32_lddqu256((char const *)p);
820}
821
822/* SIMD store ops */
823static __inline void __attribute__((__always_inline__, __nodebug__))
824_mm256_store_pd(double *p, __m256d a)
825{
826  *(__m256d *)p = a;
827}
828
829static __inline void __attribute__((__always_inline__, __nodebug__))
830_mm256_store_ps(float *p, __m256 a)
831{
832  *(__m256 *)p = a;
833}
834
835static __inline void __attribute__((__always_inline__, __nodebug__))
836_mm256_storeu_pd(double *p, __m256d a)
837{
838  __builtin_ia32_storeupd256(p, (__v4df)a);
839}
840
841static __inline void __attribute__((__always_inline__, __nodebug__))
842_mm256_storeu_ps(float *p, __m256 a)
843{
844  __builtin_ia32_storeups256(p, (__v8sf)a);
845}
846
847static __inline void __attribute__((__always_inline__, __nodebug__))
848_mm256_store_si256(__m256i *p, __m256i a)
849{
850  *p = a;
851}
852
853static __inline void __attribute__((__always_inline__, __nodebug__))
854_mm256_storeu_si256(__m256i *p, __m256i a)
855{
856  __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
857}
858
859/* Conditional load ops */
860static __inline __m128d __attribute__((__always_inline__, __nodebug__))
861_mm_maskload_pd(double const *p, __m128d m)
862{
863  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
864}
865
866static __inline __m256d __attribute__((__always_inline__, __nodebug__))
867_mm256_maskload_pd(double const *p, __m256d m)
868{
869  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
870}
871
872static __inline __m128 __attribute__((__always_inline__, __nodebug__))
873_mm_maskload_ps(float const *p, __m128 m)
874{
875  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
876}
877
878static __inline __m256 __attribute__((__always_inline__, __nodebug__))
879_mm256_maskload_ps(float const *p, __m256 m)
880{
881  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
882}
883
884/* Conditional store ops */
885static __inline void __attribute__((__always_inline__, __nodebug__))
886_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
887{
888  __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
889}
890
891static __inline void __attribute__((__always_inline__, __nodebug__))
892_mm_maskstore_pd(double *p, __m128d m, __m128d a)
893{
894  __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
895}
896
897static __inline void __attribute__((__always_inline__, __nodebug__))
898_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
899{
900  __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
901}
902
903static __inline void __attribute__((__always_inline__, __nodebug__))
904_mm_maskstore_ps(float *p, __m128 m, __m128 a)
905{
906  __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
907}
908
909/* Cacheability support ops */
910static __inline void __attribute__((__always_inline__, __nodebug__))
911_mm256_stream_si256(__m256i *a, __m256i b)
912{
913  __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
914}
915
916static __inline void __attribute__((__always_inline__, __nodebug__))
917_mm256_stream_pd(double *a, __m256d b)
918{
919  __builtin_ia32_movntpd256(a, (__v4df)b);
920}
921
922static __inline void __attribute__((__always_inline__, __nodebug__))
923_mm256_stream_ps(float *p, __m256 a)
924{
925  __builtin_ia32_movntps256(p, (__v8sf)a);
926}
927
928/* Create vectors */
929static __inline __m256d __attribute__((__always_inline__, __nodebug__))
930_mm256_set_pd(double a, double b, double c, double d)
931{
932  return (__m256d){ d, c, b, a };
933}
934
935static __inline __m256 __attribute__((__always_inline__, __nodebug__))
936_mm256_set_ps(float a, float b, float c, float d,
937	            float e, float f, float g, float h)
938{
939  return (__m256){ h, g, f, e, d, c, b, a };
940}
941
942static __inline __m256i __attribute__((__always_inline__, __nodebug__))
943_mm256_set_epi32(int i0, int i1, int i2, int i3,
944		             int i4, int i5, int i6, int i7)
945{
946  return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
947}
948
949static __inline __m256i __attribute__((__always_inline__, __nodebug__))
950_mm256_set_epi16(short w15, short w14, short w13, short w12,
951		             short w11, short w10, short w09, short w08,
952		             short w07, short w06, short w05, short w04,
953		             short w03, short w02, short w01, short w00)
954{
955  return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
956                             w08, w09, w10, w11, w12, w13, w14, w15 };
957}
958
959static __inline __m256i __attribute__((__always_inline__, __nodebug__))
960_mm256_set_epi8(char b31, char b30, char b29, char b28,
961		            char b27, char b26, char b25, char b24,
962		            char b23, char b22, char b21, char b20,
963		            char b19, char b18, char b17, char b16,
964		            char b15, char b14, char b13, char b12,
965		            char b11, char b10, char b09, char b08,
966		            char b07, char b06, char b05, char b04,
967		            char b03, char b02, char b01, char b00)
968{
969  return (__m256i)(__v32qi){
970    b00, b01, b02, b03, b04, b05, b06, b07,
971    b08, b09, b10, b11, b12, b13, b14, b15,
972    b16, b17, b18, b19, b20, b21, b22, b23,
973    b24, b25, b26, b27, b28, b29, b30, b31
974  };
975}
976
977static __inline __m256i __attribute__((__always_inline__, __nodebug__))
978_mm256_set_epi64x(long long a, long long b, long long c, long long d)
979{
980  return (__m256i)(__v4di){ d, c, b, a };
981}
982
983/* Create vectors with elements in reverse order */
984static __inline __m256d __attribute__((__always_inline__, __nodebug__))
985_mm256_setr_pd(double a, double b, double c, double d)
986{
987  return (__m256d){ a, b, c, d };
988}
989
990static __inline __m256 __attribute__((__always_inline__, __nodebug__))
991_mm256_setr_ps(float a, float b, float c, float d,
992		           float e, float f, float g, float h)
993{
994  return (__m256){ a, b, c, d, e, f, g, h };
995}
996
997static __inline __m256i __attribute__((__always_inline__, __nodebug__))
998_mm256_setr_epi32(int i0, int i1, int i2, int i3,
999		              int i4, int i5, int i6, int i7)
1000{
1001  return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
1002}
1003
1004static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1005_mm256_setr_epi16(short w15, short w14, short w13, short w12,
1006		   short w11, short w10, short w09, short w08,
1007		   short w07, short w06, short w05, short w04,
1008		   short w03, short w02, short w01, short w00)
1009{
1010  return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
1011			                       w07, w06, w05, w04, w03, w02, w01, w00 };
1012}
1013
1014static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1015_mm256_setr_epi8(char b31, char b30, char b29, char b28,
1016		             char b27, char b26, char b25, char b24,
1017		             char b23, char b22, char b21, char b20,
1018		             char b19, char b18, char b17, char b16,
1019		             char b15, char b14, char b13, char b12,
1020		             char b11, char b10, char b09, char b08,
1021		             char b07, char b06, char b05, char b04,
1022		             char b03, char b02, char b01, char b00)
1023{
1024  return (__m256i)(__v32qi){
1025    b31, b30, b29, b28, b27, b26, b25, b24,
1026		b23, b22, b21, b20, b19, b18, b17, b16,
1027		b15, b14, b13, b12, b11, b10, b09, b08,
1028		b07, b06, b05, b04, b03, b02, b01, b00 };
1029}
1030
1031static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1032_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1033{
1034  return (__m256i)(__v4di){ a, b, c, d };
1035}
1036
1037/* Create vectors with repeated elements */
1038static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1039_mm256_set1_pd(double w)
1040{
1041  return (__m256d){ w, w, w, w };
1042}
1043
1044static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1045_mm256_set1_ps(float w)
1046{
1047  return (__m256){ w, w, w, w, w, w, w, w };
1048}
1049
1050static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1051_mm256_set1_epi32(int i)
1052{
1053  return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1054}
1055
1056static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1057_mm256_set1_epi16(short w)
1058{
1059  return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1060}
1061
1062static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1063_mm256_set1_epi8(char b)
1064{
1065  return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1066                             b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1067}
1068
1069static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1070_mm256_set1_epi64x(long long q)
1071{
1072  return (__m256i)(__v4di){ q, q, q, q };
1073}
1074
1075/* Create zeroed vectors */
1076static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1077_mm256_setzero_pd(void)
1078{
1079  return (__m256d){ 0, 0, 0, 0 };
1080}
1081
1082static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1083_mm256_setzero_ps(void)
1084{
1085  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1086}
1087
1088static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1089_mm256_setzero_si256(void)
1090{
1091  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1092}
1093
1094/* Cast between vector types */
1095static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1096_mm256_castpd_ps(__m256d in)
1097{
1098  return (__m256)in;
1099}
1100
1101static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1102_mm256_castpd_si256(__m256d in)
1103{
1104  return (__m256i)in;
1105}
1106
1107static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1108_mm256_castps_pd(__m256 in)
1109{
1110  return (__m256d)in;
1111}
1112
1113static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1114_mm256_castps_si256(__m256 in)
1115{
1116  return (__m256i)in;
1117}
1118
1119static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1120_mm256_castsi256_ps(__m256i in)
1121{
1122  return (__m256)in;
1123}
1124
1125static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1126_mm256_castsi256_pd(__m256i in)
1127{
1128  return (__m256d)in;
1129}
1130
1131static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1132_mm256_castpd256_pd128(__m256d in)
1133{
1134  return __builtin_shufflevector(in, in, 0, 1);
1135}
1136
1137static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1138_mm256_castps256_ps128(__m256 in)
1139{
1140  return __builtin_shufflevector(in, in, 0, 1, 2, 3);
1141}
1142
1143static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1144_mm256_castsi256_si128(__m256i in)
1145{
1146  return __builtin_shufflevector(in, in, 0, 1);
1147}
1148
1149static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1150_mm256_castpd128_pd256(__m128d in)
1151{
1152  __m128d zero = _mm_setzero_pd();
1153  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1154}
1155
1156static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1157_mm256_castps128_ps256(__m128 in)
1158{
1159  __m128 zero = _mm_setzero_ps();
1160  return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
1161}
1162
1163static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1164_mm256_castsi128_si256(__m128i in)
1165{
1166  __m128i zero = _mm_setzero_si128();
1167  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1168}
1169
1170/* SIMD load ops (unaligned) */
1171static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1172_mm256_loadu2_m128(float const *addr_hi, float const *addr_lo)
1173{
1174  struct __loadu_ps {
1175    __m128 v;
1176  } __attribute__((__packed__, __may_alias__));
1177
1178  __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v);
1179  return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1);
1180}
1181
1182static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1183_mm256_loadu2_m128d(double const *addr_hi, double const *addr_lo)
1184{
1185  struct __loadu_pd {
1186    __m128d v;
1187  } __attribute__((__packed__, __may_alias__));
1188
1189  __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v);
1190  return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1);
1191}
1192
1193static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1194_mm256_loadu2_m128i(__m128i const *addr_hi, __m128i const *addr_lo)
1195{
1196  struct __loadu_si128 {
1197    __m128i v;
1198  } __attribute__((packed, may_alias));
1199  __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v);
1200  return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1);
1201}
1202
1203/* SIMD store ops (unaligned) */
1204static __inline void __attribute__((__always_inline__, __nodebug__))
1205_mm256_storeu2_m128(float *addr_hi, float *addr_lo, __m256 a)
1206{
1207  __m128 v128;
1208
1209  v128 = _mm256_castps256_ps128(a);
1210  __builtin_ia32_storeups(addr_lo, v128);
1211  v128 = _mm256_extractf128_ps(a, 1);
1212  __builtin_ia32_storeups(addr_hi, v128);
1213}
1214
1215static __inline void __attribute__((__always_inline__, __nodebug__))
1216_mm256_storeu2_m128d(double *addr_hi, double *addr_lo, __m256d a)
1217{
1218  __m128d v128;
1219
1220  v128 = _mm256_castpd256_pd128(a);
1221  __builtin_ia32_storeupd(addr_lo, v128);
1222  v128 = _mm256_extractf128_pd(a, 1);
1223  __builtin_ia32_storeupd(addr_hi, v128);
1224}
1225
1226static __inline void __attribute__((__always_inline__, __nodebug__))
1227_mm256_storeu2_m128i(__m128i *addr_hi, __m128i *addr_lo, __m256i a)
1228{
1229  __m128i v128;
1230
1231  v128 = _mm256_castsi256_si128(a);
1232  __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128);
1233  v128 = _mm256_extractf128_si256(a, 1);
1234  __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128);
1235}
1236