avxintrin.h revision 234982
1132383Sdas/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2132383Sdas *
3132383Sdas * Permission is hereby granted, free of charge, to any person obtaining a copy
4132383Sdas * of this software and associated documentation files (the "Software"), to deal
5132383Sdas * in the Software without restriction, including without limitation the rights
6132383Sdas * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7132383Sdas * copies of the Software, and to permit persons to whom the Software is
8132383Sdas * furnished to do so, subject to the following conditions:
9132383Sdas *
10132383Sdas * The above copyright notice and this permission notice shall be included in
11132383Sdas * all copies or substantial portions of the Software.
12132383Sdas *
13132383Sdas * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14132383Sdas * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15132383Sdas * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16132383Sdas * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17132383Sdas * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18132383Sdas * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19132383Sdas * THE SOFTWARE.
20132383Sdas *
21132383Sdas *===-----------------------------------------------------------------------===
22132383Sdas */
23132383Sdas
24132383Sdas#ifndef __IMMINTRIN_H
25132383Sdas#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26132383Sdas#endif
27
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43  return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49  return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55  return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61  return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67  return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73  return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79  return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85  return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91  return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97  return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103  return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109  return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115  return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121  return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133  return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145  return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
148#define _mm256_round_pd(V, M) __extension__ ({ \
149    __m256d __V = (V); \
150    (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
151
152#define _mm256_round_ps(V, M) __extension__ ({ \
153  __m256 __V = (V); \
154  (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
155
156#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160
161/* Logical */
162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
163_mm256_and_pd(__m256d a, __m256d b)
164{
165  return (__m256d)((__v4di)a & (__v4di)b);
166}
167
168static __inline __m256 __attribute__((__always_inline__, __nodebug__))
169_mm256_and_ps(__m256 a, __m256 b)
170{
171  return (__m256)((__v8si)a & (__v8si)b);
172}
173
174static __inline __m256d __attribute__((__always_inline__, __nodebug__))
175_mm256_andnot_pd(__m256d a, __m256d b)
176{
177  return (__m256d)(~(__v4di)a & (__v4di)b);
178}
179
180static __inline __m256 __attribute__((__always_inline__, __nodebug__))
181_mm256_andnot_ps(__m256 a, __m256 b)
182{
183  return (__m256)(~(__v8si)a & (__v8si)b);
184}
185
186static __inline __m256d __attribute__((__always_inline__, __nodebug__))
187_mm256_or_pd(__m256d a, __m256d b)
188{
189  return (__m256d)((__v4di)a | (__v4di)b);
190}
191
192static __inline __m256 __attribute__((__always_inline__, __nodebug__))
193_mm256_or_ps(__m256 a, __m256 b)
194{
195  return (__m256)((__v8si)a | (__v8si)b);
196}
197
198static __inline __m256d __attribute__((__always_inline__, __nodebug__))
199_mm256_xor_pd(__m256d a, __m256d b)
200{
201  return (__m256d)((__v4di)a ^ (__v4di)b);
202}
203
204static __inline __m256 __attribute__((__always_inline__, __nodebug__))
205_mm256_xor_ps(__m256 a, __m256 b)
206{
207  return (__m256)((__v8si)a ^ (__v8si)b);
208}
209
210/* Horizontal arithmetic */
211static __inline __m256d __attribute__((__always_inline__, __nodebug__))
212_mm256_hadd_pd(__m256d a, __m256d b)
213{
214  return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215}
216
217static __inline __m256 __attribute__((__always_inline__, __nodebug__))
218_mm256_hadd_ps(__m256 a, __m256 b)
219{
220  return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221}
222
223static __inline __m256d __attribute__((__always_inline__, __nodebug__))
224_mm256_hsub_pd(__m256d a, __m256d b)
225{
226  return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227}
228
229static __inline __m256 __attribute__((__always_inline__, __nodebug__))
230_mm256_hsub_ps(__m256 a, __m256 b)
231{
232  return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233}
234
235/* Vector permutations */
236static __inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_permutevar_pd(__m128d a, __m128i c)
238{
239  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240}
241
242static __inline __m256d __attribute__((__always_inline__, __nodebug__))
243_mm256_permutevar_pd(__m256d a, __m256i c)
244{
245  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246}
247
248static __inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_permutevar_ps(__m128 a, __m128i c)
250{
251  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252}
253
254static __inline __m256 __attribute__((__always_inline__, __nodebug__))
255_mm256_permutevar_ps(__m256 a, __m256i c)
256{
257  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258						  (__v8si)c);
259}
260
261#define _mm_permute_pd(A, C) __extension__ ({ \
262  __m128d __A = (A); \
263  (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
264                                   (C) & 0x1, ((C) & 0x2) >> 1); })
265
266#define _mm256_permute_pd(A, C) __extension__ ({ \
267  __m256d __A = (A); \
268  (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
269                                   (C) & 0x1, ((C) & 0x2) >> 1, \
270                                   2 + (((C) & 0x4) >> 2), \
271                                   2 + (((C) & 0x8) >> 3)); })
272
273#define _mm_permute_ps(A, C) __extension__ ({ \
274  __m128 __A = (A); \
275  (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
276                                   (C) & 0x3, ((C) & 0xc) >> 2, \
277                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
278
279#define _mm256_permute_ps(A, C) __extension__ ({ \
280  __m256 __A = (A); \
281  (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
282                                  (C) & 0x3, ((C) & 0xc) >> 2, \
283                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
284                                  4 + (((C) & 0x03) >> 0), \
285                                  4 + (((C) & 0x0c) >> 2), \
286                                  4 + (((C) & 0x30) >> 4), \
287                                  4 + (((C) & 0xc0) >> 6)); })
288
289#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
290  __m256d __V1 = (V1); \
291  __m256d __V2 = (V2); \
292  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
293
294#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
295  __m256 __V1 = (V1); \
296  __m256 __V2 = (V2); \
297  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
298
299#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
300  __m256i __V1 = (V1); \
301  __m256i __V2 = (V2); \
302  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
303
304/* Vector Blend */
305#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
306  __m256d __V1 = (V1); \
307  __m256d __V2 = (V2); \
308  (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); })
309
310#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
311  __m256 __V1 = (V1); \
312  __m256 __V2 = (V2); \
313  (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
314
315static __inline __m256d __attribute__((__always_inline__, __nodebug__))
316_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
317{
318  return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
319}
320
321static __inline __m256 __attribute__((__always_inline__, __nodebug__))
322_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
323{
324  return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
325}
326
327/* Vector Dot Product */
328#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
329  __m256 __V1 = (V1); \
330  __m256 __V2 = (V2); \
331  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
332
333/* Vector shuffle */
334#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
335        __m256 __a = (a); \
336        __m256 __b = (b); \
337        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
338        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
339        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
340        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
341        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
342
343#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
344        __m256d __a = (a); \
345        __m256d __b = (b); \
346        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
347        (mask) & 0x1, \
348        (((mask) & 0x2) >> 1) + 4, \
349        (((mask) & 0x4) >> 2) + 2, \
350        (((mask) & 0x8) >> 3) + 6); })
351
352/* Compare */
353#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
354#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
355#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
356#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
357#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
358#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
359#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
360#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
361#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
362#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
363#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
364#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
365#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
366#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
367#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
368#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
369#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
370#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
371#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
372#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
373#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
374#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
375#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
376#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
377#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
378#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
379#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
380#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
381#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
382#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
383#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
384#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
385
386#define _mm_cmp_pd(a, b, c) __extension__ ({ \
387  __m128d __a = (a); \
388  __m128d __b = (b); \
389  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
390
391#define _mm_cmp_ps(a, b, c) __extension__ ({ \
392  __m128 __a = (a); \
393  __m128 __b = (b); \
394  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
395
396#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
397  __m256d __a = (a); \
398  __m256d __b = (b); \
399  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
400
401#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
402  __m256 __a = (a); \
403  __m256 __b = (b); \
404  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
405
406#define _mm_cmp_sd(a, b, c) __extension__ ({ \
407  __m128d __a = (a); \
408  __m128d __b = (b); \
409  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
410
411#define _mm_cmp_ss(a, b, c) __extension__ ({ \
412  __m128 __a = (a); \
413  __m128 __b = (b); \
414  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
415
416/* Vector extract */
417#define _mm256_extractf128_pd(A, O) __extension__ ({ \
418  __m256d __A = (A); \
419  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
420
421#define _mm256_extractf128_ps(A, O) __extension__ ({ \
422  __m256 __A = (A); \
423  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
424
425#define _mm256_extractf128_si256(A, O) __extension__ ({ \
426  __m256i __A = (A); \
427  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
428
429static __inline int __attribute__((__always_inline__, __nodebug__))
430_mm256_extract_epi32(__m256i a, int const imm)
431{
432  __v8si b = (__v8si)a;
433  return b[imm];
434}
435
436static __inline int __attribute__((__always_inline__, __nodebug__))
437_mm256_extract_epi16(__m256i a, int const imm)
438{
439  __v16hi b = (__v16hi)a;
440  return b[imm];
441}
442
443static __inline int __attribute__((__always_inline__, __nodebug__))
444_mm256_extract_epi8(__m256i a, int const imm)
445{
446  __v32qi b = (__v32qi)a;
447  return b[imm];
448}
449
450#ifdef __x86_64__
451static __inline long long  __attribute__((__always_inline__, __nodebug__))
452_mm256_extract_epi64(__m256i a, const int imm)
453{
454  __v4di b = (__v4di)a;
455  return b[imm];
456}
457#endif
458
459/* Vector insert */
460#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
461  __m256d __V1 = (V1); \
462  __m128d __V2 = (V2); \
463  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
464
465#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
466  __m256 __V1 = (V1); \
467  __m128 __V2 = (V2); \
468  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
469
470#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
471  __m256i __V1 = (V1); \
472  __m128i __V2 = (V2); \
473  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
474
475static __inline __m256i __attribute__((__always_inline__, __nodebug__))
476_mm256_insert_epi32(__m256i a, int b, int const imm)
477{
478  __v8si c = (__v8si)a;
479  c[imm & 7] = b;
480  return (__m256i)c;
481}
482
483static __inline __m256i __attribute__((__always_inline__, __nodebug__))
484_mm256_insert_epi16(__m256i a, int b, int const imm)
485{
486  __v16hi c = (__v16hi)a;
487  c[imm & 15] = b;
488  return (__m256i)c;
489}
490
491static __inline __m256i __attribute__((__always_inline__, __nodebug__))
492_mm256_insert_epi8(__m256i a, int b, int const imm)
493{
494  __v32qi c = (__v32qi)a;
495  c[imm & 31] = b;
496  return (__m256i)c;
497}
498
499#ifdef __x86_64__
500static __inline __m256i __attribute__((__always_inline__, __nodebug__))
501_mm256_insert_epi64(__m256i a, int b, int const imm)
502{
503  __v4di c = (__v4di)a;
504  c[imm & 3] = b;
505  return (__m256i)c;
506}
507#endif
508
509/* Conversion */
510static __inline __m256d __attribute__((__always_inline__, __nodebug__))
511_mm256_cvtepi32_pd(__m128i a)
512{
513  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
514}
515
516static __inline __m256 __attribute__((__always_inline__, __nodebug__))
517_mm256_cvtepi32_ps(__m256i a)
518{
519  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
520}
521
522static __inline __m128 __attribute__((__always_inline__, __nodebug__))
523_mm256_cvtpd_ps(__m256d a)
524{
525  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
526}
527
528static __inline __m256i __attribute__((__always_inline__, __nodebug__))
529_mm256_cvtps_epi32(__m256 a)
530{
531  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
532}
533
534static __inline __m256d __attribute__((__always_inline__, __nodebug__))
535_mm256_cvtps_pd(__m128 a)
536{
537  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
538}
539
540static __inline __m128i __attribute__((__always_inline__, __nodebug__))
541_mm256_cvttpd_epi32(__m256d a)
542{
543  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
544}
545
546static __inline __m128i __attribute__((__always_inline__, __nodebug__))
547_mm256_cvtpd_epi32(__m256d a)
548{
549  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
550}
551
552static __inline __m256i __attribute__((__always_inline__, __nodebug__))
553_mm256_cvttps_epi32(__m256 a)
554{
555  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
556}
557
558/* Vector replicate */
559static __inline __m256 __attribute__((__always_inline__, __nodebug__))
560_mm256_movehdup_ps(__m256 a)
561{
562  return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
563}
564
565static __inline __m256 __attribute__((__always_inline__, __nodebug__))
566_mm256_moveldup_ps(__m256 a)
567{
568  return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
569}
570
571static __inline __m256d __attribute__((__always_inline__, __nodebug__))
572_mm256_movedup_pd(__m256d a)
573{
574  return __builtin_shufflevector(a, a, 0, 0, 2, 2);
575}
576
577/* Unpack and Interleave */
578static __inline __m256d __attribute__((__always_inline__, __nodebug__))
579_mm256_unpackhi_pd(__m256d a, __m256d b)
580{
581  return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
582}
583
584static __inline __m256d __attribute__((__always_inline__, __nodebug__))
585_mm256_unpacklo_pd(__m256d a, __m256d b)
586{
587  return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
588}
589
590static __inline __m256 __attribute__((__always_inline__, __nodebug__))
591_mm256_unpackhi_ps(__m256 a, __m256 b)
592{
593  return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
594}
595
596static __inline __m256 __attribute__((__always_inline__, __nodebug__))
597_mm256_unpacklo_ps(__m256 a, __m256 b)
598{
599  return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
600}
601
602/* Bit Test */
603static __inline int __attribute__((__always_inline__, __nodebug__))
604_mm_testz_pd(__m128d a, __m128d b)
605{
606  return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
607}
608
609static __inline int __attribute__((__always_inline__, __nodebug__))
610_mm_testc_pd(__m128d a, __m128d b)
611{
612  return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
613}
614
615static __inline int __attribute__((__always_inline__, __nodebug__))
616_mm_testnzc_pd(__m128d a, __m128d b)
617{
618  return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
619}
620
621static __inline int __attribute__((__always_inline__, __nodebug__))
622_mm_testz_ps(__m128 a, __m128 b)
623{
624  return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
625}
626
627static __inline int __attribute__((__always_inline__, __nodebug__))
628_mm_testc_ps(__m128 a, __m128 b)
629{
630  return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
631}
632
633static __inline int __attribute__((__always_inline__, __nodebug__))
634_mm_testnzc_ps(__m128 a, __m128 b)
635{
636  return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
637}
638
639static __inline int __attribute__((__always_inline__, __nodebug__))
640_mm256_testz_pd(__m256d a, __m256d b)
641{
642  return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
643}
644
645static __inline int __attribute__((__always_inline__, __nodebug__))
646_mm256_testc_pd(__m256d a, __m256d b)
647{
648  return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
649}
650
651static __inline int __attribute__((__always_inline__, __nodebug__))
652_mm256_testnzc_pd(__m256d a, __m256d b)
653{
654  return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
655}
656
657static __inline int __attribute__((__always_inline__, __nodebug__))
658_mm256_testz_ps(__m256 a, __m256 b)
659{
660  return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
661}
662
663static __inline int __attribute__((__always_inline__, __nodebug__))
664_mm256_testc_ps(__m256 a, __m256 b)
665{
666  return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
667}
668
669static __inline int __attribute__((__always_inline__, __nodebug__))
670_mm256_testnzc_ps(__m256 a, __m256 b)
671{
672  return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
673}
674
675static __inline int __attribute__((__always_inline__, __nodebug__))
676_mm256_testz_si256(__m256i a, __m256i b)
677{
678  return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
679}
680
681static __inline int __attribute__((__always_inline__, __nodebug__))
682_mm256_testc_si256(__m256i a, __m256i b)
683{
684  return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
685}
686
687static __inline int __attribute__((__always_inline__, __nodebug__))
688_mm256_testnzc_si256(__m256i a, __m256i b)
689{
690  return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
691}
692
693/* Vector extract sign mask */
694static __inline int __attribute__((__always_inline__, __nodebug__))
695_mm256_movemask_pd(__m256d a)
696{
697  return __builtin_ia32_movmskpd256((__v4df)a);
698}
699
700static __inline int __attribute__((__always_inline__, __nodebug__))
701_mm256_movemask_ps(__m256 a)
702{
703  return __builtin_ia32_movmskps256((__v8sf)a);
704}
705
706/* Vector zero */
707static __inline void __attribute__((__always_inline__, __nodebug__))
708_mm256_zeroall(void)
709{
710  __builtin_ia32_vzeroall();
711}
712
713static __inline void __attribute__((__always_inline__, __nodebug__))
714_mm256_zeroupper(void)
715{
716  __builtin_ia32_vzeroupper();
717}
718
719/* Vector load with broadcast */
720static __inline __m128 __attribute__((__always_inline__, __nodebug__))
721_mm_broadcast_ss(float const *a)
722{
723  return (__m128)__builtin_ia32_vbroadcastss(a);
724}
725
726static __inline __m256d __attribute__((__always_inline__, __nodebug__))
727_mm256_broadcast_sd(double const *a)
728{
729  return (__m256d)__builtin_ia32_vbroadcastsd256(a);
730}
731
732static __inline __m256 __attribute__((__always_inline__, __nodebug__))
733_mm256_broadcast_ss(float const *a)
734{
735  return (__m256)__builtin_ia32_vbroadcastss256(a);
736}
737
738static __inline __m256d __attribute__((__always_inline__, __nodebug__))
739_mm256_broadcast_pd(__m128d const *a)
740{
741  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
742}
743
744static __inline __m256 __attribute__((__always_inline__, __nodebug__))
745_mm256_broadcast_ps(__m128 const *a)
746{
747  return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
748}
749
750/* SIMD load ops */
751static __inline __m256d __attribute__((__always_inline__, __nodebug__))
752_mm256_load_pd(double const *p)
753{
754  return *(__m256d *)p;
755}
756
757static __inline __m256 __attribute__((__always_inline__, __nodebug__))
758_mm256_load_ps(float const *p)
759{
760  return *(__m256 *)p;
761}
762
763static __inline __m256d __attribute__((__always_inline__, __nodebug__))
764_mm256_loadu_pd(double const *p)
765{
766  struct __loadu_pd {
767    __m256d v;
768  } __attribute__((packed, may_alias));
769  return ((struct __loadu_pd*)p)->v;
770}
771
772static __inline __m256 __attribute__((__always_inline__, __nodebug__))
773_mm256_loadu_ps(float const *p)
774{
775  struct __loadu_ps {
776    __m256 v;
777  } __attribute__((packed, may_alias));
778  return ((struct __loadu_ps*)p)->v;
779}
780
781static __inline __m256i __attribute__((__always_inline__, __nodebug__))
782_mm256_load_si256(__m256i const *p)
783{
784  return *p;
785}
786
787static __inline __m256i __attribute__((__always_inline__, __nodebug__))
788_mm256_loadu_si256(__m256i const *p)
789{
790  struct __loadu_si256 {
791    __m256i v;
792  } __attribute__((packed, may_alias));
793  return ((struct __loadu_si256*)p)->v;
794}
795
796static __inline __m256i __attribute__((__always_inline__, __nodebug__))
797_mm256_lddqu_si256(__m256i const *p)
798{
799  return (__m256i)__builtin_ia32_lddqu256((char const *)p);
800}
801
802/* SIMD store ops */
803static __inline void __attribute__((__always_inline__, __nodebug__))
804_mm256_store_pd(double *p, __m256d a)
805{
806  *(__m256d *)p = a;
807}
808
809static __inline void __attribute__((__always_inline__, __nodebug__))
810_mm256_store_ps(float *p, __m256 a)
811{
812  *(__m256 *)p = a;
813}
814
815static __inline void __attribute__((__always_inline__, __nodebug__))
816_mm256_storeu_pd(double *p, __m256d a)
817{
818  __builtin_ia32_storeupd256(p, (__v4df)a);
819}
820
821static __inline void __attribute__((__always_inline__, __nodebug__))
822_mm256_storeu_ps(float *p, __m256 a)
823{
824  __builtin_ia32_storeups256(p, (__v8sf)a);
825}
826
827static __inline void __attribute__((__always_inline__, __nodebug__))
828_mm256_store_si256(__m256i *p, __m256i a)
829{
830  *p = a;
831}
832
833static __inline void __attribute__((__always_inline__, __nodebug__))
834_mm256_storeu_si256(__m256i *p, __m256i a)
835{
836  __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
837}
838
839/* Conditional load ops */
840static __inline __m128d __attribute__((__always_inline__, __nodebug__))
841_mm_maskload_pd(double const *p, __m128d m)
842{
843  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
844}
845
846static __inline __m256d __attribute__((__always_inline__, __nodebug__))
847_mm256_maskload_pd(double const *p, __m256d m)
848{
849  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
850}
851
852static __inline __m128 __attribute__((__always_inline__, __nodebug__))
853_mm_maskload_ps(float const *p, __m128 m)
854{
855  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
856}
857
858static __inline __m256 __attribute__((__always_inline__, __nodebug__))
859_mm256_maskload_ps(float const *p, __m256 m)
860{
861  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
862}
863
864/* Conditional store ops */
865static __inline void __attribute__((__always_inline__, __nodebug__))
866_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
867{
868  __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
869}
870
871static __inline void __attribute__((__always_inline__, __nodebug__))
872_mm_maskstore_pd(double *p, __m128d m, __m128d a)
873{
874  __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
875}
876
877static __inline void __attribute__((__always_inline__, __nodebug__))
878_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
879{
880  __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
881}
882
883static __inline void __attribute__((__always_inline__, __nodebug__))
884_mm_maskstore_ps(float *p, __m128 m, __m128 a)
885{
886  __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
887}
888
889/* Cacheability support ops */
890static __inline void __attribute__((__always_inline__, __nodebug__))
891_mm256_stream_si256(__m256i *a, __m256i b)
892{
893  __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
894}
895
896static __inline void __attribute__((__always_inline__, __nodebug__))
897_mm256_stream_pd(double *a, __m256d b)
898{
899  __builtin_ia32_movntpd256(a, (__v4df)b);
900}
901
902static __inline void __attribute__((__always_inline__, __nodebug__))
903_mm256_stream_ps(float *p, __m256 a)
904{
905  __builtin_ia32_movntps256(p, (__v8sf)a);
906}
907
908/* Create vectors */
909static __inline __m256d __attribute__((__always_inline__, __nodebug__))
910_mm256_set_pd(double a, double b, double c, double d)
911{
912  return (__m256d){ d, c, b, a };
913}
914
915static __inline __m256 __attribute__((__always_inline__, __nodebug__))
916_mm256_set_ps(float a, float b, float c, float d,
917	            float e, float f, float g, float h)
918{
919  return (__m256){ h, g, f, e, d, c, b, a };
920}
921
922static __inline __m256i __attribute__((__always_inline__, __nodebug__))
923_mm256_set_epi32(int i0, int i1, int i2, int i3,
924		             int i4, int i5, int i6, int i7)
925{
926  return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
927}
928
929static __inline __m256i __attribute__((__always_inline__, __nodebug__))
930_mm256_set_epi16(short w15, short w14, short w13, short w12,
931		             short w11, short w10, short w09, short w08,
932		             short w07, short w06, short w05, short w04,
933		             short w03, short w02, short w01, short w00)
934{
935  return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
936                             w08, w09, w10, w11, w12, w13, w14, w15 };
937}
938
939static __inline __m256i __attribute__((__always_inline__, __nodebug__))
940_mm256_set_epi8(char b31, char b30, char b29, char b28,
941		            char b27, char b26, char b25, char b24,
942		            char b23, char b22, char b21, char b20,
943		            char b19, char b18, char b17, char b16,
944		            char b15, char b14, char b13, char b12,
945		            char b11, char b10, char b09, char b08,
946		            char b07, char b06, char b05, char b04,
947		            char b03, char b02, char b01, char b00)
948{
949  return (__m256i)(__v32qi){
950    b00, b01, b02, b03, b04, b05, b06, b07,
951    b08, b09, b10, b11, b12, b13, b14, b15,
952    b16, b17, b18, b19, b20, b21, b22, b23,
953    b24, b25, b26, b27, b28, b29, b30, b31
954  };
955}
956
957static __inline __m256i __attribute__((__always_inline__, __nodebug__))
958_mm256_set_epi64x(long long a, long long b, long long c, long long d)
959{
960  return (__m256i)(__v4di){ d, c, b, a };
961}
962
963/* Create vectors with elements in reverse order */
964static __inline __m256d __attribute__((__always_inline__, __nodebug__))
965_mm256_setr_pd(double a, double b, double c, double d)
966{
967  return (__m256d){ a, b, c, d };
968}
969
970static __inline __m256 __attribute__((__always_inline__, __nodebug__))
971_mm256_setr_ps(float a, float b, float c, float d,
972		           float e, float f, float g, float h)
973{
974  return (__m256){ a, b, c, d, e, f, g, h };
975}
976
977static __inline __m256i __attribute__((__always_inline__, __nodebug__))
978_mm256_setr_epi32(int i0, int i1, int i2, int i3,
979		              int i4, int i5, int i6, int i7)
980{
981  return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
982}
983
984static __inline __m256i __attribute__((__always_inline__, __nodebug__))
985_mm256_setr_epi16(short w15, short w14, short w13, short w12,
986		   short w11, short w10, short w09, short w08,
987		   short w07, short w06, short w05, short w04,
988		   short w03, short w02, short w01, short w00)
989{
990  return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
991			                       w07, w06, w05, w04, w03, w02, w01, w00 };
992}
993
994static __inline __m256i __attribute__((__always_inline__, __nodebug__))
995_mm256_setr_epi8(char b31, char b30, char b29, char b28,
996		             char b27, char b26, char b25, char b24,
997		             char b23, char b22, char b21, char b20,
998		             char b19, char b18, char b17, char b16,
999		             char b15, char b14, char b13, char b12,
1000		             char b11, char b10, char b09, char b08,
1001		             char b07, char b06, char b05, char b04,
1002		             char b03, char b02, char b01, char b00)
1003{
1004  return (__m256i)(__v32qi){
1005    b31, b30, b29, b28, b27, b26, b25, b24,
1006		b23, b22, b21, b20, b19, b18, b17, b16,
1007		b15, b14, b13, b12, b11, b10, b09, b08,
1008		b07, b06, b05, b04, b03, b02, b01, b00 };
1009}
1010
1011static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1012_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1013{
1014  return (__m256i)(__v4di){ a, b, c, d };
1015}
1016
1017/* Create vectors with repeated elements */
1018static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1019_mm256_set1_pd(double w)
1020{
1021  return (__m256d){ w, w, w, w };
1022}
1023
1024static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1025_mm256_set1_ps(float w)
1026{
1027  return (__m256){ w, w, w, w, w, w, w, w };
1028}
1029
1030static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1031_mm256_set1_epi32(int i)
1032{
1033  return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1034}
1035
1036static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1037_mm256_set1_epi16(short w)
1038{
1039  return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1040}
1041
1042static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1043_mm256_set1_epi8(char b)
1044{
1045  return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1046                             b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1047}
1048
1049static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1050_mm256_set1_epi64x(long long q)
1051{
1052  return (__m256i)(__v4di){ q, q, q, q };
1053}
1054
1055/* Create zeroed vectors */
1056static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1057_mm256_setzero_pd(void)
1058{
1059  return (__m256d){ 0, 0, 0, 0 };
1060}
1061
1062static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1063_mm256_setzero_ps(void)
1064{
1065  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1066}
1067
1068static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1069_mm256_setzero_si256(void)
1070{
1071  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1072}
1073
1074/* Cast between vector types */
1075static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1076_mm256_castpd_ps(__m256d in)
1077{
1078  return (__m256)in;
1079}
1080
1081static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1082_mm256_castpd_si256(__m256d in)
1083{
1084  return (__m256i)in;
1085}
1086
1087static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1088_mm256_castps_pd(__m256 in)
1089{
1090  return (__m256d)in;
1091}
1092
1093static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1094_mm256_castps_si256(__m256 in)
1095{
1096  return (__m256i)in;
1097}
1098
1099static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1100_mm256_castsi256_ps(__m256i in)
1101{
1102  return (__m256)in;
1103}
1104
1105static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1106_mm256_castsi256_pd(__m256i in)
1107{
1108  return (__m256d)in;
1109}
1110
1111static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1112_mm256_castpd256_pd128(__m256d in)
1113{
1114  return __builtin_shufflevector(in, in, 0, 1);
1115}
1116
1117static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1118_mm256_castps256_ps128(__m256 in)
1119{
1120  return __builtin_shufflevector(in, in, 0, 1, 2, 3);
1121}
1122
1123static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1124_mm256_castsi256_si128(__m256i in)
1125{
1126  return __builtin_shufflevector(in, in, 0, 1);
1127}
1128
1129static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1130_mm256_castpd128_pd256(__m128d in)
1131{
1132  __m128d zero = _mm_setzero_pd();
1133  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1134}
1135
1136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1137_mm256_castps128_ps256(__m128 in)
1138{
1139  __m128 zero = _mm_setzero_ps();
1140  return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
1141}
1142
1143static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1144_mm256_castsi128_si256(__m128i in)
1145{
1146  __m128i zero = _mm_setzero_si128();
1147  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1148}
1149
1150/* SIMD load ops (unaligned) */
1151static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1152_mm256_loadu2_m128(float const *addr_hi, float const *addr_lo)
1153{
1154  struct __loadu_ps {
1155    __m128 v;
1156  } __attribute__((__packed__, __may_alias__));
1157
1158  __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v);
1159  return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1);
1160}
1161
1162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1163_mm256_loadu2_m128d(double const *addr_hi, double const *addr_lo)
1164{
1165  struct __loadu_pd {
1166    __m128d v;
1167  } __attribute__((__packed__, __may_alias__));
1168
1169  __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v);
1170  return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1);
1171}
1172
1173static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1174_mm256_loadu2_m128i(__m128i const *addr_hi, __m128i const *addr_lo)
1175{
1176  struct __loadu_si128 {
1177    __m128i v;
1178  } __attribute__((packed, may_alias));
1179  __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v);
1180  return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1);
1181}
1182
1183/* SIMD store ops (unaligned) */
1184static __inline void __attribute__((__always_inline__, __nodebug__))
1185_mm256_storeu2_m128(float *addr_hi, float *addr_lo, __m256 a)
1186{
1187  __m128 v128;
1188
1189  v128 = _mm256_castps256_ps128(a);
1190  __builtin_ia32_storeups(addr_lo, v128);
1191  v128 = _mm256_extractf128_ps(a, 1);
1192  __builtin_ia32_storeups(addr_hi, v128);
1193}
1194
1195static __inline void __attribute__((__always_inline__, __nodebug__))
1196_mm256_storeu2_m128d(double *addr_hi, double *addr_lo, __m256d a)
1197{
1198  __m128d v128;
1199
1200  v128 = _mm256_castpd256_pd128(a);
1201  __builtin_ia32_storeupd(addr_lo, v128);
1202  v128 = _mm256_extractf128_pd(a, 1);
1203  __builtin_ia32_storeupd(addr_hi, v128);
1204}
1205
1206static __inline void __attribute__((__always_inline__, __nodebug__))
1207_mm256_storeu2_m128i(__m128i *addr_hi, __m128i *addr_lo, __m256i a)
1208{
1209  __m128i v128;
1210
1211  v128 = _mm256_castsi256_si128(a);
1212  __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128);
1213  v128 = _mm256_extractf128_si256(a, 1);
1214  __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128);
1215}
1216