avxintrin.h revision 221345
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43  return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49  return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55  return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61  return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67  return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73  return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79  return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85  return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91  return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97  return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103  return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109  return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115  return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121  return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133  return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145  return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
148static __inline __m256d __attribute__((__always_inline__, __nodebug__))
149_mm256_round_pd(__m256d v, const int m)
150{
151  return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
152}
153
154static __inline __m256 __attribute__((__always_inline__, __nodebug__))
155_mm256_round_ps(__m256 v, const int m)
156{
157  return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
158}
159
160#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
161#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
162#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
163#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
164
165/* Logical */
166static __inline __m256d __attribute__((__always_inline__, __nodebug__))
167_mm256_and_pd(__m256d a, __m256d b)
168{
169  return (__m256d)((__v4di)a & (__v4di)b);
170}
171
172static __inline __m256 __attribute__((__always_inline__, __nodebug__))
173_mm256_and_ps(__m256 a, __m256 b)
174{
175  return (__m256)((__v8si)a & (__v8si)b);
176}
177
178static __inline __m256d __attribute__((__always_inline__, __nodebug__))
179_mm256_andnot_pd(__m256d a, __m256d b)
180{
181  return (__m256d)(~(__v4di)a & (__v4di)b);
182}
183
184static __inline __m256 __attribute__((__always_inline__, __nodebug__))
185_mm256_andnot_ps(__m256 a, __m256 b)
186{
187  return (__m256)(~(__v8si)a & (__v8si)b);
188}
189
190static __inline __m256d __attribute__((__always_inline__, __nodebug__))
191_mm256_or_pd(__m256d a, __m256d b)
192{
193  return (__m256d)((__v4di)a | (__v4di)b);
194}
195
196static __inline __m256 __attribute__((__always_inline__, __nodebug__))
197_mm256_or_ps(__m256 a, __m256 b)
198{
199  return (__m256)((__v8si)a | (__v8si)b);
200}
201
202static __inline __m256d __attribute__((__always_inline__, __nodebug__))
203_mm256_xor_pd(__m256d a, __m256d b)
204{
205  return (__m256d)((__v4di)a ^ (__v4di)b);
206}
207
208static __inline __m256 __attribute__((__always_inline__, __nodebug__))
209_mm256_xor_ps(__m256 a, __m256 b)
210{
211  return (__m256)((__v8si)a ^ (__v8si)b);
212}
213
214/* Horizontal arithmetic */
215static __inline __m256d __attribute__((__always_inline__, __nodebug__))
216_mm256_hadd_pd(__m256d a, __m256d b)
217{
218  return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
219}
220
221static __inline __m256 __attribute__((__always_inline__, __nodebug__))
222_mm256_hadd_ps(__m256 a, __m256 b)
223{
224  return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
225}
226
227static __inline __m256d __attribute__((__always_inline__, __nodebug__))
228_mm256_hsub_pd(__m256d a, __m256d b)
229{
230  return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
231}
232
233static __inline __m256 __attribute__((__always_inline__, __nodebug__))
234_mm256_hsub_ps(__m256 a, __m256 b)
235{
236  return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
237}
238
239/* Vector permutations */
240static __inline __m128d __attribute__((__always_inline__, __nodebug__))
241_mm_permutevar_pd(__m128d a, __m128i c)
242{
243  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
244}
245
246static __inline __m256d __attribute__((__always_inline__, __nodebug__))
247_mm256_permutevar_pd(__m256d a, __m256i c)
248{
249  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
250}
251
252static __inline __m128 __attribute__((__always_inline__, __nodebug__))
253_mm_permutevar_ps(__m128 a, __m128i c)
254{
255  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
256}
257
258static __inline __m256 __attribute__((__always_inline__, __nodebug__))
259_mm256_permutevar_ps(__m256 a, __m256i c)
260{
261  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
262						  (__v8si)c);
263}
264
265static __inline __m128d __attribute__((__always_inline__, __nodebug__))
266_mm_permute_pd(__m128d a, const int c)
267{
268  return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
269}
270
271static __inline __m256d __attribute__((__always_inline__, __nodebug__))
272_mm256_permute_pd(__m256d a, const int c)
273{
274  return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
275}
276
277static __inline __m128 __attribute__((__always_inline__, __nodebug__))
278_mm_permute_ps(__m128 a, const int c)
279{
280  return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
281}
282
283static __inline __m256 __attribute__((__always_inline__, __nodebug__))
284_mm256_permute_ps(__m256 a, const int c)
285{
286  return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
287}
288
289static __inline __m256d __attribute__((__always_inline__, __nodebug__))
290_mm256_permute2f128_pd(__m256d a, __m256d b, const int c)
291{
292  return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c);
293}
294
295static __inline __m256 __attribute__((__always_inline__, __nodebug__))
296_mm256_permute2f128_ps(__m256 a, __m256 b, const int c)
297{
298  return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c);
299}
300
301static __inline __m256i __attribute__((__always_inline__, __nodebug__))
302_mm256_permute2f128_si256(__m256i a, __m256i b, const int c)
303{
304  return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c);
305}
306
307/* Vector Blend */
308static __inline __m256d __attribute__((__always_inline__, __nodebug__))
309_mm256_blend_pd(__m256d a, __m256d b, const int c)
310{
311  return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c);
312}
313
314static __inline __m256 __attribute__((__always_inline__, __nodebug__))
315_mm256_blend_ps(__m256 a, __m256 b, const int c)
316{
317  return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c);
318}
319
320static __inline __m256d __attribute__((__always_inline__, __nodebug__))
321_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
322{
323  return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
324}
325
326static __inline __m256 __attribute__((__always_inline__, __nodebug__))
327_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
328{
329  return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
330}
331
332/* Vector Dot Product */
333static __inline __m256 __attribute__((__always_inline__, __nodebug__))
334_mm256_dp_ps(__m256 a, __m256 b, const int c)
335{
336  return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c);
337}
338
339/* Vector shuffle */
340#define _mm256_shuffle_ps(a, b, mask) \
341        (__builtin_shufflevector((__v8sf)(a), (__v8sf)(b), \
342        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
343        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
344        (mask) & 0x3 + 4,            (((mask) & 0xc) >> 2) + 4, \
345        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12))
346
347#define _mm256_shuffle_pd(a, b, mask) \
348        (__builtin_shufflevector((__v4df)(a), (__v4df)(b), \
349        (mask) & 0x1, \
350        (((mask) & 0x2) >> 1) + 4, \
351        (((mask) & 0x4) >> 2) + 2, \
352        (((mask) & 0x8) >> 3) + 6))
353
354/* Compare */
355#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
356#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
357#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
358#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
359#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
360#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
361#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
362#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
363#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
364#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
365#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
366#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
367#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
368#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
369#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
370#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
371#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
372#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
373#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
374#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
375#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
376#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
377#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
378#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
379#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
380#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
381#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
382#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
383#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
384#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
385#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
386#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
387
388#define _mm_cmp_pd(a, b, c) \
389  (__m128d)__builtin_ia32_cmppd((__v2df)(a), (__v2df)(b), (c))
390
391#define _mm_cmp_ps(a, b, c) \
392  (__m128)__builtin_ia32_cmpps((__v4sf)(a), (__v4sf)(b), (c))
393
394#define _mm256_cmp_pd(a, b, c) \
395  (__m256d)__builtin_ia32_cmppd256((__v4df)(a), (__v4df)(b), (c))
396
397#define _mm256_cmp_ps(a, b, c) \
398  (__m256)__builtin_ia32_cmpps256((__v8sf)(a), (__v8sf)(b), (c))
399
400#define _mm_cmp_sd(a, b, c) \
401  (__m128d)__builtin_ia32_cmpsd((__v2df)(a), (__v2df)(b), (c))
402
403#define _mm_cmp_ss(a, b, c) \
404  (__m128)__builtin_ia32_cmpss((__v4sf)(a), (__v4sf)(b), (c))
405
406/* Vector extract */
407static __inline __m128d __attribute__((__always_inline__, __nodebug__))
408_mm256_extractf128_pd(__m256d a, const int o)
409{
410  return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
411}
412
413static __inline __m128 __attribute__((__always_inline__, __nodebug__))
414_mm256_extractf128_ps(__m256 a, const int o)
415{
416  return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
417}
418
419static __inline __m128i __attribute__((__always_inline__, __nodebug__))
420_mm256_extractf128_si256(__m256i a, const int o)
421{
422  return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
423}
424
425static __inline int __attribute__((__always_inline__, __nodebug__))
426_mm256_extract_epi32(__m256i a, int const imm)
427{
428  __v8si b = (__v8si)a;
429  return b[imm];
430}
431
432static __inline int __attribute__((__always_inline__, __nodebug__))
433_mm256_extract_epi16(__m256i a, int const imm)
434{
435  __v16hi b = (__v16hi)a;
436  return b[imm];
437}
438
439static __inline int __attribute__((__always_inline__, __nodebug__))
440_mm256_extract_epi8(__m256i a, int const imm)
441{
442  __v32qi b = (__v32qi)a;
443  return b[imm];
444}
445
446#ifdef __x86_64__
447static __inline long long  __attribute__((__always_inline__, __nodebug__))
448_mm256_extract_epi64(__m256i a, const int imm)
449{
450  __v4di b = (__v4di)a;
451  return b[imm];
452}
453#endif
454
455/* Vector insert */
456static __inline __m256d __attribute__((__always_inline__, __nodebug__))
457_mm256_insertf128_pd(__m256d a, __m128d b, const int o)
458{
459  return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
460}
461
462static __inline __m256 __attribute__((__always_inline__, __nodebug__))
463_mm256_insertf128_ps(__m256 a, __m128 b, const int o)
464{
465  return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
466}
467
468static __inline __m256i __attribute__((__always_inline__, __nodebug__))
469_mm256_insertf128_si256(__m256i a, __m128i b, const int o)
470{
471  return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
472}
473
474static __inline __m256i __attribute__((__always_inline__, __nodebug__))
475_mm256_insert_epi32(__m256i a, int b, int const imm)
476{
477  __v8si c = (__v8si)a;
478  c[imm & 7] = b;
479  return (__m256i)c;
480}
481
482static __inline __m256i __attribute__((__always_inline__, __nodebug__))
483_mm256_insert_epi16(__m256i a, int b, int const imm)
484{
485  __v16hi c = (__v16hi)a;
486  c[imm & 15] = b;
487  return (__m256i)c;
488}
489
490static __inline __m256i __attribute__((__always_inline__, __nodebug__))
491_mm256_insert_epi8(__m256i a, int b, int const imm)
492{
493  __v32qi c = (__v32qi)a;
494  c[imm & 31] = b;
495  return (__m256i)c;
496}
497
498#ifdef __x86_64__
499static __inline __m256i __attribute__((__always_inline__, __nodebug__))
500_mm256_insert_epi64(__m256i a, int b, int const imm)
501{
502  __v4di c = (__v4di)a;
503  c[imm & 3] = b;
504  return (__m256i)c;
505}
506#endif
507
508/* Conversion */
509static __inline __m256d __attribute__((__always_inline__, __nodebug__))
510_mm256_cvtepi32_pd(__m128i a)
511{
512  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
513}
514
515static __inline __m256 __attribute__((__always_inline__, __nodebug__))
516_mm256_cvtepi32_ps(__m256i a)
517{
518  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
519}
520
521static __inline __m128 __attribute__((__always_inline__, __nodebug__))
522_mm256_cvtpd_ps(__m256d a)
523{
524  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
525}
526
527static __inline __m256i __attribute__((__always_inline__, __nodebug__))
528_mm256_cvtps_epi32(__m256 a)
529{
530  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
531}
532
533static __inline __m256d __attribute__((__always_inline__, __nodebug__))
534_mm256_cvtps_pd(__m128 a)
535{
536  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
537}
538
539static __inline __m128i __attribute__((__always_inline__, __nodebug__))
540_mm256_cvttpd_epi32(__m256d a)
541{
542  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
543}
544
545static __inline __m128i __attribute__((__always_inline__, __nodebug__))
546_mm256_cvtpd_epi32(__m256d a)
547{
548  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
549}
550
551static __inline __m256i __attribute__((__always_inline__, __nodebug__))
552_mm256_cvttps_epi32(__m256 a)
553{
554  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
555}
556
557/* Vector replicate */
558static __inline __m256 __attribute__((__always_inline__, __nodebug__))
559_mm256_movehdup_ps(__m256 a)
560{
561  return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
562}
563
564static __inline __m256 __attribute__((__always_inline__, __nodebug__))
565_mm256_moveldup_ps(__m256 a)
566{
567  return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
568}
569
570static __inline __m256d __attribute__((__always_inline__, __nodebug__))
571_mm256_movedup_pd(__m256d a)
572{
573  return __builtin_shufflevector(a, a, 0, 0, 2, 2);
574}
575
576/* Unpack and Interleave */
577static __inline __m256d __attribute__((__always_inline__, __nodebug__))
578_mm256_unpackhi_pd(__m256d a, __m256d b)
579{
580  return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
581}
582
583static __inline __m256d __attribute__((__always_inline__, __nodebug__))
584_mm256_unpacklo_pd(__m256d a, __m256d b)
585{
586  return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
587}
588
589static __inline __m256 __attribute__((__always_inline__, __nodebug__))
590_mm256_unpackhi_ps(__m256 a, __m256 b)
591{
592  return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
593}
594
595static __inline __m256 __attribute__((__always_inline__, __nodebug__))
596_mm256_unpacklo_ps(__m256 a, __m256 b)
597{
598  return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
599}
600
601/* Bit Test */
602static __inline int __attribute__((__always_inline__, __nodebug__))
603_mm_testz_pd(__m128d a, __m128d b)
604{
605  return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
606}
607
608static __inline int __attribute__((__always_inline__, __nodebug__))
609_mm_testc_pd(__m128d a, __m128d b)
610{
611  return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
612}
613
614static __inline int __attribute__((__always_inline__, __nodebug__))
615_mm_testnzc_pd(__m128d a, __m128d b)
616{
617  return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
618}
619
620static __inline int __attribute__((__always_inline__, __nodebug__))
621_mm_testz_ps(__m128 a, __m128 b)
622{
623  return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
624}
625
626static __inline int __attribute__((__always_inline__, __nodebug__))
627_mm_testc_ps(__m128 a, __m128 b)
628{
629  return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
630}
631
632static __inline int __attribute__((__always_inline__, __nodebug__))
633_mm_testnzc_ps(__m128 a, __m128 b)
634{
635  return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
636}
637
638static __inline int __attribute__((__always_inline__, __nodebug__))
639_mm256_testz_pd(__m256d a, __m256d b)
640{
641  return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
642}
643
644static __inline int __attribute__((__always_inline__, __nodebug__))
645_mm256_testc_pd(__m256d a, __m256d b)
646{
647  return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
648}
649
650static __inline int __attribute__((__always_inline__, __nodebug__))
651_mm256_testnzc_pd(__m256d a, __m256d b)
652{
653  return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
654}
655
656static __inline int __attribute__((__always_inline__, __nodebug__))
657_mm256_testz_ps(__m256 a, __m256 b)
658{
659  return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
660}
661
662static __inline int __attribute__((__always_inline__, __nodebug__))
663_mm256_testc_ps(__m256 a, __m256 b)
664{
665  return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
666}
667
668static __inline int __attribute__((__always_inline__, __nodebug__))
669_mm256_testnzc_ps(__m256 a, __m256 b)
670{
671  return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
672}
673
674static __inline int __attribute__((__always_inline__, __nodebug__))
675_mm256_testz_si256(__m256i a, __m256i b)
676{
677  return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
678}
679
680static __inline int __attribute__((__always_inline__, __nodebug__))
681_mm256_testc_si256(__m256i a, __m256i b)
682{
683  return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
684}
685
686static __inline int __attribute__((__always_inline__, __nodebug__))
687_mm256_testnzc_si256(__m256i a, __m256i b)
688{
689  return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
690}
691
692/* Vector extract sign mask */
693static __inline int __attribute__((__always_inline__, __nodebug__))
694_mm256_movemask_pd(__m256d a)
695{
696  return __builtin_ia32_movmskpd256((__v4df)a);
697}
698
699static __inline int __attribute__((__always_inline__, __nodebug__))
700_mm256_movemask_ps(__m256 a)
701{
702  return __builtin_ia32_movmskps256((__v8sf)a);
703}
704
705/* Vector zero */
706static __inline void __attribute__((__always_inline__, __nodebug__))
707_mm256_zeroall(void)
708{
709  __builtin_ia32_vzeroall();
710}
711
712static __inline void __attribute__((__always_inline__, __nodebug__))
713_mm256_zeroupper(void)
714{
715  __builtin_ia32_vzeroupper();
716}
717
718/* Vector load with broadcast */
719static __inline __m128 __attribute__((__always_inline__, __nodebug__))
720_mm_broadcast_ss(float const *a)
721{
722  return (__m128)__builtin_ia32_vbroadcastss(a);
723}
724
725static __inline __m256d __attribute__((__always_inline__, __nodebug__))
726_mm256_broadcast_sd(double const *a)
727{
728  return (__m256d)__builtin_ia32_vbroadcastsd256(a);
729}
730
731static __inline __m256 __attribute__((__always_inline__, __nodebug__))
732_mm256_broadcast_ss(float const *a)
733{
734  return (__m256)__builtin_ia32_vbroadcastss256(a);
735}
736
737static __inline __m256d __attribute__((__always_inline__, __nodebug__))
738_mm256_broadcast_pd(__m128d const *a)
739{
740  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
741}
742
743static __inline __m256 __attribute__((__always_inline__, __nodebug__))
744_mm256_broadcast_ps(__m128 const *a)
745{
746  return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
747}
748
749/* SIMD load ops */
750static __inline __m256d __attribute__((__always_inline__, __nodebug__))
751_mm256_load_pd(double const *p)
752{
753  return *(__m256d *)p;
754}
755
756static __inline __m256 __attribute__((__always_inline__, __nodebug__))
757_mm256_load_ps(float const *p)
758{
759  return *(__m256 *)p;
760}
761
762static __inline __m256d __attribute__((__always_inline__, __nodebug__))
763_mm256_loadu_pd(double const *p)
764{
765  return (__m256d)__builtin_ia32_loadupd256(p);
766}
767
768static __inline __m256 __attribute__((__always_inline__, __nodebug__))
769_mm256_loadu_ps(float const *p)
770{
771  return (__m256)__builtin_ia32_loadups256(p);
772}
773
774static __inline __m256i __attribute__((__always_inline__, __nodebug__))
775_mm256_load_si256(__m256i const *p)
776{
777  return *p;
778}
779
780static __inline __m256i __attribute__((__always_inline__, __nodebug__))
781_mm256_loadu_si256(__m256i const *p)
782{
783  return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
784}
785
786static __inline __m256i __attribute__((__always_inline__, __nodebug__))
787_mm256_lddqu_si256(__m256i const *p)
788{
789  return (__m256i)__builtin_ia32_lddqu256((char const *)p);
790}
791
792/* SIMD store ops */
793static __inline void __attribute__((__always_inline__, __nodebug__))
794_mm256_store_pd(double *p, __m256d a)
795{
796  *(__m256d *)p = a;
797}
798
799static __inline void __attribute__((__always_inline__, __nodebug__))
800_mm256_store_ps(float *p, __m256 a)
801{
802  *(__m256 *)p = a;
803}
804
805static __inline void __attribute__((__always_inline__, __nodebug__))
806_mm256_storeu_pd(double *p, __m256d a)
807{
808  __builtin_ia32_storeupd256(p, (__v4df)a);
809}
810
811static __inline void __attribute__((__always_inline__, __nodebug__))
812_mm256_storeu_ps(float *p, __m256 a)
813{
814  __builtin_ia32_storeups256(p, (__v8sf)a);
815}
816
817static __inline void __attribute__((__always_inline__, __nodebug__))
818_mm256_store_si256(__m256i *p, __m256i a)
819{
820  *p = a;
821}
822
823static __inline void __attribute__((__always_inline__, __nodebug__))
824_mm256_storeu_si256(__m256i *p, __m256i a)
825{
826  __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
827}
828
829/* Conditional load ops */
830static __inline __m128d __attribute__((__always_inline__, __nodebug__))
831_mm_maskload_pd(double const *p, __m128d m)
832{
833  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
834}
835
836static __inline __m256d __attribute__((__always_inline__, __nodebug__))
837_mm256_maskload_pd(double const *p, __m256d m)
838{
839  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
840}
841
842static __inline __m128 __attribute__((__always_inline__, __nodebug__))
843_mm_maskload_ps(float const *p, __m128 m)
844{
845  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
846}
847
848static __inline __m256 __attribute__((__always_inline__, __nodebug__))
849_mm256_maskload_ps(float const *p, __m256 m)
850{
851  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
852}
853
854/* Conditional store ops */
855static __inline void __attribute__((__always_inline__, __nodebug__))
856_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
857{
858  __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
859}
860
861static __inline void __attribute__((__always_inline__, __nodebug__))
862_mm_maskstore_pd(double *p, __m128d m, __m128d a)
863{
864  __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
865}
866
867static __inline void __attribute__((__always_inline__, __nodebug__))
868_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
869{
870  __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
871}
872
873static __inline void __attribute__((__always_inline__, __nodebug__))
874_mm_maskstore_ps(float *p, __m128 m, __m128 a)
875{
876  __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
877}
878
879/* Cacheability support ops */
880static __inline void __attribute__((__always_inline__, __nodebug__))
881_mm256_stream_si256(__m256i *a, __m256i b)
882{
883  __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
884}
885
886static __inline void __attribute__((__always_inline__, __nodebug__))
887_mm256_stream_pd(double *a, __m256d b)
888{
889  __builtin_ia32_movntpd256(a, (__v4df)b);
890}
891
892static __inline void __attribute__((__always_inline__, __nodebug__))
893_mm256_stream_ps(float *p, __m256 a)
894{
895  __builtin_ia32_movntps256(p, (__v8sf)a);
896}
897
898/* Create vectors */
899static __inline __m256d __attribute__((__always_inline__, __nodebug__))
900_mm256_set_pd(double a, double b, double c, double d)
901{
902  return (__m256d){ d, c, b, a };
903}
904
905static __inline __m256 __attribute__((__always_inline__, __nodebug__))
906_mm256_set_ps(float a, float b, float c, float d,
907	            float e, float f, float g, float h)
908{
909  return (__m256){ h, g, f, e, d, c, b, a };
910}
911
912static __inline __m256i __attribute__((__always_inline__, __nodebug__))
913_mm256_set_epi32(int i0, int i1, int i2, int i3,
914		             int i4, int i5, int i6, int i7)
915{
916  return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
917}
918
919static __inline __m256i __attribute__((__always_inline__, __nodebug__))
920_mm256_set_epi16(short w15, short w14, short w13, short w12,
921		             short w11, short w10, short w09, short w08,
922		             short w07, short w06, short w05, short w04,
923		             short w03, short w02, short w01, short w00)
924{
925  return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
926                             w08, w09, w10, w11, w12, w13, w14, w15 };
927}
928
929static __inline __m256i __attribute__((__always_inline__, __nodebug__))
930_mm256_set_epi8(char b31, char b30, char b29, char b28,
931		            char b27, char b26, char b25, char b24,
932		            char b23, char b22, char b21, char b20,
933		            char b19, char b18, char b17, char b16,
934		            char b15, char b14, char b13, char b12,
935		            char b11, char b10, char b09, char b08,
936		            char b07, char b06, char b05, char b04,
937		            char b03, char b02, char b01, char b00)
938{
939  return (__m256i)(__v32qi){
940    b00, b01, b02, b03, b04, b05, b06, b07,
941    b08, b09, b10, b11, b12, b13, b14, b15,
942    b16, b17, b18, b19, b20, b21, b22, b23,
943    b24, b25, b26, b27, b28, b29, b30, b31
944  };
945}
946
947static __inline __m256i __attribute__((__always_inline__, __nodebug__))
948_mm256_set_epi64x(long long a, long long b, long long c, long long d)
949{
950  return (__m256i)(__v4di){ d, c, b, a };
951}
952
953/* Create vectors with elements in reverse order */
954static __inline __m256d __attribute__((__always_inline__, __nodebug__))
955_mm256_setr_pd(double a, double b, double c, double d)
956{
957  return (__m256d){ a, b, c, d };
958}
959
960static __inline __m256 __attribute__((__always_inline__, __nodebug__))
961_mm256_setr_ps(float a, float b, float c, float d,
962		           float e, float f, float g, float h)
963{
964  return (__m256){ a, b, c, d, e, f, g, h };
965}
966
967static __inline __m256i __attribute__((__always_inline__, __nodebug__))
968_mm256_setr_epi32(int i0, int i1, int i2, int i3,
969		              int i4, int i5, int i6, int i7)
970{
971  return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
972}
973
974static __inline __m256i __attribute__((__always_inline__, __nodebug__))
975_mm256_setr_epi16(short w15, short w14, short w13, short w12,
976		   short w11, short w10, short w09, short w08,
977		   short w07, short w06, short w05, short w04,
978		   short w03, short w02, short w01, short w00)
979{
980  return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
981			                       w07, w06, w05, w04, w03, w02, w01, w00 };
982}
983
984static __inline __m256i __attribute__((__always_inline__, __nodebug__))
985_mm256_setr_epi8(char b31, char b30, char b29, char b28,
986		             char b27, char b26, char b25, char b24,
987		             char b23, char b22, char b21, char b20,
988		             char b19, char b18, char b17, char b16,
989		             char b15, char b14, char b13, char b12,
990		             char b11, char b10, char b09, char b08,
991		             char b07, char b06, char b05, char b04,
992		             char b03, char b02, char b01, char b00)
993{
994  return (__m256i)(__v32qi){
995    b31, b30, b29, b28, b27, b26, b25, b24,
996		b23, b22, b21, b20, b19, b18, b17, b16,
997		b15, b14, b13, b12, b11, b10, b09, b08,
998		b07, b06, b05, b04, b03, b02, b01, b00 };
999}
1000
1001static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1002_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1003{
1004  return (__m256i)(__v4di){ a, b, c, d };
1005}
1006
1007/* Create vectors with repeated elements */
1008static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1009_mm256_set1_pd(double w)
1010{
1011  return (__m256d){ w, w, w, w };
1012}
1013
1014static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1015_mm256_set1_ps(float w)
1016{
1017  return (__m256){ w, w, w, w, w, w, w, w };
1018}
1019
1020static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1021_mm256_set1_epi32(int i)
1022{
1023  return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1024}
1025
1026static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1027_mm256_set1_epi16(short w)
1028{
1029  return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1030}
1031
1032static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1033_mm256_set1_epi8(char b)
1034{
1035  return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1036                             b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1037}
1038
1039static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1040_mm256_set1_epi64x(long long q)
1041{
1042  return (__m256i)(__v4di){ q, q, q, q };
1043}
1044
1045/* Create zeroed vectors */
1046static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1047_mm256_setzero_pd(void)
1048{
1049  return (__m256d){ 0, 0, 0, 0 };
1050}
1051
1052static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1053_mm256_setzero_ps(void)
1054{
1055  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1056}
1057
1058static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1059_mm256_setzero_si256(void)
1060{
1061  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1062}
1063
1064/* Cast between vector types */
1065static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1066_mm256_castpd_ps(__m256d in)
1067{
1068  return (__m256)in;
1069}
1070
1071static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1072_mm256_castpd_si256(__m256d in)
1073{
1074  return (__m256i)in;
1075}
1076
1077static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1078_mm256_castps_pd(__m256 in)
1079{
1080  return (__m256d)in;
1081}
1082
1083static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1084_mm256_castps_si256(__m256 in)
1085{
1086  return (__m256i)in;
1087}
1088
1089static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1090_mm256_castsi256_ps(__m256i in)
1091{
1092  return (__m256)in;
1093}
1094
1095static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1096_mm256_castsi256_pd(__m256i in)
1097{
1098  return (__m256d)in;
1099}
1100
1101static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1102_mm256_castpd256_pd128(__m256d in)
1103{
1104  return __builtin_shufflevector(in, in, 0, 1);
1105}
1106
1107static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1108_mm256_castps256_ps128(__m256 in)
1109{
1110  return __builtin_shufflevector(in, in, 0, 1, 2, 3);
1111}
1112
1113static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1114_mm256_castsi256_si128(__m256i in)
1115{
1116  return __builtin_shufflevector(in, in, 0, 1);
1117}
1118
1119static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1120_mm256_castpd128_pd256(__m128d in)
1121{
1122  __m128d zero = _mm_setzero_pd();
1123  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1124}
1125
1126static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1127_mm256_castps128_ps256(__m128 in)
1128{
1129  __m128 zero = _mm_setzero_ps();
1130  return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
1131}
1132
1133static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1134_mm256_castsi128_si256(__m128i in)
1135{
1136  __m128i zero = _mm_setzero_si128();
1137  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1138}
1139