avxintrin.h revision 212904
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43  return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49  return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55  return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61  return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67  return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73  return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79  return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85  return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91  return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97  return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103  return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109  return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115  return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121  return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133  return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145  return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
148static __inline __m256d __attribute__((__always_inline__, __nodebug__))
149_mm256_round_pd(__m256d v, const int m)
150{
151  return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
152}
153
154static __inline __m256 __attribute__((__always_inline__, __nodebug__))
155_mm256_round_ps(__m256 v, const int m)
156{
157  return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
158}
159
160#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
161#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
162#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
163#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
164
165/* Logical */
166static __inline __m256d __attribute__((__always_inline__, __nodebug__))
167_mm256_and_pd(__m256d a, __m256d b)
168{
169  return (__m256d)((__v4di)a & (__v4di)b);
170}
171
172static __inline __m256 __attribute__((__always_inline__, __nodebug__))
173_mm256_and_ps(__m256 a, __m256 b)
174{
175  return (__m256)((__v8si)a & (__v8si)b);
176}
177
178static __inline __m256d __attribute__((__always_inline__, __nodebug__))
179_mm256_andnot_pd(__m256d a, __m256d b)
180{
181  return (__m256d)(~(__v4di)a & (__v4di)b);
182}
183
184static __inline __m256 __attribute__((__always_inline__, __nodebug__))
185_mm256_andnot_ps(__m256 a, __m256 b)
186{
187  return (__m256)(~(__v8si)a & (__v8si)b);
188}
189
190static __inline __m256d __attribute__((__always_inline__, __nodebug__))
191_mm256_or_pd(__m256d a, __m256d b)
192{
193  return (__m256d)((__v4di)a | (__v4di)b);
194}
195
196static __inline __m256 __attribute__((__always_inline__, __nodebug__))
197_mm256_or_ps(__m256 a, __m256 b)
198{
199  return (__m256)((__v8si)a | (__v8si)b);
200}
201
202static __inline __m256d __attribute__((__always_inline__, __nodebug__))
203_mm256_xor_pd(__m256d a, __m256d b)
204{
205  return (__m256d)((__v4di)a ^ (__v4di)b);
206}
207
208static __inline __m256 __attribute__((__always_inline__, __nodebug__))
209_mm256_xor_ps(__m256 a, __m256 b)
210{
211  return (__m256)((__v8si)a ^ (__v8si)b);
212}
213
214/* Horizontal arithmetic */
215static __inline __m256d __attribute__((__always_inline__, __nodebug__))
216_mm256_hadd_pd(__m256d a, __m256d b)
217{
218  return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
219}
220
221static __inline __m256 __attribute__((__always_inline__, __nodebug__))
222_mm256_hadd_ps(__m256 a, __m256 b)
223{
224  return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
225}
226
227static __inline __m256d __attribute__((__always_inline__, __nodebug__))
228_mm256_hsub_pd(__m256d a, __m256d b)
229{
230  return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
231}
232
233static __inline __m256 __attribute__((__always_inline__, __nodebug__))
234_mm256_hsub_ps(__m256 a, __m256 b)
235{
236  return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
237}
238
239/* Vector permutations */
240static __inline __m128d __attribute__((__always_inline__, __nodebug__))
241_mm_permutevar_pd(__m128d a, __m128i c)
242{
243  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
244}
245
246static __inline __m256d __attribute__((__always_inline__, __nodebug__))
247_mm256_permutevar_pd(__m256d a, __m256i c)
248{
249  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
250}
251
252static __inline __m128 __attribute__((__always_inline__, __nodebug__))
253_mm_permutevar_ps(__m128 a, __m128i c)
254{
255  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
256}
257
258static __inline __m256 __attribute__((__always_inline__, __nodebug__))
259_mm256_permutevar_ps(__m256 a, __m256i c)
260{
261  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
262						  (__v8si)c);
263}
264
265static __inline __m128d __attribute__((__always_inline__, __nodebug__))
266_mm_permute_pd(__m128d a, const int c)
267{
268  return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
269}
270
271static __inline __m256d __attribute__((__always_inline__, __nodebug__))
272_mm256_permute_pd(__m256d a, const int c)
273{
274  return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
275}
276
277static __inline __m128 __attribute__((__always_inline__, __nodebug__))
278_mm_permute_ps(__m128 a, const int c)
279{
280  return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
281}
282
283static __inline __m256 __attribute__((__always_inline__, __nodebug__))
284_mm256_permute_ps(__m256 a, const int c)
285{
286  return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
287}
288
289static __inline __m256d __attribute__((__always_inline__, __nodebug__))
290_mm256_permute2f128_pd(__m256d a, __m256d b, const int c)
291{
292  return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c);
293}
294
295static __inline __m256 __attribute__((__always_inline__, __nodebug__))
296_mm256_permute2f128_ps(__m256 a, __m256 b, const int c)
297{
298  return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c);
299}
300
301static __inline __m256i __attribute__((__always_inline__, __nodebug__))
302_mm256_permute2f128_si256(__m256i a, __m256i b, const int c)
303{
304  return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c);
305}
306
307/* Vector Blend */
308static __inline __m256d __attribute__((__always_inline__, __nodebug__))
309_mm256_blend_pd(__m256d a, __m256d b, const int c)
310{
311  return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c);
312}
313
314static __inline __m256 __attribute__((__always_inline__, __nodebug__))
315_mm256_blend_ps(__m256 a, __m256 b, const int c)
316{
317  return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c);
318}
319
320static __inline __m256d __attribute__((__always_inline__, __nodebug__))
321_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
322{
323  return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
324}
325
326static __inline __m256 __attribute__((__always_inline__, __nodebug__))
327_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
328{
329  return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
330}
331
332/* Vector Dot Product */
333static __inline __m256 __attribute__((__always_inline__, __nodebug__))
334_mm256_dp_ps(__m256 a, __m256 b, const int c)
335{
336  return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c);
337}
338
339/* Vector shuffle */
340#define _mm256_shuffle_ps(a, b, mask) \
341        (__builtin_shufflevector((__v8sf)(a), (__v8sf)(b), \
342        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
343        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
344        (mask) & 0x3 + 4,            (((mask) & 0xc) >> 2) + 4, \
345        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12))
346
347#define _mm256_shuffle_pd(a, b, mask) \
348        (__builtin_shufflevector((__v4df)(a), (__v4df)(b), \
349        (mask) & 0x1, \
350        (((mask) & 0x2) >> 1) + 4, \
351        (((mask) & 0x4) >> 2) + 2, \
352        (((mask) & 0x8) >> 3) + 6))
353
354/* Compare */
355#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
356#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
357#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
358#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
359#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
360#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
361#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
362#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
363#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
364#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
365#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
366#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
367#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
368#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
369#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
370#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
371#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
372#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
373#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
374#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
375#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
376#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
377#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
378#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
379#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
380#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
381#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
382#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
383#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
384#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
385#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
386#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
387
388static __inline __m128d __attribute__((__always_inline__, __nodebug__))
389_mm_cmp_pd(__m128d a, __m128d b, const int c)
390{
391  return (__m128d)__builtin_ia32_cmppd((__v2df)a, (__v2df)b, c);
392}
393
394static __inline __m128 __attribute__((__always_inline__, __nodebug__))
395_mm_cmp_ps(__m128 a, __m128 b, const int c)
396{
397  return (__m128)__builtin_ia32_cmpps((__v4sf)a, (__v4sf)b, c);
398}
399
400static __inline __m256d __attribute__((__always_inline__, __nodebug__))
401_mm256_cmp_pd(__m256d a, __m256d b, const int c)
402{
403  return (__m256d)__builtin_ia32_cmppd256((__v4df)a, (__v4df)b, c);
404}
405
406static __inline __m256 __attribute__((__always_inline__, __nodebug__))
407_mm256_cmp_ps(__m256 a, __m256 b, const int c)
408{
409  return (__m256)__builtin_ia32_cmpps256((__v8sf)a, (__v8sf)b, c);
410}
411
412static __inline __m128d __attribute__((__always_inline__, __nodebug__))
413_mm_cmp_sd(__m128d a, __m128d b, const int c)
414{
415  return (__m128d)__builtin_ia32_cmpsd((__v2df)a, (__v2df)b, c);
416}
417
418static __inline __m128 __attribute__((__always_inline__, __nodebug__))
419_mm_cmp_ss(__m128 a, __m128 b, const int c)
420{
421  return (__m128)__builtin_ia32_cmpss((__v4sf)a, (__v4sf)b, c);
422}
423
424/* Vector extract */
425static __inline __m128d __attribute__((__always_inline__, __nodebug__))
426_mm256_extractf128_pd(__m256d a, const int o)
427{
428  return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
429}
430
431static __inline __m128 __attribute__((__always_inline__, __nodebug__))
432_mm256_extractf128_ps(__m256 a, const int o)
433{
434  return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
435}
436
437static __inline __m128i __attribute__((__always_inline__, __nodebug__))
438_mm256_extractf128_si256(__m256i a, const int o)
439{
440  return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
441}
442
443static __inline int __attribute__((__always_inline__, __nodebug__))
444_mm256_extract_epi32(__m256i a, int const imm)
445{
446  __v8si b = (__v8si)a;
447  return b[imm];
448}
449
450static __inline int __attribute__((__always_inline__, __nodebug__))
451_mm256_extract_epi16(__m256i a, int const imm)
452{
453  __v16hi b = (__v16hi)a;
454  return b[imm];
455}
456
457static __inline int __attribute__((__always_inline__, __nodebug__))
458_mm256_extract_epi8(__m256i a, int const imm)
459{
460  __v32qi b = (__v32qi)a;
461  return b[imm];
462}
463
464#ifdef __x86_64__
465static __inline long long  __attribute__((__always_inline__, __nodebug__))
466_mm256_extract_epi64(__m256i a, const int imm)
467{
468  __v4di b = (__v4di)a;
469  return b[imm];
470}
471#endif
472
473/* Vector insert */
474static __inline __m256d __attribute__((__always_inline__, __nodebug__))
475_mm256_insertf128_pd(__m256d a, __m128d b, const int o)
476{
477  return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
478}
479
480static __inline __m256 __attribute__((__always_inline__, __nodebug__))
481_mm256_insertf128_ps(__m256 a, __m128 b, const int o)
482{
483  return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
484}
485
486static __inline __m256i __attribute__((__always_inline__, __nodebug__))
487_mm256_insertf128_si256(__m256i a, __m128i b, const int o)
488{
489  return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
490}
491
492static __inline __m256i __attribute__((__always_inline__, __nodebug__))
493_mm256_insert_epi32(__m256i a, int b, int const imm)
494{
495  __v8si c = (__v8si)a;
496  c[imm & 7] = b;
497  return (__m256i)c;
498}
499
500static __inline __m256i __attribute__((__always_inline__, __nodebug__))
501_mm256_insert_epi16(__m256i a, int b, int const imm)
502{
503  __v16hi c = (__v16hi)a;
504  c[imm & 15] = b;
505  return (__m256i)c;
506}
507
508static __inline __m256i __attribute__((__always_inline__, __nodebug__))
509_mm256_insert_epi8(__m256i a, int b, int const imm)
510{
511  __v32qi c = (__v32qi)a;
512  c[imm & 31] = b;
513  return (__m256i)c;
514}
515
516#ifdef __x86_64__
517static __inline __m256i __attribute__((__always_inline__, __nodebug__))
518_mm256_insert_epi64(__m256i a, int b, int const imm)
519{
520  __v4di c = (__v4di)a;
521  c[imm & 3] = b;
522  return (__m256i)c;
523}
524#endif
525
526/* Conversion */
527static __inline __m256d __attribute__((__always_inline__, __nodebug__))
528_mm256_cvtepi32_pd(__m128i a)
529{
530  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
531}
532
533static __inline __m256 __attribute__((__always_inline__, __nodebug__))
534_mm256_cvtepi32_ps(__m256i a)
535{
536  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
537}
538
539static __inline __m128 __attribute__((__always_inline__, __nodebug__))
540_mm256_cvtpd_ps(__m256d a)
541{
542  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
543}
544
545static __inline __m256i __attribute__((__always_inline__, __nodebug__))
546_mm256_cvtps_epi32(__m256 a)
547{
548  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
549}
550
551static __inline __m256d __attribute__((__always_inline__, __nodebug__))
552_mm256_cvtps_pd(__m128 a)
553{
554  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
555}
556
557static __inline __m128i __attribute__((__always_inline__, __nodebug__))
558_mm256_cvttpd_epi32(__m256d a)
559{
560  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
561}
562
563static __inline __m128i __attribute__((__always_inline__, __nodebug__))
564_mm256_cvtpd_epi32(__m256d a)
565{
566  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
567}
568
569static __inline __m256i __attribute__((__always_inline__, __nodebug__))
570_mm256_cvttps_epi32(__m256 a)
571{
572  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
573}
574
575/* Vector replicate */
576static __inline __m256 __attribute__((__always_inline__, __nodebug__))
577_mm256_movehdup_ps(__m256 a)
578{
579  return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
580}
581
582static __inline __m256 __attribute__((__always_inline__, __nodebug__))
583_mm256_moveldup_ps(__m256 a)
584{
585  return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
586}
587
588static __inline __m256d __attribute__((__always_inline__, __nodebug__))
589_mm256_movedup_pd(__m256d a)
590{
591  return __builtin_shufflevector(a, a, 0, 0, 2, 2);
592}
593
594/* Unpack and Interleave */
595static __inline __m256d __attribute__((__always_inline__, __nodebug__))
596_mm256_unpackhi_pd(__m256d a, __m256d b)
597{
598  return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
599}
600
601static __inline __m256d __attribute__((__always_inline__, __nodebug__))
602_mm256_unpacklo_pd(__m256d a, __m256d b)
603{
604  return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
605}
606
607static __inline __m256 __attribute__((__always_inline__, __nodebug__))
608_mm256_unpackhi_ps(__m256 a, __m256 b)
609{
610  return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
611}
612
613static __inline __m256 __attribute__((__always_inline__, __nodebug__))
614_mm256_unpacklo_ps(__m256 a, __m256 b)
615{
616  return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
617}
618
619/* Bit Test */
620static __inline int __attribute__((__always_inline__, __nodebug__))
621_mm_testz_pd(__m128d a, __m128d b)
622{
623  return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
624}
625
626static __inline int __attribute__((__always_inline__, __nodebug__))
627_mm_testc_pd(__m128d a, __m128d b)
628{
629  return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
630}
631
632static __inline int __attribute__((__always_inline__, __nodebug__))
633_mm_testnzc_pd(__m128d a, __m128d b)
634{
635  return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
636}
637
638static __inline int __attribute__((__always_inline__, __nodebug__))
639_mm_testz_ps(__m128 a, __m128 b)
640{
641  return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
642}
643
644static __inline int __attribute__((__always_inline__, __nodebug__))
645_mm_testc_ps(__m128 a, __m128 b)
646{
647  return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
648}
649
650static __inline int __attribute__((__always_inline__, __nodebug__))
651_mm_testnzc_ps(__m128 a, __m128 b)
652{
653  return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
654}
655
656static __inline int __attribute__((__always_inline__, __nodebug__))
657_mm256_testz_pd(__m256d a, __m256d b)
658{
659  return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
660}
661
662static __inline int __attribute__((__always_inline__, __nodebug__))
663_mm256_testc_pd(__m256d a, __m256d b)
664{
665  return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
666}
667
668static __inline int __attribute__((__always_inline__, __nodebug__))
669_mm256_testnzc_pd(__m256d a, __m256d b)
670{
671  return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
672}
673
674static __inline int __attribute__((__always_inline__, __nodebug__))
675_mm256_testz_ps(__m256 a, __m256 b)
676{
677  return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
678}
679
680static __inline int __attribute__((__always_inline__, __nodebug__))
681_mm256_testc_ps(__m256 a, __m256 b)
682{
683  return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
684}
685
686static __inline int __attribute__((__always_inline__, __nodebug__))
687_mm256_testnzc_ps(__m256 a, __m256 b)
688{
689  return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
690}
691
692static __inline int __attribute__((__always_inline__, __nodebug__))
693_mm256_testz_si256(__m256i a, __m256i b)
694{
695  return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
696}
697
698static __inline int __attribute__((__always_inline__, __nodebug__))
699_mm256_testc_si256(__m256i a, __m256i b)
700{
701  return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
702}
703
704static __inline int __attribute__((__always_inline__, __nodebug__))
705_mm256_testnzc_si256(__m256i a, __m256i b)
706{
707  return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
708}
709
710/* Vector extract sign mask */
711static __inline int __attribute__((__always_inline__, __nodebug__))
712_mm256_movemask_pd(__m256d a)
713{
714  return __builtin_ia32_movmskpd256((__v4df)a);
715}
716
717static __inline int __attribute__((__always_inline__, __nodebug__))
718_mm256_movemask_ps(__m256 a)
719{
720  return __builtin_ia32_movmskps256((__v8sf)a);
721}
722
723/* Vector zero */
724static __inline void __attribute__((__always_inline__, __nodebug__))
725_mm256_zeroall(void)
726{
727  __builtin_ia32_vzeroall();
728}
729
730static __inline void __attribute__((__always_inline__, __nodebug__))
731_mm256_zeroupper(void)
732{
733  __builtin_ia32_vzeroupper();
734}
735
736/* Vector load with broadcast */
737static __inline __m128 __attribute__((__always_inline__, __nodebug__))
738_mm_broadcast_ss(float const *a)
739{
740  return (__m128)__builtin_ia32_vbroadcastss(a);
741}
742
743static __inline __m256d __attribute__((__always_inline__, __nodebug__))
744_mm256_broadcast_sd(double const *a)
745{
746  return (__m256d)__builtin_ia32_vbroadcastsd256(a);
747}
748
749static __inline __m256 __attribute__((__always_inline__, __nodebug__))
750_mm256_broadcast_ss(float const *a)
751{
752  return (__m256)__builtin_ia32_vbroadcastss256(a);
753}
754
755static __inline __m256d __attribute__((__always_inline__, __nodebug__))
756_mm256_broadcast_pd(__m128d const *a)
757{
758  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
759}
760
761static __inline __m256 __attribute__((__always_inline__, __nodebug__))
762_mm256_broadcast_ps(__m128 const *a)
763{
764  return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
765}
766
767/* SIMD load ops */
768static __inline __m256d __attribute__((__always_inline__, __nodebug__))
769_mm256_load_pd(double const *p)
770{
771  return *(__m256d *)p;
772}
773
774static __inline __m256 __attribute__((__always_inline__, __nodebug__))
775_mm256_load_ps(float const *p)
776{
777  return *(__m256 *)p;
778}
779
780static __inline __m256d __attribute__((__always_inline__, __nodebug__))
781_mm256_loadu_pd(double const *p)
782{
783  return (__m256d)__builtin_ia32_loadupd256(p);
784}
785
786static __inline __m256 __attribute__((__always_inline__, __nodebug__))
787_mm256_loadu_ps(float const *p)
788{
789  return (__m256)__builtin_ia32_loadups256(p);
790}
791
792static __inline __m256i __attribute__((__always_inline__, __nodebug__))
793_mm256_load_si256(__m256i const *p)
794{
795  return *p;
796}
797
798static __inline __m256i __attribute__((__always_inline__, __nodebug__))
799_mm256_loadu_si256(__m256i const *p)
800{
801  return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
802}
803
804static __inline __m256i __attribute__((__always_inline__, __nodebug__))
805_mm256_lddqu_si256(__m256i const *p)
806{
807  return (__m256i)__builtin_ia32_lddqu256((char const *)p);
808}
809
810/* SIMD store ops */
811static __inline void __attribute__((__always_inline__, __nodebug__))
812_mm256_store_pd(double *p, __m256d a)
813{
814  *(__m256d *)p = a;
815}
816
817static __inline void __attribute__((__always_inline__, __nodebug__))
818_mm256_store_ps(float *p, __m256 a)
819{
820  *(__m256 *)p = a;
821}
822
823static __inline void __attribute__((__always_inline__, __nodebug__))
824_mm256_storeu_pd(double *p, __m256d a)
825{
826  __builtin_ia32_storeupd256(p, (__v4df)a);
827}
828
829static __inline void __attribute__((__always_inline__, __nodebug__))
830_mm256_storeu_ps(float *p, __m256 a)
831{
832  __builtin_ia32_storeups256(p, (__v8sf)a);
833}
834
835static __inline void __attribute__((__always_inline__, __nodebug__))
836_mm256_store_si256(__m256i *p, __m256i a)
837{
838  *p = a;
839}
840
841static __inline void __attribute__((__always_inline__, __nodebug__))
842_mm256_storeu_si256(__m256i *p, __m256i a)
843{
844  __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
845}
846
847/* Conditional load ops */
848static __inline __m128d __attribute__((__always_inline__, __nodebug__))
849_mm_maskload_pd(double const *p, __m128d m)
850{
851  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
852}
853
854static __inline __m256d __attribute__((__always_inline__, __nodebug__))
855_mm256_maskload_pd(double const *p, __m256d m)
856{
857  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
858}
859
860static __inline __m128 __attribute__((__always_inline__, __nodebug__))
861_mm_maskload_ps(float const *p, __m128 m)
862{
863  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
864}
865
866static __inline __m256 __attribute__((__always_inline__, __nodebug__))
867_mm256_maskload_ps(float const *p, __m256 m)
868{
869  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
870}
871
872/* Conditional store ops */
873static __inline void __attribute__((__always_inline__, __nodebug__))
874_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
875{
876  __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
877}
878
879static __inline void __attribute__((__always_inline__, __nodebug__))
880_mm_maskstore_pd(double *p, __m128d m, __m128d a)
881{
882  __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
883}
884
885static __inline void __attribute__((__always_inline__, __nodebug__))
886_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
887{
888  __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
889}
890
891static __inline void __attribute__((__always_inline__, __nodebug__))
892_mm_maskstore_ps(float *p, __m128 m, __m128 a)
893{
894  __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
895}
896
897/* Cacheability support ops */
898static __inline void __attribute__((__always_inline__, __nodebug__))
899_mm256_stream_si256(__m256i *a, __m256i b)
900{
901  __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
902}
903
904static __inline void __attribute__((__always_inline__, __nodebug__))
905_mm256_stream_pd(double *a, __m256d b)
906{
907  __builtin_ia32_movntpd256(a, (__v4df)b);
908}
909
910static __inline void __attribute__((__always_inline__, __nodebug__))
911_mm256_stream_ps(float *p, __m256 a)
912{
913  __builtin_ia32_movntps256(p, (__v8sf)a);
914}
915
916/* Create vectors */
917static __inline __m256d __attribute__((__always_inline__, __nodebug__))
918_mm256_set_pd(double a, double b, double c, double d)
919{
920  return (__m256d){ d, c, b, a };
921}
922
923static __inline __m256 __attribute__((__always_inline__, __nodebug__))
924_mm256_set_ps(float a, float b, float c, float d,
925	            float e, float f, float g, float h)
926{
927  return (__m256){ h, g, f, e, d, c, b, a };
928}
929
930static __inline __m256i __attribute__((__always_inline__, __nodebug__))
931_mm256_set_epi32(int i0, int i1, int i2, int i3,
932		             int i4, int i5, int i6, int i7)
933{
934  return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
935}
936
937static __inline __m256i __attribute__((__always_inline__, __nodebug__))
938_mm256_set_epi16(short w15, short w14, short w13, short w12,
939		             short w11, short w10, short w09, short w08,
940		             short w07, short w06, short w05, short w04,
941		             short w03, short w02, short w01, short w00)
942{
943  return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
944                             w08, w09, w10, w11, w12, w13, w14, w15 };
945}
946
947static __inline __m256i __attribute__((__always_inline__, __nodebug__))
948_mm256_set_epi8(char b31, char b30, char b29, char b28,
949		            char b27, char b26, char b25, char b24,
950		            char b23, char b22, char b21, char b20,
951		            char b19, char b18, char b17, char b16,
952		            char b15, char b14, char b13, char b12,
953		            char b11, char b10, char b09, char b08,
954		            char b07, char b06, char b05, char b04,
955		            char b03, char b02, char b01, char b00)
956{
957  return (__m256i)(__v32qi){
958    b00, b01, b02, b03, b04, b05, b06, b07,
959    b08, b09, b10, b11, b12, b13, b14, b15,
960    b16, b17, b18, b19, b20, b21, b22, b23,
961    b24, b25, b26, b27, b28, b29, b30, b31
962  };
963}
964
965static __inline __m256i __attribute__((__always_inline__, __nodebug__))
966_mm256_set_epi64x(long long a, long long b, long long c, long long d)
967{
968  return (__m256i)(__v4di){ d, c, b, a };
969}
970
971/* Create vectors with elements in reverse order */
972static __inline __m256d __attribute__((__always_inline__, __nodebug__))
973_mm256_setr_pd(double a, double b, double c, double d)
974{
975  return (__m256d){ a, b, c, d };
976}
977
978static __inline __m256 __attribute__((__always_inline__, __nodebug__))
979_mm256_setr_ps(float a, float b, float c, float d,
980		           float e, float f, float g, float h)
981{
982  return (__m256){ a, b, c, d, e, f, g, h };
983}
984
985static __inline __m256i __attribute__((__always_inline__, __nodebug__))
986_mm256_setr_epi32(int i0, int i1, int i2, int i3,
987		              int i4, int i5, int i6, int i7)
988{
989  return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
990}
991
992static __inline __m256i __attribute__((__always_inline__, __nodebug__))
993_mm256_setr_epi16(short w15, short w14, short w13, short w12,
994		   short w11, short w10, short w09, short w08,
995		   short w07, short w06, short w05, short w04,
996		   short w03, short w02, short w01, short w00)
997{
998  return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
999			                       w07, w06, w05, w04, w03, w02, w01, w00 };
1000}
1001
1002static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1003_mm256_setr_epi8(char b31, char b30, char b29, char b28,
1004		             char b27, char b26, char b25, char b24,
1005		             char b23, char b22, char b21, char b20,
1006		             char b19, char b18, char b17, char b16,
1007		             char b15, char b14, char b13, char b12,
1008		             char b11, char b10, char b09, char b08,
1009		             char b07, char b06, char b05, char b04,
1010		             char b03, char b02, char b01, char b00)
1011{
1012  return (__m256i)(__v32qi){
1013    b31, b30, b29, b28, b27, b26, b25, b24,
1014		b23, b22, b21, b20, b19, b18, b17, b16,
1015		b15, b14, b13, b12, b11, b10, b09, b08,
1016		b07, b06, b05, b04, b03, b02, b01, b00 };
1017}
1018
1019static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1020_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1021{
1022  return (__m256i)(__v4di){ a, b, c, d };
1023}
1024
1025/* Create vectors with repeated elements */
1026static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1027_mm256_set1_pd(double w)
1028{
1029  return (__m256d){ w, w, w, w };
1030}
1031
1032static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1033_mm256_set1_ps(float w)
1034{
1035  return (__m256){ w, w, w, w, w, w, w, w };
1036}
1037
1038static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1039_mm256_set1_epi32(int i)
1040{
1041  return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1042}
1043
1044static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1045_mm256_set1_epi16(short w)
1046{
1047  return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1048}
1049
1050static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1051_mm256_set1_epi8(char b)
1052{
1053  return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1054                             b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1055}
1056
1057static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1058_mm256_set1_epi64x(long long q)
1059{
1060  return (__m256i)(__v4di){ q, q, q, q };
1061}
1062
1063/* Create zeroed vectors */
1064static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1065_mm256_setzero_pd(void)
1066{
1067  return (__m256d){ 0, 0, 0, 0 };
1068}
1069
1070static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1071_mm256_setzero_ps(void)
1072{
1073  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1074}
1075
1076static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1077_mm256_setzero_si256(void)
1078{
1079  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1080}
1081
1082/* Cast between vector types */
1083static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1084_mm256_castpd_ps(__m256d in)
1085{
1086  return (__m256)in;
1087}
1088
1089static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1090_mm256_castpd_si256(__m256d in)
1091{
1092  return (__m256i)in;
1093}
1094
1095static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1096_mm256_castps_pd(__m256 in)
1097{
1098  return (__m256d)in;
1099}
1100
1101static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1102_mm256_castps_si256(__m256 in)
1103{
1104  return (__m256i)in;
1105}
1106
1107static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1108_mm256_castsi256_ps(__m256i in)
1109{
1110  return (__m256)in;
1111}
1112
1113static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1114_mm256_castsi256_pd(__m256i in)
1115{
1116  return (__m256d)in;
1117}
1118
1119static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1120_mm256_castpd256_pd128(__m256d in)
1121{
1122  return __builtin_shufflevector(in, in, 0, 1);
1123}
1124
1125static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1126_mm256_castps256_ps128(__m256 in)
1127{
1128  return __builtin_shufflevector(in, in, 0, 1, 2, 3);
1129}
1130
1131static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1132_mm256_castsi256_si128(__m256i in)
1133{
1134  return __builtin_shufflevector(in, in, 0, 1);
1135}
1136
1137static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1138_mm256_castpd128_pd256(__m128d in)
1139{
1140  __m128d zero = _mm_setzero_pd();
1141  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1142}
1143
1144static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1145_mm256_castps128_ps256(__m128 in)
1146{
1147  __m128 zero = _mm_setzero_ps();
1148  return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
1149}
1150
1151static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1152_mm256_castsi128_si256(__m128i in)
1153{
1154  __m128i zero = _mm_setzero_si128();
1155  return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
1156}
1157