1/* Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with GCC; see the file COPYING.  If not, write to
17   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
18   Boston, MA 02110-1301, USA.  */
19
20/* As a special exception, if you include this header file into source
21   files compiled by GCC, this header file does not by itself cause
22   the resulting executable to be covered by the GNU General Public
23   License.  This exception does not however invalidate any other
24   reasons why the executable file might be covered by the GNU General
25   Public License.  */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28   User Guide and Reference, version 8.0.  */
29
30#ifndef _XMMINTRIN_H_INCLUDED
31#define _XMMINTRIN_H_INCLUDED
32
33#ifndef __SSE__
34# error "SSE instruction set not enabled"
35#else
36
37/* We need type definitions from the MMX header file.  */
38#include <mmintrin.h>
39
40/* Get _mm_malloc () and _mm_free ().  */
41#include <mm_malloc.h>
42
43/* The Intel API is flexible enough that we must allow aliasing with other
44   vector types, and their scalar components.  */
45typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
46
47/* Internal data types for implementing the intrinsics.  */
48typedef float __v4sf __attribute__ ((__vector_size__ (16)));
49
50/* Create a selector for use with the SHUFPS instruction.  */
51#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
52 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
53
54/* Constants for use with _mm_prefetch.  */
55enum _mm_hint
56{
57  _MM_HINT_T0 = 3,
58  _MM_HINT_T1 = 2,
59  _MM_HINT_T2 = 1,
60  _MM_HINT_NTA = 0
61};
62
63/* Bits in the MXCSR.  */
64#define _MM_EXCEPT_MASK       0x003f
65#define _MM_EXCEPT_INVALID    0x0001
66#define _MM_EXCEPT_DENORM     0x0002
67#define _MM_EXCEPT_DIV_ZERO   0x0004
68#define _MM_EXCEPT_OVERFLOW   0x0008
69#define _MM_EXCEPT_UNDERFLOW  0x0010
70#define _MM_EXCEPT_INEXACT    0x0020
71
72#define _MM_MASK_MASK         0x1f80
73#define _MM_MASK_INVALID      0x0080
74#define _MM_MASK_DENORM       0x0100
75#define _MM_MASK_DIV_ZERO     0x0200
76#define _MM_MASK_OVERFLOW     0x0400
77#define _MM_MASK_UNDERFLOW    0x0800
78#define _MM_MASK_INEXACT      0x1000
79
80#define _MM_ROUND_MASK        0x6000
81#define _MM_ROUND_NEAREST     0x0000
82#define _MM_ROUND_DOWN        0x2000
83#define _MM_ROUND_UP          0x4000
84#define _MM_ROUND_TOWARD_ZERO 0x6000
85
86#define _MM_FLUSH_ZERO_MASK   0x8000
87#define _MM_FLUSH_ZERO_ON     0x8000
88#define _MM_FLUSH_ZERO_OFF    0x0000
89
90/* Create a vector of zeros.  */
91static __inline __m128 __attribute__((__always_inline__))
92_mm_setzero_ps (void)
93{
94  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
95}
96
97/* Perform the respective operation on the lower SPFP (single-precision
98   floating-point) values of A and B; the upper three SPFP values are
99   passed through from A.  */
100
101static __inline __m128 __attribute__((__always_inline__))
102_mm_add_ss (__m128 __A, __m128 __B)
103{
104  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
105}
106
107static __inline __m128 __attribute__((__always_inline__))
108_mm_sub_ss (__m128 __A, __m128 __B)
109{
110  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
111}
112
113static __inline __m128 __attribute__((__always_inline__))
114_mm_mul_ss (__m128 __A, __m128 __B)
115{
116  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
117}
118
119static __inline __m128 __attribute__((__always_inline__))
120_mm_div_ss (__m128 __A, __m128 __B)
121{
122  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
123}
124
125static __inline __m128 __attribute__((__always_inline__))
126_mm_sqrt_ss (__m128 __A)
127{
128  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
129}
130
131static __inline __m128 __attribute__((__always_inline__))
132_mm_rcp_ss (__m128 __A)
133{
134  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
135}
136
137static __inline __m128 __attribute__((__always_inline__))
138_mm_rsqrt_ss (__m128 __A)
139{
140  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
141}
142
143static __inline __m128 __attribute__((__always_inline__))
144_mm_min_ss (__m128 __A, __m128 __B)
145{
146  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
147}
148
149static __inline __m128 __attribute__((__always_inline__))
150_mm_max_ss (__m128 __A, __m128 __B)
151{
152  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
153}
154
155/* Perform the respective operation on the four SPFP values in A and B.  */
156
157static __inline __m128 __attribute__((__always_inline__))
158_mm_add_ps (__m128 __A, __m128 __B)
159{
160  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
161}
162
163static __inline __m128 __attribute__((__always_inline__))
164_mm_sub_ps (__m128 __A, __m128 __B)
165{
166  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
167}
168
169static __inline __m128 __attribute__((__always_inline__))
170_mm_mul_ps (__m128 __A, __m128 __B)
171{
172  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
173}
174
175static __inline __m128 __attribute__((__always_inline__))
176_mm_div_ps (__m128 __A, __m128 __B)
177{
178  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
179}
180
181static __inline __m128 __attribute__((__always_inline__))
182_mm_sqrt_ps (__m128 __A)
183{
184  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
185}
186
187static __inline __m128 __attribute__((__always_inline__))
188_mm_rcp_ps (__m128 __A)
189{
190  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
191}
192
193static __inline __m128 __attribute__((__always_inline__))
194_mm_rsqrt_ps (__m128 __A)
195{
196  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
197}
198
199static __inline __m128 __attribute__((__always_inline__))
200_mm_min_ps (__m128 __A, __m128 __B)
201{
202  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
203}
204
205static __inline __m128 __attribute__((__always_inline__))
206_mm_max_ps (__m128 __A, __m128 __B)
207{
208  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
209}
210
211/* Perform logical bit-wise operations on 128-bit values.  */
212
213static __inline __m128 __attribute__((__always_inline__))
214_mm_and_ps (__m128 __A, __m128 __B)
215{
216  return __builtin_ia32_andps (__A, __B);
217}
218
219static __inline __m128 __attribute__((__always_inline__))
220_mm_andnot_ps (__m128 __A, __m128 __B)
221{
222  return __builtin_ia32_andnps (__A, __B);
223}
224
225static __inline __m128 __attribute__((__always_inline__))
226_mm_or_ps (__m128 __A, __m128 __B)
227{
228  return __builtin_ia32_orps (__A, __B);
229}
230
231static __inline __m128 __attribute__((__always_inline__))
232_mm_xor_ps (__m128 __A, __m128 __B)
233{
234  return __builtin_ia32_xorps (__A, __B);
235}
236
237/* Perform a comparison on the lower SPFP values of A and B.  If the
238   comparison is true, place a mask of all ones in the result, otherwise a
239   mask of zeros.  The upper three SPFP values are passed through from A.  */
240
241static __inline __m128 __attribute__((__always_inline__))
242_mm_cmpeq_ss (__m128 __A, __m128 __B)
243{
244  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
245}
246
247static __inline __m128 __attribute__((__always_inline__))
248_mm_cmplt_ss (__m128 __A, __m128 __B)
249{
250  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
251}
252
253static __inline __m128 __attribute__((__always_inline__))
254_mm_cmple_ss (__m128 __A, __m128 __B)
255{
256  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
257}
258
259static __inline __m128 __attribute__((__always_inline__))
260_mm_cmpgt_ss (__m128 __A, __m128 __B)
261{
262  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
263					(__v4sf)
264					__builtin_ia32_cmpltss ((__v4sf) __B,
265								(__v4sf)
266								__A));
267}
268
269static __inline __m128 __attribute__((__always_inline__))
270_mm_cmpge_ss (__m128 __A, __m128 __B)
271{
272  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
273					(__v4sf)
274					__builtin_ia32_cmpless ((__v4sf) __B,
275								(__v4sf)
276								__A));
277}
278
279static __inline __m128 __attribute__((__always_inline__))
280_mm_cmpneq_ss (__m128 __A, __m128 __B)
281{
282  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
283}
284
285static __inline __m128 __attribute__((__always_inline__))
286_mm_cmpnlt_ss (__m128 __A, __m128 __B)
287{
288  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
289}
290
291static __inline __m128 __attribute__((__always_inline__))
292_mm_cmpnle_ss (__m128 __A, __m128 __B)
293{
294  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
295}
296
297static __inline __m128 __attribute__((__always_inline__))
298_mm_cmpngt_ss (__m128 __A, __m128 __B)
299{
300  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
301					(__v4sf)
302					__builtin_ia32_cmpnltss ((__v4sf) __B,
303								 (__v4sf)
304								 __A));
305}
306
307static __inline __m128 __attribute__((__always_inline__))
308_mm_cmpnge_ss (__m128 __A, __m128 __B)
309{
310  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
311					(__v4sf)
312					__builtin_ia32_cmpnless ((__v4sf) __B,
313								 (__v4sf)
314								 __A));
315}
316
317static __inline __m128 __attribute__((__always_inline__))
318_mm_cmpord_ss (__m128 __A, __m128 __B)
319{
320  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
321}
322
323static __inline __m128 __attribute__((__always_inline__))
324_mm_cmpunord_ss (__m128 __A, __m128 __B)
325{
326  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
327}
328
329/* Perform a comparison on the four SPFP values of A and B.  For each
330   element, if the comparison is true, place a mask of all ones in the
331   result, otherwise a mask of zeros.  */
332
333static __inline __m128 __attribute__((__always_inline__))
334_mm_cmpeq_ps (__m128 __A, __m128 __B)
335{
336  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
337}
338
339static __inline __m128 __attribute__((__always_inline__))
340_mm_cmplt_ps (__m128 __A, __m128 __B)
341{
342  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
343}
344
345static __inline __m128 __attribute__((__always_inline__))
346_mm_cmple_ps (__m128 __A, __m128 __B)
347{
348  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
349}
350
351static __inline __m128 __attribute__((__always_inline__))
352_mm_cmpgt_ps (__m128 __A, __m128 __B)
353{
354  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
355}
356
357static __inline __m128 __attribute__((__always_inline__))
358_mm_cmpge_ps (__m128 __A, __m128 __B)
359{
360  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
361}
362
363static __inline __m128 __attribute__((__always_inline__))
364_mm_cmpneq_ps (__m128 __A, __m128 __B)
365{
366  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
367}
368
369static __inline __m128 __attribute__((__always_inline__))
370_mm_cmpnlt_ps (__m128 __A, __m128 __B)
371{
372  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
373}
374
375static __inline __m128 __attribute__((__always_inline__))
376_mm_cmpnle_ps (__m128 __A, __m128 __B)
377{
378  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
379}
380
381static __inline __m128 __attribute__((__always_inline__))
382_mm_cmpngt_ps (__m128 __A, __m128 __B)
383{
384  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
385}
386
387static __inline __m128 __attribute__((__always_inline__))
388_mm_cmpnge_ps (__m128 __A, __m128 __B)
389{
390  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
391}
392
393static __inline __m128 __attribute__((__always_inline__))
394_mm_cmpord_ps (__m128 __A, __m128 __B)
395{
396  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
397}
398
399static __inline __m128 __attribute__((__always_inline__))
400_mm_cmpunord_ps (__m128 __A, __m128 __B)
401{
402  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
403}
404
405/* Compare the lower SPFP values of A and B and return 1 if true
406   and 0 if false.  */
407
408static __inline int __attribute__((__always_inline__))
409_mm_comieq_ss (__m128 __A, __m128 __B)
410{
411  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
412}
413
414static __inline int __attribute__((__always_inline__))
415_mm_comilt_ss (__m128 __A, __m128 __B)
416{
417  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
418}
419
420static __inline int __attribute__((__always_inline__))
421_mm_comile_ss (__m128 __A, __m128 __B)
422{
423  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
424}
425
426static __inline int __attribute__((__always_inline__))
427_mm_comigt_ss (__m128 __A, __m128 __B)
428{
429  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
430}
431
432static __inline int __attribute__((__always_inline__))
433_mm_comige_ss (__m128 __A, __m128 __B)
434{
435  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
436}
437
438static __inline int __attribute__((__always_inline__))
439_mm_comineq_ss (__m128 __A, __m128 __B)
440{
441  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
442}
443
444static __inline int __attribute__((__always_inline__))
445_mm_ucomieq_ss (__m128 __A, __m128 __B)
446{
447  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
448}
449
450static __inline int __attribute__((__always_inline__))
451_mm_ucomilt_ss (__m128 __A, __m128 __B)
452{
453  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
454}
455
456static __inline int __attribute__((__always_inline__))
457_mm_ucomile_ss (__m128 __A, __m128 __B)
458{
459  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
460}
461
462static __inline int __attribute__((__always_inline__))
463_mm_ucomigt_ss (__m128 __A, __m128 __B)
464{
465  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
466}
467
468static __inline int __attribute__((__always_inline__))
469_mm_ucomige_ss (__m128 __A, __m128 __B)
470{
471  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
472}
473
474static __inline int __attribute__((__always_inline__))
475_mm_ucomineq_ss (__m128 __A, __m128 __B)
476{
477  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
478}
479
480/* Convert the lower SPFP value to a 32-bit integer according to the current
481   rounding mode.  */
482static __inline int __attribute__((__always_inline__))
483_mm_cvtss_si32 (__m128 __A)
484{
485  return __builtin_ia32_cvtss2si ((__v4sf) __A);
486}
487
488static __inline int __attribute__((__always_inline__))
489_mm_cvt_ss2si (__m128 __A)
490{
491  return _mm_cvtss_si32 (__A);
492}
493
494#ifdef __x86_64__
495/* Convert the lower SPFP value to a 32-bit integer according to the current
496   rounding mode.  */
497static __inline long long __attribute__((__always_inline__))
498_mm_cvtss_si64x (__m128 __A)
499{
500  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
501}
502#endif
503
504/* Convert the two lower SPFP values to 32-bit integers according to the
505   current rounding mode.  Return the integers in packed form.  */
506static __inline __m64 __attribute__((__always_inline__))
507_mm_cvtps_pi32 (__m128 __A)
508{
509  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
510}
511
512static __inline __m64 __attribute__((__always_inline__))
513_mm_cvt_ps2pi (__m128 __A)
514{
515  return _mm_cvtps_pi32 (__A);
516}
517
518/* Truncate the lower SPFP value to a 32-bit integer.  */
519static __inline int __attribute__((__always_inline__))
520_mm_cvttss_si32 (__m128 __A)
521{
522  return __builtin_ia32_cvttss2si ((__v4sf) __A);
523}
524
525static __inline int __attribute__((__always_inline__))
526_mm_cvtt_ss2si (__m128 __A)
527{
528  return _mm_cvttss_si32 (__A);
529}
530
531#ifdef __x86_64__
532/* Truncate the lower SPFP value to a 32-bit integer.  */
533static __inline long long __attribute__((__always_inline__))
534_mm_cvttss_si64x (__m128 __A)
535{
536  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
537}
538#endif
539
540/* Truncate the two lower SPFP values to 32-bit integers.  Return the
541   integers in packed form.  */
542static __inline __m64 __attribute__((__always_inline__))
543_mm_cvttps_pi32 (__m128 __A)
544{
545  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
546}
547
548static __inline __m64 __attribute__((__always_inline__))
549_mm_cvtt_ps2pi (__m128 __A)
550{
551  return _mm_cvttps_pi32 (__A);
552}
553
554/* Convert B to a SPFP value and insert it as element zero in A.  */
555static __inline __m128 __attribute__((__always_inline__))
556_mm_cvtsi32_ss (__m128 __A, int __B)
557{
558  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
559}
560
561static __inline __m128 __attribute__((__always_inline__))
562_mm_cvt_si2ss (__m128 __A, int __B)
563{
564  return _mm_cvtsi32_ss (__A, __B);
565}
566
567#ifdef __x86_64__
568/* Convert B to a SPFP value and insert it as element zero in A.  */
569static __inline __m128 __attribute__((__always_inline__))
570_mm_cvtsi64x_ss (__m128 __A, long long __B)
571{
572  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
573}
574#endif
575
576/* Convert the two 32-bit values in B to SPFP form and insert them
577   as the two lower elements in A.  */
578static __inline __m128 __attribute__((__always_inline__))
579_mm_cvtpi32_ps (__m128 __A, __m64 __B)
580{
581  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
582}
583
584static __inline __m128 __attribute__((__always_inline__))
585_mm_cvt_pi2ps (__m128 __A, __m64 __B)
586{
587  return _mm_cvtpi32_ps (__A, __B);
588}
589
590/* Convert the four signed 16-bit values in A to SPFP form.  */
591static __inline __m128 __attribute__((__always_inline__))
592_mm_cvtpi16_ps (__m64 __A)
593{
594  __v4hi __sign;
595  __v2si __hisi, __losi;
596  __v4sf __r;
597
598  /* This comparison against zero gives us a mask that can be used to
599     fill in the missing sign bits in the unpack operations below, so
600     that we get signed values after unpacking.  */
601  __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
602
603  /* Convert the four words to doublewords.  */
604  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
605  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
606
607  /* Convert the doublewords to floating point two at a time.  */
608  __r = (__v4sf) _mm_setzero_ps ();
609  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
610  __r = __builtin_ia32_movlhps (__r, __r);
611  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
612
613  return (__m128) __r;
614}
615
616/* Convert the four unsigned 16-bit values in A to SPFP form.  */
617static __inline __m128 __attribute__((__always_inline__))
618_mm_cvtpu16_ps (__m64 __A)
619{
620  __v2si __hisi, __losi;
621  __v4sf __r;
622
623  /* Convert the four words to doublewords.  */
624  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
625  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
626
627  /* Convert the doublewords to floating point two at a time.  */
628  __r = (__v4sf) _mm_setzero_ps ();
629  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
630  __r = __builtin_ia32_movlhps (__r, __r);
631  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
632
633  return (__m128) __r;
634}
635
636/* Convert the low four signed 8-bit values in A to SPFP form.  */
637static __inline __m128 __attribute__((__always_inline__))
638_mm_cvtpi8_ps (__m64 __A)
639{
640  __v8qi __sign;
641
642  /* This comparison against zero gives us a mask that can be used to
643     fill in the missing sign bits in the unpack operations below, so
644     that we get signed values after unpacking.  */
645  __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
646
647  /* Convert the four low bytes to words.  */
648  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
649
650  return _mm_cvtpi16_ps(__A);
651}
652
653/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
654static __inline __m128 __attribute__((__always_inline__))
655_mm_cvtpu8_ps(__m64 __A)
656{
657  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
658  return _mm_cvtpu16_ps(__A);
659}
660
661/* Convert the four signed 32-bit values in A and B to SPFP form.  */
662static __inline __m128 __attribute__((__always_inline__))
663_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
664{
665  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
666  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
667  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
668  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
669}
670
671/* Convert the four SPFP values in A to four signed 16-bit integers.  */
672static __inline __m64 __attribute__((__always_inline__))
673_mm_cvtps_pi16(__m128 __A)
674{
675  __v4sf __hisf = (__v4sf)__A;
676  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
677  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
678  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
679  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
680}
681
682/* Convert the four SPFP values in A to four signed 8-bit integers.  */
683static __inline __m64 __attribute__((__always_inline__))
684_mm_cvtps_pi8(__m128 __A)
685{
686  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
687  return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
688}
689
690/* Selects four specific SPFP values from A and B based on MASK.  */
691#if 0
692static __inline __m128 __attribute__((__always_inline__))
693_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
694{
695  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
696}
697#else
698#define _mm_shuffle_ps(A, B, MASK) \
699 ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
700#endif
701
702
703/* Selects and interleaves the upper two SPFP values from A and B.  */
704static __inline __m128 __attribute__((__always_inline__))
705_mm_unpackhi_ps (__m128 __A, __m128 __B)
706{
707  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
708}
709
710/* Selects and interleaves the lower two SPFP values from A and B.  */
711static __inline __m128 __attribute__((__always_inline__))
712_mm_unpacklo_ps (__m128 __A, __m128 __B)
713{
714  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
715}
716
717/* Sets the upper two SPFP values with 64-bits of data loaded from P;
718   the lower two values are passed through from A.  */
719static __inline __m128 __attribute__((__always_inline__))
720_mm_loadh_pi (__m128 __A, __m64 const *__P)
721{
722  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
723}
724
725/* Stores the upper two SPFP values of A into P.  */
726static __inline void __attribute__((__always_inline__))
727_mm_storeh_pi (__m64 *__P, __m128 __A)
728{
729  __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
730}
731
732/* Moves the upper two values of B into the lower two values of A.  */
733static __inline __m128 __attribute__((__always_inline__))
734_mm_movehl_ps (__m128 __A, __m128 __B)
735{
736  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
737}
738
739/* Moves the lower two values of B into the upper two values of A.  */
740static __inline __m128 __attribute__((__always_inline__))
741_mm_movelh_ps (__m128 __A, __m128 __B)
742{
743  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
744}
745
746/* Sets the lower two SPFP values with 64-bits of data loaded from P;
747   the upper two values are passed through from A.  */
748static __inline __m128 __attribute__((__always_inline__))
749_mm_loadl_pi (__m128 __A, __m64 const *__P)
750{
751  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
752}
753
754/* Stores the lower two SPFP values of A into P.  */
755static __inline void __attribute__((__always_inline__))
756_mm_storel_pi (__m64 *__P, __m128 __A)
757{
758  __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
759}
760
761/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
762static __inline int __attribute__((__always_inline__))
763_mm_movemask_ps (__m128 __A)
764{
765  return __builtin_ia32_movmskps ((__v4sf)__A);
766}
767
768/* Return the contents of the control register.  */
769static __inline unsigned int __attribute__((__always_inline__))
770_mm_getcsr (void)
771{
772  return __builtin_ia32_stmxcsr ();
773}
774
775/* Read exception bits from the control register.  */
776static __inline unsigned int __attribute__((__always_inline__))
777_MM_GET_EXCEPTION_STATE (void)
778{
779  return _mm_getcsr() & _MM_EXCEPT_MASK;
780}
781
782static __inline unsigned int __attribute__((__always_inline__))
783_MM_GET_EXCEPTION_MASK (void)
784{
785  return _mm_getcsr() & _MM_MASK_MASK;
786}
787
788static __inline unsigned int __attribute__((__always_inline__))
789_MM_GET_ROUNDING_MODE (void)
790{
791  return _mm_getcsr() & _MM_ROUND_MASK;
792}
793
794static __inline unsigned int __attribute__((__always_inline__))
795_MM_GET_FLUSH_ZERO_MODE (void)
796{
797  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
798}
799
800/* Set the control register to I.  */
801static __inline void __attribute__((__always_inline__))
802_mm_setcsr (unsigned int __I)
803{
804  __builtin_ia32_ldmxcsr (__I);
805}
806
807/* Set exception bits in the control register.  */
808static __inline void __attribute__((__always_inline__))
809_MM_SET_EXCEPTION_STATE(unsigned int __mask)
810{
811  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
812}
813
814static __inline void __attribute__((__always_inline__))
815_MM_SET_EXCEPTION_MASK (unsigned int __mask)
816{
817  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
818}
819
820static __inline void __attribute__((__always_inline__))
821_MM_SET_ROUNDING_MODE (unsigned int __mode)
822{
823  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
824}
825
826static __inline void __attribute__((__always_inline__))
827_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
828{
829  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
830}
831
832/* Create a vector with element 0 as F and the rest zero.  */
833static __inline __m128 __attribute__((__always_inline__))
834_mm_set_ss (float __F)
835{
836  return __extension__ (__m128)(__v4sf){ __F, 0, 0, 0 };
837}
838
839/* Create a vector with all four elements equal to F.  */
840static __inline __m128 __attribute__((__always_inline__))
841_mm_set1_ps (float __F)
842{
843  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
844}
845
846static __inline __m128 __attribute__((__always_inline__))
847_mm_set_ps1 (float __F)
848{
849  return _mm_set1_ps (__F);
850}
851
852/* Create a vector with element 0 as *P and the rest zero.  */
853static __inline __m128 __attribute__((__always_inline__))
854_mm_load_ss (float const *__P)
855{
856  return _mm_set_ss (*__P);
857}
858
859/* Create a vector with all four elements equal to *P.  */
860static __inline __m128 __attribute__((__always_inline__))
861_mm_load1_ps (float const *__P)
862{
863  return _mm_set1_ps (*__P);
864}
865
866static __inline __m128 __attribute__((__always_inline__))
867_mm_load_ps1 (float const *__P)
868{
869  return _mm_load1_ps (__P);
870}
871
872/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
873static __inline __m128 __attribute__((__always_inline__))
874_mm_load_ps (float const *__P)
875{
876  return (__m128) *(__v4sf *)__P;
877}
878
879/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
880static __inline __m128 __attribute__((__always_inline__))
881_mm_loadu_ps (float const *__P)
882{
883  return (__m128) __builtin_ia32_loadups (__P);
884}
885
886/* Load four SPFP values in reverse order.  The address must be aligned.  */
887static __inline __m128 __attribute__((__always_inline__))
888_mm_loadr_ps (float const *__P)
889{
890  __v4sf __tmp = *(__v4sf *)__P;
891  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
892}
893
894/* Create the vector [Z Y X W].  */
895static __inline __m128 __attribute__((__always_inline__))
896_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
897{
898  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
899}
900
901/* Create the vector [W X Y Z].  */
902static __inline __m128 __attribute__((__always_inline__))
903_mm_setr_ps (float __Z, float __Y, float __X, float __W)
904{
905  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
906}
907
908/* Stores the lower SPFP value.  */
909static __inline void __attribute__((__always_inline__))
910_mm_store_ss (float *__P, __m128 __A)
911{
912  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
913}
914
915/* Store four SPFP values.  The address must be 16-byte aligned.  */
916static __inline void __attribute__((__always_inline__))
917_mm_store_ps (float *__P, __m128 __A)
918{
919  *(__v4sf *)__P = (__v4sf)__A;
920}
921
922/* Store four SPFP values.  The address need not be 16-byte aligned.  */
923static __inline void __attribute__((__always_inline__))
924_mm_storeu_ps (float *__P, __m128 __A)
925{
926  __builtin_ia32_storeups (__P, (__v4sf)__A);
927}
928
929/* Store the lower SPFP value across four words.  */
930static __inline void __attribute__((__always_inline__))
931_mm_store1_ps (float *__P, __m128 __A)
932{
933  __v4sf __va = (__v4sf)__A;
934  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
935  _mm_storeu_ps (__P, __tmp);
936}
937
938static __inline void __attribute__((__always_inline__))
939_mm_store_ps1 (float *__P, __m128 __A)
940{
941  _mm_store1_ps (__P, __A);
942}
943
944/* Store four SPFP values in reverse order.  The address must be aligned.  */
945static __inline void __attribute__((__always_inline__))
946_mm_storer_ps (float *__P, __m128 __A)
947{
948  __v4sf __va = (__v4sf)__A;
949  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
950  _mm_store_ps (__P, __tmp);
951}
952
953/* Sets the low SPFP value of A from the low value of B.  */
954static __inline __m128 __attribute__((__always_inline__))
955_mm_move_ss (__m128 __A, __m128 __B)
956{
957  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
958}
959
960/* Extracts one of the four words of A.  The selector N must be immediate.  */
961#if 0
962static __inline int __attribute__((__always_inline__))
963_mm_extract_pi16 (__m64 const __A, int const __N)
964{
965  return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
966}
967
968static __inline int __attribute__((__always_inline__))
969_m_pextrw (__m64 const __A, int const __N)
970{
971  return _mm_extract_pi16 (__A, __N);
972}
973#else
974#define _mm_extract_pi16(A, N)	__builtin_ia32_vec_ext_v4hi ((__v4hi)(A), (N))
975#define _m_pextrw(A, N)		_mm_extract_pi16((A), (N))
976#endif
977
978/* Inserts word D into one of four words of A.  The selector N must be
979   immediate.  */
980#if 0
981static __inline __m64 __attribute__((__always_inline__))
982_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
983{
984  return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
985}
986
987static __inline __m64 __attribute__((__always_inline__))
988_m_pinsrw (__m64 const __A, int const __D, int const __N)
989{
990  return _mm_insert_pi16 (__A, __D, __N);
991}
992#else
993#define _mm_insert_pi16(A, D, N) \
994  ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(A), (D), (N)))
995#define _m_pinsrw(A, D, N)	 _mm_insert_pi16((A), (D), (N))
996#endif
997
998/* Compute the element-wise maximum of signed 16-bit values.  */
999static __inline __m64 __attribute__((__always_inline__))
1000_mm_max_pi16 (__m64 __A, __m64 __B)
1001{
1002  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1003}
1004
1005static __inline __m64 __attribute__((__always_inline__))
1006_m_pmaxsw (__m64 __A, __m64 __B)
1007{
1008  return _mm_max_pi16 (__A, __B);
1009}
1010
1011/* Compute the element-wise maximum of unsigned 8-bit values.  */
1012static __inline __m64 __attribute__((__always_inline__))
1013_mm_max_pu8 (__m64 __A, __m64 __B)
1014{
1015  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1016}
1017
1018static __inline __m64 __attribute__((__always_inline__))
1019_m_pmaxub (__m64 __A, __m64 __B)
1020{
1021  return _mm_max_pu8 (__A, __B);
1022}
1023
1024/* Compute the element-wise minimum of signed 16-bit values.  */
1025static __inline __m64 __attribute__((__always_inline__))
1026_mm_min_pi16 (__m64 __A, __m64 __B)
1027{
1028  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1029}
1030
1031static __inline __m64 __attribute__((__always_inline__))
1032_m_pminsw (__m64 __A, __m64 __B)
1033{
1034  return _mm_min_pi16 (__A, __B);
1035}
1036
1037/* Compute the element-wise minimum of unsigned 8-bit values.  */
1038static __inline __m64 __attribute__((__always_inline__))
1039_mm_min_pu8 (__m64 __A, __m64 __B)
1040{
1041  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1042}
1043
1044static __inline __m64 __attribute__((__always_inline__))
1045_m_pminub (__m64 __A, __m64 __B)
1046{
1047  return _mm_min_pu8 (__A, __B);
1048}
1049
1050/* Create an 8-bit mask of the signs of 8-bit values.  */
1051static __inline int __attribute__((__always_inline__))
1052_mm_movemask_pi8 (__m64 __A)
1053{
1054  return __builtin_ia32_pmovmskb ((__v8qi)__A);
1055}
1056
1057static __inline int __attribute__((__always_inline__))
1058_m_pmovmskb (__m64 __A)
1059{
1060  return _mm_movemask_pi8 (__A);
1061}
1062
1063/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1064   in B and produce the high 16 bits of the 32-bit results.  */
1065static __inline __m64 __attribute__((__always_inline__))
1066_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1067{
1068  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1069}
1070
1071static __inline __m64 __attribute__((__always_inline__))
1072_m_pmulhuw (__m64 __A, __m64 __B)
1073{
1074  return _mm_mulhi_pu16 (__A, __B);
1075}
1076
1077/* Return a combination of the four 16-bit values in A.  The selector
1078   must be an immediate.  */
1079#if 0
1080static __inline __m64 __attribute__((__always_inline__))
1081_mm_shuffle_pi16 (__m64 __A, int __N)
1082{
1083  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1084}
1085
1086static __inline __m64 __attribute__((__always_inline__))
1087_m_pshufw (__m64 __A, int __N)
1088{
1089  return _mm_shuffle_pi16 (__A, __N);
1090}
1091#else
1092#define _mm_shuffle_pi16(A, N) \
1093  ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
1094#define _m_pshufw(A, N)		_mm_shuffle_pi16 ((A), (N))
1095#endif
1096
1097/* Conditionally store byte elements of A into P.  The high bit of each
1098   byte in the selector N determines whether the corresponding byte from
1099   A is stored.  */
1100static __inline void __attribute__((__always_inline__))
1101_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1102{
1103  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
1104}
1105
1106static __inline void __attribute__((__always_inline__))
1107_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1108{
1109  _mm_maskmove_si64 (__A, __N, __P);
1110}
1111
1112/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1113static __inline __m64 __attribute__((__always_inline__))
1114_mm_avg_pu8 (__m64 __A, __m64 __B)
1115{
1116  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1117}
1118
1119static __inline __m64 __attribute__((__always_inline__))
1120_m_pavgb (__m64 __A, __m64 __B)
1121{
1122  return _mm_avg_pu8 (__A, __B);
1123}
1124
1125/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1126static __inline __m64 __attribute__((__always_inline__))
1127_mm_avg_pu16 (__m64 __A, __m64 __B)
1128{
1129  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1130}
1131
1132static __inline __m64 __attribute__((__always_inline__))
1133_m_pavgw (__m64 __A, __m64 __B)
1134{
1135  return _mm_avg_pu16 (__A, __B);
1136}
1137
1138/* Compute the sum of the absolute differences of the unsigned 8-bit
1139   values in A and B.  Return the value in the lower 16-bit word; the
1140   upper words are cleared.  */
1141static __inline __m64 __attribute__((__always_inline__))
1142_mm_sad_pu8 (__m64 __A, __m64 __B)
1143{
1144  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1145}
1146
1147static __inline __m64 __attribute__((__always_inline__))
1148_m_psadbw (__m64 __A, __m64 __B)
1149{
1150  return _mm_sad_pu8 (__A, __B);
1151}
1152
1153/* Loads one cache line from address P to a location "closer" to the
1154   processor.  The selector I specifies the type of prefetch operation.  */
1155#if 0
1156static __inline void __attribute__((__always_inline__))
1157_mm_prefetch (void *__P, enum _mm_hint __I)
1158{
1159  __builtin_prefetch (__P, 0, __I);
1160}
1161#else
1162#define _mm_prefetch(P, I) \
1163  __builtin_prefetch ((P), 0, (I))
1164#endif
1165
1166/* Stores the data in A to the address P without polluting the caches.  */
1167static __inline void __attribute__((__always_inline__))
1168_mm_stream_pi (__m64 *__P, __m64 __A)
1169{
1170  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
1171}
1172
1173/* Likewise.  The address must be 16-byte aligned.  */
1174static __inline void __attribute__((__always_inline__))
1175_mm_stream_ps (float *__P, __m128 __A)
1176{
1177  __builtin_ia32_movntps (__P, (__v4sf)__A);
1178}
1179
1180/* Guarantees that every preceding store is globally visible before
1181   any subsequent store.  */
1182static __inline void __attribute__((__always_inline__))
1183_mm_sfence (void)
1184{
1185  __builtin_ia32_sfence ();
1186}
1187
1188/* The execution of the next instruction is delayed by an implementation
1189   specific amount of time.  The instruction does not modify the
1190   architectural state.  */
1191static __inline void __attribute__((__always_inline__))
1192_mm_pause (void)
1193{
1194  __asm__ __volatile__ ("rep; nop" : : );
1195}
1196
1197/* Transpose the 4x4 matrix composed of row[0-3].  */
1198#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
1199do {									\
1200  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1201  __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44);		\
1202  __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE);		\
1203  __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44);		\
1204  __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE);		\
1205  (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88);			\
1206  (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD);			\
1207  (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88);			\
1208  (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);			\
1209} while (0)
1210
1211/* For backward source compatibility.  */
1212#include <emmintrin.h>
1213
1214#endif /* __SSE__ */
1215#endif /* _XMMINTRIN_H_INCLUDED */
1216