xmmintrin.h revision 107590
1/* Copyright (C) 2002 Free Software Foundation, Inc.
2
3   This file is part of GNU CC.
4
5   GNU CC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   GNU CC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with GNU CC; see the file COPYING.  If not, write to
17   the Free Software Foundation, 59 Temple Place - Suite 330,
18   Boston, MA 02111-1307, USA.  */
19
20/* As a special exception, if you include this header file into source
21   files compiled by GCC, this header file does not by itself cause
22   the resulting executable to be covered by the GNU General Public
23   License.  This exception does not however invalidate any other
24   reasons why the executable file might be covered by the GNU General
25   Public License.  */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28   User Guide and Reference, version 5.0.  */
29
30#ifndef _XMMINTRIN_H_INCLUDED
31#define _XMMINTRIN_H_INCLUDED
32
33/* We need type definitions from the MMX header file.  */
34#include <mmintrin.h>
35
36/* The data type indended for user use.  */
37typedef int __m128 __attribute__ ((__mode__(__V4SF__)));
38
39/* Internal data types for implementing the instrinsics.  */
40typedef int __v4sf __attribute__ ((__mode__(__V4SF__)));
41typedef int __v4si __attribute__ ((__mode__(__V4SI__)));
42
43/* Create a selector for use with the SHUFPS instruction.  */
44#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
45 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
46
47/* Constants for use with _mm_prefetch.  */
48enum _mm_hint
49{
50  _MM_HINT_T0 = 3,
51  _MM_HINT_T1 = 2,
52  _MM_HINT_T2 = 1,
53  _MM_HINT_NTA = 0
54};
55
56/* Bits in the MXCSR.  */
57#define _MM_EXCEPT_MASK       0x003f
58#define _MM_EXCEPT_INVALID    0x0001
59#define _MM_EXCEPT_DENORM     0x0002
60#define _MM_EXCEPT_DIV_ZERO   0x0004
61#define _MM_EXCEPT_OVERFLOW   0x0008
62#define _MM_EXCEPT_UNDERFLOW  0x0010
63#define _MM_EXCEPT_INEXACT    0x0020
64
65#define _MM_MASK_MASK         0x1f80
66#define _MM_MASK_INVALID      0x0080
67#define _MM_MASK_DENORM       0x0100
68#define _MM_MASK_DIV_ZERO     0x0200
69#define _MM_MASK_OVERFLOW     0x0400
70#define _MM_MASK_UNDERFLOW    0x0800
71#define _MM_MASK_INEXACT      0x1000
72
73#define _MM_ROUND_MASK        0x6000
74#define _MM_ROUND_NEAREST     0x0000
75#define _MM_ROUND_DOWN        0x2000
76#define _MM_ROUND_UP          0x4000
77#define _MM_ROUND_TOWARD_ZERO 0x6000
78
79#define _MM_FLUSH_ZERO_MASK   0x8000
80#define _MM_FLUSH_ZERO_ON     0x8000
81#define _MM_FLUSH_ZERO_OFF    0x0000
82
83/* Perform the respective operation on the lower SPFP (single-precision
84   floating-point) values of A and B; the upper three SPFP values are
85   passed through from A.  */
86
87static __inline __m128
88_mm_add_ss (__m128 __A, __m128 __B)
89{
90  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
91}
92
93static __inline __m128
94_mm_sub_ss (__m128 __A, __m128 __B)
95{
96  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
97}
98
99static __inline __m128
100_mm_mul_ss (__m128 __A, __m128 __B)
101{
102  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
103}
104
105static __inline __m128
106_mm_div_ss (__m128 __A, __m128 __B)
107{
108  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
109}
110
111static __inline __m128
112_mm_sqrt_ss (__m128 __A)
113{
114  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
115}
116
117static __inline __m128
118_mm_rcp_ss (__m128 __A)
119{
120  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
121}
122
123static __inline __m128
124_mm_rsqrt_ss (__m128 __A)
125{
126  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
127}
128
129static __inline __m128
130_mm_min_ss (__m128 __A, __m128 __B)
131{
132  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
133}
134
135static __inline __m128
136_mm_max_ss (__m128 __A, __m128 __B)
137{
138  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
139}
140
141/* Perform the respective operation on the four SPFP values in A and B.  */
142
143static __inline __m128
144_mm_add_ps (__m128 __A, __m128 __B)
145{
146  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
147}
148
149static __inline __m128
150_mm_sub_ps (__m128 __A, __m128 __B)
151{
152  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
153}
154
155static __inline __m128
156_mm_mul_ps (__m128 __A, __m128 __B)
157{
158  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
159}
160
161static __inline __m128
162_mm_div_ps (__m128 __A, __m128 __B)
163{
164  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
165}
166
167static __inline __m128
168_mm_sqrt_ps (__m128 __A)
169{
170  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
171}
172
173static __inline __m128
174_mm_rcp_ps (__m128 __A)
175{
176  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
177}
178
179static __inline __m128
180_mm_rsqrt_ps (__m128 __A)
181{
182  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
183}
184
185static __inline __m128
186_mm_min_ps (__m128 __A, __m128 __B)
187{
188  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
189}
190
191static __inline __m128
192_mm_max_ps (__m128 __A, __m128 __B)
193{
194  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
195}
196
197/* Perform logical bit-wise operations on 128-bit values.  */
198
199static __inline __m128
200_mm_and_ps (__m128 __A, __m128 __B)
201{
202  return __builtin_ia32_andps (__A, __B);
203}
204
205static __inline __m128
206_mm_andnot_ps (__m128 __A, __m128 __B)
207{
208  return __builtin_ia32_andnps (__A, __B);
209}
210
211static __inline __m128
212_mm_or_ps (__m128 __A, __m128 __B)
213{
214  return __builtin_ia32_orps (__A, __B);
215}
216
217static __inline __m128
218_mm_xor_ps (__m128 __A, __m128 __B)
219{
220  return __builtin_ia32_xorps (__A, __B);
221}
222
223/* Perform a comparison on the lower SPFP values of A and B.  If the
224   comparison is true, place a mask of all ones in the result, otherwise a
225   mask of zeros.  The upper three SPFP values are passed through from A.  */
226
227static __inline __m128
228_mm_cmpeq_ss (__m128 __A, __m128 __B)
229{
230  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
231}
232
233static __inline __m128
234_mm_cmplt_ss (__m128 __A, __m128 __B)
235{
236  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
237}
238
239static __inline __m128
240_mm_cmple_ss (__m128 __A, __m128 __B)
241{
242  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
243}
244
245static __inline __m128
246_mm_cmpgt_ss (__m128 __A, __m128 __B)
247{
248  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
249					(__v4sf)
250					__builtin_ia32_cmpltss ((__v4sf) __B,
251								(__v4sf)
252								__A));
253}
254
255static __inline __m128
256_mm_cmpge_ss (__m128 __A, __m128 __B)
257{
258  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
259					(__v4sf)
260					__builtin_ia32_cmpless ((__v4sf) __B,
261								(__v4sf)
262								__A));
263}
264
265static __inline __m128
266_mm_cmpneq_ss (__m128 __A, __m128 __B)
267{
268  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
269}
270
271static __inline __m128
272_mm_cmpnlt_ss (__m128 __A, __m128 __B)
273{
274  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
275}
276
277static __inline __m128
278_mm_cmpnle_ss (__m128 __A, __m128 __B)
279{
280  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
281}
282
283static __inline __m128
284_mm_cmpngt_ss (__m128 __A, __m128 __B)
285{
286  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
287					(__v4sf)
288					__builtin_ia32_cmpnltss ((__v4sf) __B,
289								 (__v4sf)
290								 __A));
291}
292
293static __inline __m128
294_mm_cmpnge_ss (__m128 __A, __m128 __B)
295{
296  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
297					(__v4sf)
298					__builtin_ia32_cmpnless ((__v4sf) __B,
299								 (__v4sf)
300								 __A));
301}
302
303static __inline __m128
304_mm_cmpord_ss (__m128 __A, __m128 __B)
305{
306  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
307}
308
309static __inline __m128
310_mm_cmpunord_ss (__m128 __A, __m128 __B)
311{
312  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
313}
314
315/* Perform a comparison on the four SPFP values of A and B.  For each
316   element, if the comparison is true, place a mask of all ones in the
317   result, otherwise a mask of zeros.  */
318
319static __inline __m128
320_mm_cmpeq_ps (__m128 __A, __m128 __B)
321{
322  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
323}
324
325static __inline __m128
326_mm_cmplt_ps (__m128 __A, __m128 __B)
327{
328  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
329}
330
331static __inline __m128
332_mm_cmple_ps (__m128 __A, __m128 __B)
333{
334  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
335}
336
337static __inline __m128
338_mm_cmpgt_ps (__m128 __A, __m128 __B)
339{
340  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
341}
342
343static __inline __m128
344_mm_cmpge_ps (__m128 __A, __m128 __B)
345{
346  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
347}
348
349static __inline __m128
350_mm_cmpneq_ps (__m128 __A, __m128 __B)
351{
352  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
353}
354
355static __inline __m128
356_mm_cmpnlt_ps (__m128 __A, __m128 __B)
357{
358  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
359}
360
361static __inline __m128
362_mm_cmpnle_ps (__m128 __A, __m128 __B)
363{
364  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
365}
366
367static __inline __m128
368_mm_cmpngt_ps (__m128 __A, __m128 __B)
369{
370  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
371}
372
373static __inline __m128
374_mm_cmpnge_ps (__m128 __A, __m128 __B)
375{
376  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
377}
378
379static __inline __m128
380_mm_cmpord_ps (__m128 __A, __m128 __B)
381{
382  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
383}
384
385static __inline __m128
386_mm_cmpunord_ps (__m128 __A, __m128 __B)
387{
388  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
389}
390
391/* Compare the lower SPFP values of A and B and return 1 if true
392   and 0 if false.  */
393
394static __inline int
395_mm_comieq_ss (__m128 __A, __m128 __B)
396{
397  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
398}
399
400static __inline int
401_mm_comilt_ss (__m128 __A, __m128 __B)
402{
403  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
404}
405
406static __inline int
407_mm_comile_ss (__m128 __A, __m128 __B)
408{
409  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
410}
411
412static __inline int
413_mm_comigt_ss (__m128 __A, __m128 __B)
414{
415  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
416}
417
418static __inline int
419_mm_comige_ss (__m128 __A, __m128 __B)
420{
421  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
422}
423
424static __inline int
425_mm_comineq_ss (__m128 __A, __m128 __B)
426{
427  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
428}
429
430static __inline int
431_mm_ucomieq_ss (__m128 __A, __m128 __B)
432{
433  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
434}
435
436static __inline int
437_mm_ucomilt_ss (__m128 __A, __m128 __B)
438{
439  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
440}
441
442static __inline int
443_mm_ucomile_ss (__m128 __A, __m128 __B)
444{
445  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
446}
447
448static __inline int
449_mm_ucomigt_ss (__m128 __A, __m128 __B)
450{
451  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
452}
453
454static __inline int
455_mm_ucomige_ss (__m128 __A, __m128 __B)
456{
457  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
458}
459
460static __inline int
461_mm_ucomineq_ss (__m128 __A, __m128 __B)
462{
463  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
464}
465
466/* Convert the lower SPFP value to a 32-bit integer according to the current
467   rounding mode.  */
468static __inline int
469_mm_cvtss_si32 (__m128 __A)
470{
471  return __builtin_ia32_cvtss2si ((__v4sf) __A);
472}
473
474/* Convert the two lower SPFP values to 32-bit integers according to the
475   current rounding mode.  Return the integers in packed form.  */
476static __inline __m64
477_mm_cvtps_pi32 (__m128 __A)
478{
479  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
480}
481
482/* Truncate the lower SPFP value to a 32-bit integer.  */
483static __inline int
484_mm_cvttss_si32 (__m128 __A)
485{
486  return __builtin_ia32_cvttss2si ((__v4sf) __A);
487}
488
489/* Truncate the two lower SPFP values to 32-bit integers.  Return the
490   integers in packed form.  */
491static __inline __m64
492_mm_cvttps_pi32 (__m128 __A)
493{
494  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
495}
496
497/* Convert B to a SPFP value and insert it as element zero in A.  */
498static __inline __m128
499_mm_cvtsi32_ss (__m128 __A, int __B)
500{
501  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
502}
503
504/* Convert the two 32-bit values in B to SPFP form and insert them
505   as the two lower elements in A.  */
506static __inline __m128
507_mm_cvtpi32_ps (__m128 __A, __m64 __B)
508{
509  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
510}
511
512/* Convert the four signed 16-bit values in A to SPFP form.  */
513static __inline __m128
514_mm_cvtpi16_ps (__m64 __A)
515{
516  __v4hi __sign;
517  __v2si __hisi, __losi;
518  __v4sf __r;
519
520  /* This comparison against zero gives us a mask that can be used to
521     fill in the missing sign bits in the unpack operations below, so
522     that we get signed values after unpacking.  */
523  __sign = (__v4hi) __builtin_ia32_mmx_zero ();
524  __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A);
525
526  /* Convert the four words to doublewords.  */
527  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
528  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
529
530  /* Convert the doublewords to floating point two at a time.  */
531  __r = (__v4sf) __builtin_ia32_setzerops ();
532  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
533  __r = __builtin_ia32_movlhps (__r, __r);
534  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
535
536  return (__m128) __r;
537}
538
539/* Convert the four unsigned 16-bit values in A to SPFP form.  */
540static __inline __m128
541_mm_cvtpu16_ps (__m64 __A)
542{
543  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
544  __v2si __hisi, __losi;
545  __v4sf __r;
546
547  /* Convert the four words to doublewords.  */
548  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero);
549  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero);
550
551  /* Convert the doublewords to floating point two at a time.  */
552  __r = (__v4sf) __builtin_ia32_setzerops ();
553  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
554  __r = __builtin_ia32_movlhps (__r, __r);
555  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
556
557  return (__m128) __r;
558}
559
560/* Convert the low four signed 8-bit values in A to SPFP form.  */
561static __inline __m128
562_mm_cvtpi8_ps (__m64 __A)
563{
564  __v8qi __sign;
565
566  /* This comparison against zero gives us a mask that can be used to
567     fill in the missing sign bits in the unpack operations below, so
568     that we get signed values after unpacking.  */
569  __sign = (__v8qi) __builtin_ia32_mmx_zero ();
570  __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A);
571
572  /* Convert the four low bytes to words.  */
573  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
574
575  return _mm_cvtpi16_ps(__A);
576}
577
578/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
579static __inline __m128
580_mm_cvtpu8_ps(__m64 __A)
581{
582  __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero ();
583  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero);
584  return _mm_cvtpu16_ps(__A);
585}
586
587/* Convert the four signed 32-bit values in A and B to SPFP form.  */
588static __inline __m128
589_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
590{
591  __v4sf __zero = (__v4sf) __builtin_ia32_setzerops ();
592  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
593  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
594  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
595}
596
597/* Convert the four SPFP values in A to four signed 16-bit integers.  */
598static __inline __m64
599_mm_cvtps_pi16(__m128 __A)
600{
601  __v4sf __hisf = (__v4sf)__A;
602  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
603  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
604  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
605  return (__m64) __builtin_ia32_packssdw (__losi, __hisi);
606}
607
608/* Convert the four SPFP values in A to four signed 8-bit integers.  */
609static __inline __m64
610_mm_cvtps_pi8(__m128 __A)
611{
612  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
613  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
614  return (__m64) __builtin_ia32_packsswb (__tmp, __zero);
615}
616
617/* Selects four specific SPFP values from A and B based on MASK.  */
618#if 0
619static __inline __m128
620_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
621{
622  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
623}
624#else
625#define _mm_shuffle_ps(A, B, MASK) \
626 ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
627#endif
628
629
630/* Selects and interleaves the upper two SPFP values from A and B.  */
631static __inline __m128
632_mm_unpackhi_ps (__m128 __A, __m128 __B)
633{
634  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
635}
636
637/* Selects and interleaves the lower two SPFP values from A and B.  */
638static __inline __m128
639_mm_unpacklo_ps (__m128 __A, __m128 __B)
640{
641  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
642}
643
644/* Sets the upper two SPFP values with 64-bits of data loaded from P;
645   the lower two values are passed through from A.  */
646static __inline __m128
647_mm_loadh_pi (__m128 __A, __m64 *__P)
648{
649  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
650}
651
652/* Stores the upper two SPFP values of A into P.  */
653static __inline void
654_mm_storeh_pi (__m64 *__P, __m128 __A)
655{
656  __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
657}
658
659/* Moves the upper two values of B into the lower two values of A.  */
660static __inline __m128
661_mm_movehl_ps (__m128 __A, __m128 __B)
662{
663  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
664}
665
666/* Moves the lower two values of B into the upper two values of A.  */
667static __inline __m128
668_mm_movelh_ps (__m128 __A, __m128 __B)
669{
670  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
671}
672
673/* Sets the lower two SPFP values with 64-bits of data loaded from P;
674   the upper two values are passed through from A.  */
675static __inline __m128
676_mm_loadl_pi (__m128 __A, __m64 *__P)
677{
678  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
679}
680
681/* Stores the lower two SPFP values of A into P.  */
682static __inline void
683_mm_storel_pi (__m64 *__P, __m128 __A)
684{
685  __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
686}
687
688/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
689static __inline int
690_mm_movemask_ps (__m128 __A)
691{
692  return __builtin_ia32_movmskps ((__v4sf)__A);
693}
694
695/* Return the contents of the control register.  */
696static __inline unsigned int
697_mm_getcsr (void)
698{
699  return __builtin_ia32_stmxcsr ();
700}
701
702/* Read exception bits from the control register.  */
703static __inline unsigned int
704_MM_GET_EXCEPTION_STATE (void)
705{
706  return _mm_getcsr() & _MM_EXCEPT_MASK;
707}
708
709static __inline unsigned int
710_MM_GET_EXCEPTION_MASK (void)
711{
712  return _mm_getcsr() & _MM_MASK_MASK;
713}
714
715static __inline unsigned int
716_MM_GET_ROUNDING_MODE (void)
717{
718  return _mm_getcsr() & _MM_ROUND_MASK;
719}
720
721static __inline unsigned int
722_MM_GET_FLUSH_ZERO_MODE (void)
723{
724  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
725}
726
727/* Set the control register to I.  */
728static __inline void
729_mm_setcsr (unsigned int __I)
730{
731  __builtin_ia32_ldmxcsr (__I);
732}
733
734/* Set exception bits in the control register.  */
735static __inline void
736_MM_SET_EXCEPTION_STATE(unsigned int __mask)
737{
738  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
739}
740
741static __inline void
742_MM_SET_EXCEPTION_MASK (unsigned int __mask)
743{
744  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
745}
746
747static __inline void
748_MM_SET_ROUNDING_MODE (unsigned int __mode)
749{
750  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
751}
752
753static __inline void
754_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
755{
756  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
757}
758
759/* Create a vector with element 0 as *P and the rest zero.  */
760static __inline __m128
761_mm_load_ss (float *__P)
762{
763  return (__m128) __builtin_ia32_loadss (__P);
764}
765
766/* Create a vector with all four elements equal to *P.  */
767static __inline __m128
768_mm_load1_ps (float *__P)
769{
770  __v4sf __tmp = __builtin_ia32_loadss (__P);
771  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
772}
773
774static __inline __m128
775_mm_load_ps1 (float *__P)
776{
777  return _mm_load1_ps (__P);
778}
779
780/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
781static __inline __m128
782_mm_load_ps (float *__P)
783{
784  return (__m128) __builtin_ia32_loadaps (__P);
785}
786
787/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
788static __inline __m128
789_mm_loadu_ps (float *__P)
790{
791  return (__m128) __builtin_ia32_loadups (__P);
792}
793
794/* Load four SPFP values in reverse order.  The address must be aligned.  */
795static __inline __m128
796_mm_loadr_ps (float *__P)
797{
798  __v4sf __tmp = __builtin_ia32_loadaps (__P);
799  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
800}
801
802/* Create a vector with element 0 as F and the rest zero.  */
803static __inline __m128
804_mm_set_ss (float __F)
805{
806  return (__m128) __builtin_ia32_loadss (&__F);
807}
808
809/* Create a vector with all four elements equal to F.  */
810static __inline __m128
811_mm_set1_ps (float __F)
812{
813  __v4sf __tmp = __builtin_ia32_loadss (&__F);
814  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
815}
816
817static __inline __m128
818_mm_set_ps1 (float __F)
819{
820  return _mm_set1_ps (__F);
821}
822
823/* Create the vector [Z Y X W].  */
824static __inline __m128
825_mm_set_ps (float __Z, float __Y, float __X, float __W)
826{
827  union {
828    float __a[4];
829    __m128 __v;
830  } __u;
831
832  __u.__a[0] = __W;
833  __u.__a[1] = __X;
834  __u.__a[2] = __Y;
835  __u.__a[3] = __Z;
836
837  return __u.__v;
838}
839
840/* Create the vector [W X Y Z].  */
841static __inline __m128
842_mm_setr_ps (float __Z, float __Y, float __X, float __W)
843{
844  return _mm_set_ps (__W, __X, __Y, __Z);
845}
846
847/* Create a vector of zeros.  */
848static __inline __m128
849_mm_setzero_ps (void)
850{
851  return (__m128) __builtin_ia32_setzerops ();
852}
853
854/* Stores the lower SPFP value.  */
855static __inline void
856_mm_store_ss (float *__P, __m128 __A)
857{
858  __builtin_ia32_storess (__P, (__v4sf)__A);
859}
860
861/* Store the lower SPFP value across four words.  */
862static __inline void
863_mm_store1_ps (float *__P, __m128 __A)
864{
865  __v4sf __va = (__v4sf)__A;
866  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
867  __builtin_ia32_storeaps (__P, __tmp);
868}
869
870static __inline void
871_mm_store_ps1 (float *__P, __m128 __A)
872{
873  _mm_store1_ps (__P, __A);
874}
875
876/* Store four SPFP values.  The address must be 16-byte aligned.  */
877static __inline void
878_mm_store_ps (float *__P, __m128 __A)
879{
880  __builtin_ia32_storeaps (__P, (__v4sf)__A);
881}
882
883/* Store four SPFP values.  The address need not be 16-byte aligned.  */
884static __inline void
885_mm_storeu_ps (float *__P, __m128 __A)
886{
887  __builtin_ia32_storeups (__P, (__v4sf)__A);
888}
889
890/* Store four SPFP values in reverse order.  The addres must be aligned.  */
891static __inline void
892_mm_storer_ps (float *__P, __m128 __A)
893{
894  __v4sf __va = (__v4sf)__A;
895  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
896  __builtin_ia32_storeaps (__P, __tmp);
897}
898
899/* Sets the low SPFP value of A from the low value of B.  */
900static __inline __m128
901_mm_move_ss (__m128 __A, __m128 __B)
902{
903  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
904}
905
906/* Extracts one of the four words of A.  The selector N must be immediate.  */
907#if 0
908static __inline int
909_mm_extract_pi16 (__m64 __A, int __N)
910{
911  return __builtin_ia32_pextrw ((__v4hi)__A, __N);
912}
913#else
914#define _mm_extract_pi16(A, N) \
915  __builtin_ia32_pextrw ((__v4hi)(A), (N))
916#endif
917
918/* Inserts word D into one of four words of A.  The selector N must be
919   immediate.  */
920#if 0
921static __inline __m64
922_mm_insert_pi16 (__m64 __A, int __D, int __N)
923{
924  return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N);
925}
926#else
927#define _mm_insert_pi16(A, D, N) \
928  ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N)))
929#endif
930
931/* Compute the element-wise maximum of signed 16-bit values.  */
932static __inline __m64
933_mm_max_pi16 (__m64 __A, __m64 __B)
934{
935  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
936}
937
938/* Compute the element-wise maximum of unsigned 8-bit values.  */
939static __inline __m64
940_mm_max_pu8 (__m64 __A, __m64 __B)
941{
942  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
943}
944
945/* Compute the element-wise minimum of signed 16-bit values.  */
946static __inline __m64
947_mm_min_pi16 (__m64 __A, __m64 __B)
948{
949  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
950}
951
952/* Compute the element-wise minimum of unsigned 8-bit values.  */
953static __inline __m64
954_mm_min_pu8 (__m64 __A, __m64 __B)
955{
956  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
957}
958
959/* Create an 8-bit mask of the signs of 8-bit values.  */
960static __inline int
961_mm_movemask_pi8 (__m64 __A)
962{
963  return __builtin_ia32_pmovmskb ((__v8qi)__A);
964}
965
966/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
967   in B and produce the high 16 bits of the 32-bit results.  */
968static __inline __m64
969_mm_mulhi_pu16 (__m64 __A, __m64 __B)
970{
971  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
972}
973
974/* Return a combination of the four 16-bit values in A.  The selector
975   must be an immediate.  */
976#if 0
977static __inline __m64
978_mm_shuffle_pi16 (__m64 __A, int __N)
979{
980  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
981}
982#else
983#define _mm_shuffle_pi16(A, N) \
984  ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
985#endif
986
987/* Conditionally store byte elements of A into P.  The high bit of each
988   byte in the selector N determines whether the corresponding byte from
989   A is stored.  */
990static __inline void
991_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
992{
993  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
994}
995
996/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
997static __inline __m64
998_mm_avg_pu8 (__m64 __A, __m64 __B)
999{
1000  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1001}
1002
1003/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1004static __inline __m64
1005_mm_avg_pu16 (__m64 __A, __m64 __B)
1006{
1007  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1008}
1009
1010/* Compute the sum of the absolute differences of the unsigned 8-bit
1011   values in A and B.  Return the value in the lower 16-bit word; the
1012   upper words are cleared.  */
1013static __inline __m64
1014_mm_sad_pu8 (__m64 __A, __m64 __B)
1015{
1016  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1017}
1018
1019/* Loads one cache line from address P to a location "closer" to the
1020   processor.  The selector I specifies the type of prefetch operation.  */
1021#if 0
1022static __inline void
1023_mm_prefetch (void *__P, enum _mm_hint __I)
1024{
1025  __builtin_prefetch (__P, 0, __I);
1026}
1027#else
1028#define _mm_prefetch(P, I) \
1029  __builtin_prefetch ((P), 0, (I))
1030#endif
1031
1032/* Stores the data in A to the address P without polluting the caches.  */
1033static __inline void
1034_mm_stream_pi (__m64 *__P, __m64 __A)
1035{
1036  __builtin_ia32_movntq (__P, (long long)__A);
1037}
1038
1039/* Likewise.  The address must be 16-byte aligned.  */
1040static __inline void
1041_mm_stream_ps (float *__P, __m128 __A)
1042{
1043  __builtin_ia32_movntps (__P, (__v4sf)__A);
1044}
1045
1046/* Guarantees that every preceeding store is globally visible before
1047   any subsequent store.  */
1048static __inline void
1049_mm_sfence (void)
1050{
1051  __builtin_ia32_sfence ();
1052}
1053
1054/* The execution of the next instruction is delayed by an implementation
1055   specific amount of time.  The instruction does not modify the
1056   architectural state.  */
1057static __inline void
1058_mm_pause (void)
1059{
1060  __asm__ __volatile__ ("rep; nop" : : );
1061}
1062
1063/* Transpose the 4x4 matrix composed of row[0-3].  */
1064#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
1065do {									\
1066  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1067  __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44);		\
1068  __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE);		\
1069  __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44);		\
1070  __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE);		\
1071  (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88);			\
1072  (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD);			\
1073  (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88);			\
1074  (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);			\
1075} while (0)
1076
1077#endif /* _XMMINTRIN_H_INCLUDED */
1078