1/* { dg-do run } */
2/* { dg-require-effective-target sse4 } */
3/* { dg-options "-O2 -msse4.1" } */
4
5#ifndef CHECK_H
6#define CHECK_H "sse4_1-check.h"
7#endif
8
9#ifndef TEST
10#define TEST sse4_1_test
11#endif
12
13#include CHECK_H
14
15#include <smmintrin.h>
16
17#define lmskN  0x00
18#define lmsk0  0x01
19#define lmsk1  0x02
20#define lmsk2  0x04
21#define lmsk3  0x08
22#define lmsk01 0x03
23#define lmsk02 0x05
24#define lmsk03 0x09
25#define lmsk12 0x06
26#define lmsk13 0x0A
27#define lmsk23 0x0C
28#define lmskA  0x0F
29
30#define hmskN  0x00
31#define hmskA  0xF0
32#define hmsk0  0x10
33#define hmsk1  0x20
34#define hmsk2  0x40
35#define hmsk3  0x80
36#define hmsk01 0x30
37#define hmsk02 0x50
38#define hmsk03 0x90
39#define hmsk12 0x60
40#define hmsk13 0xA0
41#define hmsk23 0xC0
42
43#ifndef HIMASK
44#define HIMASK hmskA
45#endif
46
47static void
48TEST (void)
49{
50  union
51    {
52      __m128 x;
53      float f[4];
54    } val1, val2, res[16];
55  int masks[16];
56  int i, j;
57
58  val1.f[0] = 2.;
59  val1.f[1] = 3.;
60  val1.f[2] = 4.;
61  val1.f[3] = 5.;
62
63  val2.f[0] = 10.;
64  val2.f[1] = 100.;
65  val2.f[2] = 1000.;
66  val2.f[3] = 10000.;
67
68  res[0].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk0);
69  res[1].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk1);
70  res[2].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk2);
71  res[3].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk3);
72  res[4].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk01);
73  res[5].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk02);
74  res[6].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk03);
75  res[7].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk12);
76  res[8].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk13);
77  res[9].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk23);
78  res[10].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk0));
79  res[11].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk1));
80  res[12].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk2));
81  res[13].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk3));
82  res[14].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskN);
83  res[15].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskA);
84
85  masks[0] = HIMASK | lmsk0;
86  masks[1] = HIMASK | lmsk1;
87  masks[2] = HIMASK | lmsk2;
88  masks[3] = HIMASK | lmsk3;
89  masks[4] = HIMASK | lmsk01;
90  masks[5] = HIMASK | lmsk02;
91  masks[6] = HIMASK | lmsk03;
92  masks[7] = HIMASK | lmsk12;
93  masks[8] = HIMASK | lmsk13;
94  masks[9] = HIMASK | lmsk23;
95  masks[10] = HIMASK | (0x0F & ~lmsk0);
96  masks[11] = HIMASK | (0x0F & ~lmsk1);
97  masks[12] = HIMASK | (0x0F & ~lmsk2);
98  masks[13] = HIMASK | (0x0F & ~lmsk3);
99  masks[14] = HIMASK | lmskN;
100  masks[15] = HIMASK | lmskA;
101
102  for (i = 0; i <= 15; i++)
103    {
104      float tmp = 0.;
105
106      for (j = 0; j < 4; j++)
107	if ((HIMASK & (0x10 << j)))
108	  tmp += val1.f[j] * val2.f[j];
109
110      for (j = 0; j < 4; j++)
111	if ((masks[i] & (1 << j)) && res[i].f[j] != tmp)
112	  abort ();
113   }
114}
115