1166144Smarius/* { dg-do run } */
2166144Smarius/* { dg-require-effective-target fma4 } */
3166144Smarius/* { dg-options "-O0 -mfma4" } */
4166144Smarius
5166144Smarius#include "fma4-check.h"
6166144Smarius
7166144Smarius#include <x86intrin.h>
8166144Smarius#include <string.h>
9166144Smarius
10166144Smarius#define NUM 20
11166144Smarius
12166144Smariusunion
13166144Smarius{
14166144Smarius  __m128 x[NUM];
15166144Smarius  float f[NUM * 4];
16166144Smarius  __m128d y[NUM];
17166144Smarius  double d[NUM * 2];
18166144Smarius} dst, res, src1, src2, src3;
19166144Smarius
20166144Smarius/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
21166144Smarius   product is not rounded, only the addition is rounded. */
22166144Smarius
23166144Smariusstatic void
24166144Smariusinit_nmaccps ()
25166144Smarius{
26166144Smarius  int i;
27166144Smarius  for (i = 0; i < NUM * 4; i++)
28166144Smarius    {
29166144Smarius      src1.f[i] = i;
30166144Smarius      src2.f[i] = i + 10;
31166144Smarius      src3.f[i] = i + 20;
32166144Smarius    }
33166144Smarius}
34166144Smarius
35166144Smariusstatic void
36166144Smariusinit_nmaccpd ()
37166144Smarius{
38166144Smarius  int i;
39166144Smarius  for (i = 0; i < NUM * 4; i++)
40166144Smarius    {
41166144Smarius      src1.d[i] = i;
42166144Smarius      src2.d[i] = i + 10;
43166144Smarius      src3.d[i] = i + 20;
44166144Smarius    }
45166144Smarius}
46166144Smarius
47166144Smariusstatic int
48166144Smariuscheck_nmaccps ()
49166144Smarius{
50166144Smarius  int i, j, check_fails = 0;
51166144Smarius  for (i = 0; i < NUM * 4; i = i + 4)
52166144Smarius    for (j = 0; j < 4; j++)
53166144Smarius      {
54166144Smarius	res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
55166144Smarius	if (dst.f[i + j] != res.f[i + j])
56166144Smarius	  check_fails++;
57166144Smarius      }
58166144Smarius  return check_fails++;
59166144Smarius}
60166144Smarius
61166144Smariusstatic int
62166144Smariuscheck_nmaccpd ()
63166144Smarius{
64166144Smarius  int i, j, check_fails = 0;
65166144Smarius  for (i = 0; i < NUM * 2; i = i + 2)
66166144Smarius    for (j = 0; j < 2; j++)
67166144Smarius      {
68166144Smarius	res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
69166144Smarius	if (dst.d[i + j] != res.d[i + j])
70166144Smarius	  check_fails++;
71166144Smarius      }
72166144Smarius  return check_fails++;
73166144Smarius}
74166144Smarius
75166144Smarius
76166144Smariusstatic int
77166144Smariuscheck_nmaccss ()
78166144Smarius{
79166144Smarius  int i, j, check_fails = 0;
80166144Smarius  for (i = 0; i < NUM * 4; i = i + 4)
81166144Smarius    {
82166144Smarius      res.f[i] = - (src1.f[i] * src2.f[i]) + src3.f[i];
83166144Smarius      if (dst.f[i] != res.f[i])
84225931Smarius	check_fails++;
85166144Smarius    }
86166144Smarius  return check_fails++;
87166144Smarius}
88166144Smarius
89166144Smariusstatic int
90166144Smariuscheck_nmaccsd ()
91166144Smarius{
92166144Smarius  int i, j, check_fails = 0;
93166144Smarius  for (i = 0; i < NUM * 2; i = i + 2)
94166144Smarius    {
95166144Smarius      res.d[i] = - (src1.d[i] * src2.d[i]) + src3.d[i];
96166144Smarius      if (dst.d[i] != res.d[i])
97166144Smarius	check_fails++;
98166144Smarius    }
99166144Smarius  return check_fails++;
100166144Smarius}
101166144Smarius
102166144Smariusstatic void
103166144Smariusfma4_test (void)
104183337Smarius{
105183337Smarius  int i;
106166144Smarius
107166144Smarius  init_nmaccps ();
108166144Smarius
109166144Smarius  for (i = 0; i < NUM; i++)
110166144Smarius    dst.x[i] = _mm_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]);
111166144Smarius
112166144Smarius  if (check_nmaccps ())
113166144Smarius    abort ();
114166144Smarius
115166144Smarius
116166144Smarius  for (i = 0; i < NUM; i++)
117166144Smarius    dst.x[i] = _mm_nmacc_ss (src1.x[i], src2.x[i], src3.x[i]);
118166144Smarius
119166144Smarius  if (check_nmaccss ())
120166144Smarius    abort ();
121166144Smarius
122166144Smarius  init_nmaccpd ();
123166144Smarius
124166144Smarius  for (i = 0; i < NUM; i++)
125166144Smarius    dst.y[i] = _mm_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]);
126166144Smarius
127166144Smarius  if (check_nmaccpd ())
128166144Smarius    abort ();
129166144Smarius
130166144Smarius
131166144Smarius  for (i = 0; i < NUM; i++)
132166144Smarius    dst.y[i] = _mm_nmacc_sd (src1.y[i], src2.y[i], src3.y[i]);
133166144Smarius
134166144Smarius  if (check_nmaccsd ())
135166144Smarius    abort ();
136166144Smarius
137166144Smarius}
138166144Smarius