1/* { dg-require-effective-target fma4 } */
2/* { dg-options "-O2 -mfma4" } */
3
4#include "fma4-check.h"
5
6#include <x86intrin.h>
7#include <string.h>
8
9#define NUM 20
10
11union
12{
13  __m256 x[NUM];
14  float f[NUM * 8];
15  __m256d y[NUM];
16  double d[NUM * 4];
17} dst, res, src1, src2, src3;
18
19/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
20   product is not rounded, only the addition is rounded. */
21
22static void
23init_nmsubps ()
24{
25  int i;
26  for (i = 0; i < NUM * 8; i++)
27    {
28      src1.f[i] = i;
29      src2.f[i] = i + 10;
30      src3.f[i] = i + 20;
31    }
32}
33
34static void
35init_nmsubpd ()
36{
37  int i;
38  for (i = 0; i < NUM * 4; i++)
39    {
40      src1.d[i] = i;
41      src2.d[i] = i + 10;
42      src3.d[i] = i + 20;
43    }
44}
45
46static int
47check_nmsubps ()
48{
49  int i, j, check_fails = 0;
50  for (i = 0; i < NUM * 8; i = i + 8)
51    for (j = 0; j < 8; j++)
52      {
53	res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
54	if (dst.f[i + j] != res.f[i + j])
55	  check_fails++;
56      }
57  return check_fails++;
58}
59
60static int
61check_nmsubpd ()
62{
63  int i, j, check_fails = 0;
64  for (i = 0; i < NUM * 4; i = i + 4)
65    for (j = 0; j < 4; j++)
66      {
67	res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
68	if (dst.d[i + j] != res.d[i + j])
69	  check_fails++;
70      }
71  return check_fails++;
72}
73
74static void
75fma4_test (void)
76{
77  int i;
78
79  init_nmsubps ();
80
81  for (i = 0; i < NUM; i++)
82    dst.x[i] = _mm256_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]);
83
84  if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4]))
85    abort ();
86
87  init_nmsubpd ();
88
89  for (i = 0; i < NUM; i++)
90    dst.y[i] = _mm256_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]);
91
92  if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2]))
93    abort ();
94
95}
96