1/* Test that the compiler properly optimizes floating point multiply and add
2   instructions vector into vfmaddps on FMA4 systems.  */
3
4/* { dg-do compile { target { ! { ia32 } } } } */
5/* { dg-options "-O2 -mfma4 -ftree-vectorize -mtune=generic" } */
6
7extern void exit (int);
8
9typedef float     __m256  __attribute__ ((__vector_size__ (32), __may_alias__));
10typedef double    __m256d __attribute__ ((__vector_size__ (32), __may_alias__));
11
12#define SIZE 10240
13
14union {
15  __m256 f_align;
16  __m256d d_align;
17  float f[SIZE];
18  double d[SIZE];
19} a, b, c, d;
20
21void
22flt_mul_add (void)
23{
24  int i;
25
26  for (i = 0; i < SIZE; i++)
27    a.f[i] = (b.f[i] * c.f[i]) + d.f[i];
28}
29
30void
31dbl_mul_add (void)
32{
33  int i;
34
35  for (i = 0; i < SIZE; i++)
36    a.d[i] = (b.d[i] * c.d[i]) + d.d[i];
37}
38
39void
40flt_mul_sub (void)
41{
42  int i;
43
44  for (i = 0; i < SIZE; i++)
45    a.f[i] = (b.f[i] * c.f[i]) - d.f[i];
46}
47
48void
49dbl_mul_sub (void)
50{
51  int i;
52
53  for (i = 0; i < SIZE; i++)
54    a.d[i] = (b.d[i] * c.d[i]) - d.d[i];
55}
56
57void
58flt_neg_mul_add (void)
59{
60  int i;
61
62  for (i = 0; i < SIZE; i++)
63    a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i];
64}
65
66void
67dbl_neg_mul_add (void)
68{
69  int i;
70
71  for (i = 0; i < SIZE; i++)
72    a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i];
73}
74
75int main ()
76{
77  flt_mul_add ();
78  flt_mul_sub ();
79  flt_neg_mul_add ();
80
81  dbl_mul_add ();
82  dbl_mul_sub ();
83  dbl_neg_mul_add ();
84  exit (0);
85}
86
87/* { dg-final { scan-assembler "vfmaddps" } } */
88/* { dg-final { scan-assembler "vfmaddpd" } } */
89/* { dg-final { scan-assembler "vfmsubps" } } */
90/* { dg-final { scan-assembler "vfmsubpd" } } */
91/* { dg-final { scan-assembler "vfnmaddps" } } */
92/* { dg-final { scan-assembler "vfnmaddpd" } } */
93