1/* { dg-do run } */
2/* { dg-require-effective-target sse4 } */
3/* { dg-options "-O2 -msse4.1" } */
4/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
5
6#include "sse4_1-check.h"
7
8#include <smmintrin.h>
9#include <math.h>
10#include <string.h>
11
12#define NUM 64
13
14static void
15init_round (float *src)
16{
17  int i, sign = 1;
18  float f = rand ();
19
20  for (i = 0; i < NUM; i++)
21    {
22      src[i] = (i + 1)* f * M_PI * sign;
23      if (i < (NUM / 2))
24	{
25          if ((i % 6) == 0)
26	    f = f * src[i];
27        }
28      else if (i == (NUM / 2))
29	f = rand ();
30      else if ((i % 6) == 0)
31	f = 1 / (f * (i + 1) * src[i] * M_PI *sign);
32      sign = -sign;
33    }
34}
35
36static float
37do_round (float f, int type)
38{
39  short saved_cw, new_cw, clr_mask;
40  float ret;
41
42  if ((type & 4))
43    {
44      type = 0;
45      clr_mask = 0xFFFF;
46    }
47  else
48    {
49      type = 0x003F | ((type & 3) << 10);
50      clr_mask = ~0x0C3F;
51    }
52
53  __asm__ ("flds %0" : : "m" (*&f));
54
55  __asm__ ("fstcw %0" : "=m" (*&saved_cw));
56  new_cw = saved_cw & clr_mask;
57  new_cw |= type;
58  __asm__ ("fldcw %0" : : "m" (*&new_cw));
59
60  __asm__ ("frndint\n"
61	   "fstps %0\n" : "=m" (*&ret));
62  __asm__ ("fldcw %0" : : "m" (*&saved_cw));
63  return ret;
64}
65
66static void
67sse4_1_test (void)
68{
69  int i, j;
70  float f;
71  union
72    {
73      __m128 x[NUM / 4];
74      float f[NUM];
75    } dst, src;
76
77  init_round (src.f);
78  memset (&dst, 0, NUM * sizeof(float));
79
80  for (i = 0; i < NUM / 4 ; i++)
81    dst.x[i] =  _mm_round_ss (dst.x[i], src.x[i], _MM_FROUND_RINT);
82
83  for (i = 0; i < NUM; i += 4)
84    {
85      for (j = 0; j < 3; j++)
86	if (dst.f[i + j + 1] != 0.0)
87	  abort ();
88
89      f = do_round (src.f[i], 0x04);
90      if (f != dst.f[i])
91	abort ();
92    }
93
94  for (i = 0; i < NUM / 4 ; i++)
95    dst.x[i] =  _mm_round_ss (dst.x[i], src.x[i], _MM_FROUND_NEARBYINT);
96
97  for (i = 0; i < NUM; i += 4)
98    {
99      for (j = 0; j < 3; j++)
100	if (dst.f[i + j + 1] != 0.0)
101	  abort ();
102
103      f = do_round (src.f[i], 0x0c);
104      if (f != dst.f[i])
105	abort ();
106    }
107}
108