1/* { dg-do compile } */
2/* { dg-require-effective-target ia32 } */
3/* { dg-options "-O3 -msse2 -fdump-rtl-csa" } */
4/* { dg-skip-if "no stdint" { vxworks_kernel } } */
5
6#include <emmintrin.h>
7#include <stdint.h>
8
9typedef __SIZE_TYPE__ size_t;
10typedef float vFloat __attribute__ ((__vector_size__ (16)));
11typedef double vDouble __attribute__ ((__vector_size__ (16)));
12typedef struct buf
13{
14  void *data;
15  unsigned long h;
16  unsigned long  w;
17  size_t bytes;
18} buf;
19
20typedef struct job
21{
22  struct Job *next;
23  void * info;
24  long (*func)(struct Job *job);
25  long error;
26} job;
27
28typedef struct fj
29{
30    job hd;
31    buf src;
32    buf dest;
33    float g;
34    unsigned int flags;
35} fj;
36
37static const double r[256], t[256];
38
39long bar (const buf *src, const buf *dest, float g, unsigned int flags)
40{
41  float *d0 = (float*) src->data;
42  float *d1 = (float*) dest->data;
43  uintptr_t w = dest->w;
44  uintptr_t idx;
45  vFloat p0;
46  static const vFloat m0;
47  static const vDouble p[3], m, b;
48  float *sr = d0;
49  float *dr = d1;
50  for( idx = 0; idx + 8 <= w; idx += 8 )
51  {
52    vFloat f0 = _mm_loadu_ps (sr);
53    vFloat f1 = _mm_loadu_ps (sr + 4);
54    sr += 8;
55    vFloat fa0 = _mm_andnot_ps (m0, f0);
56    vFloat fa1 = _mm_andnot_ps (m0, f1);
57    vDouble v0 = _mm_cvtps_pd (fa0);
58    vDouble v1 = _mm_cvtps_pd (_mm_movehl_ps (fa0, fa0));
59    vDouble v2 = _mm_cvtps_pd (fa1);
60    vDouble v3 = _mm_cvtps_pd (_mm_movehl_ps (fa1, fa1));
61    vDouble  vi0, vi1, vi2, vi3;
62    __m128i b0, b1, b2, b3;
63    b0 = _mm_packs_epi32 (_mm_packs_epi32 (b0, b1), _mm_packs_epi32 (b2, b3));
64    b1 = _mm_srli_epi64 (b0, 32);
65    unsigned int i0 = _mm_cvtsi128_si32 (b0);
66    unsigned int i2 = _mm_cvtsi128_si32 (b1);
67    v0 -= _mm_loadh_pd (_mm_load_sd (r + (i0 & 0xff)), r + (i0 >> 16));
68    v1 -= _mm_loadh_pd (_mm_load_sd (r + (i2 & 0xff)), r + (i2 >> 16));
69    b0 = _mm_unpackhi_epi64 (b0, b0);
70    b1 = _mm_unpackhi_epi64 (b1, b1);
71    unsigned int i4 = _mm_cvtsi128_si32 (b0);
72    unsigned int i6 = _mm_cvtsi128_si32 (b1);
73    v2 -= _mm_loadh_pd (_mm_load_sd (r + (i4 & 0xff)), r + (i4 >> 16));
74    v3 -= _mm_loadh_pd (_mm_load_sd (r + (i6 & 0xff)), r + (i6 >> 16));
75    v0 = p[0] + (p[1] + p[2] * v0) * v0;
76    v1 = p[0] + (p[1] + p[2] * v1) * v1;
77    v2 = p[0] + (p[1] + p[2] * v2) * v2;
78    v3 = p[0] + (p[1] + p[2] * v3) * v3;
79    vi0 = (vDouble) _mm_slli_epi64 ((__m128i)((vi0 + b) + m), 52);
80    vi1 = (vDouble) _mm_slli_epi64 ((__m128i)((vi1 + b) + m), 52);
81    vi2 = (vDouble) _mm_slli_epi64 ((__m128i)((vi2 + b) + m), 52);
82    vi3 = (vDouble) _mm_slli_epi64 ((__m128i)((vi3 + b) + m), 52);
83    vi0 *= _mm_loadh_pd (_mm_load_sd (t + (i0 & 0xff)), t + (i0 >> 16));
84    vi1 *= _mm_loadh_pd (_mm_load_sd (t + (i2 & 0xff)), t + (i2 >> 16));
85    vi2 *= _mm_loadh_pd (_mm_load_sd (t + (i4 & 0xff)), t + (i4 >> 16));
86    vi3 *= _mm_loadh_pd (_mm_load_sd (t + (i6 & 0xff)), t + (i6 >> 16));
87    v0 *= vi0;
88    v1 *= vi1;
89    v2 *= vi2;
90    v3 *= vi3;
91    vFloat r0 = _mm_movelh_ps (_mm_cvtpd_ps( v0 ), _mm_cvtpd_ps (v1));
92    vFloat r1 = _mm_movelh_ps (_mm_cvtpd_ps( v2 ), _mm_cvtpd_ps (v3));
93    vFloat z0 = _mm_cmpeq_ps (f0, _mm_setzero_ps());
94    vFloat z1 = _mm_cmpeq_ps (f1, _mm_setzero_ps());
95    r0 = _mm_andnot_ps (z0, r0);
96    r1 = _mm_andnot_ps (z1, r1);
97    z0 = _mm_and_ps (z0, p0);
98    z1 = _mm_and_ps (z1, p0);
99    r0 = _mm_or_ps (r0, z0);
100    r1 = _mm_or_ps (r1, z1);
101    _mm_storeu_ps (dr, r0);
102    _mm_storeu_ps (dr + 4, r1);
103    dr += 8;
104  }
105  return 0;
106}
107
108long foo (job *j )
109{
110  fj *jd = (fj*) j;
111  return bar (&jd->src, &jd->dest, jd->g, jd->flags);
112}
113
114/* { dg-final { scan-rtl-dump-not "deleted 1 dead insns" "csa" } } */
115/* { dg-final { cleanup-rtl-dump "csa" } } */
116