1/* { dg-do compile } */
2/* { dg-options "-O2 -msse2" } */
3
4typedef float __v4sf __attribute__ ((__vector_size__ (16)));
5typedef float __m128 __attribute__ ((__vector_size__ (16)));
6typedef long long __v2di __attribute__ ((__vector_size__ (16)));
7
8static __inline __m128
9_mm_cmpeq_ps (__m128 __A, __m128 __B)
10{
11  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
12}
13
14static __inline __m128
15_mm_setr_ps (float __Z, float __Y, float __X, float __W)
16{
17  return __extension__ (__m128)(__v4sf){__Z, __Y, __X, __W };
18}
19
20static __inline __m128
21_mm_and_si128 (__m128 __A, __m128 __B)
22{
23  return (__m128)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
24}
25
26static __inline __m128
27_mm_or_si128 (__m128 __A, __m128 __B)
28{
29  return (__m128)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
30}
31
32typedef union
33{
34  __m128 xmmi;
35  int si[4];
36}
37__attribute__ ((aligned (16))) um128;
38
39um128 u;
40
41static inline int
42sse_max_abs_indexf (float *v, int step, int n)
43{
44  __m128 m1, mm;
45  __m128 mim, mi, msk;
46  um128 u, ui;
47  int n4, step2, step3;
48  mm = __builtin_ia32_andps ((__m128) (__v4sf)
49			     { 0.0, v[step], v[step2], v[step3] }
50			     , u.xmmi);
51  if (n4)
52    {
53      int i;
54      for (i = 0; i < n4; ++i);
55      msk = (__m128) _mm_cmpeq_ps (m1, mm);
56      mim = _mm_or_si128 (_mm_and_si128 (msk, mi), mim);
57    }
58  ui.xmmi = (__m128) mim;
59  return ui.si[n];
60}
61
62static void
63sse_swap_rowf (float *r1, float *r2, int n)
64{
65  int n4 = (n / 4) * 4;
66  float *r14end = r1 + n4;
67  while (r1 < r14end)
68    {
69      *r1 = *r2;
70      r1++;
71    }
72}
73
74void swap_index (int *, int, int);
75void sse_add_rowf (float *, float *, float, int);
76
77void
78ludcompf (float *m, int nw, int *prow, int n)
79{
80  int i, s = 0;
81  float *pm;
82  for (i = 0, pm = m; i < n - 1; ++i, pm += nw)
83    {
84      int vi = sse_max_abs_indexf (pm + i, nw, n - i);
85      float *pt;
86      int j;
87      if (vi != 0)
88	{
89	  sse_swap_rowf (pm, pm + vi * nw, nw);
90	  swap_index (prow, i, i + vi);
91	}
92      for (j = i + 1, pt = pm + nw; j < n; ++j, pt += nw)
93	sse_add_rowf (pt + i + 1, pm + i + 1, -1.0, n - i - 1);
94    }
95}
96