1/* { dg-do compile } */ 2/* { dg-options "-O2 -msse2" } */ 3/* { dg-require-effective-target sse2 } */ 4 5typedef float __v4sf __attribute__ ((__vector_size__ (16))); 6typedef float __m128 __attribute__ ((__vector_size__ (16))); 7typedef long long __v2di __attribute__ ((__vector_size__ (16))); 8 9static __inline __m128 10_mm_cmpeq_ps (__m128 __A, __m128 __B) 11{ 12 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 13} 14 15static __inline __m128 16_mm_setr_ps (float __Z, float __Y, float __X, float __W) 17{ 18 return __extension__ (__m128)(__v4sf){__Z, __Y, __X, __W }; 19} 20 21static __inline __m128 22_mm_and_si128 (__m128 __A, __m128 __B) 23{ 24 return (__m128)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); 25} 26 27static __inline __m128 28_mm_or_si128 (__m128 __A, __m128 __B) 29{ 30 return (__m128)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); 31} 32 33typedef union 34{ 35 __m128 xmmi; 36 int si[4]; 37} 38__attribute__ ((aligned (16))) um128; 39 40um128 u; 41 42static inline int 43sse_max_abs_indexf (float *v, int step, int n) 44{ 45 __m128 m1, mm; 46 __m128 mim, mi, msk; 47 um128 u, ui; 48 int n4, step2, step3; 49 mm = __builtin_ia32_andps ((__m128) (__v4sf) 50 { 0.0, v[step], v[step2], v[step3] } 51 , u.xmmi); 52 if (n4) 53 { 54 int i; 55 for (i = 0; i < n4; ++i); 56 msk = (__m128) _mm_cmpeq_ps (m1, mm); 57 mim = _mm_or_si128 (_mm_and_si128 (msk, mi), mim); 58 } 59 ui.xmmi = (__m128) mim; 60 return ui.si[n]; 61} 62 63static void 64sse_swap_rowf (float *r1, float *r2, int n) 65{ 66 int n4 = (n / 4) * 4; 67 float *r14end = r1 + n4; 68 while (r1 < r14end) 69 { 70 *r1 = *r2; 71 r1++; 72 } 73} 74 75void 76ludcompf (float *m, int nw, int *prow, int n) 77{ 78 int i, s = 0; 79 float *pm; 80 for (i = 0, pm = m; i < n - 1; ++i, pm += nw) 81 { 82 int vi = sse_max_abs_indexf (pm + i, nw, n - i); 83 float *pt; 84 int j; 85 if (vi != 0) 86 { 87 sse_swap_rowf (pm, pm + vi * nw, nw); 88 swap_index (prow, i, i + vi); 89 } 90 for (j = i + 1, pt = pm + nw; j < n; ++j, pt += nw) 91 sse_add_rowf (pt + i + 1, pm + i + 1, -1.0, n - i - 1); 92 } 93} 94