1/* { dg-do compile } */ 2/* { dg-options "-O2 -msse2" } */ 3 4typedef float __v4sf __attribute__ ((__vector_size__ (16))); 5typedef float __m128 __attribute__ ((__vector_size__ (16))); 6typedef long long __v2di __attribute__ ((__vector_size__ (16))); 7 8static __inline __m128 9_mm_cmpeq_ps (__m128 __A, __m128 __B) 10{ 11 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 12} 13 14static __inline __m128 15_mm_setr_ps (float __Z, float __Y, float __X, float __W) 16{ 17 return __extension__ (__m128)(__v4sf){__Z, __Y, __X, __W }; 18} 19 20static __inline __m128 21_mm_and_si128 (__m128 __A, __m128 __B) 22{ 23 return (__m128)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); 24} 25 26static __inline __m128 27_mm_or_si128 (__m128 __A, __m128 __B) 28{ 29 return (__m128)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); 30} 31 32typedef union 33{ 34 __m128 xmmi; 35 int si[4]; 36} 37__attribute__ ((aligned (16))) um128; 38 39um128 u; 40 41static inline int 42sse_max_abs_indexf (float *v, int step, int n) 43{ 44 __m128 m1, mm; 45 __m128 mim, mi, msk; 46 um128 u, ui; 47 int n4, step2, step3; 48 mm = __builtin_ia32_andps ((__m128) (__v4sf) 49 { 0.0, v[step], v[step2], v[step3] } 50 , u.xmmi); 51 if (n4) 52 { 53 int i; 54 for (i = 0; i < n4; ++i); 55 msk = (__m128) _mm_cmpeq_ps (m1, mm); 56 mim = _mm_or_si128 (_mm_and_si128 (msk, mi), mim); 57 } 58 ui.xmmi = (__m128) mim; 59 return ui.si[n]; 60} 61 62static void 63sse_swap_rowf (float *r1, float *r2, int n) 64{ 65 int n4 = (n / 4) * 4; 66 float *r14end = r1 + n4; 67 while (r1 < r14end) 68 { 69 *r1 = *r2; 70 r1++; 71 } 72} 73 74void swap_index (int *, int, int); 75void sse_add_rowf (float *, float *, float, int); 76 77void 78ludcompf (float *m, int nw, int *prow, int n) 79{ 80 int i, s = 0; 81 float *pm; 82 for (i = 0, pm = m; i < n - 1; ++i, pm += nw) 83 { 84 int vi = sse_max_abs_indexf (pm + i, nw, n - i); 85 float *pt; 86 int j; 87 if (vi != 0) 88 { 89 sse_swap_rowf (pm, pm + vi * nw, nw); 90 swap_index (prow, i, i + vi); 91 } 92 for (j = i + 1, pt = pm + nw; j < n; ++j, pt += nw) 93 sse_add_rowf (pt + i + 1, pm + i + 1, -1.0, n - i - 1); 94 } 95} 96