1/* { dg-do compile } */ 2/* { dg-require-effective-target ia32 } */ 3/* { dg-options "-O3 -msse2 -fdump-rtl-csa" } */ 4/* { dg-skip-if "no stdint" { vxworks_kernel } } */ 5 6#include <emmintrin.h> 7#include <stdint.h> 8 9typedef __SIZE_TYPE__ size_t; 10typedef float vFloat __attribute__ ((__vector_size__ (16))); 11typedef double vDouble __attribute__ ((__vector_size__ (16))); 12typedef struct buf 13{ 14 void *data; 15 unsigned long h; 16 unsigned long w; 17 size_t bytes; 18} buf; 19 20typedef struct job 21{ 22 struct Job *next; 23 void * info; 24 long (*func)(struct Job *job); 25 long error; 26} job; 27 28typedef struct fj 29{ 30 job hd; 31 buf src; 32 buf dest; 33 float g; 34 unsigned int flags; 35} fj; 36 37static const double r[256], t[256]; 38 39long bar (const buf *src, const buf *dest, float g, unsigned int flags) 40{ 41 float *d0 = (float*) src->data; 42 float *d1 = (float*) dest->data; 43 uintptr_t w = dest->w; 44 uintptr_t idx; 45 vFloat p0; 46 static const vFloat m0; 47 static const vDouble p[3], m, b; 48 float *sr = d0; 49 float *dr = d1; 50 for( idx = 0; idx + 8 <= w; idx += 8 ) 51 { 52 vFloat f0 = _mm_loadu_ps (sr); 53 vFloat f1 = _mm_loadu_ps (sr + 4); 54 sr += 8; 55 vFloat fa0 = _mm_andnot_ps (m0, f0); 56 vFloat fa1 = _mm_andnot_ps (m0, f1); 57 vDouble v0 = _mm_cvtps_pd (fa0); 58 vDouble v1 = _mm_cvtps_pd (_mm_movehl_ps (fa0, fa0)); 59 vDouble v2 = _mm_cvtps_pd (fa1); 60 vDouble v3 = _mm_cvtps_pd (_mm_movehl_ps (fa1, fa1)); 61 vDouble vi0, vi1, vi2, vi3; 62 __m128i b0, b1, b2, b3; 63 b0 = _mm_packs_epi32 (_mm_packs_epi32 (b0, b1), _mm_packs_epi32 (b2, b3)); 64 b1 = _mm_srli_epi64 (b0, 32); 65 unsigned int i0 = _mm_cvtsi128_si32 (b0); 66 unsigned int i2 = _mm_cvtsi128_si32 (b1); 67 v0 -= _mm_loadh_pd (_mm_load_sd (r + (i0 & 0xff)), r + (i0 >> 16)); 68 v1 -= _mm_loadh_pd (_mm_load_sd (r + (i2 & 0xff)), r + (i2 >> 16)); 69 b0 = _mm_unpackhi_epi64 (b0, b0); 70 b1 = _mm_unpackhi_epi64 (b1, b1); 71 unsigned int i4 = _mm_cvtsi128_si32 (b0); 72 unsigned int i6 = _mm_cvtsi128_si32 (b1); 73 v2 -= _mm_loadh_pd (_mm_load_sd (r + (i4 & 0xff)), r + (i4 >> 16)); 74 v3 -= _mm_loadh_pd (_mm_load_sd (r + (i6 & 0xff)), r + (i6 >> 16)); 75 v0 = p[0] + (p[1] + p[2] * v0) * v0; 76 v1 = p[0] + (p[1] + p[2] * v1) * v1; 77 v2 = p[0] + (p[1] + p[2] * v2) * v2; 78 v3 = p[0] + (p[1] + p[2] * v3) * v3; 79 vi0 = (vDouble) _mm_slli_epi64 ((__m128i)((vi0 + b) + m), 52); 80 vi1 = (vDouble) _mm_slli_epi64 ((__m128i)((vi1 + b) + m), 52); 81 vi2 = (vDouble) _mm_slli_epi64 ((__m128i)((vi2 + b) + m), 52); 82 vi3 = (vDouble) _mm_slli_epi64 ((__m128i)((vi3 + b) + m), 52); 83 vi0 *= _mm_loadh_pd (_mm_load_sd (t + (i0 & 0xff)), t + (i0 >> 16)); 84 vi1 *= _mm_loadh_pd (_mm_load_sd (t + (i2 & 0xff)), t + (i2 >> 16)); 85 vi2 *= _mm_loadh_pd (_mm_load_sd (t + (i4 & 0xff)), t + (i4 >> 16)); 86 vi3 *= _mm_loadh_pd (_mm_load_sd (t + (i6 & 0xff)), t + (i6 >> 16)); 87 v0 *= vi0; 88 v1 *= vi1; 89 v2 *= vi2; 90 v3 *= vi3; 91 vFloat r0 = _mm_movelh_ps (_mm_cvtpd_ps( v0 ), _mm_cvtpd_ps (v1)); 92 vFloat r1 = _mm_movelh_ps (_mm_cvtpd_ps( v2 ), _mm_cvtpd_ps (v3)); 93 vFloat z0 = _mm_cmpeq_ps (f0, _mm_setzero_ps()); 94 vFloat z1 = _mm_cmpeq_ps (f1, _mm_setzero_ps()); 95 r0 = _mm_andnot_ps (z0, r0); 96 r1 = _mm_andnot_ps (z1, r1); 97 z0 = _mm_and_ps (z0, p0); 98 z1 = _mm_and_ps (z1, p0); 99 r0 = _mm_or_ps (r0, z0); 100 r1 = _mm_or_ps (r1, z1); 101 _mm_storeu_ps (dr, r0); 102 _mm_storeu_ps (dr + 4, r1); 103 dr += 8; 104 } 105 return 0; 106} 107 108long foo (job *j ) 109{ 110 fj *jd = (fj*) j; 111 return bar (&jd->src, &jd->dest, jd->g, jd->flags); 112} 113 114/* { dg-final { scan-rtl-dump-not "deleted 1 dead insns" "csa" } } */ 115/* { dg-final { cleanup-rtl-dump "csa" } } */ 116