tmmintrin.h revision 1.1.1.1
1/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10/* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13#ifndef NO_WARN_X86_INTRINSICS 14/* This header is distributed to simplify porting x86_64 code that 15 makes explicit use of Intel intrinsics to powerpc64le. 16 17 It is the user's responsibility to determine if the results are 18 acceptable and make additional changes as necessary. 19 20 Note that much code that uses Intel intrinsics can be rewritten in 21 standard C or GNU C extensions, which are more portable and better 22 optimized across multiple targets. */ 23#endif 24 25#ifndef TMMINTRIN_H_ 26#define TMMINTRIN_H_ 27 28#if defined(__linux__) && defined(__ppc64__) 29 30#include <altivec.h> 31 32/* We need definitions from the SSE header files. */ 33#include <pmmintrin.h> 34 35extern __inline __m128i 36__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 37_mm_abs_epi16 (__m128i __A) 38{ 39 return (__m128i) vec_abs ((__v8hi) __A); 40} 41 42extern __inline __m128i 43__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 44_mm_abs_epi32 (__m128i __A) 45{ 46 return (__m128i) vec_abs ((__v4si) __A); 47} 48 49extern __inline __m128i 50__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 51_mm_abs_epi8 (__m128i __A) 52{ 53 return (__m128i) vec_abs ((__v16qi) __A); 54} 55 56extern __inline __m64 57__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 58_mm_abs_pi16 (__m64 __A) 59{ 60 __v8hi __B = (__v8hi) (__v2du) { __A, __A }; 61 return (__m64) ((__v2du) vec_abs (__B))[0]; 62} 63 64extern __inline __m64 65__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 66_mm_abs_pi32 (__m64 __A) 67{ 68 __v4si __B = (__v4si) (__v2du) { __A, __A }; 69 return (__m64) ((__v2du) vec_abs (__B))[0]; 70} 71 72extern __inline __m64 73__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 74_mm_abs_pi8 (__m64 __A) 75{ 76 __v16qi __B = (__v16qi) (__v2du) { __A, __A }; 77 return (__m64) ((__v2du) vec_abs (__B))[0]; 78} 79 80extern __inline __m128i 81__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 82_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) 83{ 84 if (__builtin_constant_p (__count) && __count < 16) 85 { 86#ifdef __LITTLE_ENDIAN__ 87 __A = (__m128i) vec_reve ((__v16qu) __A); 88 __B = (__m128i) vec_reve ((__v16qu) __B); 89#endif 90 __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); 91#ifdef __LITTLE_ENDIAN__ 92 __A = (__m128i) vec_reve ((__v16qu) __A); 93#endif 94 return __A; 95 } 96 97 if (__count == 0) 98 return __B; 99 100 if (__count >= 16) 101 { 102 if (__count >= 32) 103 { 104 const __v16qu zero = { 0 }; 105 return (__m128i) zero; 106 } 107 else 108 { 109 const __v16qu __shift = 110 vec_splats ((unsigned char) ((__count - 16) * 8)); 111#ifdef __LITTLE_ENDIAN__ 112 return (__m128i) vec_sro ((__v16qu) __A, __shift); 113#else 114 return (__m128i) vec_slo ((__v16qu) __A, __shift); 115#endif 116 } 117 } 118 else 119 { 120 const __v16qu __shiftA = 121 vec_splats ((unsigned char) ((16 - __count) * 8)); 122 const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); 123#ifdef __LITTLE_ENDIAN__ 124 __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); 125 __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); 126#else 127 __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA); 128 __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB); 129#endif 130 return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B); 131 } 132} 133 134extern __inline __m64 135__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 136_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) 137{ 138 if (__count < 16) 139 { 140 __v2du __C = { __B, __A }; 141#ifdef __LITTLE_ENDIAN__ 142 const __v4su __shift = { __count << 3, 0, 0, 0 }; 143 __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); 144#else 145 const __v4su __shift = { 0, 0, 0, __count << 3 }; 146 __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift); 147#endif 148 return (__m64) __C[0]; 149 } 150 else 151 { 152 const __m64 __zero = { 0 }; 153 return __zero; 154 } 155} 156 157extern __inline __m128i 158__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 159_mm_hadd_epi16 (__m128i __A, __m128i __B) 160{ 161 const __v16qu __P = 162 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 163 const __v16qu __Q = 164 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 165 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 166 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 167 return (__m128i) vec_add (__C, __D); 168} 169 170extern __inline __m128i 171__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 172_mm_hadd_epi32 (__m128i __A, __m128i __B) 173{ 174 const __v16qu __P = 175 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; 176 const __v16qu __Q = 177 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; 178 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); 179 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); 180 return (__m128i) vec_add (__C, __D); 181} 182 183extern __inline __m64 184__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 185_mm_hadd_pi16 (__m64 __A, __m64 __B) 186{ 187 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 188 const __v16qu __P = 189 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 190 const __v16qu __Q = 191 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 192 __v8hi __D = vec_perm (__C, __C, __Q); 193 __C = vec_perm (__C, __C, __P); 194 __C = vec_add (__C, __D); 195 return (__m64) ((__v2du) __C)[1]; 196} 197 198extern __inline __m64 199__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 200_mm_hadd_pi32 (__m64 __A, __m64 __B) 201{ 202 __v4si __C = (__v4si) (__v2du) { __A, __B }; 203 const __v16qu __P = 204 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; 205 const __v16qu __Q = 206 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; 207 __v4si __D = vec_perm (__C, __C, __Q); 208 __C = vec_perm (__C, __C, __P); 209 __C = vec_add (__C, __D); 210 return (__m64) ((__v2du) __C)[1]; 211} 212 213extern __inline __m128i 214__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 215_mm_hadds_epi16 (__m128i __A, __m128i __B) 216{ 217 __v4si __C = { 0 }, __D = { 0 }; 218 __C = vec_sum4s ((__v8hi) __A, __C); 219 __D = vec_sum4s ((__v8hi) __B, __D); 220 __C = (__v4si) vec_packs (__C, __D); 221 return (__m128i) __C; 222} 223 224extern __inline __m64 225__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 226_mm_hadds_pi16 (__m64 __A, __m64 __B) 227{ 228 const __v4si __zero = { 0 }; 229 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 230 __v4si __D = vec_sum4s (__C, __zero); 231 __C = vec_packs (__D, __D); 232 return (__m64) ((__v2du) __C)[1]; 233} 234 235extern __inline __m128i 236__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 237_mm_hsub_epi16 (__m128i __A, __m128i __B) 238{ 239 const __v16qu __P = 240 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 241 const __v16qu __Q = 242 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 243 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 244 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 245 return (__m128i) vec_sub (__C, __D); 246} 247 248extern __inline __m128i 249__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 250_mm_hsub_epi32 (__m128i __A, __m128i __B) 251{ 252 const __v16qu __P = 253 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; 254 const __v16qu __Q = 255 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; 256 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); 257 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); 258 return (__m128i) vec_sub (__C, __D); 259} 260 261extern __inline __m64 262__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263_mm_hsub_pi16 (__m64 __A, __m64 __B) 264{ 265 const __v16qu __P = 266 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 267 const __v16qu __Q = 268 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 269 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 270 __v8hi __D = vec_perm (__C, __C, __Q); 271 __C = vec_perm (__C, __C, __P); 272 __C = vec_sub (__C, __D); 273 return (__m64) ((__v2du) __C)[1]; 274} 275 276extern __inline __m64 277__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 278_mm_hsub_pi32 (__m64 __A, __m64 __B) 279{ 280 const __v16qu __P = 281 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; 282 const __v16qu __Q = 283 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; 284 __v4si __C = (__v4si) (__v2du) { __A, __B }; 285 __v4si __D = vec_perm (__C, __C, __Q); 286 __C = vec_perm (__C, __C, __P); 287 __C = vec_sub (__C, __D); 288 return (__m64) ((__v2du) __C)[1]; 289} 290 291extern __inline __m128i 292__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 293_mm_hsubs_epi16 (__m128i __A, __m128i __B) 294{ 295 const __v16qu __P = 296 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 297 const __v16qu __Q = 298 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 299 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 300 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 301 return (__m128i) vec_subs (__C, __D); 302} 303 304extern __inline __m64 305__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 306_mm_hsubs_pi16 (__m64 __A, __m64 __B) 307{ 308 const __v16qu __P = 309 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 310 const __v16qu __Q = 311 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 312 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 313 __v8hi __D = vec_perm (__C, __C, __P); 314 __v8hi __E = vec_perm (__C, __C, __Q); 315 __C = vec_subs (__D, __E); 316 return (__m64) ((__v2du) __C)[1]; 317} 318 319extern __inline __m128i 320__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 321_mm_shuffle_epi8 (__m128i __A, __m128i __B) 322{ 323 const __v16qi __zero = { 0 }; 324 __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero); 325 __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); 326 return (__m128i) vec_sel (__C, __zero, __select); 327} 328 329extern __inline __m64 330__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 331_mm_shuffle_pi8 (__m64 __A, __m64 __B) 332{ 333 const __v16qi __zero = { 0 }; 334 __v16qi __C = (__v16qi) (__v2du) { __A, __A }; 335 __v16qi __D = (__v16qi) (__v2du) { __B, __B }; 336 __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero); 337 __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); 338 __C = vec_sel (__C, __zero, __select); 339 return (__m64) ((__v2du) (__C))[0]; 340} 341 342extern __inline __m128i 343__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 344_mm_sign_epi8 (__m128i __A, __m128i __B) 345{ 346 const __v16qi __zero = { 0 }; 347 __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero); 348 __v16qi __selectpos = 349 (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero)); 350 __v16qi __conv = vec_add (__selectneg, __selectpos); 351 return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); 352} 353 354extern __inline __m128i 355__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 356_mm_sign_epi16 (__m128i __A, __m128i __B) 357{ 358 const __v8hi __zero = { 0 }; 359 __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero); 360 __v8hi __selectpos = 361 (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero)); 362 __v8hi __conv = vec_add (__selectneg, __selectpos); 363 return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); 364} 365 366extern __inline __m128i 367__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 368_mm_sign_epi32 (__m128i __A, __m128i __B) 369{ 370 const __v4si __zero = { 0 }; 371 __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero); 372 __v4si __selectpos = 373 (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero)); 374 __v4si __conv = vec_add (__selectneg, __selectpos); 375 return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); 376} 377 378extern __inline __m64 379__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 380_mm_sign_pi8 (__m64 __A, __m64 __B) 381{ 382 const __v16qi __zero = { 0 }; 383 __v16qi __C = (__v16qi) (__v2du) { __A, __A }; 384 __v16qi __D = (__v16qi) (__v2du) { __B, __B }; 385 __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); 386 return (__m64) ((__v2du) (__C))[0]; 387} 388 389extern __inline __m64 390__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 391_mm_sign_pi16 (__m64 __A, __m64 __B) 392{ 393 const __v8hi __zero = { 0 }; 394 __v8hi __C = (__v8hi) (__v2du) { __A, __A }; 395 __v8hi __D = (__v8hi) (__v2du) { __B, __B }; 396 __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); 397 return (__m64) ((__v2du) (__C))[0]; 398} 399 400extern __inline __m64 401__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 402_mm_sign_pi32 (__m64 __A, __m64 __B) 403{ 404 const __v4si __zero = { 0 }; 405 __v4si __C = (__v4si) (__v2du) { __A, __A }; 406 __v4si __D = (__v4si) (__v2du) { __B, __B }; 407 __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); 408 return (__m64) ((__v2du) (__C))[0]; 409} 410 411extern __inline __m128i 412__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 413_mm_maddubs_epi16 (__m128i __A, __m128i __B) 414{ 415 __v8hi __unsigned = vec_splats ((signed short) 0x00ff); 416 __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned); 417 __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned); 418 __v8hi __E = vec_unpackh ((__v16qi) __B); 419 __v8hi __F = vec_unpackl ((__v16qi) __B); 420 __C = vec_mul (__C, __E); 421 __D = vec_mul (__D, __F); 422 const __v16qu __odds = 423 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 424 const __v16qu __evens = 425 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 426 __E = vec_perm (__C, __D, __odds); 427 __F = vec_perm (__C, __D, __evens); 428 return (__m128i) vec_adds (__E, __F); 429} 430 431extern __inline __m64 432__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 433_mm_maddubs_pi16 (__m64 __A, __m64 __B) 434{ 435 __v8hi __C = (__v8hi) (__v2du) { __A, __A }; 436 __C = vec_unpackl ((__v16qi) __C); 437 const __v8hi __unsigned = vec_splats ((signed short) 0x00ff); 438 __C = vec_and (__C, __unsigned); 439 __v8hi __D = (__v8hi) (__v2du) { __B, __B }; 440 __D = vec_unpackl ((__v16qi) __D); 441 __D = vec_mul (__C, __D); 442 const __v16qu __odds = 443 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 444 const __v16qu __evens = 445 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 446 __C = vec_perm (__D, __D, __odds); 447 __D = vec_perm (__D, __D, __evens); 448 __C = vec_adds (__C, __D); 449 return (__m64) ((__v2du) (__C))[0]; 450} 451 452extern __inline __m128i 453__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 454_mm_mulhrs_epi16 (__m128i __A, __m128i __B) 455{ 456 __v4si __C = vec_unpackh ((__v8hi) __A); 457 __v4si __D = vec_unpackh ((__v8hi) __B); 458 __C = vec_mul (__C, __D); 459 __D = vec_unpackl ((__v8hi) __A); 460 __v4si __E = vec_unpackl ((__v8hi) __B); 461 __D = vec_mul (__D, __E); 462 const __v4su __shift = vec_splats ((unsigned int) 14); 463 __C = vec_sr (__C, __shift); 464 __D = vec_sr (__D, __shift); 465 const __v4si __ones = vec_splats ((signed int) 1); 466 __C = vec_add (__C, __ones); 467 __C = vec_sr (__C, (__v4su) __ones); 468 __D = vec_add (__D, __ones); 469 __D = vec_sr (__D, (__v4su) __ones); 470 return (__m128i) vec_pack (__C, __D); 471} 472 473extern __inline __m64 474__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 475_mm_mulhrs_pi16 (__m64 __A, __m64 __B) 476{ 477 __v4si __C = (__v4si) (__v2du) { __A, __A }; 478 __C = vec_unpackh ((__v8hi) __C); 479 __v4si __D = (__v4si) (__v2du) { __B, __B }; 480 __D = vec_unpackh ((__v8hi) __D); 481 __C = vec_mul (__C, __D); 482 const __v4su __shift = vec_splats ((unsigned int) 14); 483 __C = vec_sr (__C, __shift); 484 const __v4si __ones = vec_splats ((signed int) 1); 485 __C = vec_add (__C, __ones); 486 __C = vec_sr (__C, (__v4su) __ones); 487 __v8hi __E = vec_pack (__C, __D); 488 return (__m64) ((__v2du) (__E))[0]; 489} 490 491#else 492#include_next <tmmintrin.h> 493#endif /* defined(__linux__) && defined(__ppc64__) */ 494 495#endif /* TMMINTRIN_H_ */ 496