1/* Copyright (C) 2003-2020 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24/* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27#ifndef NO_WARN_X86_INTRINSICS 28/* This header is distributed to simplify porting x86_64 code that 29 makes explicit use of Intel intrinsics to powerpc64le. 30 It is the user's responsibility to determine if the results are 31 acceptable and make additional changes as necessary. 32 Note that much code that uses Intel intrinsics can be rewritten in 33 standard C or GNU C extensions, which are more portable and better 34 optimized across multiple targets. */ 35#endif 36 37#ifndef TMMINTRIN_H_ 38#define TMMINTRIN_H_ 39 40#include <altivec.h> 41#include <assert.h> 42 43/* We need definitions from the SSE header files. */ 44#include <pmmintrin.h> 45 46extern __inline __m128i 47__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 48_mm_abs_epi16 (__m128i __A) 49{ 50 return (__m128i) vec_abs ((__v8hi) __A); 51} 52 53extern __inline __m128i 54__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 55_mm_abs_epi32 (__m128i __A) 56{ 57 return (__m128i) vec_abs ((__v4si) __A); 58} 59 60extern __inline __m128i 61__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 62_mm_abs_epi8 (__m128i __A) 63{ 64 return (__m128i) vec_abs ((__v16qi) __A); 65} 66 67extern __inline __m64 68__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 69_mm_abs_pi16 (__m64 __A) 70{ 71 __v8hi __B = (__v8hi) (__v2du) { __A, __A }; 72 return (__m64) ((__v2du) vec_abs (__B))[0]; 73} 74 75extern __inline __m64 76__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 77_mm_abs_pi32 (__m64 __A) 78{ 79 __v4si __B = (__v4si) (__v2du) { __A, __A }; 80 return (__m64) ((__v2du) vec_abs (__B))[0]; 81} 82 83extern __inline __m64 84__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 85_mm_abs_pi8 (__m64 __A) 86{ 87 __v16qi __B = (__v16qi) (__v2du) { __A, __A }; 88 return (__m64) ((__v2du) vec_abs (__B))[0]; 89} 90 91extern __inline __m128i 92__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 93_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) 94{ 95 if (__builtin_constant_p (__count) && __count < 16) 96 { 97#ifdef __LITTLE_ENDIAN__ 98 __A = (__m128i) vec_reve ((__v16qu) __A); 99 __B = (__m128i) vec_reve ((__v16qu) __B); 100#endif 101 __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); 102#ifdef __LITTLE_ENDIAN__ 103 __A = (__m128i) vec_reve ((__v16qu) __A); 104#endif 105 return __A; 106 } 107 108 if (__count == 0) 109 return __B; 110 111 if (__count >= 16) 112 { 113 if (__count >= 32) 114 { 115 const __v16qu __zero = { 0 }; 116 return (__m128i) __zero; 117 } 118 else 119 { 120 const __v16qu __shift = 121 vec_splats ((unsigned char) ((__count - 16) * 8)); 122#ifdef __LITTLE_ENDIAN__ 123 return (__m128i) vec_sro ((__v16qu) __A, __shift); 124#else 125 return (__m128i) vec_slo ((__v16qu) __A, __shift); 126#endif 127 } 128 } 129 else 130 { 131 const __v16qu __shiftA = 132 vec_splats ((unsigned char) ((16 - __count) * 8)); 133 const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); 134#ifdef __LITTLE_ENDIAN__ 135 __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); 136 __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); 137#else 138 __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA); 139 __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB); 140#endif 141 return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B); 142 } 143} 144 145extern __inline __m64 146__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) 148{ 149 if (__count < 16) 150 { 151 __v2du __C = { __B, __A }; 152#ifdef __LITTLE_ENDIAN__ 153 const __v4su __shift = { __count << 3, 0, 0, 0 }; 154 __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); 155#else 156 const __v4su __shift = { 0, 0, 0, __count << 3 }; 157 __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift); 158#endif 159 return (__m64) __C[0]; 160 } 161 else 162 { 163 const __m64 __zero = { 0 }; 164 return __zero; 165 } 166} 167 168extern __inline __m128i 169__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 170_mm_hadd_epi16 (__m128i __A, __m128i __B) 171{ 172 const __v16qu __P = 173 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 174 const __v16qu __Q = 175 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 176 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 177 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 178 return (__m128i) vec_add (__C, __D); 179} 180 181extern __inline __m128i 182__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 183_mm_hadd_epi32 (__m128i __A, __m128i __B) 184{ 185 const __v16qu __P = 186 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; 187 const __v16qu __Q = 188 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; 189 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); 190 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); 191 return (__m128i) vec_add (__C, __D); 192} 193 194extern __inline __m64 195__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 196_mm_hadd_pi16 (__m64 __A, __m64 __B) 197{ 198 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 199 const __v16qu __P = 200 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 201 const __v16qu __Q = 202 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 203 __v8hi __D = vec_perm (__C, __C, __Q); 204 __C = vec_perm (__C, __C, __P); 205 __C = vec_add (__C, __D); 206 return (__m64) ((__v2du) __C)[1]; 207} 208 209extern __inline __m64 210__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 211_mm_hadd_pi32 (__m64 __A, __m64 __B) 212{ 213 __v4si __C = (__v4si) (__v2du) { __A, __B }; 214 const __v16qu __P = 215 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; 216 const __v16qu __Q = 217 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; 218 __v4si __D = vec_perm (__C, __C, __Q); 219 __C = vec_perm (__C, __C, __P); 220 __C = vec_add (__C, __D); 221 return (__m64) ((__v2du) __C)[1]; 222} 223 224extern __inline __m128i 225__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 226_mm_hadds_epi16 (__m128i __A, __m128i __B) 227{ 228 __v4si __C = { 0 }, __D = { 0 }; 229 __C = vec_sum4s ((__v8hi) __A, __C); 230 __D = vec_sum4s ((__v8hi) __B, __D); 231 __C = (__v4si) vec_packs (__C, __D); 232 return (__m128i) __C; 233} 234 235extern __inline __m64 236__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 237_mm_hadds_pi16 (__m64 __A, __m64 __B) 238{ 239 const __v4si __zero = { 0 }; 240 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 241 __v4si __D = vec_sum4s (__C, __zero); 242 __C = vec_packs (__D, __D); 243 return (__m64) ((__v2du) __C)[1]; 244} 245 246extern __inline __m128i 247__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248_mm_hsub_epi16 (__m128i __A, __m128i __B) 249{ 250 const __v16qu __P = 251 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 252 const __v16qu __Q = 253 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 254 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 255 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 256 return (__m128i) vec_sub (__C, __D); 257} 258 259extern __inline __m128i 260__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 261_mm_hsub_epi32 (__m128i __A, __m128i __B) 262{ 263 const __v16qu __P = 264 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; 265 const __v16qu __Q = 266 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; 267 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); 268 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); 269 return (__m128i) vec_sub (__C, __D); 270} 271 272extern __inline __m64 273__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 274_mm_hsub_pi16 (__m64 __A, __m64 __B) 275{ 276 const __v16qu __P = 277 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 278 const __v16qu __Q = 279 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 280 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 281 __v8hi __D = vec_perm (__C, __C, __Q); 282 __C = vec_perm (__C, __C, __P); 283 __C = vec_sub (__C, __D); 284 return (__m64) ((__v2du) __C)[1]; 285} 286 287extern __inline __m64 288__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 289_mm_hsub_pi32 (__m64 __A, __m64 __B) 290{ 291 const __v16qu __P = 292 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; 293 const __v16qu __Q = 294 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; 295 __v4si __C = (__v4si) (__v2du) { __A, __B }; 296 __v4si __D = vec_perm (__C, __C, __Q); 297 __C = vec_perm (__C, __C, __P); 298 __C = vec_sub (__C, __D); 299 return (__m64) ((__v2du) __C)[1]; 300} 301 302extern __inline __m128i 303__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 304_mm_hsubs_epi16 (__m128i __A, __m128i __B) 305{ 306 const __v16qu __P = 307 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 308 const __v16qu __Q = 309 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 310 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 311 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 312 return (__m128i) vec_subs (__C, __D); 313} 314 315extern __inline __m64 316__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 317_mm_hsubs_pi16 (__m64 __A, __m64 __B) 318{ 319 const __v16qu __P = 320 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 321 const __v16qu __Q = 322 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 323 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 324 __v8hi __D = vec_perm (__C, __C, __P); 325 __v8hi __E = vec_perm (__C, __C, __Q); 326 __C = vec_subs (__D, __E); 327 return (__m64) ((__v2du) __C)[1]; 328} 329 330extern __inline __m128i 331__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 332_mm_shuffle_epi8 (__m128i __A, __m128i __B) 333{ 334 const __v16qi __zero = { 0 }; 335 __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero); 336 __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); 337 return (__m128i) vec_sel (__C, __zero, __select); 338} 339 340extern __inline __m64 341__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 342_mm_shuffle_pi8 (__m64 __A, __m64 __B) 343{ 344 const __v16qi __zero = { 0 }; 345 __v16qi __C = (__v16qi) (__v2du) { __A, __A }; 346 __v16qi __D = (__v16qi) (__v2du) { __B, __B }; 347 __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero); 348 __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); 349 __C = vec_sel (__C, __zero, __select); 350 return (__m64) ((__v2du) (__C))[0]; 351} 352 353extern __inline __m128i 354__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 355_mm_sign_epi8 (__m128i __A, __m128i __B) 356{ 357 const __v16qi __zero = { 0 }; 358 __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero); 359 __v16qi __selectpos = 360 (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero)); 361 __v16qi __conv = vec_add (__selectneg, __selectpos); 362 return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); 363} 364 365extern __inline __m128i 366__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 367_mm_sign_epi16 (__m128i __A, __m128i __B) 368{ 369 const __v8hi __zero = { 0 }; 370 __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero); 371 __v8hi __selectpos = 372 (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero)); 373 __v8hi __conv = vec_add (__selectneg, __selectpos); 374 return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); 375} 376 377extern __inline __m128i 378__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 379_mm_sign_epi32 (__m128i __A, __m128i __B) 380{ 381 const __v4si __zero = { 0 }; 382 __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero); 383 __v4si __selectpos = 384 (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero)); 385 __v4si __conv = vec_add (__selectneg, __selectpos); 386 return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); 387} 388 389extern __inline __m64 390__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 391_mm_sign_pi8 (__m64 __A, __m64 __B) 392{ 393 const __v16qi __zero = { 0 }; 394 __v16qi __C = (__v16qi) (__v2du) { __A, __A }; 395 __v16qi __D = (__v16qi) (__v2du) { __B, __B }; 396 __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); 397 return (__m64) ((__v2du) (__C))[0]; 398} 399 400extern __inline __m64 401__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 402_mm_sign_pi16 (__m64 __A, __m64 __B) 403{ 404 const __v8hi __zero = { 0 }; 405 __v8hi __C = (__v8hi) (__v2du) { __A, __A }; 406 __v8hi __D = (__v8hi) (__v2du) { __B, __B }; 407 __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); 408 return (__m64) ((__v2du) (__C))[0]; 409} 410 411extern __inline __m64 412__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 413_mm_sign_pi32 (__m64 __A, __m64 __B) 414{ 415 const __v4si __zero = { 0 }; 416 __v4si __C = (__v4si) (__v2du) { __A, __A }; 417 __v4si __D = (__v4si) (__v2du) { __B, __B }; 418 __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); 419 return (__m64) ((__v2du) (__C))[0]; 420} 421 422extern __inline __m128i 423__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 424_mm_maddubs_epi16 (__m128i __A, __m128i __B) 425{ 426 __v8hi __unsigned = vec_splats ((signed short) 0x00ff); 427 __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned); 428 __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned); 429 __v8hi __E = vec_unpackh ((__v16qi) __B); 430 __v8hi __F = vec_unpackl ((__v16qi) __B); 431 __C = vec_mul (__C, __E); 432 __D = vec_mul (__D, __F); 433 const __v16qu __odds = 434 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 435 const __v16qu __evens = 436 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 437 __E = vec_perm (__C, __D, __odds); 438 __F = vec_perm (__C, __D, __evens); 439 return (__m128i) vec_adds (__E, __F); 440} 441 442extern __inline __m64 443__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 444_mm_maddubs_pi16 (__m64 __A, __m64 __B) 445{ 446 __v8hi __C = (__v8hi) (__v2du) { __A, __A }; 447 __C = vec_unpackl ((__v16qi) __C); 448 const __v8hi __unsigned = vec_splats ((signed short) 0x00ff); 449 __C = vec_and (__C, __unsigned); 450 __v8hi __D = (__v8hi) (__v2du) { __B, __B }; 451 __D = vec_unpackl ((__v16qi) __D); 452 __D = vec_mul (__C, __D); 453 const __v16qu __odds = 454 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 455 const __v16qu __evens = 456 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 457 __C = vec_perm (__D, __D, __odds); 458 __D = vec_perm (__D, __D, __evens); 459 __C = vec_adds (__C, __D); 460 return (__m64) ((__v2du) (__C))[0]; 461} 462 463extern __inline __m128i 464__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 465_mm_mulhrs_epi16 (__m128i __A, __m128i __B) 466{ 467 __v4si __C = vec_unpackh ((__v8hi) __A); 468 __v4si __D = vec_unpackh ((__v8hi) __B); 469 __C = vec_mul (__C, __D); 470 __D = vec_unpackl ((__v8hi) __A); 471 __v4si __E = vec_unpackl ((__v8hi) __B); 472 __D = vec_mul (__D, __E); 473 const __v4su __shift = vec_splats ((unsigned int) 14); 474 __C = vec_sr (__C, __shift); 475 __D = vec_sr (__D, __shift); 476 const __v4si __ones = vec_splats ((signed int) 1); 477 __C = vec_add (__C, __ones); 478 __C = vec_sr (__C, (__v4su) __ones); 479 __D = vec_add (__D, __ones); 480 __D = vec_sr (__D, (__v4su) __ones); 481 return (__m128i) vec_pack (__C, __D); 482} 483 484extern __inline __m64 485__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 486_mm_mulhrs_pi16 (__m64 __A, __m64 __B) 487{ 488 __v4si __C = (__v4si) (__v2du) { __A, __A }; 489 __C = vec_unpackh ((__v8hi) __C); 490 __v4si __D = (__v4si) (__v2du) { __B, __B }; 491 __D = vec_unpackh ((__v8hi) __D); 492 __C = vec_mul (__C, __D); 493 const __v4su __shift = vec_splats ((unsigned int) 14); 494 __C = vec_sr (__C, __shift); 495 const __v4si __ones = vec_splats ((signed int) 1); 496 __C = vec_add (__C, __ones); 497 __C = vec_sr (__C, (__v4su) __ones); 498 __v8hi __E = vec_pack (__C, __D); 499 return (__m64) ((__v2du) (__E))[0]; 500} 501 502#endif 503