1/* $NetBSD: immintrin.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */ 2 3/*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H 30#define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H 31 32#include <sys/types.h> 33 34/* 35 * This kludgerous header file provides definitions for the Intel 36 * intrinsics that work with GCC and Clang, because <immintrin.h> is 37 * not available during the kernel build and arranging to make it 38 * available is complicated. Please fix this properly! 39 */ 40 41#if defined(__GNUC__) && !defined(__clang__) 42 43#define _INTRINSATTR \ 44 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 45#define _PACKALIAS 46 47typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); 48typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); 49typedef long long __m128i_u 50 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 51typedef long long __v2di __attribute__((__vector_size__(16))); 52typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 53typedef int __v4si __attribute__((__vector_size__(16))); 54typedef unsigned __v4su __attribute__((__vector_size__(16))); 55typedef float __v4sf __attribute__((__vector_size__(16))); 56typedef short __v8hi __attribute__((__vector_size__(16))); 57typedef char __v16qi __attribute__((__vector_size__(16))); 58 59#elif defined(__clang__) 60 61typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); 62typedef long long __m128i 63 __attribute__((__vector_size__(16), __aligned__(16))); 64typedef long long __m128i_u 65 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 66typedef long long __v2di __attribute__((__vector_size__(16))); 67typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 68typedef int __v4si __attribute__((__vector_size__(16))); 69typedef unsigned __v4su __attribute__((__vector_size__(16))); 70typedef float __v4sf __attribute__((__vector_size__(16))); 71typedef short __v8hi __attribute__((__vector_size__(16))); 72typedef char __v16qi __attribute__((__vector_size__(16))); 73 74#define _INTRINSATTR \ 75 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ 76 __min_vector_width__(128))) 77#define _PACKALIAS \ 78 __attribute__((__packed__, __may_alias__)) 79 80#else 81 82#error Please teach me how to do Intel intrinsics for your compiler! 83 84#endif 85 86#define _SSSE3_ATTR __attribute__((target("ssse3"))) 87 88_INTRINSATTR 89static __inline __m128i 90_mm_add_epi32(__m128i __a, __m128i __b) 91{ 92 return (__m128i)((__v4su)__a + (__v4su)__b); 93} 94 95#if defined(__GNUC__) && !defined(__clang__) 96#define _mm_alignr_epi8(hi,lo,bytes) \ 97 (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \ 98 (__v2di)(__m128i)(lo), 8*(int)(bytes)) 99#elif defined(__clang__) 100#define _mm_alignr_epi8(hi,lo,bytes) \ 101 (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \ 102 (__v16qi)(__m128i)(lo), (int)(bytes)) 103#endif 104 105_INTRINSATTR 106static __inline __m128 107_mm_load1_ps(const float *__p) 108{ 109 return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p }; 110} 111 112_INTRINSATTR 113static __inline __m128i 114_mm_loadu_si128(const __m128i_u *__p) 115{ 116 return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v; 117} 118 119_INTRINSATTR 120static __inline __m128i 121_mm_loadu_si32(const void *__p) 122{ 123 int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v; 124 return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 }; 125} 126 127_INTRINSATTR 128static __inline __m128i 129_mm_loadu_si64(const void *__p) 130{ 131 int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v; 132 return __extension__ (__m128i)(__v2di){ __v, 0 }; 133} 134 135_INTRINSATTR 136static __inline __m128i 137_mm_load_si128(const __m128i *__p) 138{ 139 return *__p; 140} 141 142_INTRINSATTR 143static __inline __m128 144_mm_movehl_ps(__m128 __v0, __m128 __v1) 145{ 146#if defined(__GNUC__) && !defined(__clang__) 147 return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1); 148#elif defined(__clang__) 149 return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3); 150#endif 151} 152 153_INTRINSATTR 154static __inline __m128 155_mm_movelh_ps(__m128 __v0, __m128 __v1) 156{ 157#if defined(__GNUC__) && !defined(__clang__) 158 return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1); 159#elif defined(__clang__) 160 return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5); 161#endif 162} 163 164_INTRINSATTR 165static __inline __m128i 166_mm_set1_epi16(int16_t __v) 167{ 168 return __extension__ (__m128i)(__v8hi){ 169 __v, __v, __v, __v, __v, __v, __v, __v 170 }; 171} 172 173_INTRINSATTR 174static __inline __m128i 175_mm_set1_epi32(int32_t __v) 176{ 177 return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v }; 178} 179 180_INTRINSATTR 181static __inline __m128i 182_mm_set1_epi64x(int64_t __v) 183{ 184 return __extension__ (__m128i)(__v2di){ __v, __v }; 185} 186 187_INTRINSATTR 188static __inline __m128i 189_mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0) 190{ 191 return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 }; 192} 193 194_INTRINSATTR 195static __inline __m128i 196_mm_set_epi64x(int64_t __v1, int64_t __v0) 197{ 198 return __extension__ (__m128i)(__v2di){ __v0, __v1 }; 199} 200 201_INTRINSATTR 202static __inline __m128 203_mm_setzero_ps(void) 204{ 205 return __extension__ (__m128){ 0, 0, 0, 0 }; 206} 207 208_INTRINSATTR 209static __inline __m128i 210_mm_setzero_si128(void) 211{ 212 return _mm_set1_epi64x(0); 213} 214 215_INTRINSATTR _SSSE3_ATTR 216static __inline __m128i 217_mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx) 218{ 219 return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl, 220 (__v16qi)__vidx); 221} 222 223#define _mm_shuffle_epi32(v,m) \ 224 (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m)) 225 226#define _mm_shuffle_ps(x,y,m) \ 227 (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \ 228 (__v4sf)(__m128)(y), (int)(m)) \ 229 230_INTRINSATTR 231static __inline __m128i 232_mm_slli_epi32(__m128i __v, uint8_t __bits) 233{ 234 return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits); 235} 236 237_INTRINSATTR 238static __inline __m128i 239_mm_slli_epi64(__m128i __v, uint8_t __bits) 240{ 241 return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits); 242} 243 244#if defined(__GNUC__) && !defined(__clang__) 245#define _mm_slli_si128(v,bytes) \ 246 (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \ 247 8*(int)(bytes)) 248#elif defined(__clang__) 249#define _mm_slli_si128(v,bytes) \ 250 (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \ 251 (int)(bytes)) 252#endif 253 254_INTRINSATTR 255static __inline __m128i 256_mm_srli_epi32(__m128i __v, uint8_t __bits) 257{ 258 return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits); 259} 260 261_INTRINSATTR 262static __inline __m128i 263_mm_srli_epi64(__m128i __v, uint8_t __bits) 264{ 265 return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits); 266} 267 268#if defined(__GNUC__) && !defined(__clang__) 269#define _mm_srli_si128(v,bytes) \ 270 (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes)) 271#elif defined(__clang__) 272#define _mm_srli_si128(v,bytes) \ 273 (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \ 274 (int)(bytes)); 275#endif 276 277_INTRINSATTR 278static __inline void 279_mm_storeu_si128(__m128i_u *__p, __m128i __v) 280{ 281 ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v; 282} 283 284_INTRINSATTR 285static __inline void 286_mm_storeu_si32(void *__p, __m128i __v) 287{ 288 ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0]; 289} 290 291_INTRINSATTR 292static __inline void 293_mm_storeu_si64(void *__p, __m128i __v) 294{ 295 ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0]; 296} 297 298_INTRINSATTR 299static __inline void 300_mm_store_si128(__m128i *__p, __m128i __v) 301{ 302 *__p = __v; 303} 304 305_INTRINSATTR 306static __inline __m128i 307_mm_sub_epi64(__m128i __x, __m128i __y) 308{ 309 return (__m128i)((__v2du)__x - (__v2du)__y); 310} 311 312_INTRINSATTR 313static __inline __m128i 314_mm_unpackhi_epi32(__m128i __lo, __m128i __hi) 315{ 316#if defined(__GNUC__) && !defined(__clang__) 317 return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo, 318 (__v4si)__hi); 319#elif defined(__clang__) 320 return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, 321 2,6,3,7); 322#endif 323} 324 325_INTRINSATTR 326static __inline __m128i 327_mm_unpacklo_epi32(__m128i __lo, __m128i __hi) 328{ 329#if defined(__GNUC__) && !defined(__clang__) 330 return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo, 331 (__v4si)__hi); 332#elif defined(__clang__) 333 return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, 334 0,4,1,5); 335#endif 336} 337 338_INTRINSATTR 339static __inline __m128i 340_mm_unpacklo_epi64(__m128i __lo, __m128i __hi) 341{ 342#if defined(__GNUC__) && !defined(__clang__) 343 return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo, 344 (__v2di)__hi); 345#elif defined(__clang__) 346 return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 347 0,2); 348#endif 349} 350 351#endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */ 352