pmmintrin.h revision 314564
1/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __PMMINTRIN_H 25#define __PMMINTRIN_H 26 27#include <emmintrin.h> 28 29/* Define the default attributes for the functions in this file. */ 30#define __DEFAULT_FN_ATTRS \ 31 __attribute__((__always_inline__, __nodebug__, __target__("sse3"))) 32 33/// \brief Loads data from an unaligned memory location to elements in a 128-bit 34/// vector. If the address of the data is not 16-byte aligned, the 35/// instruction may read two adjacent aligned blocks of memory to retrieve 36/// the requested data. 37/// 38/// \headerfile <x86intrin.h> 39/// 40/// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 41/// 42/// \param __p 43/// A pointer to a 128-bit integer vector containing integer values. 44/// \returns A 128-bit vector containing the moved values. 45static __inline__ __m128i __DEFAULT_FN_ATTRS 46_mm_lddqu_si128(__m128i const *__p) 47{ 48 return (__m128i)__builtin_ia32_lddqu((char const *)__p); 49} 50 51/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 52/// two 128-bit vectors of [4 x float]. 53/// 54/// \headerfile <x86intrin.h> 55/// 56/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 57/// 58/// \param __a 59/// A 128-bit vector of [4 x float] containing the left source operand. 60/// \param __b 61/// A 128-bit vector of [4 x float] containing the right source operand. 62/// \returns A 128-bit vector of [4 x float] containing the alternating sums and 63/// differences of both operands. 64static __inline__ __m128 __DEFAULT_FN_ATTRS 65_mm_addsub_ps(__m128 __a, __m128 __b) 66{ 67 return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); 68} 69 70/// \brief Horizontally adds the adjacent pairs of values contained in two 71/// 128-bit vectors of [4 x float]. 72/// 73/// \headerfile <x86intrin.h> 74/// 75/// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 76/// 77/// \param __a 78/// A 128-bit vector of [4 x float] containing one of the source operands. 79/// The horizontal sums of the values are stored in the lower bits of the 80/// destination. 81/// \param __b 82/// A 128-bit vector of [4 x float] containing one of the source operands. 83/// The horizontal sums of the values are stored in the upper bits of the 84/// destination. 85/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of 86/// both operands. 87static __inline__ __m128 __DEFAULT_FN_ATTRS 88_mm_hadd_ps(__m128 __a, __m128 __b) 89{ 90 return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); 91} 92 93/// \brief Horizontally subtracts the adjacent pairs of values contained in two 94/// 128-bit vectors of [4 x float]. 95/// 96/// \headerfile <x86intrin.h> 97/// 98/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 99/// 100/// \param __a 101/// A 128-bit vector of [4 x float] containing one of the source operands. 102/// The horizontal differences between the values are stored in the lower 103/// bits of the destination. 104/// \param __b 105/// A 128-bit vector of [4 x float] containing one of the source operands. 106/// The horizontal differences between the values are stored in the upper 107/// bits of the destination. 108/// \returns A 128-bit vector of [4 x float] containing the horizontal 109/// differences of both operands. 110static __inline__ __m128 __DEFAULT_FN_ATTRS 111_mm_hsub_ps(__m128 __a, __m128 __b) 112{ 113 return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); 114} 115 116/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit 117/// vector of [4 x float] to float values stored in a 128-bit vector of 118/// [4 x float]. 119/// 120/// \headerfile <x86intrin.h> 121/// 122/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 123/// 124/// \param __a 125/// A 128-bit vector of [4 x float]. \n 126/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of 127/// the destination. \n 128/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the 129/// destination. 130/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 131/// values. 132static __inline__ __m128 __DEFAULT_FN_ATTRS 133_mm_movehdup_ps(__m128 __a) 134{ 135 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); 136} 137 138/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of 139/// [4 x float] to float values stored in a 128-bit vector of [4 x float]. 140/// 141/// \headerfile <x86intrin.h> 142/// 143/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 144/// 145/// \param __a 146/// A 128-bit vector of [4 x float] \n 147/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of 148/// the destination. \n 149/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the 150/// destination. 151/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 152/// values. 153static __inline__ __m128 __DEFAULT_FN_ATTRS 154_mm_moveldup_ps(__m128 __a) 155{ 156 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); 157} 158 159/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 160/// two 128-bit vectors of [2 x double]. 161/// 162/// \headerfile <x86intrin.h> 163/// 164/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 165/// 166/// \param __a 167/// A 128-bit vector of [2 x double] containing the left source operand. 168/// \param __b 169/// A 128-bit vector of [2 x double] containing the right source operand. 170/// \returns A 128-bit vector of [2 x double] containing the alternating sums 171/// and differences of both operands. 172static __inline__ __m128d __DEFAULT_FN_ATTRS 173_mm_addsub_pd(__m128d __a, __m128d __b) 174{ 175 return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); 176} 177 178/// \brief Horizontally adds the pairs of values contained in two 128-bit 179/// vectors of [2 x double]. 180/// 181/// \headerfile <x86intrin.h> 182/// 183/// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 184/// 185/// \param __a 186/// A 128-bit vector of [2 x double] containing one of the source operands. 187/// The horizontal sum of the values is stored in the lower bits of the 188/// destination. 189/// \param __b 190/// A 128-bit vector of [2 x double] containing one of the source operands. 191/// The horizontal sum of the values is stored in the upper bits of the 192/// destination. 193/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of 194/// both operands. 195static __inline__ __m128d __DEFAULT_FN_ATTRS 196_mm_hadd_pd(__m128d __a, __m128d __b) 197{ 198 return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); 199} 200 201/// \brief Horizontally subtracts the pairs of values contained in two 128-bit 202/// vectors of [2 x double]. 203/// 204/// \headerfile <x86intrin.h> 205/// 206/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 207/// 208/// \param __a 209/// A 128-bit vector of [2 x double] containing one of the source operands. 210/// The horizontal difference of the values is stored in the lower bits of 211/// the destination. 212/// \param __b 213/// A 128-bit vector of [2 x double] containing one of the source operands. 214/// The horizontal difference of the values is stored in the upper bits of 215/// the destination. 216/// \returns A 128-bit vector of [2 x double] containing the horizontal 217/// differences of both operands. 218static __inline__ __m128d __DEFAULT_FN_ATTRS 219_mm_hsub_pd(__m128d __a, __m128d __b) 220{ 221 return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); 222} 223 224/// \brief Moves and duplicates one double-precision value to double-precision 225/// values stored in a 128-bit vector of [2 x double]. 226/// 227/// \headerfile <x86intrin.h> 228/// 229/// \code 230/// __m128d _mm_loaddup_pd(double const * dp); 231/// \endcode 232/// 233/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 234/// 235/// \param dp 236/// A pointer to a double-precision value to be moved and duplicated. 237/// \returns A 128-bit vector of [2 x double] containing the moved and 238/// duplicated values. 239#define _mm_loaddup_pd(dp) _mm_load1_pd(dp) 240 241/// \brief Moves and duplicates the double-precision value in the lower bits of 242/// a 128-bit vector of [2 x double] to double-precision values stored in a 243/// 128-bit vector of [2 x double]. 244/// 245/// \headerfile <x86intrin.h> 246/// 247/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 248/// 249/// \param __a 250/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits 251/// [127:64] and [63:0] of the destination. 252/// \returns A 128-bit vector of [2 x double] containing the moved and 253/// duplicated values. 254static __inline__ __m128d __DEFAULT_FN_ATTRS 255_mm_movedup_pd(__m128d __a) 256{ 257 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 258} 259 260#define _MM_DENORMALS_ZERO_ON (0x0040) 261#define _MM_DENORMALS_ZERO_OFF (0x0000) 262 263#define _MM_DENORMALS_ZERO_MASK (0x0040) 264 265#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 266#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 267 268/// \brief Establishes a linear address memory range to be monitored and puts 269/// the processor in the monitor event pending state. Data stored in the 270/// monitored address range causes the processor to exit the pending state. 271/// 272/// \headerfile <x86intrin.h> 273/// 274/// This intrinsic corresponds to the <c> MONITOR </c> instruction. 275/// 276/// \param __p 277/// The memory range to be monitored. The size of the range is determined by 278/// CPUID function 0000_0005h. 279/// \param __extensions 280/// Optional extensions for the monitoring state. 281/// \param __hints 282/// Optional hints for the monitoring state. 283static __inline__ void __DEFAULT_FN_ATTRS 284_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) 285{ 286 __builtin_ia32_monitor((void *)__p, __extensions, __hints); 287} 288 289/// \brief Used with the MONITOR instruction to wait while the processor is in 290/// the monitor event pending state. Data stored in the monitored address 291/// range causes the processor to exit the pending state. 292/// 293/// \headerfile <x86intrin.h> 294/// 295/// This intrinsic corresponds to the <c> MWAIT </c> instruction. 296/// 297/// \param __extensions 298/// Optional extensions for the monitoring state, which may vary by 299/// processor. 300/// \param __hints 301/// Optional hints for the monitoring state, which may vary by processor. 302static __inline__ void __DEFAULT_FN_ATTRS 303_mm_mwait(unsigned __extensions, unsigned __hints) 304{ 305 __builtin_ia32_mwait(__extensions, __hints); 306} 307 308#undef __DEFAULT_FN_ATTRS 309 310#endif /* __PMMINTRIN_H */ 311