1/* Copyright (C) 2011-2020 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24/* This header is distributed to simplify porting x86_64 code that 25 makes explicit use of Intel intrinsics to powerpc64le. 26 It is the user's responsibility to determine if the results are 27 acceptable and make additional changes as necessary. 28 Note that much code that uses Intel intrinsics can be rewritten in 29 standard C or GNU C extensions, which are more portable and better 30 optimized across multiple targets. */ 31 32#if !defined _X86INTRIN_H_INCLUDED 33# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead." 34#endif 35 36#ifndef _BMI2INTRIN_H_INCLUDED 37#define _BMI2INTRIN_H_INCLUDED 38 39extern __inline unsigned int 40__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 41_bzhi_u32 (unsigned int __X, unsigned int __Y) 42{ 43 return ((__X << (32 - __Y)) >> (32 - __Y)); 44} 45 46extern __inline unsigned int 47__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 48_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P) 49{ 50 unsigned long long __res = (unsigned long long) __X * __Y; 51 *__P = (unsigned int) (__res >> 32); 52 return (unsigned int) __res; 53} 54 55#ifdef __PPC64__ 56extern __inline unsigned long long 57__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 58_bzhi_u64 (unsigned long long __X, unsigned long long __Y) 59{ 60 return ((__X << (64 - __Y)) >> (64 - __Y)); 61} 62 63/* __int128 requires base 64-bit. */ 64extern __inline unsigned long long 65__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 66_mulx_u64 (unsigned long long __X, unsigned long long __Y, 67 unsigned long long *__P) 68{ 69 unsigned __int128 __res = (unsigned __int128) __X * __Y; 70 *__P = (unsigned long long) (__res >> 64); 71 return (unsigned long long) __res; 72} 73 74#ifdef _ARCH_PWR7 75/* popcount and bpermd require power7 minimum. */ 76extern __inline unsigned long long 77__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 78_pdep_u64 (unsigned long long __X, unsigned long long __M) 79{ 80 unsigned long __result = 0x0UL; 81 const unsigned long __mask = 0x8000000000000000UL; 82 unsigned long __m = __M; 83 unsigned long __c, __t; 84 unsigned long __p; 85 86 /* The pop-count of the mask gives the number of the bits from 87 source to process. This is also needed to shift bits from the 88 source into the correct position for the result. */ 89 __p = 64 - __builtin_popcountl (__M); 90 91 /* The loop is for the number of '1' bits in the mask and clearing 92 each mask bit as it is processed. */ 93 while (__m != 0) 94 { 95 __c = __builtin_clzl (__m); 96 __t = __X << (__p - __c); 97 __m ^= (__mask >> __c); 98 __result |= (__t & (__mask >> __c)); 99 __p++; 100 } 101 return __result; 102} 103 104extern __inline unsigned long long 105__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 106_pext_u64 (unsigned long long __X, unsigned long long __M) 107{ 108 unsigned long __p = 0x4040404040404040UL; // initial bit permute control 109 const unsigned long __mask = 0x8000000000000000UL; 110 unsigned long __m = __M; 111 unsigned long __c; 112 unsigned long __result; 113 114 /* if the mask is constant and selects 8 bits or less we can use 115 the Power8 Bit permute instruction. */ 116 if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8)) 117 { 118 /* Also if the pext mask is constant, then the popcount is 119 constant, we can evaluate the following loop at compile 120 time and use a constant bit permute vector. */ 121 for (long __i = 0; __i < __builtin_popcountl (__M); __i++) 122 { 123 __c = __builtin_clzl (__m); 124 __p = (__p << 8) | __c; 125 __m ^= (__mask >> __c); 126 } 127 __result = __builtin_bpermd (__p, __X); 128 } 129 else 130 { 131 __p = 64 - __builtin_popcountl (__M); 132 __result = 0; 133 /* We could a use a for loop here, but that combined with 134 -funroll-loops can expand to a lot of code. The while 135 loop avoids unrolling and the compiler commons the xor 136 from clearing the mask bit with the (__m != 0) test. The 137 result is a more compact loop setup and body. */ 138 while (__m != 0) 139 { 140 unsigned long __t; 141 __c = __builtin_clzl (__m); 142 __t = (__X & (__mask >> __c)) >> (__p - __c); 143 __m ^= (__mask >> __c); 144 __result |= (__t); 145 __p++; 146 } 147 } 148 return __result; 149} 150 151/* these 32-bit implementations depend on 64-bit pdep/pext 152 which depend on _ARCH_PWR7. */ 153extern __inline unsigned int 154__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 155_pdep_u32 (unsigned int __X, unsigned int __Y) 156{ 157 return _pdep_u64 (__X, __Y); 158} 159 160extern __inline unsigned int 161__attribute__((__gnu_inline__, __always_inline__, __artificial__)) 162_pext_u32 (unsigned int __X, unsigned int __Y) 163{ 164 return _pext_u64 (__X, __Y); 165} 166#endif /* _ARCH_PWR7 */ 167#endif /* __PPC64__ */ 168 169#endif /* _BMI2INTRIN_H_INCLUDED */ 170