1/* Copyright (C) 2011-2020 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* This header is distributed to simplify porting x86_64 code that
25   makes explicit use of Intel intrinsics to powerpc64le.
26   It is the user's responsibility to determine if the results are
27   acceptable and make additional changes as necessary.
28   Note that much code that uses Intel intrinsics can be rewritten in
29   standard C or GNU C extensions, which are more portable and better
30   optimized across multiple targets.  */
31
32#if !defined _X86INTRIN_H_INCLUDED
33# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
34#endif
35
36#ifndef _BMI2INTRIN_H_INCLUDED
37#define _BMI2INTRIN_H_INCLUDED
38
39extern __inline unsigned int
40__attribute__((__gnu_inline__, __always_inline__, __artificial__))
41_bzhi_u32 (unsigned int __X, unsigned int __Y)
42{
43  return ((__X << (32 - __Y)) >> (32 - __Y));
44}
45
46extern __inline unsigned int
47__attribute__((__gnu_inline__, __always_inline__, __artificial__))
48_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
49{
50  unsigned long long __res = (unsigned long long) __X * __Y;
51  *__P = (unsigned int) (__res >> 32);
52  return (unsigned int) __res;
53}
54
55#ifdef  __PPC64__
56extern __inline unsigned long long
57__attribute__((__gnu_inline__, __always_inline__, __artificial__))
58_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
59{
60  return ((__X << (64 - __Y)) >> (64 - __Y));
61}
62
63/* __int128 requires base 64-bit.  */
64extern __inline unsigned long long
65__attribute__((__gnu_inline__, __always_inline__, __artificial__))
66_mulx_u64 (unsigned long long __X, unsigned long long __Y,
67	   unsigned long long *__P)
68{
69  unsigned __int128 __res = (unsigned __int128) __X * __Y;
70  *__P = (unsigned long long) (__res >> 64);
71  return (unsigned long long) __res;
72}
73
74#ifdef  _ARCH_PWR7
75/* popcount and bpermd require power7 minimum.  */
76extern __inline unsigned long long
77__attribute__((__gnu_inline__, __always_inline__, __artificial__))
78_pdep_u64 (unsigned long long __X, unsigned long long __M)
79{
80  unsigned long __result = 0x0UL;
81  const unsigned long __mask = 0x8000000000000000UL;
82  unsigned long __m = __M;
83  unsigned long __c, __t;
84  unsigned long __p;
85
86  /* The pop-count of the mask gives the number of the bits from
87   source to process.  This is also needed to shift bits from the
88   source into the correct position for the result.  */
89  __p = 64 - __builtin_popcountl (__M);
90
91  /* The loop is for the number of '1' bits in the mask and clearing
92   each mask bit as it is processed.  */
93  while (__m != 0)
94    {
95      __c = __builtin_clzl (__m);
96      __t = __X << (__p - __c);
97      __m ^= (__mask >> __c);
98      __result |= (__t & (__mask >> __c));
99      __p++;
100    }
101  return __result;
102}
103
104extern __inline unsigned long long
105__attribute__((__gnu_inline__, __always_inline__, __artificial__))
106_pext_u64 (unsigned long long __X, unsigned long long __M)
107{
108  unsigned long __p = 0x4040404040404040UL; // initial bit permute control
109  const unsigned long __mask = 0x8000000000000000UL;
110  unsigned long __m = __M;
111  unsigned long __c;
112  unsigned long __result;
113
114  /* if the mask is constant and selects 8 bits or less we can use
115   the Power8 Bit permute instruction.  */
116  if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
117    {
118      /* Also if the pext mask is constant, then the popcount is
119       constant, we can evaluate the following loop at compile
120       time and use a constant bit permute vector.  */
121      for (long __i = 0; __i < __builtin_popcountl (__M); __i++)
122	{
123	  __c = __builtin_clzl (__m);
124	  __p = (__p << 8) | __c;
125	  __m ^= (__mask >> __c);
126	}
127      __result = __builtin_bpermd (__p, __X);
128    }
129  else
130    {
131      __p = 64 - __builtin_popcountl (__M);
132      __result = 0;
133      /* We could a use a for loop here, but that combined with
134       -funroll-loops can expand to a lot of code.  The while
135       loop avoids unrolling and the compiler commons the xor
136       from clearing the mask bit with the (__m != 0) test.  The
137       result is a more compact loop setup and body.  */
138      while (__m != 0)
139	{
140	  unsigned long __t;
141	  __c = __builtin_clzl (__m);
142	  __t = (__X & (__mask >> __c)) >> (__p - __c);
143	  __m ^= (__mask >> __c);
144	  __result |= (__t);
145	  __p++;
146	}
147    }
148  return __result;
149}
150
151/* these 32-bit implementations depend on 64-bit pdep/pext
152   which depend on _ARCH_PWR7.  */
153extern __inline unsigned int
154__attribute__((__gnu_inline__, __always_inline__, __artificial__))
155_pdep_u32 (unsigned int __X, unsigned int __Y)
156{
157  return _pdep_u64 (__X, __Y);
158}
159
160extern __inline unsigned int
161__attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_pext_u32 (unsigned int __X, unsigned int __Y)
163{
164  return _pext_u64 (__X, __Y);
165}
166#endif /* _ARCH_PWR7  */
167#endif /* __PPC64__  */
168
169#endif /* _BMI2INTRIN_H_INCLUDED */
170