1/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===------------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
12#endif /* __IMMINTRIN_H */
13
14#ifndef __AMXINTRIN_H
15#define __AMXINTRIN_H
16#ifdef __x86_64__
17
18#define __DEFAULT_FN_ATTRS \
19  __attribute__((__always_inline__, __nodebug__,  __target__("amx-tile")))
20
21/// Load tile configuration from a 64-byte memory location specified by
22/// "mem_addr". The tile configuration includes the tile type palette, the
23/// number of bytes per row, and the number of rows. If the specified
24/// palette_id is zero, that signifies the init state for both the tile
25/// config and the tile data, and the tiles are zeroed. Any invalid
26/// configurations will result in #GP fault.
27///
28/// \headerfile <x86intrin.h>
29///
30/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
31///
32/// \param __config
33///    A pointer to 512-bits configuration
34static __inline__ void __DEFAULT_FN_ATTRS
35_tile_loadconfig(const void *__config)
36{
37  __builtin_ia32_tile_loadconfig(__config);
38}
39
40/// Stores the current tile configuration to a 64-byte memory location
41/// specified by "mem_addr". The tile configuration includes the tile type
42/// palette, the number of bytes per row, and the number of rows. If tiles
43/// are not configured, all zeroes will be stored to memory.
44///
45/// \headerfile <x86intrin.h>
46///
47/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
48///
49/// \param __config
50///    A pointer to 512-bits configuration
51static __inline__ void __DEFAULT_FN_ATTRS
52_tile_storeconfig(void *__config)
53{
54  __builtin_ia32_tile_storeconfig(__config);
55}
56
57/// Release the tile configuration to return to the init state, which
58/// releases all storage it currently holds.
59///
60/// \headerfile <x86intrin.h>
61///
62/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
63static __inline__ void __DEFAULT_FN_ATTRS
64_tile_release(void)
65{
66  __builtin_ia32_tilerelease();
67}
68
69/// Load tile rows from memory specifieid by "base" address and "stride" into
70/// destination tile "dst" using the tile configuration previously configured
71/// via "_tile_loadconfig".
72///
73/// \headerfile <x86intrin.h>
74///
75/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
76///
77/// \param dst
78///    A destination tile. Max size is 1024 Bytes.
79/// \param base
80///    A pointer to base address.
81/// \param stride
82///    The stride between the rows' data to be loaded in memory.
83#define _tile_loadd(dst, base, stride) \
84  __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
85
86/// Load tile rows from memory specifieid by "base" address and "stride" into
87/// destination tile "dst" using the tile configuration previously configured
88/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
89/// that the data will likely not be reused in the near future and the data
90/// caching can be optimized accordingly.
91///
92/// \headerfile <x86intrin.h>
93///
94/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
95///
96/// \param dst
97///    A destination tile. Max size is 1024 Bytes.
98/// \param base
99///    A pointer to base address.
100/// \param stride
101///    The stride between the rows' data to be loaded in memory.
102#define _tile_stream_loadd(dst, base, stride) \
103  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
104
105/// Store the tile specified by "src" to memory specifieid by "base" address and
106/// "stride" using the tile configuration previously configured via
107/// "_tile_loadconfig".
108///
109/// \headerfile <x86intrin.h>
110///
111/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
112///
113/// \param dst
114///    A destination tile. Max size is 1024 Bytes.
115/// \param base
116///    A pointer to base address.
117/// \param stride
118///    The stride between the rows' data to be stored in memory.
119#define _tile_stored(dst, base, stride) \
120  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
121
122/// Zero the tile specified by "tdest".
123///
124/// \headerfile <x86intrin.h>
125///
126/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
127///
128/// \param tile
129///    The destination tile to be zero. Max size is 1024 Bytes.
130#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
131
132/// Compute dot-product of bytes in tiles with a source/destination accumulator.
133/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
134/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
135/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
136/// and store the 32-bit result back to tile "dst".
137///
138/// \headerfile <x86intrin.h>
139///
140/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
141///
142/// \param dst
143///    The destination tile. Max size is 1024 Bytes.
144/// \param src0
145///    The 1st source tile. Max size is 1024 Bytes.
146/// \param src1
147///    The 2nd source tile. Max size is 1024 Bytes.
148#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
149
150/// Compute dot-product of bytes in tiles with a source/destination accumulator.
151/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
152/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
153/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
154/// in "dst", and store the 32-bit result back to tile "dst".
155///
156/// \headerfile <x86intrin.h>
157///
158/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
159///
160/// \param dst
161///    The destination tile. Max size is 1024 Bytes.
162/// \param src0
163///    The 1st source tile. Max size is 1024 Bytes.
164/// \param src1
165///    The 2nd source tile. Max size is 1024 Bytes.
166#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
167
168/// Compute dot-product of bytes in tiles with a source/destination accumulator.
169/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
170/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
171/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
172/// and store the 32-bit result back to tile "dst".
173///
174/// \headerfile <x86intrin.h>
175///
176/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
177///
178/// \param dst
179///    The destination tile. Max size is 1024 Bytes.
180/// \param src0
181///    The 1st source tile. Max size is 1024 Bytes.
182/// \param src1
183///    The 2nd source tile. Max size is 1024 Bytes.
184#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
185
186/// Compute dot-product of bytes in tiles with a source/destination accumulator.
187/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
188/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
189/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
190/// "dst", and store the 32-bit result back to tile "dst".
191///
192/// \headerfile <x86intrin.h>
193///
194/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
195///
196/// \param dst
197///    The destination tile. Max size is 1024 Bytes.
198/// \param src0
199///    The 1st source tile. Max size is 1024 Bytes.
200/// \param src1
201///    The 2nd source tile. Max size is 1024 Bytes.
202#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
203
204/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
205/// src1, accumulating the intermediate single-precision (32-bit) floating-point
206/// elements with elements in "dst", and store the 32-bit result back to tile
207/// "dst".
208///
209/// \headerfile <x86intrin.h>
210///
211/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
212///
213/// \param dst
214///    The destination tile. Max size is 1024 Bytes.
215/// \param src0
216///    The 1st source tile. Max size is 1024 Bytes.
217/// \param src1
218///    The 2nd source tile. Max size is 1024 Bytes.
219#define _tile_dpbf16ps(dst, src0, src1) \
220  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
221
222#undef __DEFAULT_FN_ATTRS
223
224#endif /* __x86_64__ */
225#endif /* __AMXINTRIN_H */
226