1/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===------------------------------------------------------------------------=== 8 */ 9 10#ifndef __IMMINTRIN_H 11#error "Never use <amxintrin.h> directly; include <immintrin.h> instead." 12#endif /* __IMMINTRIN_H */ 13 14#ifndef __AMXINTRIN_H 15#define __AMXINTRIN_H 16#ifdef __x86_64__ 17 18#define __DEFAULT_FN_ATTRS \ 19 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) 20 21/// Load tile configuration from a 64-byte memory location specified by 22/// "mem_addr". The tile configuration includes the tile type palette, the 23/// number of bytes per row, and the number of rows. If the specified 24/// palette_id is zero, that signifies the init state for both the tile 25/// config and the tile data, and the tiles are zeroed. Any invalid 26/// configurations will result in #GP fault. 27/// 28/// \headerfile <x86intrin.h> 29/// 30/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction. 31/// 32/// \param __config 33/// A pointer to 512-bits configuration 34static __inline__ void __DEFAULT_FN_ATTRS 35_tile_loadconfig(const void *__config) 36{ 37 __builtin_ia32_tile_loadconfig(__config); 38} 39 40/// Stores the current tile configuration to a 64-byte memory location 41/// specified by "mem_addr". The tile configuration includes the tile type 42/// palette, the number of bytes per row, and the number of rows. If tiles 43/// are not configured, all zeroes will be stored to memory. 44/// 45/// \headerfile <x86intrin.h> 46/// 47/// This intrinsic corresponds to the <c> STTILECFG </c> instruction. 48/// 49/// \param __config 50/// A pointer to 512-bits configuration 51static __inline__ void __DEFAULT_FN_ATTRS 52_tile_storeconfig(void *__config) 53{ 54 __builtin_ia32_tile_storeconfig(__config); 55} 56 57/// Release the tile configuration to return to the init state, which 58/// releases all storage it currently holds. 59/// 60/// \headerfile <x86intrin.h> 61/// 62/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction. 63static __inline__ void __DEFAULT_FN_ATTRS 64_tile_release(void) 65{ 66 __builtin_ia32_tilerelease(); 67} 68 69/// Load tile rows from memory specifieid by "base" address and "stride" into 70/// destination tile "dst" using the tile configuration previously configured 71/// via "_tile_loadconfig". 72/// 73/// \headerfile <x86intrin.h> 74/// 75/// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 76/// 77/// \param dst 78/// A destination tile. Max size is 1024 Bytes. 79/// \param base 80/// A pointer to base address. 81/// \param stride 82/// The stride between the rows' data to be loaded in memory. 83#define _tile_loadd(dst, base, stride) \ 84 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride)) 85 86/// Load tile rows from memory specifieid by "base" address and "stride" into 87/// destination tile "dst" using the tile configuration previously configured 88/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation 89/// that the data will likely not be reused in the near future and the data 90/// caching can be optimized accordingly. 91/// 92/// \headerfile <x86intrin.h> 93/// 94/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 95/// 96/// \param dst 97/// A destination tile. Max size is 1024 Bytes. 98/// \param base 99/// A pointer to base address. 100/// \param stride 101/// The stride between the rows' data to be loaded in memory. 102#define _tile_stream_loadd(dst, base, stride) \ 103 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride)) 104 105/// Store the tile specified by "src" to memory specifieid by "base" address and 106/// "stride" using the tile configuration previously configured via 107/// "_tile_loadconfig". 108/// 109/// \headerfile <x86intrin.h> 110/// 111/// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 112/// 113/// \param dst 114/// A destination tile. Max size is 1024 Bytes. 115/// \param base 116/// A pointer to base address. 117/// \param stride 118/// The stride between the rows' data to be stored in memory. 119#define _tile_stored(dst, base, stride) \ 120 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) 121 122/// Zero the tile specified by "tdest". 123/// 124/// \headerfile <x86intrin.h> 125/// 126/// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 127/// 128/// \param tile 129/// The destination tile to be zero. Max size is 1024 Bytes. 130#define _tile_zero(tile) __builtin_ia32_tilezero((tile)) 131 132/// Compute dot-product of bytes in tiles with a source/destination accumulator. 133/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 134/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 135/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 136/// and store the 32-bit result back to tile "dst". 137/// 138/// \headerfile <x86intrin.h> 139/// 140/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 141/// 142/// \param dst 143/// The destination tile. Max size is 1024 Bytes. 144/// \param src0 145/// The 1st source tile. Max size is 1024 Bytes. 146/// \param src1 147/// The 2nd source tile. Max size is 1024 Bytes. 148#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1)) 149 150/// Compute dot-product of bytes in tiles with a source/destination accumulator. 151/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 152/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 153/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 154/// in "dst", and store the 32-bit result back to tile "dst". 155/// 156/// \headerfile <x86intrin.h> 157/// 158/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 159/// 160/// \param dst 161/// The destination tile. Max size is 1024 Bytes. 162/// \param src0 163/// The 1st source tile. Max size is 1024 Bytes. 164/// \param src1 165/// The 2nd source tile. Max size is 1024 Bytes. 166#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1)) 167 168/// Compute dot-product of bytes in tiles with a source/destination accumulator. 169/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 170/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 171/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 172/// and store the 32-bit result back to tile "dst". 173/// 174/// \headerfile <x86intrin.h> 175/// 176/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 177/// 178/// \param dst 179/// The destination tile. Max size is 1024 Bytes. 180/// \param src0 181/// The 1st source tile. Max size is 1024 Bytes. 182/// \param src1 183/// The 2nd source tile. Max size is 1024 Bytes. 184#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1)) 185 186/// Compute dot-product of bytes in tiles with a source/destination accumulator. 187/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 188/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 189/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 190/// "dst", and store the 32-bit result back to tile "dst". 191/// 192/// \headerfile <x86intrin.h> 193/// 194/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 195/// 196/// \param dst 197/// The destination tile. Max size is 1024 Bytes. 198/// \param src0 199/// The 1st source tile. Max size is 1024 Bytes. 200/// \param src1 201/// The 2nd source tile. Max size is 1024 Bytes. 202#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1)) 203 204/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 205/// src1, accumulating the intermediate single-precision (32-bit) floating-point 206/// elements with elements in "dst", and store the 32-bit result back to tile 207/// "dst". 208/// 209/// \headerfile <x86intrin.h> 210/// 211/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 212/// 213/// \param dst 214/// The destination tile. Max size is 1024 Bytes. 215/// \param src0 216/// The 1st source tile. Max size is 1024 Bytes. 217/// \param src1 218/// The 2nd source tile. Max size is 1024 Bytes. 219#define _tile_dpbf16ps(dst, src0, src1) \ 220 __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) 221 222#undef __DEFAULT_FN_ATTRS 223 224#endif /* __x86_64__ */ 225#endif /* __AMXINTRIN_H */ 226