1/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===------------------------------------------------------------------------=== 8 */ 9 10#ifndef __IMMINTRIN_H 11#error "Never use <amxintrin.h> directly; include <immintrin.h> instead." 12#endif /* __IMMINTRIN_H */ 13 14#ifndef __AMXINTRIN_H 15#define __AMXINTRIN_H 16#ifdef __x86_64__ 17 18/* Define the default attributes for the functions in this file. */ 19#define __DEFAULT_FN_ATTRS_TILE \ 20 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) 21#define __DEFAULT_FN_ATTRS_INT8 \ 22 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) 23#define __DEFAULT_FN_ATTRS_BF16 \ 24 __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16"))) 25#define __DEFAULT_FN_ATTRS_FP16 \ 26 __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16"))) 27 28/// Load tile configuration from a 64-byte memory location specified by 29/// "mem_addr". The tile configuration includes the tile type palette, the 30/// number of bytes per row, and the number of rows. If the specified 31/// palette_id is zero, that signifies the init state for both the tile 32/// config and the tile data, and the tiles are zeroed. Any invalid 33/// configurations will result in #GP fault. 34/// 35/// \headerfile <immintrin.h> 36/// 37/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction. 38/// 39/// \param __config 40/// A pointer to 512-bits configuration 41static __inline__ void __DEFAULT_FN_ATTRS_TILE 42_tile_loadconfig(const void *__config) { 43 __builtin_ia32_tile_loadconfig(__config); 44} 45 46/// Stores the current tile configuration to a 64-byte memory location 47/// specified by "mem_addr". The tile configuration includes the tile type 48/// palette, the number of bytes per row, and the number of rows. If tiles 49/// are not configured, all zeroes will be stored to memory. 50/// 51/// \headerfile <immintrin.h> 52/// 53/// This intrinsic corresponds to the <c> STTILECFG </c> instruction. 54/// 55/// \param __config 56/// A pointer to 512-bits configuration 57static __inline__ void __DEFAULT_FN_ATTRS_TILE 58_tile_storeconfig(void *__config) { 59 __builtin_ia32_tile_storeconfig(__config); 60} 61 62/// Release the tile configuration to return to the init state, which 63/// releases all storage it currently holds. 64/// 65/// \headerfile <immintrin.h> 66/// 67/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction. 68static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { 69 __builtin_ia32_tilerelease(); 70} 71 72/// Load tile rows from memory specifieid by "base" address and "stride" into 73/// destination tile "dst" using the tile configuration previously configured 74/// via "_tile_loadconfig". 75/// 76/// \headerfile <immintrin.h> 77/// 78/// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 79/// 80/// \param dst 81/// A destination tile. Max size is 1024 Bytes. 82/// \param base 83/// A pointer to base address. 84/// \param stride 85/// The stride between the rows' data to be loaded in memory. 86#define _tile_loadd(dst, base, stride) \ 87 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \ 88 (__SIZE_TYPE__)(stride)) 89 90/// Load tile rows from memory specifieid by "base" address and "stride" into 91/// destination tile "dst" using the tile configuration previously configured 92/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation 93/// that the data will likely not be reused in the near future and the data 94/// caching can be optimized accordingly. 95/// 96/// \headerfile <immintrin.h> 97/// 98/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 99/// 100/// \param dst 101/// A destination tile. Max size is 1024 Bytes. 102/// \param base 103/// A pointer to base address. 104/// \param stride 105/// The stride between the rows' data to be loaded in memory. 106#define _tile_stream_loadd(dst, base, stride) \ 107 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \ 108 (__SIZE_TYPE__)(stride)) 109 110/// Store the tile specified by "src" to memory specifieid by "base" address and 111/// "stride" using the tile configuration previously configured via 112/// "_tile_loadconfig". 113/// 114/// \headerfile <immintrin.h> 115/// 116/// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 117/// 118/// \param dst 119/// A destination tile. Max size is 1024 Bytes. 120/// \param base 121/// A pointer to base address. 122/// \param stride 123/// The stride between the rows' data to be stored in memory. 124#define _tile_stored(dst, base, stride) \ 125 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) 126 127/// Zero the tile specified by "tdest". 128/// 129/// \headerfile <immintrin.h> 130/// 131/// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 132/// 133/// \param tile 134/// The destination tile to be zero. Max size is 1024 Bytes. 135#define _tile_zero(tile) __builtin_ia32_tilezero((tile)) 136 137/// Compute dot-product of bytes in tiles with a source/destination accumulator. 138/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 139/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 140/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 141/// and store the 32-bit result back to tile "dst". 142/// 143/// \headerfile <immintrin.h> 144/// 145/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 146/// 147/// \param dst 148/// The destination tile. Max size is 1024 Bytes. 149/// \param src0 150/// The 1st source tile. Max size is 1024 Bytes. 151/// \param src1 152/// The 2nd source tile. Max size is 1024 Bytes. 153#define _tile_dpbssd(dst, src0, src1) \ 154 __builtin_ia32_tdpbssd((dst), (src0), (src1)) 155 156/// Compute dot-product of bytes in tiles with a source/destination accumulator. 157/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 158/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 159/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 160/// in "dst", and store the 32-bit result back to tile "dst". 161/// 162/// \headerfile <immintrin.h> 163/// 164/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 165/// 166/// \param dst 167/// The destination tile. Max size is 1024 Bytes. 168/// \param src0 169/// The 1st source tile. Max size is 1024 Bytes. 170/// \param src1 171/// The 2nd source tile. Max size is 1024 Bytes. 172#define _tile_dpbsud(dst, src0, src1) \ 173 __builtin_ia32_tdpbsud((dst), (src0), (src1)) 174 175/// Compute dot-product of bytes in tiles with a source/destination accumulator. 176/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 177/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 178/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 179/// and store the 32-bit result back to tile "dst". 180/// 181/// \headerfile <immintrin.h> 182/// 183/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 184/// 185/// \param dst 186/// The destination tile. Max size is 1024 Bytes. 187/// \param src0 188/// The 1st source tile. Max size is 1024 Bytes. 189/// \param src1 190/// The 2nd source tile. Max size is 1024 Bytes. 191#define _tile_dpbusd(dst, src0, src1) \ 192 __builtin_ia32_tdpbusd((dst), (src0), (src1)) 193 194/// Compute dot-product of bytes in tiles with a source/destination accumulator. 195/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 196/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 197/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 198/// "dst", and store the 32-bit result back to tile "dst". 199/// 200/// \headerfile <immintrin.h> 201/// 202/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 203/// 204/// \param dst 205/// The destination tile. Max size is 1024 Bytes. 206/// \param src0 207/// The 1st source tile. Max size is 1024 Bytes. 208/// \param src1 209/// The 2nd source tile. Max size is 1024 Bytes. 210#define _tile_dpbuud(dst, src0, src1) \ 211 __builtin_ia32_tdpbuud((dst), (src0), (src1)) 212 213/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 214/// src1, accumulating the intermediate single-precision (32-bit) floating-point 215/// elements with elements in "dst", and store the 32-bit result back to tile 216/// "dst". 217/// 218/// \headerfile <immintrin.h> 219/// 220/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 221/// 222/// \param dst 223/// The destination tile. Max size is 1024 Bytes. 224/// \param src0 225/// The 1st source tile. Max size is 1024 Bytes. 226/// \param src1 227/// The 2nd source tile. Max size is 1024 Bytes. 228#define _tile_dpbf16ps(dst, src0, src1) \ 229 __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) 230 231/// AMX tile register size can be configured, the maximum size is 16x64=1024 232/// bytes. Since there is no 2D type in llvm IR, we use vector type to 233/// represent 2D tile and the fixed size is maximum amx tile register size. 234typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); 235 236/// This is internal intrinsic. C/C++ user should avoid calling it directly. 237static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 238_tile_loadd_internal(unsigned short m, unsigned short n, const void *base, 239 __SIZE_TYPE__ stride) { 240 return __builtin_ia32_tileloadd64_internal(m, n, base, 241 (__SIZE_TYPE__)(stride)); 242} 243 244/// This is internal intrinsic. C/C++ user should avoid calling it directly. 245static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 246_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base, 247 __SIZE_TYPE__ stride) { 248 return __builtin_ia32_tileloaddt164_internal(m, n, base, 249 (__SIZE_TYPE__)(stride)); 250} 251 252/// This is internal intrinsic. C/C++ user should avoid calling it directly. 253static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 254_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, 255 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 256 return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); 257} 258 259/// This is internal intrinsic. C/C++ user should avoid calling it directly. 260static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 261_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k, 262 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 263 return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2); 264} 265 266/// This is internal intrinsic. C/C++ user should avoid calling it directly. 267static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 268_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k, 269 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 270 return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2); 271} 272 273/// This is internal intrinsic. C/C++ user should avoid calling it directly. 274static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 275_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k, 276 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 277 return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2); 278} 279 280/// This is internal intrinsic. C/C++ user should avoid calling it directly. 281static __inline__ void __DEFAULT_FN_ATTRS_INT8 282_tile_stored_internal(unsigned short m, unsigned short n, void *base, 283 __SIZE_TYPE__ stride, _tile1024i tile) { 284 return __builtin_ia32_tilestored64_internal(m, n, base, 285 (__SIZE_TYPE__)(stride), tile); 286} 287 288/// This is internal intrinsic. C/C++ user should avoid calling it directly. 289static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16 290_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, 291 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 292 return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2); 293} 294 295/// This is internal intrinsic. C/C++ user should avoid calling it directly. 296static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16 297_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, 298 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 299 return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2); 300} 301 302/// This struct pack the shape and tile data together for user. We suggest 303/// initializing the struct as early as possible, because compiler depends 304/// on the shape information to do configure. The constant value is preferred 305/// for optimization by compiler. 306typedef struct __tile1024i_str { 307 const unsigned short row; 308 const unsigned short col; 309 _tile1024i tile; 310} __tile1024i; 311 312/// Load tile rows from memory specifieid by "base" address and "stride" into 313/// destination tile "dst". 314/// 315/// \headerfile <immintrin.h> 316/// 317/// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 318/// 319/// \param dst 320/// A destination tile. Max size is 1024 Bytes. 321/// \param base 322/// A pointer to base address. 323/// \param stride 324/// The stride between the rows' data to be loaded in memory. 325__DEFAULT_FN_ATTRS_TILE 326static __inline__ void __tile_loadd(__tile1024i *dst, const void *base, 327 __SIZE_TYPE__ stride) { 328 dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); 329} 330 331/// Load tile rows from memory specifieid by "base" address and "stride" into 332/// destination tile "dst". This intrinsic provides a hint to the implementation 333/// that the data will likely not be reused in the near future and the data 334/// caching can be optimized accordingly. 335/// 336/// \headerfile <immintrin.h> 337/// 338/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 339/// 340/// \param dst 341/// A destination tile. Max size is 1024 Bytes. 342/// \param base 343/// A pointer to base address. 344/// \param stride 345/// The stride between the rows' data to be loaded in memory. 346__DEFAULT_FN_ATTRS_TILE 347static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base, 348 __SIZE_TYPE__ stride) { 349 dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride); 350} 351 352/// Compute dot-product of bytes in tiles with a source/destination accumulator. 353/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 354/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 355/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 356/// and store the 32-bit result back to tile "dst". 357/// 358/// \headerfile <immintrin.h> 359/// 360/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 361/// 362/// \param dst 363/// The destination tile. Max size is 1024 Bytes. 364/// \param src0 365/// The 1st source tile. Max size is 1024 Bytes. 366/// \param src1 367/// The 2nd source tile. Max size is 1024 Bytes. 368__DEFAULT_FN_ATTRS_INT8 369static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0, 370 __tile1024i src1) { 371 dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile, 372 src0.tile, src1.tile); 373} 374 375/// Compute dot-product of bytes in tiles with a source/destination accumulator. 376/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 377/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 378/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 379/// in "dst", and store the 32-bit result back to tile "dst". 380/// 381/// \headerfile <immintrin.h> 382/// 383/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 384/// 385/// \param dst 386/// The destination tile. Max size is 1024 Bytes. 387/// \param src0 388/// The 1st source tile. Max size is 1024 Bytes. 389/// \param src1 390/// The 2nd source tile. Max size is 1024 Bytes. 391__DEFAULT_FN_ATTRS_INT8 392static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0, 393 __tile1024i src1) { 394 dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile, 395 src0.tile, src1.tile); 396} 397 398/// Compute dot-product of bytes in tiles with a source/destination accumulator. 399/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 400/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 401/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 402/// and store the 32-bit result back to tile "dst". 403/// 404/// \headerfile <immintrin.h> 405/// 406/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 407/// 408/// \param dst 409/// The destination tile. Max size is 1024 Bytes. 410/// \param src0 411/// The 1st source tile. Max size is 1024 Bytes. 412/// \param src1 413/// The 2nd source tile. Max size is 1024 Bytes. 414__DEFAULT_FN_ATTRS_INT8 415static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0, 416 __tile1024i src1) { 417 dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile, 418 src0.tile, src1.tile); 419} 420 421/// Compute dot-product of bytes in tiles with a source/destination accumulator. 422/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 423/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 424/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 425/// "dst", and store the 32-bit result back to tile "dst". 426/// 427/// \headerfile <immintrin.h> 428/// 429/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 430/// 431/// \param dst 432/// The destination tile. Max size is 1024 Bytes. 433/// \param src0 434/// The 1st source tile. Max size is 1024 Bytes. 435/// \param src1 436/// The 2nd source tile. Max size is 1024 Bytes. 437__DEFAULT_FN_ATTRS_INT8 438static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0, 439 __tile1024i src1) { 440 dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile, 441 src0.tile, src1.tile); 442} 443 444/// Store the tile specified by "src" to memory specifieid by "base" address and 445/// "stride". 446/// 447/// \headerfile <immintrin.h> 448/// 449/// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 450/// 451/// \param base 452/// A pointer to base address. 453/// \param stride 454/// The stride between the rows' data to be stored in memory. 455__DEFAULT_FN_ATTRS_TILE 456static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride, 457 __tile1024i src) { 458 _tile_stored_internal(src.row, src.col, base, stride, src.tile); 459} 460 461/// Zero the tile specified by "dst". 462/// 463/// \headerfile <immintrin.h> 464/// 465/// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 466/// 467/// \param dst 468/// The destination tile to be zero. Max size is 1024 Bytes. 469__DEFAULT_FN_ATTRS_TILE 470static __inline__ void __tile_zero(__tile1024i *dst) { 471 dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); 472} 473 474/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 475/// src1, accumulating the intermediate single-precision (32-bit) floating-point 476/// elements with elements in "dst", and store the 32-bit result back to tile 477/// "dst". 478/// 479/// \headerfile <immintrin.h> 480/// 481/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 482/// 483/// \param dst 484/// The destination tile. Max size is 1024 Bytes. 485/// \param src0 486/// The 1st source tile. Max size is 1024 Bytes. 487/// \param src1 488/// The 2nd source tile. Max size is 1024 Bytes. 489__DEFAULT_FN_ATTRS_BF16 490static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0, 491 __tile1024i src1) { 492 dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, 493 src0.tile, src1.tile); 494} 495 496/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and 497/// src1, accumulating the intermediate single-precision (32-bit) floating-point 498/// elements with elements in "dst", and store the 32-bit result back to tile 499/// "dst". 500/// 501/// \headerfile <immintrin.h> 502/// 503/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction. 504/// 505/// \param dst 506/// The destination tile. Max size is 1024 Bytes. 507/// \param src0 508/// The 1st source tile. Max size is 1024 Bytes. 509/// \param src1 510/// The 2nd source tile. Max size is 1024 Bytes. 511__DEFAULT_FN_ATTRS_FP16 512static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0, 513 __tile1024i src1) { 514 dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, 515 src0.tile, src1.tile); 516} 517 518#undef __DEFAULT_FN_ATTRS_TILE 519#undef __DEFAULT_FN_ATTRS_INT8 520#undef __DEFAULT_FN_ATTRS_BF16 521#undef __DEFAULT_FN_ATTRS_FP16 522 523#endif /* __x86_64__ */ 524#endif /* __AMXINTRIN_H */ 525