1284677Sdim//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// 2284677Sdim// 3284677Sdim// The LLVM Compiler Infrastructure 4284677Sdim// 5284677Sdim// This file is distributed under the University of Illinois Open Source 6284677Sdim// License. See LICENSE.TXT for details. 7284677Sdim// 8284677Sdim//===----------------------------------------------------------------------===// 9284677Sdim/// \file AMDKernelCodeT.h 10284677Sdim//===----------------------------------------------------------------------===// 11284677Sdim 12284677Sdim#ifndef AMDKERNELCODET_H 13284677Sdim#define AMDKERNELCODET_H 14284677Sdim 15285181Sdim#include "llvm/MC/SubtargetFeature.h" 16285181Sdim 17284677Sdim#include <cstddef> 18284677Sdim#include <cstdint> 19284677Sdim 20285181Sdim#include "llvm/Support/Debug.h" 21284677Sdim//---------------------------------------------------------------------------// 22284677Sdim// AMD Kernel Code, and its dependencies // 23284677Sdim//---------------------------------------------------------------------------// 24284677Sdim 25284677Sdimtypedef uint8_t hsa_powertwo8_t; 26284677Sdimtypedef uint32_t hsa_ext_code_kind_t; 27284677Sdimtypedef uint8_t hsa_ext_brig_profile8_t; 28284677Sdimtypedef uint8_t hsa_ext_brig_machine_model8_t; 29284677Sdimtypedef uint64_t hsa_ext_control_directive_present64_t; 30284677Sdimtypedef uint16_t hsa_ext_exception_kind16_t; 31284677Sdimtypedef uint32_t hsa_ext_code_kind32_t; 32284677Sdim 33284677Sdimtypedef struct hsa_dim3_s { 34284677Sdim uint32_t x; 35284677Sdim uint32_t y; 36284677Sdim uint32_t z; 37284677Sdim} hsa_dim3_t; 38284677Sdim 39284677Sdim/// The version of the amd_*_code_t struct. Minor versions must be 40284677Sdim/// backward compatible. 41284677Sdimtypedef uint32_t amd_code_version32_t; 42284677Sdimenum amd_code_version_t { 43284677Sdim AMD_CODE_VERSION_MAJOR = 0, 44284677Sdim AMD_CODE_VERSION_MINOR = 1 45284677Sdim}; 46284677Sdim 47284677Sdim/// The values used to define the number of bytes to use for the 48284677Sdim/// swizzle element size. 49284677Sdimenum amd_element_byte_size_t { 50284677Sdim AMD_ELEMENT_2_BYTES = 0, 51284677Sdim AMD_ELEMENT_4_BYTES = 1, 52284677Sdim AMD_ELEMENT_8_BYTES = 2, 53284677Sdim AMD_ELEMENT_16_BYTES = 3 54284677Sdim}; 55284677Sdim 56284677Sdim/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and 57284677Sdim/// COMPUTE_PGM_RSRC2 registers. 58284677Sdimtypedef uint64_t amd_compute_pgm_resource_register64_t; 59284677Sdim 60284677Sdim/// Every amd_*_code_t has the following properties, which are composed of 61284677Sdim/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), 62284677Sdim/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount 63284677Sdim/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. 64284677Sdim/// 65284677Sdim/// (Note that bit fields cannot be used as their layout is 66284677Sdim/// implementation defined in the C standard and so cannot be used to 67284677Sdim/// specify an ABI) 68284677Sdimtypedef uint32_t amd_code_property32_t; 69284677Sdimenum amd_code_property_mask_t { 70284677Sdim 71284677Sdim /// Enable the setup of the SGPR user data registers 72284677Sdim /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t 73284677Sdim /// for initial register state. 74284677Sdim /// 75284677Sdim /// The total number of SGPRuser data registers requested must not 76284677Sdim /// exceed 16. Any requests beyond 16 will be ignored. 77284677Sdim /// 78284677Sdim /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of 79284677Sdim /// SGPR user data registers enabled up to 16). 80284677Sdim 81284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, 82284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, 83284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, 84284677Sdim 85284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, 86284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, 87284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, 88284677Sdim 89284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, 90284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, 91284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, 92284677Sdim 93284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, 94284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, 95284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, 96284677Sdim 97284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, 98284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, 99284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, 100284677Sdim 101284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, 102284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, 103284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, 104284677Sdim 105284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, 106284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, 107284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, 108284677Sdim 109284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, 110284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, 111284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, 112284677Sdim 113284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, 114284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, 115284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, 116284677Sdim 117284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, 118284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, 119284677Sdim AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, 120284677Sdim 121284677Sdim /// Control wave ID base counter for GDS ordered-append. Used to set 122284677Sdim /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if 123284677Sdim /// ORDERED_APPEND_MODE also needs to be settable) 124284677Sdim AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, 125284677Sdim AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, 126284677Sdim AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, 127284677Sdim 128284677Sdim /// The interleave (swizzle) element size in bytes required by the 129284677Sdim /// code for private memory. This must be 2, 4, 8 or 16. This value 130284677Sdim /// is provided to the finalizer when it is invoked and is recorded 131284677Sdim /// here. The hardware will interleave the memory requests of each 132284677Sdim /// lane of a wavefront by this element size to ensure each 133284677Sdim /// work-item gets a distinct memory memory location. Therefore, the 134284677Sdim /// finalizer ensures that all load and store operations done to 135284677Sdim /// private memory do not exceed this size. For example, if the 136284677Sdim /// element size is 4 (32-bits or dword) and a 64-bit value must be 137284677Sdim /// loaded, the finalizer will generate two 32-bit loads. This 138284677Sdim /// ensures that the interleaving will get the work-item 139284677Sdim /// specific dword for both halves of the 64-bit value. If it just 140284677Sdim /// did a 64-bit load then it would get one dword which belonged to 141284677Sdim /// its own work-item, but the second dword would belong to the 142284677Sdim /// adjacent lane work-item since the interleaving is in dwords. 143284677Sdim /// 144284677Sdim /// The value used must match the value that the runtime configures 145284677Sdim /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This 146284677Sdim /// is generally DWORD. 147284677Sdim /// 148285181Sdim /// uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM. 149284677Sdim AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, 150284677Sdim AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, 151284677Sdim AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, 152284677Sdim 153284677Sdim /// Are global memory addresses 64 bits. Must match 154284677Sdim /// amd_kernel_code_t.hsail_machine_model == 155284677Sdim /// HSA_MACHINE_LARGE. Must also match 156284677Sdim /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), 157284677Sdim /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). 158284677Sdim AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, 159284677Sdim AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, 160284677Sdim AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, 161284677Sdim 162284677Sdim /// Indicate if the generated ISA is using a dynamically sized call 163284677Sdim /// stack. This can happen if calls are implemented using a call 164284677Sdim /// stack and recursion, alloca or calls to indirect functions are 165284677Sdim /// present. In these cases the Finalizer cannot compute the total 166284677Sdim /// private segment size at compile time. In this case the 167284677Sdim /// workitem_private_segment_byte_size only specifies the statically 168284677Sdim /// know private segment size, and additional space must be added 169284677Sdim /// for the call stack. 170284677Sdim AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, 171284677Sdim AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, 172284677Sdim AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, 173284677Sdim 174284677Sdim /// Indicate if code generated has support for debugging. 175284677Sdim AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, 176284677Sdim AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, 177285181Sdim AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT, 178285181Sdim 179285181Sdim AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 15, 180285181Sdim AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1, 181285181Sdim AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT 182284677Sdim}; 183284677Sdim 184284677Sdim/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL 185284677Sdim/// control directives. These control how the finalizer generates code. This 186284677Sdim/// struct is used both as an argument to hsaFinalizeKernel to specify values for 187284677Sdim/// the control directives, and is used in HsaKernelCode to record the values of 188284677Sdim/// the control directives that the finalize used when generating the code which 189284677Sdim/// either came from the finalizer argument or explicit HSAIL control 190284677Sdim/// directives. See the definition of the control directives in HSA Programmer's 191284677Sdim/// Reference Manual which also defines how the values specified as finalizer 192284677Sdim/// arguments have to agree with the control directives in the HSAIL code. 193284677Sdimtypedef struct hsa_ext_control_directives_s { 194284677Sdim /// This is a bit set indicating which control directives have been 195284677Sdim /// specified. If the value is 0 then there are no control directives specified 196284677Sdim /// and the rest of the fields can be ignored. The bits are accessed using the 197284677Sdim /// hsa_ext_control_directives_present_mask_t. Any control directive that is not 198284677Sdim /// enabled in this bit set must have the value of all 0s. 199284677Sdim hsa_ext_control_directive_present64_t enabled_control_directives; 200284677Sdim 201284677Sdim /// If enableBreakExceptions is not enabled then must be 0, otherwise must be 202284677Sdim /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK 203284677Sdim /// policy enabled. If this set is not empty then the generated code may have 204284677Sdim /// lower performance than if the set is empty. If the kernel being finalized 205284677Sdim /// has any enablebreakexceptions control directives, then the values specified 206284677Sdim /// by this argument are unioned with the values in these control 207284677Sdim /// directives. If any of the functions the kernel calls have an 208284677Sdim /// enablebreakexceptions control directive, then they must be equal or a 209284677Sdim /// subset of, this union. 210284677Sdim hsa_ext_exception_kind16_t enable_break_exceptions; 211284677Sdim 212284677Sdim /// If enableDetectExceptions is not enabled then must be 0, otherwise must be 213284677Sdim /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT 214284677Sdim /// policy enabled. If this set is not empty then the generated code may have 215284677Sdim /// lower performance than if the set is empty. However, an implementation 216284677Sdim /// should endeavour to make the performance impact small. If the kernel being 217284677Sdim /// finalized has any enabledetectexceptions control directives, then the 218284677Sdim /// values specified by this argument are unioned with the values in these 219284677Sdim /// control directives. If any of the functions the kernel calls have an 220284677Sdim /// enabledetectexceptions control directive, then they must be equal or a 221284677Sdim /// subset of, this union. 222284677Sdim hsa_ext_exception_kind16_t enable_detect_exceptions; 223284677Sdim 224284677Sdim /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of 225284677Sdim /// dynamic group segment can be allocated for a dispatch, otherwise the value 226284677Sdim /// specifies the maximum number of bytes of dynamic group segment that can be 227284677Sdim /// allocated for a dispatch. If the kernel being finalized has any 228284677Sdim /// maxdynamicsize control directives, then the values must be the same, and 229284677Sdim /// must be the same as this argument if it is enabled. This value can be used 230284677Sdim /// by the finalizer to determine the maximum number of bytes of group memory 231284677Sdim /// used by each work-group by adding this value to the group memory required 232284677Sdim /// for all group segment variables used by the kernel and all functions it 233284677Sdim /// calls, and group memory used to implement other HSAIL features such as 234284677Sdim /// fbarriers and the detect exception operations. This can allow the finalizer 235284677Sdim /// to determine the expected number of work-groups that can be executed by a 236284677Sdim /// compute unit and allow more resources to be allocated to the work-items if 237284677Sdim /// it is known that fewer work-groups can be executed due to group memory 238284677Sdim /// limitations. 239284677Sdim uint32_t max_dynamic_group_size; 240284677Sdim 241284677Sdim /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater 242284677Sdim /// than 0. See HSA Programmer's Reference Manual description of 243284677Sdim /// maxflatgridsize control directive. 244284677Sdim uint32_t max_flat_grid_size; 245284677Sdim 246284677Sdim /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be 247284677Sdim /// greater than 0. See HSA Programmer's Reference Manual description of 248284677Sdim /// maxflatworkgroupsize control directive. 249284677Sdim uint32_t max_flat_workgroup_size; 250284677Sdim 251284677Sdim /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the 252284677Sdim /// finalizer is free to generate ISA that may result in any number of 253284677Sdim /// work-groups executing on a single compute unit. Otherwise, the finalizer 254284677Sdim /// should attempt to generate ISA that will allow the specified number of 255284677Sdim /// work-groups to execute on a single compute unit. This is only a hint and 256284677Sdim /// can be ignored by the finalizer. If the kernel being finalized, or any of 257284677Sdim /// the functions it calls, has a requested control directive, then the values 258284677Sdim /// must be the same. This can be used to determine the number of resources 259284677Sdim /// that should be allocated to a single work-group and work-item. For example, 260284677Sdim /// a low value may allow more resources to be allocated, resulting in higher 261284677Sdim /// per work-item performance, as it is known there will never be more than the 262284677Sdim /// specified number of work-groups actually executing on the compute 263284677Sdim /// unit. Conversely, a high value may allocate fewer resources, resulting in 264284677Sdim /// lower per work-item performance, which is offset by the fact it allows more 265284677Sdim /// work-groups to actually execute on the compute unit. 266284677Sdim uint32_t requested_workgroups_per_cu; 267284677Sdim 268284677Sdim /// If not enabled then all elements for Dim3 must be 0, otherwise every 269284677Sdim /// element must be greater than 0. See HSA Programmer's Reference Manual 270284677Sdim /// description of requiredgridsize control directive. 271284677Sdim hsa_dim3_t required_grid_size; 272284677Sdim 273284677Sdim /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be 274284677Sdim /// 0, and the produced code can be dispatched with any legal work-group range 275284677Sdim /// consistent with the dispatch dimensions. Otherwise, the code produced must 276284677Sdim /// always be dispatched with the specified work-group range. No element of the 277284677Sdim /// specified range must be 0. It must be consistent with required_dimensions 278284677Sdim /// and max_flat_workgroup_size. If the kernel being finalized, or any of the 279284677Sdim /// functions it calls, has a requiredworkgroupsize control directive, then the 280284677Sdim /// values must be the same. Specifying a value can allow the finalizer to 281284677Sdim /// optimize work-group id operations, and if the number of work-items in the 282284677Sdim /// work-group is less than the WAVESIZE then barrier operations can be 283284677Sdim /// optimized to just a memory fence. 284284677Sdim hsa_dim3_t required_workgroup_size; 285284677Sdim 286284677Sdim /// If requiredDim is not enabled then must be 0 and the produced kernel code 287284677Sdim /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is 288284677Sdim /// 1..3 and the code produced must only be dispatched with a dimension that 289284677Sdim /// matches. Other values are illegal. If the kernel being finalized, or any of 290284677Sdim /// the functions it calls, has a requireddimsize control directive, then the 291284677Sdim /// values must be the same. This can be used to optimize the code generated to 292284677Sdim /// compute the absolute and flat work-group and work-item id, and the dim 293284677Sdim /// HSAIL operations. 294284677Sdim uint8_t required_dim; 295284677Sdim 296284677Sdim /// Reserved. Must be 0. 297284677Sdim uint8_t reserved[75]; 298284677Sdim} hsa_ext_control_directives_t; 299284677Sdim 300284677Sdim/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel 301284677Sdim/// Code Object to set up the hardware to execute the kernel dispatch. 302284677Sdim/// 303284677Sdim/// Initial Kernel Register State. 304284677Sdim/// 305284677Sdim/// Initial kernel register state will be set up by CP/SPI prior to the start 306284677Sdim/// of execution of every wavefront. This is limited by the constraints of the 307284677Sdim/// current hardware. 308284677Sdim/// 309284677Sdim/// The order of the SGPR registers is defined, but the Finalizer can specify 310284677Sdim/// which ones are actually setup in the amd_kernel_code_t object using the 311284677Sdim/// enable_sgpr_* bit fields. The register numbers used for enabled registers 312284677Sdim/// are dense starting at SGPR0: the first enabled register is SGPR0, the next 313284677Sdim/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR 314284677Sdim/// number. 315284677Sdim/// 316284677Sdim/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and 317284677Sdim/// apply to all waves of the grid. It is possible to specify more than 16 User 318284677Sdim/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 319284677Sdim/// are actually initialized. These are then immediately followed by the System 320284677Sdim/// SGPRs that are set up by ADC/SPI and can have different values for each wave 321284677Sdim/// of the grid dispatch. 322284677Sdim/// 323284677Sdim/// SGPR register initial state is defined as follows: 324284677Sdim/// 325284677Sdim/// Private Segment Buffer (enable_sgpr_private_segment_buffer): 326284677Sdim/// Number of User SGPR registers: 4. V# that can be used, together with 327284677Sdim/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg 328284677Sdim/// segments using a segment address. It must be set as follows: 329284677Sdim/// - Base address: of the scratch memory area used by the dispatch. It 330284677Sdim/// does not include the scratch wave offset. It will be the per process 331284677Sdim/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for 332284677Sdim/// example there may be a per pipe offset, or per AQL Queue offset). 333284677Sdim/// - Stride + data_format: Element Size * Index Stride (???) 334284677Sdim/// - Cache swizzle: ??? 335284677Sdim/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for 336284677Sdim/// scratch) 337284677Sdim/// - Num records: Flat Scratch Work Item Size / Element Size (???) 338284677Sdim/// - Dst_sel_*: ??? 339284677Sdim/// - Num_format: ??? 340284677Sdim/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must 341284677Sdim/// agree with amd_kernel_code_t.privateElementSize) 342284677Sdim/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must 343284677Sdim/// be number of wavefront lanes for scratch, must agree with 344284677Sdim/// amd_kernel_code_t.wavefrontSize) 345284677Sdim/// - Add tid enable: 1 346284677Sdim/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, 347284677Sdim/// - Hash_enable: ??? 348284677Sdim/// - Heap: ??? 349284677Sdim/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE 350284677Sdim/// - Type: 0 (a buffer) (???) 351284677Sdim/// 352284677Sdim/// Dispatch Ptr (enable_sgpr_dispatch_ptr): 353284677Sdim/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet 354284677Sdim/// for kernel actually executing. 355284677Sdim/// 356284677Sdim/// Queue Ptr (enable_sgpr_queue_ptr): 357284677Sdim/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for 358284677Sdim/// AQL queue on which the dispatch packet was queued. 359284677Sdim/// 360284677Sdim/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): 361284677Sdim/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This 362284677Sdim/// is directly copied from the kernargPtr in the dispatch packet. Having CP 363284677Sdim/// load it once avoids loading it at the beginning of every wavefront. 364284677Sdim/// 365284677Sdim/// Dispatch Id (enable_sgpr_dispatch_id): 366284677Sdim/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch 367284677Sdim/// packet being executed. 368284677Sdim/// 369284677Sdim/// Flat Scratch Init (enable_sgpr_flat_scratch_init): 370284677Sdim/// Number of User SGPR registers: 2. This is 2 SGPRs. 371284677Sdim/// 372284677Sdim/// For CI/VI: 373284677Sdim/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE 374284677Sdim/// to base of memory for scratch for this dispatch. This is the same offset 375284677Sdim/// used in computing the Scratch Segment Buffer base address. The value of 376284677Sdim/// Scratch Wave Offset must be added by the kernel code and moved to 377284677Sdim/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. 378284677Sdim/// 379285181Sdim/// The second SGPR is 32 bit byte size of a single work-item's scratch 380284677Sdim/// memory usage. This is directly loaded from the dispatch packet Private 381284677Sdim/// Segment Byte Size and rounded up to a multiple of DWORD. 382284677Sdim/// 383284677Sdim/// \todo [Does CP need to round this to >4 byte alignment?] 384284677Sdim/// 385284677Sdim/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in 386284677Sdim/// flat memory instructions. Having CP load it once avoids loading it at 387284677Sdim/// the beginning of every wavefront. 388284677Sdim/// 389284677Sdim/// For PI: 390284677Sdim/// This is the 64 bit base address of the scratch backing memory for 391284677Sdim/// allocated by CP for this dispatch. 392284677Sdim/// 393284677Sdim/// Private Segment Size (enable_sgpr_private_segment_size): 394284677Sdim/// Number of User SGPR registers: 1. The 32 bit byte size of a single 395285181Sdim/// work-item's scratch memory allocation. This is the value from the dispatch 396284677Sdim/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. 397284677Sdim/// 398284677Sdim/// \todo [Does CP need to round this to >4 byte alignment?] 399284677Sdim/// 400284677Sdim/// Having CP load it once avoids loading it at the beginning of every 401284677Sdim/// wavefront. 402284677Sdim/// 403284677Sdim/// \todo [This will not be used for CI/VI since it is the same value as 404284677Sdim/// the second SGPR of Flat Scratch Init. However, it is need for PI which 405284677Sdim/// changes meaning of Flat Scratchg Init..] 406284677Sdim/// 407284677Sdim/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): 408284677Sdim/// Number of User SGPR registers: 1. 32 bit count of the number of 409284677Sdim/// work-groups in the X dimension for the grid being executed. Computed from 410284677Sdim/// the fields in the HsaDispatchPacket as 411284677Sdim/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). 412284677Sdim/// 413284677Sdim/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): 414284677Sdim/// Number of User SGPR registers: 1. 32 bit count of the number of 415284677Sdim/// work-groups in the Y dimension for the grid being executed. Computed from 416284677Sdim/// the fields in the HsaDispatchPacket as 417284677Sdim/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). 418284677Sdim/// 419284677Sdim/// Only initialized if <16 previous SGPRs initialized. 420284677Sdim/// 421284677Sdim/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): 422284677Sdim/// Number of User SGPR registers: 1. 32 bit count of the number of 423284677Sdim/// work-groups in the Z dimension for the grid being executed. Computed 424284677Sdim/// from the fields in the HsaDispatchPacket as 425284677Sdim/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). 426284677Sdim/// 427284677Sdim/// Only initialized if <16 previous SGPRs initialized. 428284677Sdim/// 429284677Sdim/// Work-Group Id X (enable_sgpr_workgroup_id_x): 430284677Sdim/// Number of System SGPR registers: 1. 32 bit work group id in X dimension 431284677Sdim/// of grid for wavefront. Always present. 432284677Sdim/// 433284677Sdim/// Work-Group Id Y (enable_sgpr_workgroup_id_y): 434284677Sdim/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension 435284677Sdim/// of grid for wavefront. 436284677Sdim/// 437284677Sdim/// Work-Group Id Z (enable_sgpr_workgroup_id_z): 438284677Sdim/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension 439284677Sdim/// of grid for wavefront. If present then Work-group Id Y will also be 440284677Sdim/// present 441284677Sdim/// 442284677Sdim/// Work-Group Info (enable_sgpr_workgroup_info): 443285181Sdim/// Number of System SGPR registers: 1. {first_wave, 14'b0000, 444284677Sdim/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} 445284677Sdim/// 446284677Sdim/// Private Segment Wave Byte Offset 447284677Sdim/// (enable_sgpr_private_segment_wave_byte_offset): 448284677Sdim/// Number of System SGPR registers: 1. 32 bit byte offset from base of 449284677Sdim/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg 450284677Sdim/// segment address when using Scratch Segment Buffer. It must be added to 451284677Sdim/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. 452284677Sdim/// 453284677Sdim/// 454284677Sdim/// The order of the VGPR registers is defined, but the Finalizer can specify 455284677Sdim/// which ones are actually setup in the amd_kernel_code_t object using the 456284677Sdim/// enableVgpr* bit fields. The register numbers used for enabled registers 457284677Sdim/// are dense starting at VGPR0: the first enabled register is VGPR0, the next 458284677Sdim/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR 459284677Sdim/// number. 460284677Sdim/// 461284677Sdim/// VGPR register initial state is defined as follows: 462284677Sdim/// 463284677Sdim/// Work-Item Id X (always initialized): 464284677Sdim/// Number of registers: 1. 32 bit work item id in X dimension of work-group 465284677Sdim/// for wavefront lane. 466284677Sdim/// 467284677Sdim/// Work-Item Id X (enable_vgpr_workitem_id > 0): 468284677Sdim/// Number of registers: 1. 32 bit work item id in Y dimension of work-group 469284677Sdim/// for wavefront lane. 470284677Sdim/// 471284677Sdim/// Work-Item Id X (enable_vgpr_workitem_id > 0): 472284677Sdim/// Number of registers: 1. 32 bit work item id in Z dimension of work-group 473284677Sdim/// for wavefront lane. 474284677Sdim/// 475284677Sdim/// 476284677Sdim/// The setting of registers is being done by existing GPU hardware as follows: 477284677Sdim/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data 478284677Sdim/// registers. 479284677Sdim/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any 480284677Sdim/// combination including none. 481284677Sdim/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot 482284677Sdim/// be added into the value Flat Scratch Offset which would avoid the 483284677Sdim/// Finalizer generated prolog having to do the add. 484284677Sdim/// 4) The VGPRs are set by SPI which only supports specifying either (X), 485284677Sdim/// (X, Y) or (X, Y, Z). 486284677Sdim/// 487284677Sdim/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so 488284677Sdim/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and 489284677Sdim/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. 490284677Sdim/// 491284677Sdim/// The global segment can be accessed either using flat operations or buffer 492284677Sdim/// operations. If buffer operations are used then the Global Buffer used to 493284677Sdim/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a 494284677Sdim/// segment address is not passed into the kernel code by CP since its base 495284677Sdim/// address is always 0. Instead the Finalizer generates prolog code to 496284677Sdim/// initialize 4 SGPRs with a V# that has the following properties, and then 497284677Sdim/// uses that in the buffer instructions: 498284677Sdim/// - base address of 0 499284677Sdim/// - no swizzle 500284677Sdim/// - ATC=1 501284677Sdim/// - MTYPE set to support memory coherence specified in 502284677Sdim/// amd_kernel_code_t.globalMemoryCoherence 503284677Sdim/// 504284677Sdim/// When the Global Buffer is used to access the Kernarg segment, must add the 505284677Sdim/// dispatch packet kernArgPtr to a kernarg segment address before using this V#. 506284677Sdim/// Alternatively scalar loads can be used if the kernarg offset is uniform, as 507284677Sdim/// the kernarg segment is constant for the duration of the kernel execution. 508284677Sdim/// 509285181Sdim 510284677Sdimtypedef struct amd_kernel_code_s { 511285181Sdim uint32_t amd_kernel_code_version_major; 512285181Sdim uint32_t amd_kernel_code_version_minor; 513285181Sdim uint16_t amd_machine_kind; 514285181Sdim uint16_t amd_machine_version_major; 515285181Sdim uint16_t amd_machine_version_minor; 516285181Sdim uint16_t amd_machine_version_stepping; 517284677Sdim 518284677Sdim /// Byte offset (possibly negative) from start of amd_kernel_code_t 519284677Sdim /// object to kernel's entry point instruction. The actual code for 520284677Sdim /// the kernel is required to be 256 byte aligned to match hardware 521284677Sdim /// requirements (SQ cache line is 16). The code must be position 522284677Sdim /// independent code (PIC) for AMD devices to give runtime the 523284677Sdim /// option of copying code to discrete GPU memory or APU L2 524284677Sdim /// cache. The Finalizer should endeavour to allocate all kernel 525284677Sdim /// machine code in contiguous memory pages so that a device 526284677Sdim /// pre-fetcher will tend to only pre-fetch Kernel Code objects, 527284677Sdim /// improving cache performance. 528284677Sdim int64_t kernel_code_entry_byte_offset; 529284677Sdim 530284677Sdim /// Range of bytes to consider prefetching expressed as an offset 531284677Sdim /// and size. The offset is from the start (possibly negative) of 532284677Sdim /// amd_kernel_code_t object. Set both to 0 if no prefetch 533284677Sdim /// information is available. 534284677Sdim int64_t kernel_code_prefetch_byte_offset; 535284677Sdim uint64_t kernel_code_prefetch_byte_size; 536284677Sdim 537284677Sdim /// Number of bytes of scratch backing memory required for full 538284677Sdim /// occupancy of target chip. This takes into account the number of 539284677Sdim /// bytes of scratch per work-item, the wavefront size, the maximum 540284677Sdim /// number of wavefronts per CU, and the number of CUs. This is an 541284677Sdim /// upper limit on scratch. If the grid being dispatched is small it 542284677Sdim /// may only need less than this. If the kernel uses no scratch, or 543284677Sdim /// the Finalizer has not computed this value, it must be 0. 544284677Sdim uint64_t max_scratch_backing_memory_byte_size; 545284677Sdim 546284677Sdim /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and 547284677Sdim /// COMPUTE_PGM_RSRC2 registers. 548285181Sdim uint64_t compute_pgm_resource_registers; 549284677Sdim 550284677Sdim /// Code properties. See amd_code_property_mask_t for a full list of 551284677Sdim /// properties. 552285181Sdim uint32_t code_properties; 553284677Sdim 554284677Sdim /// The amount of memory required for the combined private, spill 555284677Sdim /// and arg segments for a work-item in bytes. If 556284677Sdim /// is_dynamic_callstack is 1 then additional space must be added to 557284677Sdim /// this value for the call stack. 558284677Sdim uint32_t workitem_private_segment_byte_size; 559284677Sdim 560284677Sdim /// The amount of group segment memory required by a work-group in 561284677Sdim /// bytes. This does not include any dynamically allocated group 562284677Sdim /// segment memory that may be added when the kernel is 563284677Sdim /// dispatched. 564284677Sdim uint32_t workgroup_group_segment_byte_size; 565284677Sdim 566284677Sdim /// Number of byte of GDS required by kernel dispatch. Must be 0 if 567284677Sdim /// not using GDS. 568284677Sdim uint32_t gds_segment_byte_size; 569284677Sdim 570284677Sdim /// The size in bytes of the kernarg segment that holds the values 571284677Sdim /// of the arguments to the kernel. This could be used by CP to 572284677Sdim /// prefetch the kernarg segment pointed to by the dispatch packet. 573284677Sdim uint64_t kernarg_segment_byte_size; 574284677Sdim 575284677Sdim /// Number of fbarrier's used in the kernel and all functions it 576284677Sdim /// calls. If the implementation uses group memory to allocate the 577284677Sdim /// fbarriers then that amount must already be included in the 578284677Sdim /// workgroup_group_segment_byte_size total. 579284677Sdim uint32_t workgroup_fbarrier_count; 580284677Sdim 581284677Sdim /// Number of scalar registers used by a wavefront. This includes 582284677Sdim /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size 583284677Sdim /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a 584284677Sdim /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. 585284677Sdim uint16_t wavefront_sgpr_count; 586284677Sdim 587284677Sdim /// Number of vector registers used by each work-item. Used to set 588284677Sdim /// COMPUTE_PGM_RSRC1.VGPRS. 589284677Sdim uint16_t workitem_vgpr_count; 590284677Sdim 591284677Sdim /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the 592284677Sdim /// first fixed VGPR number reserved. 593284677Sdim uint16_t reserved_vgpr_first; 594284677Sdim 595284677Sdim /// The number of consecutive VGPRs reserved by the client. If 596284677Sdim /// is_debug_supported then this count includes VGPRs reserved 597284677Sdim /// for debugger use. 598284677Sdim uint16_t reserved_vgpr_count; 599284677Sdim 600284677Sdim /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the 601284677Sdim /// first fixed SGPR number reserved. 602284677Sdim uint16_t reserved_sgpr_first; 603284677Sdim 604284677Sdim /// The number of consecutive SGPRs reserved by the client. If 605284677Sdim /// is_debug_supported then this count includes SGPRs reserved 606284677Sdim /// for debugger use. 607284677Sdim uint16_t reserved_sgpr_count; 608284677Sdim 609284677Sdim /// If is_debug_supported is 0 then must be 0. Otherwise, this is the 610284677Sdim /// fixed SGPR number used to hold the wave scratch offset for the 611284677Sdim /// entire kernel execution, or uint16_t(-1) if the register is not 612284677Sdim /// used or not known. 613284677Sdim uint16_t debug_wavefront_private_segment_offset_sgpr; 614284677Sdim 615284677Sdim /// If is_debug_supported is 0 then must be 0. Otherwise, this is the 616284677Sdim /// fixed SGPR number of the first of 4 SGPRs used to hold the 617284677Sdim /// scratch V# used for the entire kernel execution, or uint16_t(-1) 618284677Sdim /// if the registers are not used or not known. 619284677Sdim uint16_t debug_private_segment_buffer_sgpr; 620284677Sdim 621284677Sdim /// The maximum byte alignment of variables used by the kernel in 622284677Sdim /// the specified memory segment. Expressed as a power of two. Must 623284677Sdim /// be at least HSA_POWERTWO_16. 624285181Sdim uint8_t kernarg_segment_alignment; 625285181Sdim uint8_t group_segment_alignment; 626285181Sdim uint8_t private_segment_alignment; 627284677Sdim 628284677Sdim /// Wavefront size expressed as a power of two. Must be a power of 2 629284677Sdim /// in range 1..64 inclusive. Used to support runtime query that 630284677Sdim /// obtains wavefront size, which may be used by application to 631284677Sdim /// allocated dynamic group memory and set the dispatch work-group 632284677Sdim /// size. 633285181Sdim uint8_t wavefront_size; 634284677Sdim 635285181Sdim int32_t call_convention; 636285181Sdim uint8_t reserved3[12]; 637285181Sdim uint64_t runtime_loader_kernel_symbol; 638285181Sdim uint64_t control_directives[16]; 639284677Sdim} amd_kernel_code_t; 640284677Sdim 641284677Sdim#endif // AMDKERNELCODET_H 642