1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8/// \file 9/// This file implements the targeting of the RegisterBankInfo class for 10/// AMDGPU. 11/// \todo This should be generated by TableGen. 12//===----------------------------------------------------------------------===// 13 14#include "AMDGPURegisterBankInfo.h" 15#include "AMDGPUInstrInfo.h" 16#include "AMDGPUSubtarget.h" 17#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 18#include "SIMachineFunctionInfo.h" 19#include "SIRegisterInfo.h" 20#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h" 21#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24#include "llvm/CodeGen/GlobalISel/RegisterBank.h" 25#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 26#include "llvm/CodeGen/TargetRegisterInfo.h" 27#include "llvm/CodeGen/TargetSubtargetInfo.h" 28#include "llvm/IR/Constants.h" 29 30#define GET_TARGET_REGBANK_IMPL 31#include "AMDGPUGenRegisterBank.inc" 32 33// This file will be TableGen'ed at some point. 34#include "AMDGPUGenRegisterBankInfo.def" 35 36using namespace llvm; 37using namespace MIPatternMatch; 38 39namespace { 40 41// Observer to apply a register bank to new registers created by LegalizerHelper. 42class ApplyRegBankMapping final : public GISelChangeObserver { 43private: 44 const AMDGPURegisterBankInfo &RBI; 45 MachineRegisterInfo &MRI; 46 const RegisterBank *NewBank; 47 SmallVector<MachineInstr *, 4> NewInsts; 48 49public: 50 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 51 MachineRegisterInfo &MRI_, const RegisterBank *RB) 52 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 53 54 ~ApplyRegBankMapping() { 55 for (MachineInstr *MI : NewInsts) 56 applyBank(*MI); 57 } 58 59 /// Set any registers that don't have a set register class or bank to SALU. 60 void applyBank(MachineInstr &MI) { 61 const unsigned Opc = MI.getOpcode(); 62 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 63 Opc == AMDGPU::G_SEXT) { 64 // LegalizerHelper wants to use the basic legalization artifacts when 65 // widening etc. We don't handle selection with vcc in artifact sources, 66 // so we need to use a sslect instead to handle these properly. 67 Register DstReg = MI.getOperand(0).getReg(); 68 Register SrcReg = MI.getOperand(1).getReg(); 69 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 70 if (SrcBank == &AMDGPU::VCCRegBank) { 71 const LLT S32 = LLT::scalar(32); 72 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 73 assert(MRI.getType(DstReg) == S32); 74 assert(NewBank == &AMDGPU::VGPRRegBank); 75 76 // Replace the extension with a select, which really uses the boolean 77 // source. 78 MachineIRBuilder B(MI); 79 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 80 auto False = B.buildConstant(S32, 0); 81 B.buildSelect(DstReg, SrcReg, True, False); 82 MRI.setRegBank(True.getReg(0), *NewBank); 83 MRI.setRegBank(False.getReg(0), *NewBank); 84 MI.eraseFromParent(); 85 } 86 87 assert(!MRI.getRegClassOrRegBank(DstReg)); 88 MRI.setRegBank(DstReg, *NewBank); 89 return; 90 } 91 92#ifndef NDEBUG 93 if (Opc == AMDGPU::G_TRUNC) { 94 Register DstReg = MI.getOperand(0).getReg(); 95 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 96 assert(DstBank != &AMDGPU::VCCRegBank); 97 } 98#endif 99 100 for (MachineOperand &Op : MI.operands()) { 101 if (!Op.isReg()) 102 continue; 103 104 Register Reg = Op.getReg(); 105 if (MRI.getRegClassOrRegBank(Reg)) 106 continue; 107 108 const RegisterBank *RB = NewBank; 109 if (MRI.getType(Reg) == LLT::scalar(1)) { 110 assert(NewBank == &AMDGPU::VGPRRegBank && 111 "s1 operands should only be used for vector bools"); 112 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 113 MI.getOpcode() != AMDGPU::G_ANYEXT) && 114 "not expecting legalization artifacts here"); 115 RB = &AMDGPU::VCCRegBank; 116 } 117 118 MRI.setRegBank(Reg, *RB); 119 } 120 } 121 122 void erasingInstr(MachineInstr &MI) override {} 123 124 void createdInstr(MachineInstr &MI) override { 125 // At this point, the instruction was just inserted and has no operands. 126 NewInsts.push_back(&MI); 127 } 128 129 void changingInstr(MachineInstr &MI) override {} 130 void changedInstr(MachineInstr &MI) override {} 131}; 132 133} 134AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 135 : AMDGPUGenRegisterBankInfo(), 136 Subtarget(ST), 137 TRI(Subtarget.getRegisterInfo()), 138 TII(Subtarget.getInstrInfo()) { 139 140 // HACK: Until this is fully tablegen'd. 141 static bool AlreadyInit = false; 142 if (AlreadyInit) 143 return; 144 145 AlreadyInit = true; 146 147 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 148 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 149 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 150} 151 152static bool isVectorRegisterBank(const RegisterBank &Bank) { 153 unsigned BankID = Bank.getID(); 154 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 155} 156 157unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 158 const RegisterBank &Src, 159 unsigned Size) const { 160 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 161 if (Dst.getID() == AMDGPU::SGPRRegBankID && 162 isVectorRegisterBank(Src)) { 163 return std::numeric_limits<unsigned>::max(); 164 } 165 166 // Bool values are tricky, because the meaning is based on context. The SCC 167 // and VCC banks are for the natural scalar and vector conditions produced by 168 // a compare. 169 // 170 // Legalization doesn't know about the necessary context, so an s1 use may 171 // have been a truncate from an arbitrary value, in which case a copy (lowered 172 // as a compare with 0) needs to be inserted. 173 if (Size == 1 && 174 (Dst.getID() == AMDGPU::SGPRRegBankID) && 175 (isVectorRegisterBank(Src) || 176 Src.getID() == AMDGPU::SGPRRegBankID || 177 Src.getID() == AMDGPU::VCCRegBankID)) 178 return std::numeric_limits<unsigned>::max(); 179 180 if (Src.getID() == AMDGPU::VCCRegBankID) 181 return std::numeric_limits<unsigned>::max(); 182 183 // There is no direct copy between AGPRs. 184 if (Dst.getID() == AMDGPU::AGPRRegBankID && 185 Src.getID() == AMDGPU::AGPRRegBankID) 186 return 4; 187 188 return RegisterBankInfo::copyCost(Dst, Src, Size); 189} 190 191unsigned AMDGPURegisterBankInfo::getBreakDownCost( 192 const ValueMapping &ValMapping, 193 const RegisterBank *CurBank) const { 194 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 195 // VGPR. 196 // FIXME: Is there a better way to do this? 197 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 198 return 10; // This is expensive. 199 200 assert(ValMapping.NumBreakDowns == 2 && 201 ValMapping.BreakDown[0].Length == 32 && 202 ValMapping.BreakDown[0].StartIdx == 0 && 203 ValMapping.BreakDown[1].Length == 32 && 204 ValMapping.BreakDown[1].StartIdx == 32 && 205 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 206 207 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 208 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 209 // want. 210 211 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 212 // alignment restrictions, but this probably isn't important. 213 return 1; 214} 215 216const RegisterBank & 217AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 218 LLT Ty) const { 219 if (&RC == &AMDGPU::SReg_1RegClass) 220 return AMDGPU::VCCRegBank; 221 222 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 223 // VCC-like use. 224 if (TRI->isSGPRClass(&RC)) { 225 // FIXME: This probably came from a copy from a physical register, which 226 // should be inferrrable from the copied to-type. We don't have many boolean 227 // physical register constraints so just assume a normal SGPR for now. 228 if (!Ty.isValid()) 229 return AMDGPU::SGPRRegBank; 230 231 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 232 } 233 234 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 235} 236 237template <unsigned NumOps> 238RegisterBankInfo::InstructionMappings 239AMDGPURegisterBankInfo::addMappingFromTable( 240 const MachineInstr &MI, const MachineRegisterInfo &MRI, 241 const std::array<unsigned, NumOps> RegSrcOpIdx, 242 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 243 244 InstructionMappings AltMappings; 245 246 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 247 248 unsigned Sizes[NumOps]; 249 for (unsigned I = 0; I < NumOps; ++I) { 250 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 251 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 252 } 253 254 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 255 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 256 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 257 } 258 259 // getInstrMapping's default mapping uses ID 1, so start at 2. 260 unsigned MappingID = 2; 261 for (const auto &Entry : Table) { 262 for (unsigned I = 0; I < NumOps; ++I) { 263 int OpIdx = RegSrcOpIdx[I]; 264 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 265 } 266 267 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 268 getOperandsMapping(Operands), 269 Operands.size())); 270 } 271 272 return AltMappings; 273} 274 275RegisterBankInfo::InstructionMappings 276AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 277 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 278 switch (MI.getIntrinsicID()) { 279 case Intrinsic::amdgcn_readlane: { 280 static const OpRegBankEntry<3> Table[2] = { 281 // Perfectly legal. 282 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 283 284 // Need a readfirstlane for the index. 285 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 286 }; 287 288 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 289 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 290 } 291 case Intrinsic::amdgcn_writelane: { 292 static const OpRegBankEntry<4> Table[4] = { 293 // Perfectly legal. 294 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 295 296 // Need readfirstlane of first op 297 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 298 299 // Need readfirstlane of second op 300 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 301 302 // Need readfirstlane of both ops 303 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 304 }; 305 306 // rsrc, voffset, offset 307 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 308 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 309 } 310 default: 311 return RegisterBankInfo::getInstrAlternativeMappings(MI); 312 } 313} 314 315RegisterBankInfo::InstructionMappings 316AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 317 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 318 319 switch (MI.getIntrinsicID()) { 320 case Intrinsic::amdgcn_buffer_load: { 321 static const OpRegBankEntry<3> Table[4] = { 322 // Perfectly legal. 323 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 324 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 325 326 // Waterfall loop needed for rsrc. In the worst case this will execute 327 // approximately an extra 10 * wavesize + 2 instructions. 328 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 329 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 } 330 }; 331 332 // rsrc, voffset, offset 333 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } }; 334 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 335 } 336 case Intrinsic::amdgcn_s_buffer_load: { 337 static const OpRegBankEntry<2> Table[4] = { 338 // Perfectly legal. 339 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 340 341 // Only need 1 register in loop 342 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 343 344 // Have to waterfall the resource. 345 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 346 347 // Have to waterfall the resource, and the offset. 348 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 349 }; 350 351 // rsrc, offset 352 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 353 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 354 } 355 case Intrinsic::amdgcn_ds_ordered_add: 356 case Intrinsic::amdgcn_ds_ordered_swap: { 357 // VGPR = M0, VGPR 358 static const OpRegBankEntry<3> Table[2] = { 359 // Perfectly legal. 360 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 361 362 // Need a readfirstlane for m0 363 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 364 }; 365 366 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 367 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 368 } 369 case Intrinsic::amdgcn_s_sendmsg: 370 case Intrinsic::amdgcn_s_sendmsghalt: { 371 // FIXME: Should have no register for immediate 372 static const OpRegBankEntry<1> Table[2] = { 373 // Perfectly legal. 374 { { AMDGPU::SGPRRegBankID }, 1 }, 375 376 // Need readlane 377 { { AMDGPU::VGPRRegBankID }, 3 } 378 }; 379 380 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 381 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 382 } 383 default: 384 return RegisterBankInfo::getInstrAlternativeMappings(MI); 385 } 386} 387 388static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { 389 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue()); 390 return I && I->getMetadata("amdgpu.noclobber"); 391} 392 393// FIXME: Returns uniform if there's no source value information. This is 394// probably wrong. 395static bool isScalarLoadLegal(const MachineInstr &MI) { 396 if (!MI.hasOneMemOperand()) 397 return false; 398 399 const MachineMemOperand *MMO = *MI.memoperands_begin(); 400 const unsigned AS = MMO->getAddrSpace(); 401 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 402 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 403 404 // There are no extending SMRD/SMEM loads, and they require 4-byte alignment. 405 return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 && 406 // Can't do a scalar atomic load. 407 !MMO->isAtomic() && 408 // Don't use scalar loads for volatile accesses to non-constant address 409 // spaces. 410 (IsConst || !MMO->isVolatile()) && 411 // Memory must be known constant, or not written before this load. 412 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && 413 AMDGPUInstrInfo::isUniformMMO(MMO); 414} 415 416RegisterBankInfo::InstructionMappings 417AMDGPURegisterBankInfo::getInstrAlternativeMappings( 418 const MachineInstr &MI) const { 419 420 const MachineFunction &MF = *MI.getParent()->getParent(); 421 const MachineRegisterInfo &MRI = MF.getRegInfo(); 422 423 424 InstructionMappings AltMappings; 425 switch (MI.getOpcode()) { 426 case TargetOpcode::G_CONSTANT: { 427 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 428 if (Size == 1) { 429 static const OpRegBankEntry<1> Table[3] = { 430 { { AMDGPU::VGPRRegBankID }, 1 }, 431 { { AMDGPU::SGPRRegBankID }, 1 }, 432 { { AMDGPU::VCCRegBankID }, 1 } 433 }; 434 435 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 436 } 437 438 LLVM_FALLTHROUGH; 439 } 440 case TargetOpcode::G_FCONSTANT: 441 case TargetOpcode::G_FRAME_INDEX: 442 case TargetOpcode::G_GLOBAL_VALUE: { 443 static const OpRegBankEntry<1> Table[2] = { 444 { { AMDGPU::VGPRRegBankID }, 1 }, 445 { { AMDGPU::SGPRRegBankID }, 1 } 446 }; 447 448 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 449 } 450 case TargetOpcode::G_AND: 451 case TargetOpcode::G_OR: 452 case TargetOpcode::G_XOR: { 453 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 454 455 if (Size == 1) { 456 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 457 const InstructionMapping &SCCMapping = getInstructionMapping( 458 1, 1, getOperandsMapping( 459 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 460 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 461 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 462 3); // Num Operands 463 AltMappings.push_back(&SCCMapping); 464 465 const InstructionMapping &VCCMapping0 = getInstructionMapping( 466 2, 1, getOperandsMapping( 467 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 468 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 469 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 470 3); // Num Operands 471 AltMappings.push_back(&VCCMapping0); 472 return AltMappings; 473 } 474 475 if (Size != 64) 476 break; 477 478 const InstructionMapping &SSMapping = getInstructionMapping( 479 1, 1, getOperandsMapping( 480 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 481 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 482 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 483 3); // Num Operands 484 AltMappings.push_back(&SSMapping); 485 486 const InstructionMapping &VVMapping = getInstructionMapping( 487 2, 2, getOperandsMapping( 488 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 489 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 490 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 491 3); // Num Operands 492 AltMappings.push_back(&VVMapping); 493 494 const InstructionMapping &SVMapping = getInstructionMapping( 495 3, 3, getOperandsMapping( 496 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 497 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size), 498 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 499 3); // Num Operands 500 AltMappings.push_back(&SVMapping); 501 502 // SGPR in LHS is slightly preferrable, so make it VS more expensive than 503 // SV. 504 const InstructionMapping &VSMapping = getInstructionMapping( 505 3, 4, getOperandsMapping( 506 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 507 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 508 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}), 509 3); // Num Operands 510 AltMappings.push_back(&VSMapping); 511 break; 512 } 513 case TargetOpcode::G_LOAD: 514 case TargetOpcode::G_ZEXTLOAD: 515 case TargetOpcode::G_SEXTLOAD: { 516 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 517 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 518 unsigned PtrSize = PtrTy.getSizeInBits(); 519 unsigned AS = PtrTy.getAddressSpace(); 520 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); 521 522 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 523 AS != AMDGPUAS::PRIVATE_ADDRESS) && 524 isScalarLoadLegal(MI)) { 525 const InstructionMapping &SSMapping = getInstructionMapping( 526 1, 1, getOperandsMapping( 527 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 528 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 529 2); // Num Operands 530 AltMappings.push_back(&SSMapping); 531 } 532 533 const InstructionMapping &VVMapping = getInstructionMapping( 534 2, 1, getOperandsMapping( 535 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), 536 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 537 2); // Num Operands 538 AltMappings.push_back(&VVMapping); 539 540 // It may be possible to have a vgpr = load sgpr mapping here, because 541 // the mubuf instructions support this kind of load, but probably for only 542 // gfx7 and older. However, the addressing mode matching in the instruction 543 // selector should be able to do a better job of detecting and selecting 544 // these kinds of loads from the vgpr = load vgpr mapping. 545 546 return AltMappings; 547 548 } 549 case TargetOpcode::G_ICMP: { 550 // TODO: Should report 32-bit for scalar output type. 551 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 552 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 553 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 554 nullptr, // Predicate operand. 555 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 556 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 557 4); // Num Operands 558 AltMappings.push_back(&SSMapping); 559 560 const InstructionMapping &SVMapping = getInstructionMapping(2, 1, 561 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 562 nullptr, // Predicate operand. 563 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 564 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), 565 4); // Num Operands 566 AltMappings.push_back(&SVMapping); 567 568 const InstructionMapping &VSMapping = getInstructionMapping(3, 1, 569 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 570 nullptr, // Predicate operand. 571 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 573 4); // Num Operands 574 AltMappings.push_back(&VSMapping); 575 576 const InstructionMapping &VVMapping = getInstructionMapping(4, 1, 577 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 578 nullptr, // Predicate operand. 579 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 580 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), 581 4); // Num Operands 582 AltMappings.push_back(&VVMapping); 583 584 return AltMappings; 585 } 586 case TargetOpcode::G_SELECT: { 587 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 588 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 589 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 590 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 591 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 592 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 593 4); // Num Operands 594 AltMappings.push_back(&SSMapping); 595 596 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 597 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 598 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 599 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 600 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 601 4); // Num Operands 602 AltMappings.push_back(&VVMapping); 603 604 return AltMappings; 605 } 606 case TargetOpcode::G_SMIN: 607 case TargetOpcode::G_SMAX: 608 case TargetOpcode::G_UMIN: 609 case TargetOpcode::G_UMAX: { 610 static const OpRegBankEntry<3> Table[4] = { 611 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 612 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 613 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 614 615 // Scalar requires cmp+select, and extends if 16-bit. 616 // FIXME: Should there be separate costs for 32 and 16-bit 617 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } 618 }; 619 620 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; 621 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 622 } 623 case TargetOpcode::G_UADDE: 624 case TargetOpcode::G_USUBE: 625 case TargetOpcode::G_SADDE: 626 case TargetOpcode::G_SSUBE: { 627 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 628 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 629 getOperandsMapping( 630 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 631 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 632 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 633 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 634 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 635 5); // Num Operands 636 AltMappings.push_back(&SSMapping); 637 638 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 639 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 640 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 641 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 642 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 643 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 644 5); // Num Operands 645 AltMappings.push_back(&VVMapping); 646 return AltMappings; 647 } 648 case AMDGPU::G_BRCOND: { 649 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 650 651 // TODO: Change type to 32 for scalar 652 const InstructionMapping &SMapping = getInstructionMapping( 653 1, 1, getOperandsMapping( 654 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 655 2); // Num Operands 656 AltMappings.push_back(&SMapping); 657 658 const InstructionMapping &VMapping = getInstructionMapping( 659 1, 1, getOperandsMapping( 660 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 661 2); // Num Operands 662 AltMappings.push_back(&VMapping); 663 return AltMappings; 664 } 665 case AMDGPU::G_INTRINSIC: 666 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 667 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 668 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 669 default: 670 break; 671 } 672 return RegisterBankInfo::getInstrAlternativeMappings(MI); 673} 674 675void AMDGPURegisterBankInfo::split64BitValueForMapping( 676 MachineIRBuilder &B, 677 SmallVector<Register, 2> &Regs, 678 LLT HalfTy, 679 Register Reg) const { 680 assert(HalfTy.getSizeInBits() == 32); 681 MachineRegisterInfo *MRI = B.getMRI(); 682 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 683 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 684 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 685 MRI->setRegBank(LoLHS, *Bank); 686 MRI->setRegBank(HiLHS, *Bank); 687 688 Regs.push_back(LoLHS); 689 Regs.push_back(HiLHS); 690 691 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 692 .addDef(LoLHS) 693 .addDef(HiLHS) 694 .addUse(Reg); 695} 696 697/// Replace the current type each register in \p Regs has with \p NewTy 698static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 699 LLT NewTy) { 700 for (Register Reg : Regs) { 701 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 702 MRI.setType(Reg, NewTy); 703 } 704} 705 706static LLT getHalfSizedType(LLT Ty) { 707 if (Ty.isVector()) { 708 assert(Ty.getNumElements() % 2 == 0); 709 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); 710 } 711 712 assert(Ty.getSizeInBits() % 2 == 0); 713 return LLT::scalar(Ty.getSizeInBits() / 2); 714} 715 716/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 717/// any of the required SGPR operands are VGPRs, perform a waterfall loop to 718/// execute the instruction for each unique combination of values in all lanes 719/// in the wave. The block will be split such that rest of the instructions are 720/// moved to a new block. 721/// 722/// Essentially performs this loop: 723// 724/// Save Execution Mask 725/// For (Lane : Wavefront) { 726/// Enable Lane, Disable all other lanes 727/// SGPR = read SGPR value for current lane from VGPR 728/// VGPRResult[Lane] = use_op SGPR 729/// } 730/// Restore Execution Mask 731/// 732/// There is additional complexity to try for compare values to identify the 733/// unique values used. 734bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 735 MachineIRBuilder &B, 736 iterator_range<MachineBasicBlock::iterator> Range, 737 SmallSet<Register, 4> &SGPROperandRegs, 738 MachineRegisterInfo &MRI) const { 739 SmallVector<Register, 4> ResultRegs; 740 SmallVector<Register, 4> InitResultRegs; 741 SmallVector<Register, 4> PhiRegs; 742 743 MachineBasicBlock &MBB = B.getMBB(); 744 MachineFunction *MF = &B.getMF(); 745 746 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 747 const unsigned WaveAndOpc = Subtarget.isWave32() ? 748 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 749 const unsigned MovTermOpc = Subtarget.isWave32() ? 750 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 751 const unsigned XorTermOpc = Subtarget.isWave32() ? 752 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 753 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 754 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 755 const unsigned ExecReg = Subtarget.isWave32() ? 756 AMDGPU::EXEC_LO : AMDGPU::EXEC; 757 758 for (MachineInstr &MI : Range) { 759 for (MachineOperand &Def : MI.defs()) { 760 LLT ResTy = MRI.getType(Def.getReg()); 761 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); 762 ResultRegs.push_back(Def.getReg()); 763 Register InitReg = B.buildUndef(ResTy).getReg(0); 764 Register PhiReg = MRI.createGenericVirtualRegister(ResTy); 765 InitResultRegs.push_back(InitReg); 766 PhiRegs.push_back(PhiReg); 767 MRI.setRegBank(PhiReg, *DefBank); 768 MRI.setRegBank(InitReg, *DefBank); 769 } 770 } 771 772 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 773 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 774 775 // Don't bother using generic instructions/registers for the exec mask. 776 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 777 .addDef(InitSaveExecReg); 778 779 Register PhiExec = MRI.createVirtualRegister(WaveRC); 780 Register NewExec = MRI.createVirtualRegister(WaveRC); 781 782 // To insert the loop we need to split the block. Move everything before this 783 // point to a new block, and insert a new empty block before this instruction. 784 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 785 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 786 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 787 MachineFunction::iterator MBBI(MBB); 788 ++MBBI; 789 MF->insert(MBBI, LoopBB); 790 MF->insert(MBBI, RestoreExecBB); 791 MF->insert(MBBI, RemainderBB); 792 793 LoopBB->addSuccessor(RestoreExecBB); 794 LoopBB->addSuccessor(LoopBB); 795 796 // Move the rest of the block into a new block. 797 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 798 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 799 800 MBB.addSuccessor(LoopBB); 801 RestoreExecBB->addSuccessor(RemainderBB); 802 803 B.setInsertPt(*LoopBB, LoopBB->end()); 804 805 B.buildInstr(TargetOpcode::PHI) 806 .addDef(PhiExec) 807 .addReg(InitSaveExecReg) 808 .addMBB(&MBB) 809 .addReg(NewExec) 810 .addMBB(LoopBB); 811 812 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { 813 B.buildInstr(TargetOpcode::G_PHI) 814 .addDef(std::get<2>(Result)) 815 .addReg(std::get<0>(Result)) // Initial value / implicit_def 816 .addMBB(&MBB) 817 .addReg(std::get<1>(Result)) // Mid-loop value. 818 .addMBB(LoopBB); 819 } 820 821 const DebugLoc &DL = B.getDL(); 822 823 // Figure out the iterator range after splicing the instructions. 824 auto NewBegin = std::prev(LoopBB->end()); 825 826 // Move the instruction into the loop. Note we moved everything after 827 // Range.end() already into a new block, so Range.end() is no longer valid. 828 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); 829 830 auto NewEnd = LoopBB->end(); 831 832 MachineBasicBlock::iterator I = Range.begin(); 833 B.setInsertPt(*LoopBB, I); 834 835 Register CondReg; 836 837 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 838 for (MachineOperand &Op : MI.uses()) { 839 if (!Op.isReg() || Op.isDef()) 840 continue; 841 842 if (SGPROperandRegs.count(Op.getReg())) { 843 LLT OpTy = MRI.getType(Op.getReg()); 844 unsigned OpSize = OpTy.getSizeInBits(); 845 846 // Can only do a readlane of 32-bit pieces. 847 if (OpSize == 32) { 848 // Avoid extra copies in the simple case of one 32-bit register. 849 Register CurrentLaneOpReg 850 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 851 MRI.setType(CurrentLaneOpReg, OpTy); 852 853 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); 854 // Read the next variant <- also loop target. 855 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 856 CurrentLaneOpReg) 857 .addReg(Op.getReg()); 858 859 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 860 bool First = CondReg == AMDGPU::NoRegister; 861 if (First) 862 CondReg = NewCondReg; 863 864 // Compare the just read M0 value to all possible Idx values. 865 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 866 .addDef(NewCondReg) 867 .addReg(CurrentLaneOpReg) 868 .addReg(Op.getReg()); 869 Op.setReg(CurrentLaneOpReg); 870 871 if (!First) { 872 Register AndReg = MRI.createVirtualRegister(WaveRC); 873 874 // If there are multiple operands to consider, and the conditions. 875 B.buildInstr(WaveAndOpc) 876 .addDef(AndReg) 877 .addReg(NewCondReg) 878 .addReg(CondReg); 879 CondReg = AndReg; 880 } 881 } else { 882 LLT S32 = LLT::scalar(32); 883 SmallVector<Register, 8> ReadlanePieces; 884 885 // The compares can be done as 64-bit, but the extract needs to be done 886 // in 32-bit pieces. 887 888 bool Is64 = OpSize % 64 == 0; 889 890 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); 891 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 892 : AMDGPU::V_CMP_EQ_U32_e64; 893 894 // The compares can be done as 64-bit, but the extract needs to be done 895 // in 32-bit pieces. 896 897 // Insert the unmerge before the loop. 898 899 B.setMBB(MBB); 900 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); 901 B.setInstr(*I); 902 903 unsigned NumPieces = Unmerge->getNumOperands() - 1; 904 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { 905 Register UnmergePiece = Unmerge.getReg(PieceIdx); 906 907 Register CurrentLaneOpReg; 908 if (Is64) { 909 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 910 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 911 912 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 913 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 914 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 915 916 // Read the next variant <- also loop target. 917 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 918 CurrentLaneOpRegLo) 919 .addReg(UnmergePiece, 0, AMDGPU::sub0); 920 921 // Read the next variant <- also loop target. 922 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 923 CurrentLaneOpRegHi) 924 .addReg(UnmergePiece, 0, AMDGPU::sub1); 925 926 CurrentLaneOpReg = 927 B.buildMerge(LLT::scalar(64), 928 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 929 .getReg(0); 930 931 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 932 933 if (OpTy.getScalarSizeInBits() == 64) { 934 // If we need to produce a 64-bit element vector, so use the 935 // merged pieces 936 ReadlanePieces.push_back(CurrentLaneOpReg); 937 } else { 938 // 32-bit element type. 939 ReadlanePieces.push_back(CurrentLaneOpRegLo); 940 ReadlanePieces.push_back(CurrentLaneOpRegHi); 941 } 942 } else { 943 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); 944 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 945 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 946 947 // Read the next variant <- also loop target. 948 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 949 CurrentLaneOpReg) 950 .addReg(UnmergePiece); 951 ReadlanePieces.push_back(CurrentLaneOpReg); 952 } 953 954 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 955 bool First = CondReg == AMDGPU::NoRegister; 956 if (First) 957 CondReg = NewCondReg; 958 959 B.buildInstr(CmpOp) 960 .addDef(NewCondReg) 961 .addReg(CurrentLaneOpReg) 962 .addReg(UnmergePiece); 963 964 if (!First) { 965 Register AndReg = MRI.createVirtualRegister(WaveRC); 966 967 // If there are multiple operands to consider, and the conditions. 968 B.buildInstr(WaveAndOpc) 969 .addDef(AndReg) 970 .addReg(NewCondReg) 971 .addReg(CondReg); 972 CondReg = AndReg; 973 } 974 } 975 976 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 977 // BUILD_VECTOR 978 if (OpTy.isVector()) { 979 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 980 Op.setReg(Merge.getReg(0)); 981 } else { 982 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 983 Op.setReg(Merge.getReg(0)); 984 } 985 986 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); 987 } 988 } 989 } 990 } 991 992 B.setInsertPt(*LoopBB, LoopBB->end()); 993 994 // Update EXEC, save the original EXEC value to VCC. 995 B.buildInstr(AndSaveExecOpc) 996 .addDef(NewExec) 997 .addReg(CondReg, RegState::Kill); 998 999 MRI.setSimpleHint(NewExec, CondReg); 1000 1001 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 1002 B.buildInstr(XorTermOpc) 1003 .addDef(ExecReg) 1004 .addReg(ExecReg) 1005 .addReg(NewExec); 1006 1007 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 1008 // s_cbranch_scc0? 1009 1010 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 1011 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) 1012 .addMBB(LoopBB); 1013 1014 // Save the EXEC mask before the loop. 1015 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) 1016 .addReg(ExecReg); 1017 1018 // Restore the EXEC mask after the loop. 1019 B.setMBB(*RestoreExecBB); 1020 B.buildInstr(MovTermOpc) 1021 .addDef(ExecReg) 1022 .addReg(SaveExecReg); 1023 1024 // Set the insert point after the original instruction, so any new 1025 // instructions will be in the remainder. 1026 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 1027 1028 return true; 1029} 1030 1031// Return any unique registers used by \p MI at \p OpIndices that need to be 1032// handled in a waterfall loop. Returns these registers in \p 1033// SGPROperandRegs. Returns true if there are any operansd to handle and a 1034// waterfall loop is necessary. 1035bool AMDGPURegisterBankInfo::collectWaterfallOperands( 1036 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 1037 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 1038 for (unsigned Op : OpIndices) { 1039 assert(MI.getOperand(Op).isUse()); 1040 Register Reg = MI.getOperand(Op).getReg(); 1041 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 1042 if (OpBank->getID() == AMDGPU::VGPRRegBankID) 1043 SGPROperandRegs.insert(Reg); 1044 } 1045 1046 // No operands need to be replaced, so no need to loop. 1047 return !SGPROperandRegs.empty(); 1048} 1049 1050bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1051 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 1052 ArrayRef<unsigned> OpIndices) const { 1053 // Use a set to avoid extra readfirstlanes in the case where multiple operands 1054 // are the same register. 1055 SmallSet<Register, 4> SGPROperandRegs; 1056 1057 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 1058 return false; 1059 1060 MachineBasicBlock::iterator I = MI.getIterator(); 1061 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1062 SGPROperandRegs, MRI); 1063} 1064 1065bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1066 MachineInstr &MI, MachineRegisterInfo &MRI, 1067 ArrayRef<unsigned> OpIndices) const { 1068 MachineIRBuilder B(MI); 1069 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1070} 1071 1072// Legalize an operand that must be an SGPR by inserting a readfirstlane. 1073void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1074 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1075 Register Reg = MI.getOperand(OpIdx).getReg(); 1076 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1077 if (Bank != &AMDGPU::VGPRRegBank) 1078 return; 1079 1080 MachineIRBuilder B(MI); 1081 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1082 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 1083 .addDef(SGPR) 1084 .addReg(Reg); 1085 1086 MRI.setType(SGPR, MRI.getType(Reg)); 1087 1088 const TargetRegisterClass *Constrained = 1089 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 1090 (void)Constrained; 1091 assert(Constrained && "Failed to constrain readfirstlane src reg"); 1092 1093 MI.getOperand(OpIdx).setReg(SGPR); 1094} 1095 1096// When regbankselect repairs registers, it will insert a repair instruction 1097// which defines the repaired register. Then it calls applyMapping and expects 1098// that the targets will either delete or rewrite the originally wrote to the 1099// repaired registers. Beccause of this, we end up in a situation where 1100// we have 2 instructions defining the same registers. 1101static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, 1102 Register Reg, 1103 const MachineInstr &MI) { 1104 // Is there some way we can assert that there are exactly 2 def instructions? 1105 for (MachineInstr &Other : MRI.def_instructions(Reg)) { 1106 if (&Other != &MI) 1107 return &Other; 1108 } 1109 1110 return nullptr; 1111} 1112 1113bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, 1114 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1115 MachineRegisterInfo &MRI) const { 1116 Register DstReg = MI.getOperand(0).getReg(); 1117 const LLT LoadTy = MRI.getType(DstReg); 1118 unsigned LoadSize = LoadTy.getSizeInBits(); 1119 const unsigned MaxNonSmrdLoadSize = 128; 1120 // 128-bit loads are supported for all instruction types. 1121 if (LoadSize <= MaxNonSmrdLoadSize) 1122 return false; 1123 1124 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0)); 1125 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); 1126 1127 // If the pointer is an SGPR, we have nothing to do. 1128 if (SrcRegs.empty()) { 1129 const RegisterBank *PtrBank = 1130 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1131 if (PtrBank == &AMDGPU::SGPRRegBank) 1132 return false; 1133 SrcRegs.push_back(MI.getOperand(1).getReg()); 1134 } 1135 1136 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1137 1138 // We want to get the repair instruction now, because it will help us 1139 // determine which instruction the legalizer inserts that will also 1140 // write to DstReg. 1141 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); 1142 1143 // RegBankSelect only emits scalar types, so we need to reset the pointer 1144 // operand to a pointer type. 1145 Register BasePtrReg = SrcRegs[0]; 1146 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1147 MRI.setType(BasePtrReg, PtrTy); 1148 1149 MachineIRBuilder B(MI); 1150 1151 unsigned SplitElts = 1152 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); 1153 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); 1154 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 1155 GISelObserverWrapper Observer(&O); 1156 B.setChangeObserver(Observer); 1157 LegalizerHelper Helper(B.getMF(), Observer, B); 1158 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1159 return false; 1160 1161 // At this point, the legalizer has split the original load into smaller 1162 // loads. At the end of lowering, it inserts an instruction (LegalizedInst) 1163 // that combines the outputs of the lower loads and writes it to DstReg. 1164 // The register bank selector has also added the RepairInst which writes to 1165 // DstReg as well. 1166 1167 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); 1168 1169 // Replace the output of the LegalizedInst with a temporary register, since 1170 // RepairInst already defines DstReg. 1171 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); 1172 LegalizedInst->getOperand(0).setReg(TmpReg); 1173 B.setInsertPt(*RepairInst->getParent(), RepairInst); 1174 1175 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { 1176 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 1177 B.buildConstant(IdxReg, DefIdx); 1178 MRI.setRegBank(IdxReg, AMDGPU::VGPRRegBank); 1179 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); 1180 } 1181 1182 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1183 return true; 1184} 1185 1186bool AMDGPURegisterBankInfo::applyMappingImage( 1187 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1188 MachineRegisterInfo &MRI, int RsrcIdx) const { 1189 const int NumDefs = MI.getNumExplicitDefs(); 1190 1191 // The reported argument index is relative to the IR intrinsic call arguments, 1192 // so we need to shift by the number of defs and the intrinsic ID. 1193 RsrcIdx += NumDefs + 1; 1194 1195 // Insert copies to VGPR arguments. 1196 applyDefaultMapping(OpdMapper); 1197 1198 // Fixup any SGPR arguments. 1199 SmallVector<unsigned, 4> SGPRIndexes; 1200 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1201 if (!MI.getOperand(I).isReg()) 1202 continue; 1203 1204 // If this intrinsic has a sampler, it immediately follows rsrc. 1205 if (I == RsrcIdx || I == RsrcIdx + 1) 1206 SGPRIndexes.push_back(I); 1207 } 1208 1209 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1210 return true; 1211} 1212 1213// FIXME: Duplicated from LegalizerHelper 1214static CmpInst::Predicate minMaxToCompare(unsigned Opc) { 1215 switch (Opc) { 1216 case TargetOpcode::G_SMIN: 1217 return CmpInst::ICMP_SLT; 1218 case TargetOpcode::G_SMAX: 1219 return CmpInst::ICMP_SGT; 1220 case TargetOpcode::G_UMIN: 1221 return CmpInst::ICMP_ULT; 1222 case TargetOpcode::G_UMAX: 1223 return CmpInst::ICMP_UGT; 1224 default: 1225 llvm_unreachable("not in integer min/max"); 1226 } 1227} 1228 1229// FIXME: Duplicated from LegalizerHelper, except changing the boolean type. 1230void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B, 1231 MachineInstr &MI) const { 1232 Register Dst = MI.getOperand(0).getReg(); 1233 Register Src0 = MI.getOperand(1).getReg(); 1234 Register Src1 = MI.getOperand(2).getReg(); 1235 1236 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); 1237 LLT CmpType = LLT::scalar(32); 1238 1239 auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1); 1240 B.buildSelect(Dst, Cmp, Src0, Src1); 1241 1242 B.getMRI()->setRegBank(Cmp.getReg(0), AMDGPU::SGPRRegBank); 1243 MI.eraseFromParent(); 1244} 1245 1246// For cases where only a single copy is inserted for matching register banks. 1247// Replace the register in the instruction operand 1248static void substituteSimpleCopyRegs( 1249 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1250 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1251 if (!SrcReg.empty()) { 1252 assert(SrcReg.size() == 1); 1253 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1254 } 1255} 1256 1257/// Handle register layout difference for f16 images for some subtargets. 1258Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1259 MachineRegisterInfo &MRI, 1260 Register Reg) const { 1261 if (!Subtarget.hasUnpackedD16VMem()) 1262 return Reg; 1263 1264 const LLT S16 = LLT::scalar(16); 1265 LLT StoreVT = MRI.getType(Reg); 1266 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1267 return Reg; 1268 1269 auto Unmerge = B.buildUnmerge(S16, Reg); 1270 1271 1272 SmallVector<Register, 4> WideRegs; 1273 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1274 WideRegs.push_back(Unmerge.getReg(I)); 1275 1276 const LLT S32 = LLT::scalar(32); 1277 int NumElts = StoreVT.getNumElements(); 1278 1279 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1280} 1281 1282static std::pair<Register, unsigned> 1283getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1284 int64_t Const; 1285 if (mi_match(Reg, MRI, m_ICst(Const))) 1286 return std::make_pair(Register(), Const); 1287 1288 Register Base; 1289 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1290 return std::make_pair(Base, Const); 1291 1292 // TODO: Handle G_OR used for add case 1293 return std::make_pair(Reg, 0); 1294} 1295 1296std::pair<Register, unsigned> 1297AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1298 Register OrigOffset) const { 1299 const unsigned MaxImm = 4095; 1300 Register BaseReg; 1301 unsigned ImmOffset; 1302 const LLT S32 = LLT::scalar(32); 1303 1304 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1305 OrigOffset); 1306 1307 unsigned C1 = 0; 1308 if (ImmOffset != 0) { 1309 // If the immediate value is too big for the immoffset field, put the value 1310 // and -4096 into the immoffset field so that the value that is copied/added 1311 // for the voffset field is a multiple of 4096, and it stands more chance 1312 // of being CSEd with the copy/add for another similar load/store. 1313 // However, do not do that rounding down to a multiple of 4096 if that is a 1314 // negative number, as it appears to be illegal to have a negative offset 1315 // in the vgpr, even if adding the immediate offset makes it positive. 1316 unsigned Overflow = ImmOffset & ~MaxImm; 1317 ImmOffset -= Overflow; 1318 if ((int32_t)Overflow < 0) { 1319 Overflow += ImmOffset; 1320 ImmOffset = 0; 1321 } 1322 1323 C1 = ImmOffset; 1324 if (Overflow != 0) { 1325 if (!BaseReg) 1326 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1327 else { 1328 auto OverflowVal = B.buildConstant(S32, Overflow); 1329 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1330 } 1331 } 1332 } 1333 1334 if (!BaseReg) 1335 BaseReg = B.buildConstant(S32, 0).getReg(0); 1336 1337 return {BaseReg, C1}; 1338} 1339 1340static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 1341 int64_t C; 1342 return mi_match(Reg, MRI, m_ICst(C)) && C == 0; 1343} 1344 1345static unsigned extractGLC(unsigned CachePolicy) { 1346 return CachePolicy & 1; 1347} 1348 1349static unsigned extractSLC(unsigned CachePolicy) { 1350 return (CachePolicy >> 1) & 1; 1351} 1352 1353static unsigned extractDLC(unsigned CachePolicy) { 1354 return (CachePolicy >> 2) & 1; 1355} 1356 1357MachineInstr * 1358AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, 1359 MachineInstr &MI) const { 1360 MachineRegisterInfo &MRI = *B.getMRI(); 1361 executeInWaterfallLoop(B, MI, MRI, {2, 4}); 1362 1363 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. 1364 1365 Register VData = MI.getOperand(1).getReg(); 1366 LLT Ty = MRI.getType(VData); 1367 1368 int EltSize = Ty.getScalarSizeInBits(); 1369 int Size = Ty.getSizeInBits(); 1370 1371 // FIXME: Broken integer truncstore. 1372 if (EltSize != 32) 1373 report_fatal_error("unhandled intrinsic store"); 1374 1375 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 1376 const int MemSize = (*MI.memoperands_begin())->getSize(); 1377 1378 1379 Register RSrc = MI.getOperand(2).getReg(); 1380 Register VOffset = MI.getOperand(3).getReg(); 1381 Register SOffset = MI.getOperand(4).getReg(); 1382 unsigned CachePolicy = MI.getOperand(5).getImm(); 1383 1384 unsigned ImmOffset; 1385 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 1386 1387 const bool Offen = !isZero(VOffset, MRI); 1388 1389 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; 1390 switch (8 * MemSize) { 1391 case 8: 1392 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 1393 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 1394 break; 1395 case 16: 1396 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 1397 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 1398 break; 1399 default: 1400 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 1401 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 1402 if (Size > 32) 1403 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 1404 break; 1405 } 1406 1407 1408 // Set the insertion point back to the instruction in case it was moved into a 1409 // loop. 1410 B.setInstr(MI); 1411 1412 MachineInstrBuilder MIB = B.buildInstr(Opc) 1413 .addUse(VData); 1414 1415 if (Offen) 1416 MIB.addUse(VOffset); 1417 1418 MIB.addUse(RSrc) 1419 .addUse(SOffset) 1420 .addImm(ImmOffset) 1421 .addImm(extractGLC(CachePolicy)) 1422 .addImm(extractSLC(CachePolicy)) 1423 .addImm(0) // tfe: FIXME: Remove from inst 1424 .addImm(extractDLC(CachePolicy)) 1425 .cloneMemRefs(MI); 1426 1427 // FIXME: We need a way to report failure from applyMappingImpl. 1428 // Insert constrain copies before inserting the loop. 1429 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1430 report_fatal_error("failed to constrain selected store intrinsic"); 1431 1432 return MIB; 1433} 1434 1435bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1436 Register SrcReg) const { 1437 MachineRegisterInfo &MRI = *B.getMRI(); 1438 LLT SrcTy = MRI.getType(SrcReg); 1439 if (SrcTy.getSizeInBits() == 32) { 1440 // Use a v_mov_b32 here to make the exec dependency explicit. 1441 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1442 .addDef(DstReg) 1443 .addUse(SrcReg); 1444 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1445 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1446 } 1447 1448 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1449 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1450 1451 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1452 .addDef(TmpReg0) 1453 .addUse(SrcReg, 0, AMDGPU::sub0); 1454 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1455 .addDef(TmpReg1) 1456 .addUse(SrcReg, 0, AMDGPU::sub1); 1457 B.buildInstr(AMDGPU::REG_SEQUENCE) 1458 .addDef(DstReg) 1459 .addUse(TmpReg0) 1460 .addImm(AMDGPU::sub0) 1461 .addUse(TmpReg1) 1462 .addImm(AMDGPU::sub1); 1463 1464 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1465 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1466} 1467 1468void AMDGPURegisterBankInfo::applyMappingImpl( 1469 const OperandsMapper &OpdMapper) const { 1470 MachineInstr &MI = OpdMapper.getMI(); 1471 unsigned Opc = MI.getOpcode(); 1472 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1473 switch (Opc) { 1474 case AMDGPU::G_PHI: { 1475 Register DstReg = MI.getOperand(0).getReg(); 1476 LLT DstTy = MRI.getType(DstReg); 1477 if (DstTy != LLT::scalar(1)) 1478 break; 1479 1480 const LLT S32 = LLT::scalar(32); 1481 const RegisterBank *DstBank = 1482 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1483 if (DstBank == &AMDGPU::VCCRegBank) { 1484 applyDefaultMapping(OpdMapper); 1485 // The standard handling only considers the result register bank for 1486 // phis. For VCC, blindly inserting a copy when the phi is lowered will 1487 // produce an invalid copy. We can only copy with some kind of compare to 1488 // get a vector boolean result. Insert a regitser bank copy that will be 1489 // correctly lowered to a compare. 1490 MachineIRBuilder B(*MI.getParent()->getParent()); 1491 1492 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 1493 Register SrcReg = MI.getOperand(I).getReg(); 1494 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 1495 1496 if (SrcBank != &AMDGPU::VCCRegBank) { 1497 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 1498 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 1499 1500 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 1501 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 1502 MI.getOperand(I).setReg(Copy.getReg(0)); 1503 } 1504 } 1505 1506 return; 1507 } 1508 1509 // Phi handling is strange and only considers the bank of the destination. 1510 substituteSimpleCopyRegs(OpdMapper, 0); 1511 1512 // Promote SGPR/VGPR booleans to s32 1513 MachineFunction *MF = MI.getParent()->getParent(); 1514 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 1515 GISelObserverWrapper Observer(&ApplyBank); 1516 MachineIRBuilder B(MI); 1517 LegalizerHelper Helper(*MF, Observer, B); 1518 1519 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 1520 llvm_unreachable("widen scalar should have succeeded"); 1521 1522 return; 1523 } 1524 case AMDGPU::G_ICMP: 1525 case AMDGPU::G_UADDO: 1526 case AMDGPU::G_USUBO: 1527 case AMDGPU::G_UADDE: 1528 case AMDGPU::G_SADDE: 1529 case AMDGPU::G_USUBE: 1530 case AMDGPU::G_SSUBE: { 1531 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 1532 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 1533 1534 const RegisterBank *DstBank = 1535 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1536 if (DstBank != &AMDGPU::SGPRRegBank) 1537 break; 1538 1539 const bool HasCarryIn = MI.getNumOperands() == 5; 1540 1541 // If this is a scalar compare, promote the result to s32, as the selection 1542 // will end up using a copy to a 32-bit vreg. 1543 const LLT S32 = LLT::scalar(32); 1544 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 1545 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 1546 MI.getOperand(BoolDstOp).setReg(NewDstReg); 1547 MachineIRBuilder B(MI); 1548 1549 if (HasCarryIn) { 1550 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 1551 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 1552 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 1553 MI.getOperand(4).setReg(NewSrcReg); 1554 } 1555 1556 MachineBasicBlock *MBB = MI.getParent(); 1557 B.setInsertPt(*MBB, std::next(MI.getIterator())); 1558 B.buildTrunc(DstReg, NewDstReg); 1559 return; 1560 } 1561 case AMDGPU::G_SELECT: { 1562 Register DstReg = MI.getOperand(0).getReg(); 1563 LLT DstTy = MRI.getType(DstReg); 1564 1565 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 1566 if (CondRegs.empty()) 1567 CondRegs.push_back(MI.getOperand(1).getReg()); 1568 else { 1569 assert(CondRegs.size() == 1); 1570 } 1571 1572 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 1573 if (CondBank == &AMDGPU::SGPRRegBank) { 1574 MachineIRBuilder B(MI); 1575 const LLT S32 = LLT::scalar(32); 1576 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 1577 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 1578 1579 MI.getOperand(1).setReg(NewCondReg); 1580 B.buildZExt(NewCondReg, CondRegs[0]); 1581 } 1582 1583 if (DstTy.getSizeInBits() != 64) 1584 break; 1585 1586 MachineIRBuilder B(MI); 1587 LLT HalfTy = getHalfSizedType(DstTy); 1588 1589 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1590 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 1591 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 1592 1593 // All inputs are SGPRs, nothing special to do. 1594 if (DefRegs.empty()) { 1595 assert(Src1Regs.empty() && Src2Regs.empty()); 1596 break; 1597 } 1598 1599 if (Src1Regs.empty()) 1600 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 1601 else { 1602 setRegsToType(MRI, Src1Regs, HalfTy); 1603 } 1604 1605 if (Src2Regs.empty()) 1606 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 1607 else 1608 setRegsToType(MRI, Src2Regs, HalfTy); 1609 1610 setRegsToType(MRI, DefRegs, HalfTy); 1611 1612 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 1613 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 1614 1615 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1616 MI.eraseFromParent(); 1617 return; 1618 } 1619 case AMDGPU::G_BRCOND: { 1620 Register CondReg = MI.getOperand(0).getReg(); 1621 // FIXME: Should use legalizer helper, but should change bool ext type. 1622 const RegisterBank *CondBank = 1623 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1624 1625 if (CondBank == &AMDGPU::SGPRRegBank) { 1626 MachineIRBuilder B(MI); 1627 const LLT S32 = LLT::scalar(32); 1628 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 1629 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 1630 1631 MI.getOperand(0).setReg(NewCondReg); 1632 B.buildZExt(NewCondReg, CondReg); 1633 return; 1634 } 1635 1636 break; 1637 } 1638 case AMDGPU::G_AND: 1639 case AMDGPU::G_OR: 1640 case AMDGPU::G_XOR: { 1641 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 1642 // there is a VGPR input. 1643 Register DstReg = MI.getOperand(0).getReg(); 1644 LLT DstTy = MRI.getType(DstReg); 1645 1646 if (DstTy.getSizeInBits() == 1) { 1647 const RegisterBank *DstBank = 1648 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1649 if (DstBank == &AMDGPU::VCCRegBank) 1650 break; 1651 1652 MachineFunction *MF = MI.getParent()->getParent(); 1653 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 1654 GISelObserverWrapper Observer(&ApplyBank); 1655 MachineIRBuilder B(MI); 1656 LegalizerHelper Helper(*MF, Observer, B); 1657 1658 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 1659 LegalizerHelper::Legalized) 1660 llvm_unreachable("widen scalar should have succeeded"); 1661 return; 1662 } 1663 1664 if (DstTy.getSizeInBits() != 64) 1665 break; 1666 1667 LLT HalfTy = getHalfSizedType(DstTy); 1668 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1669 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 1670 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 1671 1672 // All inputs are SGPRs, nothing special to do. 1673 if (DefRegs.empty()) { 1674 assert(Src0Regs.empty() && Src1Regs.empty()); 1675 break; 1676 } 1677 1678 assert(DefRegs.size() == 2); 1679 assert(Src0Regs.size() == Src1Regs.size() && 1680 (Src0Regs.empty() || Src0Regs.size() == 2)); 1681 1682 // Depending on where the source registers came from, the generic code may 1683 // have decided to split the inputs already or not. If not, we still need to 1684 // extract the values. 1685 MachineIRBuilder B(MI); 1686 1687 if (Src0Regs.empty()) 1688 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 1689 else 1690 setRegsToType(MRI, Src0Regs, HalfTy); 1691 1692 if (Src1Regs.empty()) 1693 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 1694 else 1695 setRegsToType(MRI, Src1Regs, HalfTy); 1696 1697 setRegsToType(MRI, DefRegs, HalfTy); 1698 1699 B.buildInstr(Opc) 1700 .addDef(DefRegs[0]) 1701 .addUse(Src0Regs[0]) 1702 .addUse(Src1Regs[0]); 1703 1704 B.buildInstr(Opc) 1705 .addDef(DefRegs[1]) 1706 .addUse(Src0Regs[1]) 1707 .addUse(Src1Regs[1]); 1708 1709 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1710 MI.eraseFromParent(); 1711 return; 1712 } 1713 case AMDGPU::G_ADD: 1714 case AMDGPU::G_SUB: 1715 case AMDGPU::G_MUL: { 1716 Register DstReg = MI.getOperand(0).getReg(); 1717 LLT DstTy = MRI.getType(DstReg); 1718 if (DstTy != LLT::scalar(16)) 1719 break; 1720 1721 const RegisterBank *DstBank = 1722 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1723 if (DstBank == &AMDGPU::VGPRRegBank) 1724 break; 1725 1726 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 1727 MachineFunction *MF = MI.getParent()->getParent(); 1728 MachineIRBuilder B(MI); 1729 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 1730 GISelObserverWrapper Observer(&ApplySALU); 1731 LegalizerHelper Helper(*MF, Observer, B); 1732 1733 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 1734 LegalizerHelper::Legalized) 1735 llvm_unreachable("widen scalar should have succeeded"); 1736 return; 1737 } 1738 case AMDGPU::G_SMIN: 1739 case AMDGPU::G_SMAX: 1740 case AMDGPU::G_UMIN: 1741 case AMDGPU::G_UMAX: { 1742 Register DstReg = MI.getOperand(0).getReg(); 1743 const RegisterBank *DstBank = 1744 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1745 if (DstBank == &AMDGPU::VGPRRegBank) 1746 break; 1747 1748 MachineFunction *MF = MI.getParent()->getParent(); 1749 MachineIRBuilder B(MI); 1750 1751 // Turn scalar min/max into a compare and select. 1752 LLT Ty = MRI.getType(DstReg); 1753 LLT S32 = LLT::scalar(32); 1754 LLT S16 = LLT::scalar(16); 1755 1756 if (Ty == S16) { 1757 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 1758 GISelObserverWrapper Observer(&ApplySALU); 1759 LegalizerHelper Helper(*MF, Observer, B); 1760 1761 // Need to widen to s32, and expand as cmp + select. 1762 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 1763 llvm_unreachable("widenScalar should have succeeded"); 1764 1765 // FIXME: This is relying on widenScalar leaving MI in place. 1766 lowerScalarMinMax(B, MI); 1767 } else 1768 lowerScalarMinMax(B, MI); 1769 1770 return; 1771 } 1772 case AMDGPU::G_SEXT: 1773 case AMDGPU::G_ZEXT: { 1774 Register SrcReg = MI.getOperand(1).getReg(); 1775 LLT SrcTy = MRI.getType(SrcReg); 1776 bool Signed = Opc == AMDGPU::G_SEXT; 1777 1778 MachineIRBuilder B(MI); 1779 const RegisterBank *SrcBank = 1780 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1781 1782 Register DstReg = MI.getOperand(0).getReg(); 1783 LLT DstTy = MRI.getType(DstReg); 1784 if (DstTy.isScalar() && 1785 SrcBank != &AMDGPU::SGPRRegBank && 1786 SrcBank != &AMDGPU::VCCRegBank && 1787 // FIXME: Should handle any type that round to s64 when irregular 1788 // breakdowns supported. 1789 DstTy.getSizeInBits() == 64 && 1790 SrcTy.getSizeInBits() <= 32) { 1791 const LLT S32 = LLT::scalar(32); 1792 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1793 1794 // Extend to 32-bit, and then extend the low half. 1795 if (Signed) { 1796 // TODO: Should really be buildSExtOrCopy 1797 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 1798 1799 // Replicate sign bit from 32-bit extended part. 1800 auto ShiftAmt = B.buildConstant(S32, 31); 1801 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); 1802 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt); 1803 } else { 1804 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 1805 B.buildConstant(DefRegs[1], 0); 1806 } 1807 1808 MRI.setRegBank(DstReg, *SrcBank); 1809 MI.eraseFromParent(); 1810 return; 1811 } 1812 1813 if (SrcTy != LLT::scalar(1)) 1814 return; 1815 1816 if (SrcBank == &AMDGPU::VCCRegBank) { 1817 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1818 1819 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 1820 1821 unsigned DstSize = DstTy.getSizeInBits(); 1822 // 64-bit select is SGPR only 1823 const bool UseSel64 = DstSize > 32 && 1824 SrcBank->getID() == AMDGPU::SGPRRegBankID; 1825 1826 // TODO: Should s16 select be legal? 1827 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 1828 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 1829 auto False = B.buildConstant(SelType, 0); 1830 1831 MRI.setRegBank(True.getReg(0), *DstBank); 1832 MRI.setRegBank(False.getReg(0), *DstBank); 1833 MRI.setRegBank(DstReg, *DstBank); 1834 1835 if (DstSize > 32) { 1836 B.buildSelect(DefRegs[0], SrcReg, True, False); 1837 B.buildCopy(DefRegs[1], DefRegs[0]); 1838 } else if (DstSize < 32) { 1839 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 1840 MRI.setRegBank(Sel.getReg(0), *DstBank); 1841 B.buildTrunc(DstReg, Sel); 1842 } else { 1843 B.buildSelect(DstReg, SrcReg, True, False); 1844 } 1845 1846 MI.eraseFromParent(); 1847 return; 1848 } 1849 1850 // Fixup the case with an s1 src that isn't a condition register. Use shifts 1851 // instead of introducing a compare to avoid an unnecessary condition 1852 // register (and since there's no scalar 16-bit compares). 1853 auto Ext = B.buildAnyExt(DstTy, SrcReg); 1854 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1); 1855 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt); 1856 1857 if (MI.getOpcode() == AMDGPU::G_SEXT) 1858 B.buildAShr(DstReg, Shl, ShiftAmt); 1859 else 1860 B.buildLShr(DstReg, Shl, ShiftAmt); 1861 1862 MRI.setRegBank(DstReg, *SrcBank); 1863 MRI.setRegBank(Ext.getReg(0), *SrcBank); 1864 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); 1865 MRI.setRegBank(Shl.getReg(0), *SrcBank); 1866 MI.eraseFromParent(); 1867 return; 1868 } 1869 case AMDGPU::G_BUILD_VECTOR: 1870 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 1871 Register DstReg = MI.getOperand(0).getReg(); 1872 LLT DstTy = MRI.getType(DstReg); 1873 if (DstTy != LLT::vector(2, 16)) 1874 break; 1875 1876 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 1877 substituteSimpleCopyRegs(OpdMapper, 1); 1878 substituteSimpleCopyRegs(OpdMapper, 2); 1879 1880 const RegisterBank *DstBank = 1881 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1882 if (DstBank == &AMDGPU::SGPRRegBank) 1883 break; // Can use S_PACK_* instructions. 1884 1885 MachineIRBuilder B(MI); 1886 1887 Register Lo = MI.getOperand(1).getReg(); 1888 Register Hi = MI.getOperand(2).getReg(); 1889 const LLT S32 = LLT::scalar(32); 1890 1891 const RegisterBank *BankLo = 1892 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1893 const RegisterBank *BankHi = 1894 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1895 1896 Register ZextLo; 1897 Register ShiftHi; 1898 1899 if (Opc == AMDGPU::G_BUILD_VECTOR) { 1900 ZextLo = B.buildZExt(S32, Lo).getReg(0); 1901 MRI.setRegBank(ZextLo, *BankLo); 1902 1903 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 1904 MRI.setRegBank(ZextHi, *BankHi); 1905 1906 auto ShiftAmt = B.buildConstant(S32, 16); 1907 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 1908 1909 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 1910 MRI.setRegBank(ShiftHi, *BankHi); 1911 } else { 1912 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 1913 MRI.setRegBank(MaskLo, *BankLo); 1914 1915 auto ShiftAmt = B.buildConstant(S32, 16); 1916 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 1917 1918 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 1919 MRI.setRegBank(ShiftHi, *BankHi); 1920 1921 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 1922 MRI.setRegBank(ZextLo, *BankLo); 1923 } 1924 1925 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 1926 MRI.setRegBank(Or.getReg(0), *DstBank); 1927 1928 B.buildBitcast(DstReg, Or); 1929 MI.eraseFromParent(); 1930 return; 1931 } 1932 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 1933 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1934 1935 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 1936 1937 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 1938 MachineIRBuilder B(MI); 1939 1940 const ValueMapping &DstMapping 1941 = OpdMapper.getInstrMapping().getOperandMapping(0); 1942 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 1943 const RegisterBank *SrcBank = 1944 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1945 1946 Register DstReg = MI.getOperand(0).getReg(); 1947 Register SrcReg = MI.getOperand(1).getReg(); 1948 Register IdxReg = MI.getOperand(2).getReg(); 1949 1950 // If this is a VGPR result only because the index was a VGPR result, the 1951 // actual indexing will be done on the SGPR source vector, which will 1952 // produce a scalar result. We need to copy to the VGPR result inside the 1953 // waterfall loop. 1954 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 1955 SrcBank == &AMDGPU::SGPRRegBank; 1956 if (DstRegs.empty()) { 1957 applyDefaultMapping(OpdMapper); 1958 1959 executeInWaterfallLoop(MI, MRI, { 2 }); 1960 1961 if (NeedCopyToVGPR) { 1962 // We don't want a phi for this temporary reg. 1963 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 1964 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 1965 MI.getOperand(0).setReg(TmpReg); 1966 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 1967 1968 // Use a v_mov_b32 here to make the exec dependency explicit. 1969 buildVCopy(B, DstReg, TmpReg); 1970 } 1971 1972 return; 1973 } 1974 1975 assert(DstTy.getSizeInBits() == 64); 1976 1977 LLT SrcTy = MRI.getType(SrcReg); 1978 const LLT S32 = LLT::scalar(32); 1979 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); 1980 1981 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 1982 auto One = B.buildConstant(S32, 1); 1983 1984 // Split the vector index into 32-bit pieces. Prepare to move all of the 1985 // new instructions into a waterfall loop if necessary. 1986 // 1987 // Don't put the bitcast or constant in the loop. 1988 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 1989 1990 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 1991 auto IdxLo = B.buildShl(S32, IdxReg, One); 1992 auto IdxHi = B.buildAdd(S32, IdxLo, One); 1993 1994 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 1995 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 1996 1997 MRI.setRegBank(DstReg, *DstBank); 1998 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 1999 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2000 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2001 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2002 2003 SmallSet<Register, 4> OpsToWaterfall; 2004 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2005 MI.eraseFromParent(); 2006 return; 2007 } 2008 2009 // Remove the original instruction to avoid potentially confusing the 2010 // waterfall loop logic. 2011 B.setInstr(*Span.begin()); 2012 MI.eraseFromParent(); 2013 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2014 OpsToWaterfall, MRI); 2015 2016 if (NeedCopyToVGPR) { 2017 MachineBasicBlock *LoopBB = Extract1->getParent(); 2018 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2019 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2020 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2021 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2022 2023 Extract0->getOperand(0).setReg(TmpReg0); 2024 Extract1->getOperand(0).setReg(TmpReg1); 2025 2026 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2027 2028 buildVCopy(B, DstRegs[0], TmpReg0); 2029 buildVCopy(B, DstRegs[1], TmpReg1); 2030 } 2031 2032 return; 2033 } 2034 case AMDGPU::G_INSERT_VECTOR_ELT: { 2035 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2036 2037 assert(OpdMapper.getVRegs(0).empty()); 2038 assert(OpdMapper.getVRegs(1).empty()); 2039 assert(OpdMapper.getVRegs(3).empty()); 2040 2041 if (InsRegs.empty()) { 2042 applyDefaultMapping(OpdMapper); 2043 executeInWaterfallLoop(MI, MRI, { 3 }); 2044 return; 2045 } 2046 2047 Register DstReg = MI.getOperand(0).getReg(); 2048 Register SrcReg = MI.getOperand(1).getReg(); 2049 Register InsReg = MI.getOperand(2).getReg(); 2050 Register IdxReg = MI.getOperand(3).getReg(); 2051 LLT SrcTy = MRI.getType(SrcReg); 2052 LLT InsTy = MRI.getType(InsReg); 2053 (void)InsTy; 2054 2055 assert(InsTy.getSizeInBits() == 64); 2056 2057 const LLT S32 = LLT::scalar(32); 2058 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); 2059 2060 MachineIRBuilder B(MI); 2061 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2062 auto One = B.buildConstant(S32, 1); 2063 2064 // Split the vector index into 32-bit pieces. Prepare to move all of the 2065 // new instructions into a waterfall loop if necessary. 2066 // 2067 // Don't put the bitcast or constant in the loop. 2068 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2069 2070 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2071 auto IdxLo = B.buildShl(S32, IdxReg, One); 2072 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2073 2074 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2075 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2076 B.buildBitcast(DstReg, InsHi); 2077 2078 const RegisterBank *DstBank = 2079 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2080 const RegisterBank *SrcBank = 2081 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2082 const RegisterBank *InsSrcBank = 2083 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2084 2085 MRI.setRegBank(InsReg, *InsSrcBank); 2086 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2087 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2088 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2089 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2090 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2091 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2092 2093 2094 SmallSet<Register, 4> OpsToWaterfall; 2095 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2096 MI.eraseFromParent(); 2097 return; 2098 } 2099 2100 B.setInstr(*Span.begin()); 2101 MI.eraseFromParent(); 2102 2103 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2104 OpsToWaterfall, MRI); 2105 return; 2106 } 2107 case AMDGPU::G_INTRINSIC: { 2108 switch (MI.getIntrinsicID()) { 2109 case Intrinsic::amdgcn_s_buffer_load: { 2110 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS 2111 executeInWaterfallLoop(MI, MRI, { 2, 3 }); 2112 return; 2113 } 2114 case Intrinsic::amdgcn_readlane: { 2115 substituteSimpleCopyRegs(OpdMapper, 2); 2116 2117 assert(OpdMapper.getVRegs(0).empty()); 2118 assert(OpdMapper.getVRegs(3).empty()); 2119 2120 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2121 // waterfall loop, so assume it's a uniform value. 2122 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2123 return; 2124 } 2125 case Intrinsic::amdgcn_writelane: { 2126 assert(OpdMapper.getVRegs(0).empty()); 2127 assert(OpdMapper.getVRegs(2).empty()); 2128 assert(OpdMapper.getVRegs(3).empty()); 2129 2130 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 2131 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 2132 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2133 return; 2134 } 2135 default: 2136 break; 2137 } 2138 break; 2139 } 2140 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 2141 auto IntrID = MI.getIntrinsicID(); 2142 switch (IntrID) { 2143 case Intrinsic::amdgcn_buffer_load: { 2144 executeInWaterfallLoop(MI, MRI, { 2 }); 2145 return; 2146 } 2147 case Intrinsic::amdgcn_ds_ordered_add: 2148 case Intrinsic::amdgcn_ds_ordered_swap: { 2149 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 2150 assert(OpdMapper.getVRegs(0).empty()); 2151 substituteSimpleCopyRegs(OpdMapper, 3); 2152 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2153 return; 2154 } 2155 case Intrinsic::amdgcn_ds_gws_init: 2156 case Intrinsic::amdgcn_ds_gws_barrier: 2157 case Intrinsic::amdgcn_ds_gws_sema_br: { 2158 // Only the first lane is executes, so readfirstlane is safe. 2159 substituteSimpleCopyRegs(OpdMapper, 1); 2160 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2161 return; 2162 } 2163 case Intrinsic::amdgcn_ds_gws_sema_v: 2164 case Intrinsic::amdgcn_ds_gws_sema_p: 2165 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 2166 // Only the first lane is executes, so readfirstlane is safe. 2167 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 2168 return; 2169 } 2170 case Intrinsic::amdgcn_s_sendmsg: 2171 case Intrinsic::amdgcn_s_sendmsghalt: { 2172 // FIXME: Should this use a waterfall loop? 2173 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2174 return; 2175 } 2176 case Intrinsic::amdgcn_raw_buffer_load: 2177 case Intrinsic::amdgcn_raw_buffer_load_format: 2178 case Intrinsic::amdgcn_raw_tbuffer_load: 2179 case Intrinsic::amdgcn_raw_buffer_store: 2180 case Intrinsic::amdgcn_raw_buffer_store_format: 2181 case Intrinsic::amdgcn_raw_tbuffer_store: { 2182 applyDefaultMapping(OpdMapper); 2183 executeInWaterfallLoop(MI, MRI, {2, 4}); 2184 return; 2185 } 2186 case Intrinsic::amdgcn_struct_buffer_load: 2187 case Intrinsic::amdgcn_struct_buffer_store: 2188 case Intrinsic::amdgcn_struct_tbuffer_load: 2189 case Intrinsic::amdgcn_struct_tbuffer_store: { 2190 applyDefaultMapping(OpdMapper); 2191 executeInWaterfallLoop(MI, MRI, {2, 5}); 2192 return; 2193 } 2194 default: { 2195 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 2196 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 2197 // Non-images can have complications from operands that allow both SGPR 2198 // and VGPR. For now it's too complicated to figure out the final opcode 2199 // to derive the register bank from the MCInstrDesc. 2200 if (RSrcIntrin->IsImage) { 2201 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 2202 return; 2203 } 2204 } 2205 2206 break; 2207 } 2208 } 2209 break; 2210 } 2211 case AMDGPU::G_LOAD: 2212 case AMDGPU::G_ZEXTLOAD: 2213 case AMDGPU::G_SEXTLOAD: { 2214 if (applyMappingWideLoad(MI, OpdMapper, MRI)) 2215 return; 2216 break; 2217 } 2218 default: 2219 break; 2220 } 2221 2222 return applyDefaultMapping(OpdMapper); 2223} 2224 2225bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 2226 const MachineFunction &MF = *MI.getParent()->getParent(); 2227 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2228 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { 2229 if (!MI.getOperand(i).isReg()) 2230 continue; 2231 Register Reg = MI.getOperand(i).getReg(); 2232 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 2233 if (Bank->getID() != AMDGPU::SGPRRegBankID) 2234 return false; 2235 } 2236 } 2237 return true; 2238} 2239 2240const RegisterBankInfo::InstructionMapping & 2241AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 2242 const MachineFunction &MF = *MI.getParent()->getParent(); 2243 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2244 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 2245 2246 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2247 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 2248 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2249 } 2250 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 2251 MI.getNumOperands()); 2252} 2253 2254const RegisterBankInfo::InstructionMapping & 2255AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 2256 const MachineFunction &MF = *MI.getParent()->getParent(); 2257 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2258 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 2259 unsigned OpdIdx = 0; 2260 2261 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2262 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); 2263 2264 if (MI.getOperand(OpdIdx).isIntrinsicID()) 2265 OpdsMapping[OpdIdx++] = nullptr; 2266 2267 Register Reg1 = MI.getOperand(OpdIdx).getReg(); 2268 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); 2269 2270 unsigned DefaultBankID = Size1 == 1 ? 2271 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 2272 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID); 2273 2274 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); 2275 2276 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) { 2277 const MachineOperand &MO = MI.getOperand(OpdIdx); 2278 if (!MO.isReg()) 2279 continue; 2280 2281 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI); 2282 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 2283 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size); 2284 } 2285 2286 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 2287 MI.getNumOperands()); 2288} 2289 2290const RegisterBankInfo::InstructionMapping & 2291AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 2292 const MachineFunction &MF = *MI.getParent()->getParent(); 2293 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2294 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 2295 2296 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 2297 const MachineOperand &Op = MI.getOperand(I); 2298 if (!Op.isReg()) 2299 continue; 2300 2301 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 2302 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2303 } 2304 2305 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 2306 MI.getNumOperands()); 2307} 2308 2309const RegisterBankInfo::InstructionMapping & 2310AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 2311 const MachineInstr &MI, 2312 int RsrcIdx) const { 2313 // The reported argument index is relative to the IR intrinsic call arguments, 2314 // so we need to shift by the number of defs and the intrinsic ID. 2315 RsrcIdx += MI.getNumExplicitDefs() + 1; 2316 2317 const int NumOps = MI.getNumOperands(); 2318 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 2319 2320 // TODO: Should packed/unpacked D16 difference be reported here as part of 2321 // the value mapping? 2322 for (int I = 0; I != NumOps; ++I) { 2323 if (!MI.getOperand(I).isReg()) 2324 continue; 2325 2326 Register OpReg = MI.getOperand(I).getReg(); 2327 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 2328 2329 // FIXME: Probably need a new intrinsic register bank searchable table to 2330 // handle arbitrary intrinsics easily. 2331 // 2332 // If this has a sampler, it immediately follows rsrc. 2333 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 2334 2335 if (MustBeSGPR) { 2336 // If this must be an SGPR, so we must report whatever it is as legal. 2337 unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2338 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 2339 } else { 2340 // Some operands must be VGPR, and these are easy to copy to. 2341 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2342 } 2343 } 2344 2345 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 2346} 2347 2348const RegisterBankInfo::InstructionMapping & 2349AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 2350 2351 const MachineFunction &MF = *MI.getParent()->getParent(); 2352 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2353 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 2354 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2355 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); 2356 Register PtrReg = MI.getOperand(1).getReg(); 2357 LLT PtrTy = MRI.getType(PtrReg); 2358 unsigned AS = PtrTy.getAddressSpace(); 2359 unsigned PtrSize = PtrTy.getSizeInBits(); 2360 2361 const ValueMapping *ValMapping; 2362 const ValueMapping *PtrMapping; 2363 2364 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 2365 2366 if (PtrBank == &AMDGPU::SGPRRegBank && 2367 (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 2368 AS != AMDGPUAS::PRIVATE_ADDRESS) && 2369 isScalarLoadLegal(MI)) { 2370 // We have a uniform instruction so we want to use an SMRD load 2371 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2372 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 2373 } else { 2374 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); 2375 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 2376 } 2377 2378 OpdsMapping[0] = ValMapping; 2379 OpdsMapping[1] = PtrMapping; 2380 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 2381 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 2382 return Mapping; 2383 2384 // FIXME: Do we want to add a mapping for FLAT load, or should we just 2385 // handle that during instruction selection? 2386} 2387 2388unsigned 2389AMDGPURegisterBankInfo::getRegBankID(Register Reg, 2390 const MachineRegisterInfo &MRI, 2391 const TargetRegisterInfo &TRI, 2392 unsigned Default) const { 2393 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI); 2394 return Bank ? Bank->getID() : Default; 2395} 2396 2397 2398static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 2399 return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ? 2400 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 2401} 2402 2403static int regBankBoolUnion(int RB0, int RB1) { 2404 if (RB0 == -1) 2405 return RB1; 2406 if (RB1 == -1) 2407 return RB0; 2408 2409 // vcc, vcc -> vcc 2410 // vcc, sgpr -> vcc 2411 // vcc, vgpr -> vcc 2412 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 2413 return AMDGPU::VCCRegBankID; 2414 2415 // vcc, vgpr -> vgpr 2416 return regBankUnion(RB0, RB1); 2417} 2418 2419const RegisterBankInfo::ValueMapping * 2420AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 2421 const MachineRegisterInfo &MRI, 2422 const TargetRegisterInfo &TRI) const { 2423 // Lie and claim anything is legal, even though this needs to be an SGPR 2424 // applyMapping will have to deal with it as a waterfall loop. 2425 unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID); 2426 unsigned Size = getSizeInBits(Reg, MRI, TRI); 2427 return AMDGPU::getValueMapping(Bank, Size); 2428} 2429 2430const RegisterBankInfo::ValueMapping * 2431AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 2432 const MachineRegisterInfo &MRI, 2433 const TargetRegisterInfo &TRI) const { 2434 unsigned Size = getSizeInBits(Reg, MRI, TRI); 2435 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2436} 2437 2438const RegisterBankInfo::ValueMapping * 2439AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 2440 const MachineRegisterInfo &MRI, 2441 const TargetRegisterInfo &TRI) const { 2442 unsigned Size = getSizeInBits(Reg, MRI, TRI); 2443 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 2444} 2445 2446/// 2447/// This function must return a legal mapping, because 2448/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 2449/// in RegBankSelect::Mode::Fast. Any mapping that would cause a 2450/// VGPR to SGPR generated is illegal. 2451/// 2452const RegisterBankInfo::InstructionMapping & 2453AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 2454 const MachineFunction &MF = *MI.getParent()->getParent(); 2455 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2456 2457 if (MI.isRegSequence()) { 2458 // If any input is a VGPR, the result must be a VGPR. The default handling 2459 // assumes any copy between banks is legal. 2460 unsigned BankID = AMDGPU::SGPRRegBankID; 2461 2462 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2463 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI); 2464 // It doesn't make sense to use vcc or scc banks here, so just ignore 2465 // them. 2466 if (OpBank != AMDGPU::SGPRRegBankID) { 2467 BankID = AMDGPU::VGPRRegBankID; 2468 break; 2469 } 2470 } 2471 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2472 2473 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 2474 return getInstructionMapping( 2475 1, /*Cost*/ 1, 2476 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 2477 } 2478 2479 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 2480 // properly. 2481 // 2482 // TODO: There are additional exec masking dependencies to analyze. 2483 if (MI.getOpcode() == TargetOpcode::G_PHI) { 2484 // TODO: Generate proper invalid bank enum. 2485 int ResultBank = -1; 2486 Register DstReg = MI.getOperand(0).getReg(); 2487 2488 // Sometimes the result may have already been assigned a bank. 2489 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 2490 ResultBank = DstBank->getID(); 2491 2492 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2493 Register Reg = MI.getOperand(I).getReg(); 2494 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 2495 2496 // FIXME: Assuming VGPR for any undetermined inputs. 2497 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 2498 ResultBank = AMDGPU::VGPRRegBankID; 2499 break; 2500 } 2501 2502 // FIXME: Need to promote SGPR case to s32 2503 unsigned OpBank = Bank->getID(); 2504 ResultBank = regBankBoolUnion(ResultBank, OpBank); 2505 } 2506 2507 assert(ResultBank != -1); 2508 2509 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 2510 2511 const ValueMapping &ValMap = 2512 getValueMapping(0, Size, getRegBank(ResultBank)); 2513 return getInstructionMapping( 2514 1, /*Cost*/ 1, 2515 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 2516 } 2517 2518 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 2519 if (Mapping.isValid()) 2520 return Mapping; 2521 2522 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 2523 2524 switch (MI.getOpcode()) { 2525 default: 2526 return getInvalidInstructionMapping(); 2527 2528 case AMDGPU::G_AND: 2529 case AMDGPU::G_OR: 2530 case AMDGPU::G_XOR: { 2531 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2532 if (Size == 1) { 2533 const RegisterBank *DstBank 2534 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 2535 2536 unsigned TargetBankID = -1; 2537 unsigned BankLHS = -1; 2538 unsigned BankRHS = -1; 2539 if (DstBank) { 2540 TargetBankID = DstBank->getID(); 2541 if (DstBank == &AMDGPU::VCCRegBank) { 2542 TargetBankID = AMDGPU::VCCRegBankID; 2543 BankLHS = AMDGPU::VCCRegBankID; 2544 BankRHS = AMDGPU::VCCRegBankID; 2545 } else { 2546 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 2547 AMDGPU::SGPRRegBankID); 2548 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2549 AMDGPU::SGPRRegBankID); 2550 } 2551 } else { 2552 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 2553 AMDGPU::VCCRegBankID); 2554 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2555 AMDGPU::VCCRegBankID); 2556 2557 // Both inputs should be true booleans to produce a boolean result. 2558 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 2559 TargetBankID = AMDGPU::VGPRRegBankID; 2560 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 2561 TargetBankID = AMDGPU::VCCRegBankID; 2562 BankLHS = AMDGPU::VCCRegBankID; 2563 BankRHS = AMDGPU::VCCRegBankID; 2564 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 2565 TargetBankID = AMDGPU::SGPRRegBankID; 2566 } 2567 } 2568 2569 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 2570 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 2571 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 2572 break; 2573 } 2574 2575 if (Size == 64) { 2576 2577 if (isSALUMapping(MI)) { 2578 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 2579 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 2580 } else { 2581 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 2582 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/); 2583 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 2584 2585 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/); 2586 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 2587 } 2588 2589 break; 2590 } 2591 2592 LLVM_FALLTHROUGH; 2593 } 2594 case AMDGPU::G_PTR_ADD: 2595 case AMDGPU::G_ADD: 2596 case AMDGPU::G_SUB: 2597 case AMDGPU::G_MUL: 2598 case AMDGPU::G_SHL: 2599 case AMDGPU::G_LSHR: 2600 case AMDGPU::G_ASHR: 2601 case AMDGPU::G_UADDO: 2602 case AMDGPU::G_USUBO: 2603 case AMDGPU::G_UADDE: 2604 case AMDGPU::G_SADDE: 2605 case AMDGPU::G_USUBE: 2606 case AMDGPU::G_SSUBE: 2607 case AMDGPU::G_SMIN: 2608 case AMDGPU::G_SMAX: 2609 case AMDGPU::G_UMIN: 2610 case AMDGPU::G_UMAX: 2611 if (isSALUMapping(MI)) 2612 return getDefaultMappingSOP(MI); 2613 LLVM_FALLTHROUGH; 2614 2615 case AMDGPU::G_FADD: 2616 case AMDGPU::G_FSUB: 2617 case AMDGPU::G_FPTOSI: 2618 case AMDGPU::G_FPTOUI: 2619 case AMDGPU::G_FMUL: 2620 case AMDGPU::G_FMA: 2621 case AMDGPU::G_FMAD: 2622 case AMDGPU::G_FSQRT: 2623 case AMDGPU::G_FFLOOR: 2624 case AMDGPU::G_FCEIL: 2625 case AMDGPU::G_FRINT: 2626 case AMDGPU::G_SITOFP: 2627 case AMDGPU::G_UITOFP: 2628 case AMDGPU::G_FPTRUNC: 2629 case AMDGPU::G_FPEXT: 2630 case AMDGPU::G_FEXP2: 2631 case AMDGPU::G_FLOG2: 2632 case AMDGPU::G_FMINNUM: 2633 case AMDGPU::G_FMAXNUM: 2634 case AMDGPU::G_FMINNUM_IEEE: 2635 case AMDGPU::G_FMAXNUM_IEEE: 2636 case AMDGPU::G_FCANONICALIZE: 2637 case AMDGPU::G_INTRINSIC_TRUNC: 2638 case AMDGPU::G_AMDGPU_FFBH_U32: 2639 return getDefaultMappingVOP(MI); 2640 case AMDGPU::G_UMULH: 2641 case AMDGPU::G_SMULH: { 2642 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 2643 return getDefaultMappingSOP(MI); 2644 return getDefaultMappingVOP(MI); 2645 } 2646 case AMDGPU::G_IMPLICIT_DEF: { 2647 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2648 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2649 break; 2650 } 2651 case AMDGPU::G_FCONSTANT: 2652 case AMDGPU::G_CONSTANT: 2653 case AMDGPU::G_GLOBAL_VALUE: 2654 case AMDGPU::G_BLOCK_ADDR: 2655 case AMDGPU::G_READCYCLECOUNTER: { 2656 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2657 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2658 break; 2659 } 2660 case AMDGPU::G_FRAME_INDEX: { 2661 // TODO: This should be the same as other constants, but eliminateFrameIndex 2662 // currently assumes VALU uses. 2663 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2664 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2665 break; 2666 } 2667 case AMDGPU::G_INSERT: { 2668 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 2669 AMDGPU::VGPRRegBankID; 2670 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2671 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 2672 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 2673 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 2674 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 2675 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 2676 OpdsMapping[3] = nullptr; 2677 break; 2678 } 2679 case AMDGPU::G_EXTRACT: { 2680 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2681 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2682 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 2683 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 2684 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 2685 OpdsMapping[2] = nullptr; 2686 break; 2687 } 2688 case AMDGPU::G_BUILD_VECTOR: 2689 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2690 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 2691 if (DstTy == LLT::vector(2, 16)) { 2692 unsigned DstSize = DstTy.getSizeInBits(); 2693 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2694 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2695 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2696 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 2697 2698 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 2699 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 2700 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 2701 break; 2702 } 2703 2704 LLVM_FALLTHROUGH; 2705 } 2706 case AMDGPU::G_MERGE_VALUES: 2707 case AMDGPU::G_CONCAT_VECTORS: { 2708 unsigned Bank = isSALUMapping(MI) ? 2709 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 2710 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2711 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2712 2713 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 2714 // Op1 and Dst should use the same register bank. 2715 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 2716 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 2717 break; 2718 } 2719 case AMDGPU::G_BITCAST: 2720 case AMDGPU::G_INTTOPTR: 2721 case AMDGPU::G_PTRTOINT: 2722 case AMDGPU::G_CTLZ: 2723 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2724 case AMDGPU::G_CTTZ: 2725 case AMDGPU::G_CTTZ_ZERO_UNDEF: 2726 case AMDGPU::G_CTPOP: 2727 case AMDGPU::G_BSWAP: 2728 case AMDGPU::G_BITREVERSE: 2729 case AMDGPU::G_FABS: 2730 case AMDGPU::G_FNEG: { 2731 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2732 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2733 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 2734 break; 2735 } 2736 case AMDGPU::G_TRUNC: { 2737 Register Dst = MI.getOperand(0).getReg(); 2738 Register Src = MI.getOperand(1).getReg(); 2739 unsigned Bank = getRegBankID(Src, MRI, *TRI); 2740 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 2741 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 2742 OpdsMapping[0] = DstSize == 1 && Bank != AMDGPU::SGPRRegBankID ? 2743 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize) : 2744 AMDGPU::getValueMapping(Bank, DstSize); 2745 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 2746 break; 2747 } 2748 case AMDGPU::G_ZEXT: 2749 case AMDGPU::G_SEXT: 2750 case AMDGPU::G_ANYEXT: { 2751 Register Dst = MI.getOperand(0).getReg(); 2752 Register Src = MI.getOperand(1).getReg(); 2753 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 2754 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 2755 2756 unsigned DstBank; 2757 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 2758 assert(SrcBank); 2759 switch (SrcBank->getID()) { 2760 case AMDGPU::SGPRRegBankID: 2761 DstBank = AMDGPU::SGPRRegBankID; 2762 break; 2763 default: 2764 DstBank = AMDGPU::VGPRRegBankID; 2765 break; 2766 } 2767 2768 // TODO: Should anyext be split into 32-bit part as well? 2769 if (MI.getOpcode() == AMDGPU::G_ANYEXT) { 2770 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); 2771 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize); 2772 } else { 2773 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 2774 // 32-bits, and then to 64. 2775 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 2776 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 2777 SrcSize); 2778 } 2779 break; 2780 } 2781 case AMDGPU::G_FCMP: { 2782 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2783 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2784 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 2785 OpdsMapping[1] = nullptr; // Predicate Operand. 2786 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 2787 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2788 break; 2789 } 2790 case AMDGPU::G_STORE: { 2791 assert(MI.getOperand(0).isReg()); 2792 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2793 // FIXME: We need to specify a different reg bank once scalar stores 2794 // are supported. 2795 const ValueMapping *ValMapping = 2796 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2797 // FIXME: Depending on the type of store, the pointer could be in 2798 // the SGPR Reg bank. 2799 // FIXME: Pointer size should be based on the address space. 2800 const ValueMapping *PtrMapping = 2801 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 2802 2803 OpdsMapping[0] = ValMapping; 2804 OpdsMapping[1] = PtrMapping; 2805 break; 2806 } 2807 2808 case AMDGPU::G_ICMP: { 2809 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 2810 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2811 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2812 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 2813 2814 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID && 2815 Op3Bank == AMDGPU::SGPRRegBankID && 2816 (Size == 32 || (Size == 64 && 2817 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 2818 Subtarget.hasScalarCompareEq64())); 2819 2820 unsigned Op0Bank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 2821 2822 // TODO: Use 32-bit for scalar output size. 2823 // SCC results will need to be copied to a 32-bit SGPR virtual register. 2824 const unsigned ResultSize = 1; 2825 2826 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, ResultSize); 2827 OpdsMapping[1] = nullptr; // Predicate Operand. 2828 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 2829 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size); 2830 break; 2831 } 2832 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2833 // VGPR index can be used for waterfall when indexing a SGPR vector. 2834 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2835 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2836 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2837 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2838 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2839 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 2840 2841 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 2842 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 2843 2844 // The index can be either if the source vector is VGPR. 2845 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 2846 break; 2847 } 2848 case AMDGPU::G_INSERT_VECTOR_ELT: { 2849 unsigned OutputBankID = isSALUMapping(MI) ? 2850 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 2851 2852 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2853 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2854 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 2855 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2856 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), 2857 MRI, *TRI); 2858 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 2859 2860 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 2861 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize); 2862 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID, 2863 InsertSize); 2864 2865 // The index can be either if the source vector is VGPR. 2866 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 2867 break; 2868 } 2869 case AMDGPU::G_UNMERGE_VALUES: { 2870 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 2871 AMDGPU::VGPRRegBankID; 2872 2873 // Op1 and Dst should use the same register bank. 2874 // FIXME: Shouldn't this be the default? Why do we need to handle this? 2875 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2876 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 2877 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 2878 } 2879 break; 2880 } 2881 case AMDGPU::G_INTRINSIC: { 2882 switch (MI.getIntrinsicID()) { 2883 default: 2884 return getInvalidInstructionMapping(); 2885 case Intrinsic::amdgcn_div_fmas: 2886 case Intrinsic::amdgcn_div_fixup: 2887 case Intrinsic::amdgcn_trig_preop: 2888 case Intrinsic::amdgcn_sin: 2889 case Intrinsic::amdgcn_cos: 2890 case Intrinsic::amdgcn_log_clamp: 2891 case Intrinsic::amdgcn_rcp: 2892 case Intrinsic::amdgcn_rcp_legacy: 2893 case Intrinsic::amdgcn_rsq: 2894 case Intrinsic::amdgcn_rsq_legacy: 2895 case Intrinsic::amdgcn_rsq_clamp: 2896 case Intrinsic::amdgcn_ldexp: 2897 case Intrinsic::amdgcn_frexp_mant: 2898 case Intrinsic::amdgcn_frexp_exp: 2899 case Intrinsic::amdgcn_fract: 2900 case Intrinsic::amdgcn_cvt_pkrtz: 2901 case Intrinsic::amdgcn_cvt_pknorm_i16: 2902 case Intrinsic::amdgcn_cvt_pknorm_u16: 2903 case Intrinsic::amdgcn_cvt_pk_i16: 2904 case Intrinsic::amdgcn_cvt_pk_u16: 2905 case Intrinsic::amdgcn_fmed3: 2906 case Intrinsic::amdgcn_cubeid: 2907 case Intrinsic::amdgcn_cubema: 2908 case Intrinsic::amdgcn_cubesc: 2909 case Intrinsic::amdgcn_cubetc: 2910 case Intrinsic::amdgcn_sffbh: 2911 case Intrinsic::amdgcn_fmad_ftz: 2912 case Intrinsic::amdgcn_mbcnt_lo: 2913 case Intrinsic::amdgcn_mbcnt_hi: 2914 case Intrinsic::amdgcn_ubfe: 2915 case Intrinsic::amdgcn_sbfe: 2916 case Intrinsic::amdgcn_mul_u24: 2917 case Intrinsic::amdgcn_mul_i24: 2918 case Intrinsic::amdgcn_lerp: 2919 case Intrinsic::amdgcn_sad_u8: 2920 case Intrinsic::amdgcn_msad_u8: 2921 case Intrinsic::amdgcn_sad_hi_u8: 2922 case Intrinsic::amdgcn_sad_u16: 2923 case Intrinsic::amdgcn_qsad_pk_u16_u8: 2924 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 2925 case Intrinsic::amdgcn_mqsad_u32_u8: 2926 case Intrinsic::amdgcn_cvt_pk_u8_f32: 2927 case Intrinsic::amdgcn_alignbit: 2928 case Intrinsic::amdgcn_alignbyte: 2929 case Intrinsic::amdgcn_fdot2: 2930 case Intrinsic::amdgcn_sdot2: 2931 case Intrinsic::amdgcn_udot2: 2932 case Intrinsic::amdgcn_sdot4: 2933 case Intrinsic::amdgcn_udot4: 2934 case Intrinsic::amdgcn_sdot8: 2935 case Intrinsic::amdgcn_udot8: 2936 case Intrinsic::amdgcn_wwm: 2937 case Intrinsic::amdgcn_wqm: 2938 return getDefaultMappingVOP(MI); 2939 case Intrinsic::amdgcn_ds_swizzle: 2940 case Intrinsic::amdgcn_ds_permute: 2941 case Intrinsic::amdgcn_ds_bpermute: 2942 case Intrinsic::amdgcn_update_dpp: 2943 return getDefaultMappingAllVGPR(MI); 2944 case Intrinsic::amdgcn_kernarg_segment_ptr: 2945 case Intrinsic::amdgcn_s_getpc: 2946 case Intrinsic::amdgcn_groupstaticsize: { 2947 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2948 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2949 break; 2950 } 2951 case Intrinsic::amdgcn_wqm_vote: { 2952 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2953 OpdsMapping[0] = OpdsMapping[2] 2954 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 2955 break; 2956 } 2957 case Intrinsic::amdgcn_s_buffer_load: { 2958 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS 2959 Register RSrc = MI.getOperand(2).getReg(); // SGPR 2960 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm 2961 2962 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2963 unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); 2964 unsigned Size3 = MRI.getType(Offset).getSizeInBits(); 2965 2966 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); 2967 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); 2968 2969 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); 2970 OpdsMapping[1] = nullptr; // intrinsic id 2971 2972 // Lie and claim everything is legal, even though some need to be 2973 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 2974 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc 2975 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); 2976 OpdsMapping[4] = nullptr; 2977 break; 2978 } 2979 case Intrinsic::amdgcn_div_scale: { 2980 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2981 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2982 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 2983 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 2984 2985 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 2986 OpdsMapping[3] = AMDGPU::getValueMapping( 2987 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize); 2988 OpdsMapping[4] = AMDGPU::getValueMapping( 2989 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize); 2990 2991 break; 2992 } 2993 case Intrinsic::amdgcn_class: { 2994 Register Src0Reg = MI.getOperand(2).getReg(); 2995 Register Src1Reg = MI.getOperand(3).getReg(); 2996 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 2997 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 2998 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2999 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 3000 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI), 3001 Src0Size); 3002 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI), 3003 Src1Size); 3004 break; 3005 } 3006 case Intrinsic::amdgcn_icmp: 3007 case Intrinsic::amdgcn_fcmp: { 3008 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3009 // This is not VCCRegBank because this is not used in boolean contexts. 3010 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 3011 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3012 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 3013 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 3014 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize); 3015 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize); 3016 break; 3017 } 3018 case Intrinsic::amdgcn_readlane: { 3019 // This must be an SGPR, but accept a VGPR. 3020 Register IdxReg = MI.getOperand(3).getReg(); 3021 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 3022 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 3023 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 3024 LLVM_FALLTHROUGH; 3025 } 3026 case Intrinsic::amdgcn_readfirstlane: { 3027 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3028 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3029 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 3030 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 3031 break; 3032 } 3033 case Intrinsic::amdgcn_writelane: { 3034 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3035 Register SrcReg = MI.getOperand(2).getReg(); 3036 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 3037 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 3038 Register IdxReg = MI.getOperand(3).getReg(); 3039 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 3040 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 3041 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 3042 3043 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 3044 // to legalize. 3045 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 3046 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 3047 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 3048 break; 3049 } 3050 case Intrinsic::amdgcn_if_break: { 3051 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3052 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3053 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3054 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3055 break; 3056 } 3057 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 3058 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 3059 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 3060 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 3061 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 3062 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 3063 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 3064 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 3065 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 3066 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 3067 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 3068 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 3069 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 3070 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 3071 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 3072 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 3073 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 3074 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 3075 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 3076 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: { 3077 // Default for MAI intrinsics. 3078 // srcC can also be an immediate which can be folded later. 3079 // FIXME: Should we eventually add an alternative mapping with AGPR src 3080 // for srcA/srcB? 3081 // 3082 // vdst, srcA, srcB, srcC 3083 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3084 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3085 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3086 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3087 break; 3088 } 3089 } 3090 break; 3091 } 3092 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 3093 auto IntrID = MI.getIntrinsicID(); 3094 switch (IntrID) { 3095 case Intrinsic::amdgcn_s_getreg: 3096 case Intrinsic::amdgcn_s_memtime: 3097 case Intrinsic::amdgcn_s_memrealtime: 3098 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 3099 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3100 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3101 break; 3102 } 3103 case Intrinsic::amdgcn_ds_append: 3104 case Intrinsic::amdgcn_ds_consume: 3105 case Intrinsic::amdgcn_ds_fadd: 3106 case Intrinsic::amdgcn_ds_fmin: 3107 case Intrinsic::amdgcn_ds_fmax: 3108 case Intrinsic::amdgcn_atomic_inc: 3109 case Intrinsic::amdgcn_atomic_dec: 3110 return getDefaultMappingAllVGPR(MI); 3111 case Intrinsic::amdgcn_ds_ordered_add: 3112 case Intrinsic::amdgcn_ds_ordered_swap: { 3113 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3114 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 3115 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 3116 AMDGPU::SGPRRegBankID); 3117 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 3118 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3119 break; 3120 } 3121 case Intrinsic::amdgcn_exp_compr: 3122 OpdsMapping[0] = nullptr; // IntrinsicID 3123 // FIXME: These are immediate values which can't be read from registers. 3124 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3125 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3126 // FIXME: Could we support packed types here? 3127 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3128 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3129 // FIXME: These are immediate values which can't be read from registers. 3130 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3131 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3132 break; 3133 case Intrinsic::amdgcn_exp: 3134 // FIXME: Could we support packed types here? 3135 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3136 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3137 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3138 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3139 break; 3140 case Intrinsic::amdgcn_buffer_load: { 3141 Register RSrc = MI.getOperand(2).getReg(); // SGPR 3142 Register VIndex = MI.getOperand(3).getReg(); // VGPR 3143 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm 3144 3145 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3146 unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); 3147 unsigned Size3 = MRI.getType(VIndex).getSizeInBits(); 3148 unsigned Size4 = MRI.getType(Offset).getSizeInBits(); 3149 3150 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); 3151 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); 3152 3153 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); 3154 OpdsMapping[1] = nullptr; // intrinsic id 3155 3156 // Lie and claim everything is legal, even though some need to be 3157 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 3158 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc 3159 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3); 3160 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4); 3161 OpdsMapping[5] = nullptr; 3162 OpdsMapping[6] = nullptr; 3163 break; 3164 } 3165 case Intrinsic::amdgcn_s_sendmsg: 3166 case Intrinsic::amdgcn_s_sendmsghalt: { 3167 // This must be an SGPR, but accept a VGPR. 3168 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 3169 AMDGPU::SGPRRegBankID); 3170 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 3171 break; 3172 } 3173 case Intrinsic::amdgcn_end_cf: 3174 case Intrinsic::amdgcn_init_exec: { 3175 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3176 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3177 break; 3178 } 3179 case Intrinsic::amdgcn_else: { 3180 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3181 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3182 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 3183 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 3184 break; 3185 } 3186 case Intrinsic::amdgcn_kill: { 3187 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3188 break; 3189 } 3190 case Intrinsic::amdgcn_raw_buffer_load: 3191 case Intrinsic::amdgcn_raw_tbuffer_load: { 3192 // FIXME: Should make intrinsic ID the last operand of the instruction, 3193 // then this would be the same as store 3194 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3195 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3196 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3197 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3198 break; 3199 } 3200 case Intrinsic::amdgcn_raw_buffer_store: 3201 case Intrinsic::amdgcn_raw_buffer_store_format: 3202 case Intrinsic::amdgcn_raw_tbuffer_store: { 3203 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3204 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3205 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3206 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3207 break; 3208 } 3209 case Intrinsic::amdgcn_struct_buffer_load: 3210 case Intrinsic::amdgcn_struct_tbuffer_load: { 3211 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3212 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3213 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3214 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3215 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3216 break; 3217 } 3218 case Intrinsic::amdgcn_struct_buffer_store: 3219 case Intrinsic::amdgcn_struct_tbuffer_store: { 3220 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3221 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3222 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3223 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3224 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3225 break; 3226 } 3227 case Intrinsic::amdgcn_init_exec_from_input: { 3228 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3229 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3230 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3231 break; 3232 } 3233 case Intrinsic::amdgcn_ds_gws_init: 3234 case Intrinsic::amdgcn_ds_gws_barrier: 3235 case Intrinsic::amdgcn_ds_gws_sema_br: { 3236 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3237 3238 // This must be an SGPR, but accept a VGPR. 3239 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 3240 AMDGPU::SGPRRegBankID); 3241 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 3242 break; 3243 } 3244 case Intrinsic::amdgcn_ds_gws_sema_v: 3245 case Intrinsic::amdgcn_ds_gws_sema_p: 3246 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3247 // This must be an SGPR, but accept a VGPR. 3248 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 3249 AMDGPU::SGPRRegBankID); 3250 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 3251 break; 3252 } 3253 default: 3254 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3255 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3256 // Non-images can have complications from operands that allow both SGPR 3257 // and VGPR. For now it's too complicated to figure out the final opcode 3258 // to derive the register bank from the MCInstrDesc. 3259 if (RSrcIntrin->IsImage) 3260 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 3261 } 3262 3263 return getInvalidInstructionMapping(); 3264 } 3265 break; 3266 } 3267 case AMDGPU::G_SELECT: { 3268 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3269 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 3270 AMDGPU::SGPRRegBankID); 3271 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI, 3272 AMDGPU::SGPRRegBankID); 3273 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 3274 Op3Bank == AMDGPU::SGPRRegBankID; 3275 3276 unsigned CondBankDefault = SGPRSrcs ? 3277 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 3278 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 3279 CondBankDefault); 3280 if (CondBank == AMDGPU::SGPRRegBankID) 3281 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 3282 else if (CondBank == AMDGPU::VGPRRegBankID) 3283 CondBank = AMDGPU::VCCRegBankID; 3284 3285 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 3286 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3287 3288 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 3289 3290 // TODO: Should report 32-bit for scalar condition type. 3291 if (Size == 64) { 3292 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 3293 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 3294 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 3295 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 3296 } else { 3297 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 3298 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 3299 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 3300 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 3301 } 3302 3303 break; 3304 } 3305 3306 case AMDGPU::G_LOAD: 3307 case AMDGPU::G_ZEXTLOAD: 3308 case AMDGPU::G_SEXTLOAD: 3309 return getInstrMappingForLoad(MI); 3310 3311 case AMDGPU::G_ATOMICRMW_XCHG: 3312 case AMDGPU::G_ATOMICRMW_ADD: 3313 case AMDGPU::G_ATOMICRMW_SUB: 3314 case AMDGPU::G_ATOMICRMW_AND: 3315 case AMDGPU::G_ATOMICRMW_OR: 3316 case AMDGPU::G_ATOMICRMW_XOR: 3317 case AMDGPU::G_ATOMICRMW_MAX: 3318 case AMDGPU::G_ATOMICRMW_MIN: 3319 case AMDGPU::G_ATOMICRMW_UMAX: 3320 case AMDGPU::G_ATOMICRMW_UMIN: 3321 case AMDGPU::G_ATOMICRMW_FADD: 3322 case AMDGPU::G_ATOMIC_CMPXCHG: 3323 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { 3324 return getDefaultMappingAllVGPR(MI); 3325 } 3326 case AMDGPU::G_BRCOND: { 3327 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, 3328 AMDGPU::SGPRRegBankID); 3329 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 3330 if (Bank != AMDGPU::SGPRRegBankID) 3331 Bank = AMDGPU::VCCRegBankID; 3332 3333 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 3334 break; 3335 } 3336 } 3337 3338 return getInstructionMapping(/*ID*/1, /*Cost*/1, 3339 getOperandsMapping(OpdsMapping), 3340 MI.getNumOperands()); 3341} 3342