1343171Sdim//===-- SIModeRegister.cpp - Mode Register --------------------------------===// 2343171Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6343171Sdim// 7343171Sdim//===----------------------------------------------------------------------===// 8343171Sdim/// \file 9343171Sdim/// This pass inserts changes to the Mode register settings as required. 10343171Sdim/// Note that currently it only deals with the Double Precision Floating Point 11343171Sdim/// rounding mode setting, but is intended to be generic enough to be easily 12343171Sdim/// expanded. 13343171Sdim/// 14343171Sdim//===----------------------------------------------------------------------===// 15343171Sdim// 16343171Sdim#include "AMDGPU.h" 17343171Sdim#include "AMDGPUInstrInfo.h" 18343171Sdim#include "AMDGPUSubtarget.h" 19343171Sdim#include "SIInstrInfo.h" 20343171Sdim#include "SIMachineFunctionInfo.h" 21343171Sdim#include "llvm/ADT/Statistic.h" 22343171Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 23343171Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 24343171Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 25343171Sdim#include "llvm/IR/Constants.h" 26343171Sdim#include "llvm/IR/Function.h" 27343171Sdim#include "llvm/IR/LLVMContext.h" 28343171Sdim#include "llvm/Support/Debug.h" 29343171Sdim#include "llvm/Support/raw_ostream.h" 30343171Sdim#include "llvm/Target/TargetMachine.h" 31343171Sdim#include <queue> 32343171Sdim 33343171Sdim#define DEBUG_TYPE "si-mode-register" 34343171Sdim 35343171SdimSTATISTIC(NumSetregInserted, "Number of setreg of mode register inserted."); 36343171Sdim 37343171Sdimusing namespace llvm; 38343171Sdim 39343171Sdimstruct Status { 40343171Sdim // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a 41343171Sdim // known value 42343171Sdim unsigned Mask; 43343171Sdim unsigned Mode; 44343171Sdim 45343171Sdim Status() : Mask(0), Mode(0){}; 46343171Sdim 47353358Sdim Status(unsigned NewMask, unsigned NewMode) : Mask(NewMask), Mode(NewMode) { 48343171Sdim Mode &= Mask; 49343171Sdim }; 50343171Sdim 51343171Sdim // merge two status values such that only values that don't conflict are 52343171Sdim // preserved 53343171Sdim Status merge(const Status &S) const { 54343171Sdim return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask))); 55343171Sdim } 56343171Sdim 57343171Sdim // merge an unknown value by using the unknown value's mask to remove bits 58343171Sdim // from the result 59343171Sdim Status mergeUnknown(unsigned newMask) { 60343171Sdim return Status(Mask & ~newMask, Mode & ~newMask); 61343171Sdim } 62343171Sdim 63343171Sdim // intersect two Status values to produce a mode and mask that is a subset 64343171Sdim // of both values 65343171Sdim Status intersect(const Status &S) const { 66343171Sdim unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode); 67343171Sdim unsigned NewMode = (Mode & NewMask); 68343171Sdim return Status(NewMask, NewMode); 69343171Sdim } 70343171Sdim 71343171Sdim // produce the delta required to change the Mode to the required Mode 72343171Sdim Status delta(const Status &S) const { 73343171Sdim return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode); 74343171Sdim } 75343171Sdim 76343171Sdim bool operator==(const Status &S) const { 77343171Sdim return (Mask == S.Mask) && (Mode == S.Mode); 78343171Sdim } 79343171Sdim 80343171Sdim bool operator!=(const Status &S) const { return !(*this == S); } 81343171Sdim 82343171Sdim bool isCompatible(Status &S) { 83343171Sdim return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode); 84343171Sdim } 85343171Sdim 86343171Sdim bool isCombinable(Status &S) { 87343171Sdim return !(Mask & S.Mask) || isCompatible(S); 88343171Sdim } 89343171Sdim}; 90343171Sdim 91343171Sdimclass BlockData { 92343171Sdimpublic: 93343171Sdim // The Status that represents the mode register settings required by the 94343171Sdim // FirstInsertionPoint (if any) in this block. Calculated in Phase 1. 95343171Sdim Status Require; 96343171Sdim 97343171Sdim // The Status that represents the net changes to the Mode register made by 98343171Sdim // this block, Calculated in Phase 1. 99343171Sdim Status Change; 100343171Sdim 101343171Sdim // The Status that represents the mode register settings on exit from this 102343171Sdim // block. Calculated in Phase 2. 103343171Sdim Status Exit; 104343171Sdim 105343171Sdim // The Status that represents the intersection of exit Mode register settings 106343171Sdim // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3. 107343171Sdim Status Pred; 108343171Sdim 109343171Sdim // In Phase 1 we record the first instruction that has a mode requirement, 110343171Sdim // which is used in Phase 3 if we need to insert a mode change. 111343171Sdim MachineInstr *FirstInsertionPoint; 112343171Sdim 113343171Sdim BlockData() : FirstInsertionPoint(nullptr) {}; 114343171Sdim}; 115343171Sdim 116343171Sdimnamespace { 117343171Sdim 118343171Sdimclass SIModeRegister : public MachineFunctionPass { 119343171Sdimpublic: 120343171Sdim static char ID; 121343171Sdim 122343171Sdim std::vector<std::unique_ptr<BlockData>> BlockInfo; 123343171Sdim std::queue<MachineBasicBlock *> Phase2List; 124343171Sdim 125343171Sdim // The default mode register setting currently only caters for the floating 126343171Sdim // point double precision rounding mode. 127343171Sdim // We currently assume the default rounding mode is Round to Nearest 128343171Sdim // NOTE: this should come from a per function rounding mode setting once such 129343171Sdim // a setting exists. 130343171Sdim unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST; 131343171Sdim Status DefaultStatus = 132343171Sdim Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode)); 133343171Sdim 134343171Sdimpublic: 135343171Sdim SIModeRegister() : MachineFunctionPass(ID) {} 136343171Sdim 137343171Sdim bool runOnMachineFunction(MachineFunction &MF) override; 138343171Sdim 139343171Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 140343171Sdim AU.setPreservesCFG(); 141343171Sdim MachineFunctionPass::getAnalysisUsage(AU); 142343171Sdim } 143343171Sdim 144343171Sdim void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); 145343171Sdim 146343171Sdim void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); 147343171Sdim 148343171Sdim void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII); 149343171Sdim 150343171Sdim Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII); 151343171Sdim 152343171Sdim void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I, 153343171Sdim const SIInstrInfo *TII, Status InstrMode); 154343171Sdim}; 155343171Sdim} // End anonymous namespace. 156343171Sdim 157343171SdimINITIALIZE_PASS(SIModeRegister, DEBUG_TYPE, 158343171Sdim "Insert required mode register values", false, false) 159343171Sdim 160343171Sdimchar SIModeRegister::ID = 0; 161343171Sdim 162343171Sdimchar &llvm::SIModeRegisterID = SIModeRegister::ID; 163343171Sdim 164343171SdimFunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); } 165343171Sdim 166343171Sdim// Determine the Mode register setting required for this instruction. 167343171Sdim// Instructions which don't use the Mode register return a null Status. 168343171Sdim// Note this currently only deals with instructions that use the floating point 169343171Sdim// double precision setting. 170343171SdimStatus SIModeRegister::getInstructionMode(MachineInstr &MI, 171343171Sdim const SIInstrInfo *TII) { 172343171Sdim if (TII->usesFPDPRounding(MI)) { 173343171Sdim switch (MI.getOpcode()) { 174343171Sdim case AMDGPU::V_INTERP_P1LL_F16: 175343171Sdim case AMDGPU::V_INTERP_P1LV_F16: 176343171Sdim case AMDGPU::V_INTERP_P2_F16: 177343171Sdim // f16 interpolation instructions need double precision round to zero 178343171Sdim return Status(FP_ROUND_MODE_DP(3), 179343171Sdim FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); 180343171Sdim default: 181343171Sdim return DefaultStatus; 182343171Sdim } 183343171Sdim } 184343171Sdim return Status(); 185343171Sdim} 186343171Sdim 187343171Sdim// Insert a setreg instruction to update the Mode register. 188343171Sdim// It is possible (though unlikely) for an instruction to require a change to 189343171Sdim// the value of disjoint parts of the Mode register when we don't know the 190343171Sdim// value of the intervening bits. In that case we need to use more than one 191343171Sdim// setreg instruction. 192343171Sdimvoid SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, 193343171Sdim const SIInstrInfo *TII, Status InstrMode) { 194343171Sdim while (InstrMode.Mask) { 195343171Sdim unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask); 196343171Sdim unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset); 197343171Sdim unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); 198343171Sdim BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32)) 199343171Sdim .addImm(Value) 200343171Sdim .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | 201343171Sdim (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | 202343171Sdim (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_)); 203343171Sdim ++NumSetregInserted; 204343171Sdim InstrMode.Mask &= ~(((1 << Width) - 1) << Offset); 205343171Sdim } 206343171Sdim} 207343171Sdim 208343171Sdim// In Phase 1 we iterate through the instructions of the block and for each 209343171Sdim// instruction we get its mode usage. If the instruction uses the Mode register 210343171Sdim// we: 211343171Sdim// - update the Change status, which tracks the changes to the Mode register 212343171Sdim// made by this block 213343171Sdim// - if this instruction's requirements are compatible with the current setting 214343171Sdim// of the Mode register we merge the modes 215343171Sdim// - if it isn't compatible and an InsertionPoint isn't set, then we set the 216343171Sdim// InsertionPoint to the current instruction, and we remember the current 217343171Sdim// mode 218343171Sdim// - if it isn't compatible and InsertionPoint is set we insert a seteg before 219343171Sdim// that instruction (unless this instruction forms part of the block's 220343171Sdim// entry requirements in which case the insertion is deferred until Phase 3 221343171Sdim// when predecessor exit values are known), and move the insertion point to 222343171Sdim// this instruction 223343171Sdim// - if this is a setreg instruction we treat it as an incompatible instruction. 224343171Sdim// This is sub-optimal but avoids some nasty corner cases, and is expected to 225343171Sdim// occur very rarely. 226343171Sdim// - on exit we have set the Require, Change, and initial Exit modes. 227343171Sdimvoid SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, 228343171Sdim const SIInstrInfo *TII) { 229360784Sdim auto NewInfo = std::make_unique<BlockData>(); 230343171Sdim MachineInstr *InsertionPoint = nullptr; 231343171Sdim // RequirePending is used to indicate whether we are collecting the initial 232343171Sdim // requirements for the block, and need to defer the first InsertionPoint to 233343171Sdim // Phase 3. It is set to false once we have set FirstInsertionPoint, or when 234343171Sdim // we discover an explict setreg that means this block doesn't have any 235343171Sdim // initial requirements. 236343171Sdim bool RequirePending = true; 237343171Sdim Status IPChange; 238343171Sdim for (MachineInstr &MI : MBB) { 239343171Sdim Status InstrMode = getInstructionMode(MI, TII); 240343171Sdim if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) || 241343171Sdim (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) { 242343171Sdim // We preserve any explicit mode register setreg instruction we encounter, 243343171Sdim // as we assume it has been inserted by a higher authority (this is 244343171Sdim // likely to be a very rare occurrence). 245343171Sdim unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 246343171Sdim if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) != 247343171Sdim AMDGPU::Hwreg::ID_MODE) 248343171Sdim continue; 249343171Sdim 250343171Sdim unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >> 251343171Sdim AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + 252343171Sdim 1; 253343171Sdim unsigned Offset = 254343171Sdim (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_; 255343171Sdim unsigned Mask = ((1 << Width) - 1) << Offset; 256343171Sdim 257343171Sdim // If an InsertionPoint is set we will insert a setreg there. 258343171Sdim if (InsertionPoint) { 259343171Sdim insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); 260343171Sdim InsertionPoint = nullptr; 261343171Sdim } 262343171Sdim // If this is an immediate then we know the value being set, but if it is 263343171Sdim // not an immediate then we treat the modified bits of the mode register 264343171Sdim // as unknown. 265343171Sdim if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) { 266343171Sdim unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm(); 267343171Sdim unsigned Mode = (Val << Offset) & Mask; 268343171Sdim Status Setreg = Status(Mask, Mode); 269343171Sdim // If we haven't already set the initial requirements for the block we 270343171Sdim // don't need to as the requirements start from this explicit setreg. 271343171Sdim RequirePending = false; 272343171Sdim NewInfo->Change = NewInfo->Change.merge(Setreg); 273343171Sdim } else { 274343171Sdim NewInfo->Change = NewInfo->Change.mergeUnknown(Mask); 275343171Sdim } 276343171Sdim } else if (!NewInfo->Change.isCompatible(InstrMode)) { 277343171Sdim // This instruction uses the Mode register and its requirements aren't 278343171Sdim // compatible with the current mode. 279343171Sdim if (InsertionPoint) { 280343171Sdim // If the required mode change cannot be included in the current 281343171Sdim // InsertionPoint changes, we need a setreg and start a new 282343171Sdim // InsertionPoint. 283343171Sdim if (!IPChange.delta(NewInfo->Change).isCombinable(InstrMode)) { 284343171Sdim if (RequirePending) { 285343171Sdim // This is the first insertionPoint in the block so we will defer 286343171Sdim // the insertion of the setreg to Phase 3 where we know whether or 287343171Sdim // not it is actually needed. 288343171Sdim NewInfo->FirstInsertionPoint = InsertionPoint; 289343171Sdim NewInfo->Require = NewInfo->Change; 290343171Sdim RequirePending = false; 291343171Sdim } else { 292343171Sdim insertSetreg(MBB, InsertionPoint, TII, 293343171Sdim IPChange.delta(NewInfo->Change)); 294343171Sdim IPChange = NewInfo->Change; 295343171Sdim } 296343171Sdim // Set the new InsertionPoint 297343171Sdim InsertionPoint = &MI; 298343171Sdim } 299343171Sdim NewInfo->Change = NewInfo->Change.merge(InstrMode); 300343171Sdim } else { 301343171Sdim // No InsertionPoint is currently set - this is either the first in 302343171Sdim // the block or we have previously seen an explicit setreg. 303343171Sdim InsertionPoint = &MI; 304343171Sdim IPChange = NewInfo->Change; 305343171Sdim NewInfo->Change = NewInfo->Change.merge(InstrMode); 306343171Sdim } 307343171Sdim } 308343171Sdim } 309343171Sdim if (RequirePending) { 310343171Sdim // If we haven't yet set the initial requirements for the block we set them 311343171Sdim // now. 312343171Sdim NewInfo->FirstInsertionPoint = InsertionPoint; 313343171Sdim NewInfo->Require = NewInfo->Change; 314343171Sdim } else if (InsertionPoint) { 315343171Sdim // We need to insert a setreg at the InsertionPoint 316343171Sdim insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); 317343171Sdim } 318343171Sdim NewInfo->Exit = NewInfo->Change; 319343171Sdim BlockInfo[MBB.getNumber()] = std::move(NewInfo); 320343171Sdim} 321343171Sdim 322343171Sdim// In Phase 2 we revisit each block and calculate the common Mode register 323343171Sdim// value provided by all predecessor blocks. If the Exit value for the block 324343171Sdim// is changed, then we add the successor blocks to the worklist so that the 325343171Sdim// exit value is propagated. 326343171Sdimvoid SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, 327343171Sdim const SIInstrInfo *TII) { 328343171Sdim// BlockData *BI = BlockInfo[MBB.getNumber()]; 329343171Sdim unsigned ThisBlock = MBB.getNumber(); 330343171Sdim if (MBB.pred_empty()) { 331343171Sdim // There are no predecessors, so use the default starting status. 332343171Sdim BlockInfo[ThisBlock]->Pred = DefaultStatus; 333343171Sdim } else { 334343171Sdim // Build a status that is common to all the predecessors by intersecting 335343171Sdim // all the predecessor exit status values. 336343171Sdim MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end(); 337343171Sdim MachineBasicBlock &PB = *(*P); 338343171Sdim BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit; 339343171Sdim 340343171Sdim for (P = std::next(P); P != E; P = std::next(P)) { 341343171Sdim MachineBasicBlock *Pred = *P; 342343171Sdim BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit); 343343171Sdim } 344343171Sdim } 345343171Sdim Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change); 346343171Sdim if (BlockInfo[ThisBlock]->Exit != TmpStatus) { 347343171Sdim BlockInfo[ThisBlock]->Exit = TmpStatus; 348343171Sdim // Add the successors to the work list so we can propagate the changed exit 349343171Sdim // status. 350343171Sdim for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), 351343171Sdim E = MBB.succ_end(); 352343171Sdim S != E; S = std::next(S)) { 353343171Sdim MachineBasicBlock &B = *(*S); 354343171Sdim Phase2List.push(&B); 355343171Sdim } 356343171Sdim } 357343171Sdim} 358343171Sdim 359343171Sdim// In Phase 3 we revisit each block and if it has an insertion point defined we 360343171Sdim// check whether the predecessor mode meets the block's entry requirements. If 361343171Sdim// not we insert an appropriate setreg instruction to modify the Mode register. 362343171Sdimvoid SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB, 363343171Sdim const SIInstrInfo *TII) { 364343171Sdim// BlockData *BI = BlockInfo[MBB.getNumber()]; 365343171Sdim unsigned ThisBlock = MBB.getNumber(); 366343171Sdim if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) { 367343171Sdim Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require); 368343171Sdim if (BlockInfo[ThisBlock]->FirstInsertionPoint) 369343171Sdim insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta); 370343171Sdim else 371343171Sdim insertSetreg(MBB, &MBB.instr_front(), TII, Delta); 372343171Sdim } 373343171Sdim} 374343171Sdim 375343171Sdimbool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { 376343171Sdim BlockInfo.resize(MF.getNumBlockIDs()); 377343171Sdim const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 378343171Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 379343171Sdim 380343171Sdim // Processing is performed in a number of phases 381343171Sdim 382343171Sdim // Phase 1 - determine the initial mode required by each block, and add setreg 383343171Sdim // instructions for intra block requirements. 384343171Sdim for (MachineBasicBlock &BB : MF) 385343171Sdim processBlockPhase1(BB, TII); 386343171Sdim 387343171Sdim // Phase 2 - determine the exit mode from each block. We add all blocks to the 388343171Sdim // list here, but will also add any that need to be revisited during Phase 2 389343171Sdim // processing. 390343171Sdim for (MachineBasicBlock &BB : MF) 391343171Sdim Phase2List.push(&BB); 392343171Sdim while (!Phase2List.empty()) { 393343171Sdim processBlockPhase2(*Phase2List.front(), TII); 394343171Sdim Phase2List.pop(); 395343171Sdim } 396343171Sdim 397343171Sdim // Phase 3 - add an initial setreg to each block where the required entry mode 398343171Sdim // is not satisfied by the exit mode of all its predecessors. 399343171Sdim for (MachineBasicBlock &BB : MF) 400343171Sdim processBlockPhase3(BB, TII); 401343171Sdim 402343171Sdim BlockInfo.clear(); 403343171Sdim 404343171Sdim return NumSetregInserted > 0; 405343171Sdim} 406