lib/Target/R600/AMDILPeepholeOptimizer.cpp - platform/external/llvm - Git at Google

 //===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 /// \file
 //==-----------------------------------------------------------------------===//

 #define DEBUG_TYPE "PeepholeOpt"
 #ifdef DEBUG
 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
 #else
 #define DEBUGME 0
 #endif

 #include "AMDILDevices.h"
 #include "AMDGPUInstrInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"

 #include <sstream>

 #if 0
 STATISTIC(PointerAssignments, "Number of dynamic pointer "
     "assigments discovered");
 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
 #endif

 using namespace llvm;
 // The Peephole optimization pass is used to do simple last minute optimizations
 // that are required for correct code or to remove redundant functions
 namespace {

 class OpaqueType;

 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
 public:
   TargetMachine &TM;
   static char ID;
   AMDGPUPeepholeOpt(TargetMachine &tm);
   ~AMDGPUPeepholeOpt();
   const char *getPassName() const;
   bool runOnFunction(Function &F);
   bool doInitialization(Module &M);
   bool doFinalization(Module &M);
   void getAnalysisUsage(AnalysisUsage &AU) const;
 protected:
 private:
   // Function to initiate all of the instruction level optimizations.
   bool instLevelOptimizations(BasicBlock::iterator *inst);
   // Quick check to see if we need to dump all of the pointers into the
   // arena. If this is correct, then we set all pointers to exist in arena. This
   // is a workaround for aliasing of pointers in a struct/union.
   bool dumpAllIntoArena(Function &F);
   // Because I don't want to invalidate any pointers while in the
   // safeNestedForEachFunction. I push atomic conversions to a vector and handle
   // it later. This function does the conversions if required.
   void doAtomicConversionIfNeeded(Function &F);
   // Because __amdil_is_constant cannot be properly evaluated if
   // optimizations are disabled, the call's are placed in a vector
   // and evaluated after the __amdil_image* functions are evaluated
   // which should allow the __amdil_is_constant function to be
   // evaluated correctly.
   void doIsConstCallConversionIfNeeded();
   bool mChanged;
   bool mDebug;
   bool mConvertAtomics;
   CodeGenOpt::Level optLevel;
   // Run a series of tests to see if we can optimize a CALL instruction.
   bool optimizeCallInst(BasicBlock::iterator *bbb);
   // A peephole optimization to optimize bit extract sequences.
   bool optimizeBitExtract(Instruction *inst);
   // A peephole optimization to optimize bit insert sequences.
   bool optimizeBitInsert(Instruction *inst);
   bool setupBitInsert(Instruction *base,
                       Instruction *&src,
                       Constant *&mask,
                       Constant *&shift);
   // Expand the bit field insert instruction on versions of OpenCL that
   // don't support it.
   bool expandBFI(CallInst *CI);
   // Expand the bit field mask instruction on version of OpenCL that
   // don't support it.
   bool expandBFM(CallInst *CI);
   // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
   // this case we need to expand them. These functions check for 24bit functions
   // and then expand.
   bool isSigned24BitOps(CallInst *CI);
   void expandSigned24BitOps(CallInst *CI);
   // One optimization that can occur is that if the required workgroup size is
   // specified then the result of get_local_size is known at compile time and
   // can be returned accordingly.
   bool isRWGLocalOpt(CallInst *CI);
   // On northern island cards, the division is slightly less accurate than on
   // previous generations, so we need to utilize a more accurate division. So we
   // can translate the accurate divide to a normal divide on all other cards.
   bool convertAccurateDivide(CallInst *CI);
   void expandAccurateDivide(CallInst *CI);
   // If the alignment is set incorrectly, it can produce really inefficient
   // code. This checks for this scenario and fixes it if possible.
   bool correctMisalignedMemOp(Instruction *inst);

   // If we are in no opt mode, then we need to make sure that
   // local samplers are properly propagated as constant propagation
   // doesn't occur and we need to know the value of kernel defined
   // samplers at compile time.
   bool propagateSamplerInst(CallInst *CI);

   // Helper functions

   // Group of functions that recursively calculate the size of a structure based
   // on it's sub-types.
   size_t getTypeSize(Type * const T, bool dereferencePtr = false);
   size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
   size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
   size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
   size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
   size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
   size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
   size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);

   LLVMContext *mCTX;
   Function *mF;
   const AMDGPUSubtarget *mSTM;
   SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
   SmallVector<CallInst *, 16> isConstVec;
 }; // class AMDGPUPeepholeOpt
   char AMDGPUPeepholeOpt::ID = 0;

 // A template function that has two levels of looping before calling the
 // function with a pointer to the current iterator.
 template<class InputIterator, class SecondIterator, class Function>
 Function safeNestedForEach(InputIterator First, InputIterator Last,
                               SecondIterator S, Function F) {
   for ( ; First != Last; ++First) {
     SecondIterator sf, sl;
     for (sf = First->begin(), sl = First->end();
          sf != sl; )  {
       if (!F(&sf)) {
         ++sf;
       }
     }
   }
   return F;
 }

 } // anonymous namespace

 namespace llvm {
   FunctionPass *
   createAMDGPUPeepholeOpt(TargetMachine &tm) {
     return new AMDGPUPeepholeOpt(tm);
   }
 } // llvm namespace

 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
   : FunctionPass(ID), TM(tm)  {
   mDebug = DEBUGME;
   optLevel = TM.getOptLevel();

 }

 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()  {
 }

 const char *
 AMDGPUPeepholeOpt::getPassName() const  {
   return "AMDGPU PeepHole Optimization Pass";
 }

 bool
 containsPointerType(Type *Ty)  {
   if (!Ty) {
     return false;
   }
   switch(Ty->getTypeID()) {
   default:
     return false;
   case Type::StructTyID: {
     const StructType *ST = dyn_cast<StructType>(Ty);
     for (StructType::element_iterator stb = ST->element_begin(),
            ste = ST->element_end(); stb != ste; ++stb) {
       if (!containsPointerType(*stb)) {
         continue;
       }
       return true;
     }
     break;
   }
   case Type::VectorTyID:
   case Type::ArrayTyID:
     return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
   case Type::PointerTyID:
     return true;
   };
   return false;
 }

 bool
 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)  {
   bool dumpAll = false;
   for (Function::const_arg_iterator cab = F.arg_begin(),
        cae = F.arg_end(); cab != cae; ++cab) {
     const Argument *arg = cab;
     const PointerType *PT = dyn_cast<PointerType>(arg->getType());
     if (!PT) {
       continue;
     }
     Type *DereferencedType = PT->getElementType();
     if (!dyn_cast<StructType>(DereferencedType)
         ) {
       continue;
     }
     if (!containsPointerType(DereferencedType)) {
       continue;
     }
     // FIXME: Because a pointer inside of a struct/union may be aliased to
     // another pointer we need to take the conservative approach and place all
     // pointers into the arena until more advanced detection is implemented.
     dumpAll = true;
   }
   return dumpAll;
 }
 void
 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
   if (isConstVec.empty()) {
     return;
   }
   for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
     CallInst *CI = isConstVec[x];
     Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
     Type *aType = Type::getInt32Ty(*mCTX);
     Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
       : ConstantInt::get(aType, 0);
     CI->replaceAllUsesWith(Val);
     CI->eraseFromParent();
   }
   isConstVec.clear();
 }
 void
 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)  {
   // Don't do anything if we don't have any atomic operations.
   if (atomicFuncs.empty()) {
     return;
   }
   // Change the function name for the atomic if it is required
   uint32_t size = atomicFuncs.size();
   for (uint32_t x = 0; x < size; ++x) {
     atomicFuncs[x].first->setOperand(
         atomicFuncs[x].first->getNumOperands()-1,
         atomicFuncs[x].second);

   }
   mChanged = true;
   if (mConvertAtomics) {
     return;
   }
 }

 bool
 AMDGPUPeepholeOpt::runOnFunction(Function &MF)  {
   mChanged = false;
   mF = &MF;
   mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
   if (mDebug) {
     MF.dump();
   }
   mCTX = &MF.getType()->getContext();
   mConvertAtomics = true;
   safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
      std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
                   this));

   doAtomicConversionIfNeeded(MF);
   doIsConstCallConversionIfNeeded();

   if (mDebug) {
     MF.dump();
   }
   return mChanged;
 }

 bool
 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
   Instruction *inst = (*bbb);
   CallInst *CI = dyn_cast<CallInst>(inst);
   if (!CI) {
     return false;
   }
   if (isSigned24BitOps(CI)) {
     expandSigned24BitOps(CI);
     ++(*bbb);
     CI->eraseFromParent();
     return true;
   }
   if (propagateSamplerInst(CI)) {
     return false;
   }
   if (expandBFI(CI) || expandBFM(CI)) {
     ++(*bbb);
     CI->eraseFromParent();
     return true;
   }
   if (convertAccurateDivide(CI)) {
     expandAccurateDivide(CI);
     ++(*bbb);
     CI->eraseFromParent();
     return true;
   }

   StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
   if (calleeName.startswith("__amdil_is_constant")) {
     // If we do not have optimizations, then this
     // cannot be properly evaluated, so we add the
     // call instruction to a vector and process
     // them at the end of processing after the
     // samplers have been correctly handled.
     if (optLevel == CodeGenOpt::None) {
       isConstVec.push_back(CI);
       return false;
     } else {
       Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
       Type *aType = Type::getInt32Ty(*mCTX);
       Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
         : ConstantInt::get(aType, 0);
       CI->replaceAllUsesWith(Val);
       ++(*bbb);
       CI->eraseFromParent();
       return true;
     }
   }

   if (calleeName.equals("__amdil_is_asic_id_i32")) {
     ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
     Type *aType = Type::getInt32Ty(*mCTX);
     Value *Val = CV;
     if (Val) {
       Val = ConstantInt::get(aType,
           mSTM->device()->getDeviceFlag() & CV->getZExtValue());
     } else {
       Val = ConstantInt::get(aType, 0);
     }
     CI->replaceAllUsesWith(Val);
     ++(*bbb);
     CI->eraseFromParent();
     return true;
   }
   Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
   if (!F) {
     return false;
   }
   if (F->getName().startswith("__atom") && !CI->getNumUses()
       && F->getName().find("_xchg") == StringRef::npos) {
     std::string buffer(F->getName().str() + "_noret");
     F = dyn_cast<Function>(
           F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
     atomicFuncs.push_back(std::make_pair(CI, F));
   }

   if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
       && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
     return false;
   }
   if (!mConvertAtomics) {
     return false;
   }
   StringRef name = F->getName();
   if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
     mConvertAtomics = false;
   }
   return false;
 }

 bool
 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
     Instruction *&src,
     Constant *&mask,
     Constant *&shift) {
   if (!base) {
     if (mDebug) {
       dbgs() << "Null pointer passed into function.\n";
     }
     return false;
   }
   bool andOp = false;
   if (base->getOpcode() == Instruction::Shl) {
     shift = dyn_cast<Constant>(base->getOperand(1));
   } else if (base->getOpcode() == Instruction::And) {
     mask = dyn_cast<Constant>(base->getOperand(1));
     andOp = true;
   } else {
     if (mDebug) {
       dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
     }
     // If the base is neither a Shl or a And, we don't fit any of the patterns above.
     return false;
   }
   src = dyn_cast<Instruction>(base->getOperand(0));
   if (!src) {
     if (mDebug) {
       dbgs() << "Failed setup since the base operand is not an instruction!\n";
     }
     return false;
   }
   // If we find an 'and' operation, then we don't need to
   // find the next operation as we already know the
   // bits that are valid at this point.
   if (andOp) {
     return true;
   }
   if (src->getOpcode() == Instruction::Shl && !shift) {
     shift = dyn_cast<Constant>(src->getOperand(1));
     src = dyn_cast<Instruction>(src->getOperand(0));
   } else if (src->getOpcode() == Instruction::And && !mask) {
     mask = dyn_cast<Constant>(src->getOperand(1));
   }
   if (!mask && !shift) {
     if (mDebug) {
       dbgs() << "Failed setup since both mask and shift are NULL!\n";
     }
     // Did not find a constant mask or a shift.
     return false;
   }
   return true;
 }
 bool
 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
   if (!inst) {
     return false;
   }
   if (!inst->isBinaryOp()) {
     return false;
   }
   if (inst->getOpcode() != Instruction::Or) {
     return false;
   }
   if (optLevel == CodeGenOpt::None) {
     return false;
   }
   // We want to do an optimization on a sequence of ops that in the end equals a
   // single ISA instruction.
   // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
   // Some simplified versions of this pattern are as follows:
   // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
   // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
   // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
   // (A & B) | (D << F) when (1 << F) >= B
   // (A << C) | (D & E) when (1 << C) >= E
   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
     // The HD4XXX hardware doesn't support the ubit_insert instruction.
     return false;
   }
   Type *aType = inst->getType();
   bool isVector = aType->isVectorTy();
   int numEle = 1;
   // This optimization only works on 32bit integers.
   if (aType->getScalarType()
       != Type::getInt32Ty(inst->getContext())) {
     return false;
   }
   if (isVector) {
     const VectorType *VT = dyn_cast<VectorType>(aType);
     numEle = VT->getNumElements();
     // We currently cannot support more than 4 elements in a intrinsic and we
     // cannot support Vec3 types.
     if (numEle > 4 || numEle == 3) {
       return false;
     }
   }
   // TODO: Handle vectors.
   if (isVector) {
     if (mDebug) {
       dbgs() << "!!! Vectors are not supported yet!\n";
     }
     return false;
   }
   Instruction *LHSSrc = NULL, *RHSSrc = NULL;
   Constant *LHSMask = NULL, *RHSMask = NULL;
   Constant *LHSShift = NULL, *RHSShift = NULL;
   Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
   Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
   if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
     if (mDebug) {
       dbgs() << "Found an OR Operation that failed setup!\n";
       inst->dump();
       if (LHS) { LHS->dump(); }
       if (LHSSrc) { LHSSrc->dump(); }
       if (LHSMask) { LHSMask->dump(); }
       if (LHSShift) { LHSShift->dump(); }
     }
     // There was an issue with the setup for BitInsert.
     return false;
   }
   if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
     if (mDebug) {
       dbgs() << "Found an OR Operation that failed setup!\n";
       inst->dump();
       if (RHS) { RHS->dump(); }
       if (RHSSrc) { RHSSrc->dump(); }
       if (RHSMask) { RHSMask->dump(); }
       if (RHSShift) { RHSShift->dump(); }
     }
     // There was an issue with the setup for BitInsert.
     return false;
   }
   if (mDebug) {
     dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
     dbgs() << "Op:        "; inst->dump();
     dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
     dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
     dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
     dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
     dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
     dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
     dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
     dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
   }
   Constant *offset = NULL;
   Constant *width = NULL;
   uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
   uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
   uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
   uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
   lhsMaskVal = (LHSMask
       ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
   rhsMaskVal = (RHSMask
       ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
   lhsShiftVal = (LHSShift
       ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
   rhsShiftVal = (RHSShift
       ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
   lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
   rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
   lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
   rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
   // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
   if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
     return false;
   }
   if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
     offset = ConstantInt::get(aType, lhsMaskOffset, false);
     width = ConstantInt::get(aType, lhsMaskWidth, false);
     RHSSrc = RHS;
     if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
       return false;
     }
     if (!LHSShift) {
       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
           "MaskShr", LHS);
     } else if (lhsShiftVal != lhsMaskOffset) {
       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
           "MaskShr", LHS);
     }
     if (mDebug) {
       dbgs() << "Optimizing LHS!\n";
     }
   } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
     offset = ConstantInt::get(aType, rhsMaskOffset, false);
     width = ConstantInt::get(aType, rhsMaskWidth, false);
     LHSSrc = RHSSrc;
     RHSSrc = LHS;
     if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
       return false;
     }
     if (!RHSShift) {
       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
           "MaskShr", RHS);
     } else if (rhsShiftVal != rhsMaskOffset) {
       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
           "MaskShr", RHS);
     }
     if (mDebug) {
       dbgs() << "Optimizing RHS!\n";
     }
   } else {
     if (mDebug) {
       dbgs() << "Failed constraint 3!\n";
     }
     return false;
   }
   if (mDebug) {
     dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
     dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
     dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
     dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
   }
   if (!offset || !width) {
     if (mDebug) {
       dbgs() << "Either width or offset are NULL, failed detection!\n";
     }
     return false;
   }
   // Lets create the function signature.
   std::vector<Type *> callTypes;
   callTypes.push_back(aType);
   callTypes.push_back(aType);
   callTypes.push_back(aType);
   callTypes.push_back(aType);
   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
   std::string name = "__amdil_ubit_insert";
   if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
   Function *Func =
     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
         getOrInsertFunction(StringRef(name), funcType));
   Value *Operands[4] = {
     width,
     offset,
     LHSSrc,
     RHSSrc
   };
   CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
   if (mDebug) {
     dbgs() << "Old Inst: ";
     inst->dump();
     dbgs() << "New Inst: ";
     CI->dump();
     dbgs() << "\n\n";
   }
   CI->insertBefore(inst);
   inst->replaceAllUsesWith(CI);
   return true;
 }

 bool
 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
   if (!inst) {
     return false;
   }
   if (!inst->isBinaryOp()) {
     return false;
   }
   if (inst->getOpcode() != Instruction::And) {
     return false;
   }
   if (optLevel == CodeGenOpt::None) {
     return false;
   }
   // We want to do some simple optimizations on Shift right/And patterns. The
   // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
   // value smaller than 32 and C is a mask. If C is a constant value, then the
   // following transformation can occur. For signed integers, it turns into the
   // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
   // integers, it turns into the function call dst =
   // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
   // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
   // Evergreen hardware.
   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
     // This does not work on HD4XXX hardware.
     return false;
   }
   Type *aType = inst->getType();
   bool isVector = aType->isVectorTy();

   // XXX Support vector types
   if (isVector) {
     return false;
   }
   int numEle = 1;
   // This only works on 32bit integers
   if (aType->getScalarType()
       != Type::getInt32Ty(inst->getContext())) {
     return false;
   }
   if (isVector) {
     const VectorType *VT = dyn_cast<VectorType>(aType);
     numEle = VT->getNumElements();
     // We currently cannot support more than 4 elements in a intrinsic and we
     // cannot support Vec3 types.
     if (numEle > 4 || numEle == 3) {
       return false;
     }
   }
   BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
   // If the first operand is not a shift instruction, then we can return as it
   // doesn't match this pattern.
   if (!ShiftInst || !ShiftInst->isShift()) {
     return false;
   }
   // If we are a shift left, then we need don't match this pattern.
   if (ShiftInst->getOpcode() == Instruction::Shl) {
     return false;
   }
   bool isSigned = ShiftInst->isArithmeticShift();
   Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
   Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
   // Lets make sure that the shift value and the and mask are constant integers.
   if (!AndMask || !ShrVal) {
     return false;
   }
   Constant *newMaskConst;
   Constant *shiftValConst;
   if (isVector) {
     // Handle the vector case
     std::vector<Constant *> maskVals;
     std::vector<Constant *> shiftVals;
     ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
     ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
     Type *scalarType = AndMaskVec->getType()->getScalarType();
     assert(AndMaskVec->getNumOperands() ==
            ShrValVec->getNumOperands() && "cannot have a "
            "combination where the number of elements to a "
            "shift and an and are different!");
     for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
       ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
       ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
       if (!AndCI || !ShiftIC) {
         return false;
       }
       uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
       if (!isMask_32(maskVal)) {
         return false;
       }
       maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
       uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
       // If the mask or shiftval is greater than the bitcount, then break out.
       if (maskVal >= 32 || shiftVal >= 32) {
         return false;
       }
       // If the mask val is greater than the the number of original bits left
       // then this optimization is invalid.
       if (maskVal > (32 - shiftVal)) {
         return false;
       }
       maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
       shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
     }
     newMaskConst = ConstantVector::get(maskVals);
     shiftValConst = ConstantVector::get(shiftVals);
   } else {
     // Handle the scalar case
     uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
     // This must be a mask value where all lower bits are set to 1 and then any
     // bit higher is set to 0.
     if (!isMask_32(maskVal)) {
       return false;
     }
     maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
     // Count the number of bits set in the mask, this is the width of the
     // resulting bit set that is extracted from the source value.
     uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
     // If the mask or shift val is greater than the bitcount, then break out.
     if (maskVal >= 32 || shiftVal >= 32) {
       return false;
     }
     // If the mask val is greater than the the number of original bits left then
     // this optimization is invalid.
     if (maskVal > (32 - shiftVal)) {
       return false;
     }
     newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
     shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
   }
   // Lets create the function signature.
   std::vector<Type *> callTypes;
   callTypes.push_back(aType);
   callTypes.push_back(aType);
   callTypes.push_back(aType);
   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
   std::string name = "llvm.AMDGPU.bit.extract.u32";
   if (isVector) {
     name += ".v" + itostr(numEle) + "i32";
   } else {
     name += ".";
   }
   // Lets create the function.
   Function *Func =
     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
                        getOrInsertFunction(StringRef(name), funcType));
   Value *Operands[3] = {
     ShiftInst->getOperand(0),
     shiftValConst,
     newMaskConst
   };
   // Lets create the Call with the operands
   CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
   CI->setDoesNotAccessMemory();
   CI->insertBefore(inst);
   inst->replaceAllUsesWith(CI);
   return true;
 }

 bool
 AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
   if (!CI) {
     return false;
   }
   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
   if (!LHS->getName().startswith("__amdil_bfi")) {
     return false;
   }
   Type* type = CI->getOperand(0)->getType();
   Constant *negOneConst = NULL;
   if (type->isVectorTy()) {
     std::vector<Constant *> negOneVals;
     negOneConst = ConstantInt::get(CI->getContext(),
         APInt(32, StringRef("-1"), 10));
     for (size_t x = 0,
         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
       negOneVals.push_back(negOneConst);
     }
     negOneConst = ConstantVector::get(negOneVals);
   } else {
     negOneConst = ConstantInt::get(CI->getContext(),
         APInt(32, StringRef("-1"), 10));
   }
   // __amdil_bfi => (A & B) | (~A & C)
   BinaryOperator *lhs =
     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
         CI->getOperand(1), "bfi_and", CI);
   BinaryOperator *rhs =
     BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
         "bfi_not", CI);
   rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
       "bfi_and", CI);
   lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
   CI->replaceAllUsesWith(lhs);
   return true;
 }

 bool
 AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
   if (!CI) {
     return false;
   }
   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
   if (!LHS->getName().startswith("__amdil_bfm")) {
     return false;
   }
   // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
   Constant *newMaskConst = NULL;
   Constant *newShiftConst = NULL;
   Type* type = CI->getOperand(0)->getType();
   if (type->isVectorTy()) {
     std::vector<Constant*> newMaskVals, newShiftVals;
     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
     for (size_t x = 0,
         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
       newMaskVals.push_back(newMaskConst);
       newShiftVals.push_back(newShiftConst);
     }
     newMaskConst = ConstantVector::get(newMaskVals);
     newShiftConst = ConstantVector::get(newShiftVals);
   } else {
     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
   }
   BinaryOperator *lhs =
     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
         newMaskConst, "bfm_mask", CI);
   lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
       lhs, "bfm_shl", CI);
   lhs = BinaryOperator::Create(Instruction::Sub, lhs,
       newShiftConst, "bfm_sub", CI);
   BinaryOperator *rhs =
     BinaryOperator::Create(Instruction::And, CI->getOperand(1),
         newMaskConst, "bfm_mask", CI);
   lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
   CI->replaceAllUsesWith(lhs);
   return true;
 }

 bool
 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)  {
   Instruction *inst = (*bbb);
   if (optimizeCallInst(bbb)) {
     return true;
   }
   if (optimizeBitExtract(inst)) {
     return false;
   }
   if (optimizeBitInsert(inst)) {
     return false;
   }
   if (correctMisalignedMemOp(inst)) {
     return false;
   }
   return false;
 }
 bool
 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
   LoadInst *linst = dyn_cast<LoadInst>(inst);
   StoreInst *sinst = dyn_cast<StoreInst>(inst);
   unsigned alignment;
   Type* Ty = inst->getType();
   if (linst) {
     alignment = linst->getAlignment();
     Ty = inst->getType();
   } else if (sinst) {
     alignment = sinst->getAlignment();
     Ty = sinst->getValueOperand()->getType();
   } else {
     return false;
   }
   unsigned size = getTypeSize(Ty);
   if (size == alignment || size < alignment) {
     return false;
   }
   if (!Ty->isStructTy()) {
     return false;
   }
   if (alignment < 4) {
     if (linst) {
       linst->setAlignment(0);
       return true;
     } else if (sinst) {
       sinst->setAlignment(0);
       return true;
     }
   }
   return false;
 }
 bool
 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)  {
   if (!CI) {
     return false;
   }
   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
   std::string namePrefix = LHS->getName().substr(0, 14);
   if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
       && namePrefix != "__amdil__imul24_high") {
     return false;
   }
   if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
     return false;
   }
   return true;
 }

 void
 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
   assert(isSigned24BitOps(CI) && "Must be a "
       "signed 24 bit operation to call this function!");
   Value *LHS = CI->getOperand(CI->getNumOperands()-1);
   // On 7XX and 8XX we do not have signed 24bit, so we need to
   // expand it to the following:
   // imul24 turns into 32bit imul
   // imad24 turns into 32bit imad
   // imul24_high turns into 32bit imulhigh
   if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
     Type *aType = CI->getOperand(0)->getType();
     bool isVector = aType->isVectorTy();
     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
     std::vector<Type*> callTypes;
     callTypes.push_back(CI->getOperand(0)->getType());
     callTypes.push_back(CI->getOperand(1)->getType());
     callTypes.push_back(CI->getOperand(2)->getType());
     FunctionType *funcType =
       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
     std::string name = "__amdil_imad";
     if (isVector) {
       name += "_v" + itostr(numEle) + "i32";
     } else {
       name += "_i32";
     }
     Function *Func = dyn_cast<Function>(
                        CI->getParent()->getParent()->getParent()->
                        getOrInsertFunction(StringRef(name), funcType));
     Value *Operands[3] = {
       CI->getOperand(0),
       CI->getOperand(1),
       CI->getOperand(2)
     };
     CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
     nCI->insertBefore(CI);
     CI->replaceAllUsesWith(nCI);
   } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
     BinaryOperator *mulOp =
       BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
           CI->getOperand(1), "imul24", CI);
     CI->replaceAllUsesWith(mulOp);
   } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
     Type *aType = CI->getOperand(0)->getType();

     bool isVector = aType->isVectorTy();
     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
     std::vector<Type*> callTypes;
     callTypes.push_back(CI->getOperand(0)->getType());
     callTypes.push_back(CI->getOperand(1)->getType());
     FunctionType *funcType =
       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
     std::string name = "__amdil_imul_high";
     if (isVector) {
       name += "_v" + itostr(numEle) + "i32";
     } else {
       name += "_i32";
     }
     Function *Func = dyn_cast<Function>(
                        CI->getParent()->getParent()->getParent()->
                        getOrInsertFunction(StringRef(name), funcType));
     Value *Operands[2] = {
       CI->getOperand(0),
       CI->getOperand(1)
     };
     CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
     nCI->insertBefore(CI);
     CI->replaceAllUsesWith(nCI);
   }
 }

 bool
 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)  {
   return (CI != NULL
           && CI->getOperand(CI->getNumOperands() - 1)->getName()
           == "__amdil_get_local_size_int");
 }

 bool
 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)  {
   if (!CI) {
     return false;
   }
   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
       && (mSTM->getDeviceName() == "cayman")) {
     return false;
   }
   return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
       == "__amdil_improved_div";
 }

 void
 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)  {
   assert(convertAccurateDivide(CI)
          && "expanding accurate divide can only happen if it is expandable!");
   BinaryOperator *divOp =
     BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
                            CI->getOperand(1), "fdiv32", CI);
   CI->replaceAllUsesWith(divOp);
 }

 bool
 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
   if (optLevel != CodeGenOpt::None) {
     return false;
   }

   if (!CI) {
     return false;
   }

   unsigned funcNameIdx = 0;
   funcNameIdx = CI->getNumOperands() - 1;
   StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
   if (calleeName != "__amdil_image2d_read_norm"
    && calleeName != "__amdil_image2d_read_unnorm"
    && calleeName != "__amdil_image3d_read_norm"
    && calleeName != "__amdil_image3d_read_unnorm") {
     return false;
   }

   unsigned samplerIdx = 2;
   samplerIdx = 1;
   Value *sampler = CI->getOperand(samplerIdx);
   LoadInst *lInst = dyn_cast<LoadInst>(sampler);
   if (!lInst) {
     return false;
   }

   if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
     return false;
   }

   GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
   // If we are loading from what is not a global value, then we
   // fail and return.
   if (!gv) {
     return false;
   }

   // If we don't have an initializer or we have an initializer and
   // the initializer is not a 32bit integer, we fail.
   if (!gv->hasInitializer()
       || !gv->getInitializer()->getType()->isIntegerTy(32)) {
       return false;
   }

   // Now that we have the global variable initializer, lets replace
   // all uses of the load instruction with the samplerVal and
   // reparse the __amdil_is_constant() function.
   Constant *samplerVal = gv->getInitializer();
   lInst->replaceAllUsesWith(samplerVal);
   return true;
 }

 bool
 AMDGPUPeepholeOpt::doInitialization(Module &M)  {
   return false;
 }

 bool
 AMDGPUPeepholeOpt::doFinalization(Module &M)  {
   return false;
 }

 void
 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const  {
   AU.addRequired<MachineFunctionAnalysis>();
   FunctionPass::getAnalysisUsage(AU);
   AU.setPreservesAll();
 }

 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
   size_t size = 0;
   if (!T) {
     return size;
   }
   switch (T->getTypeID()) {
   case Type::X86_FP80TyID:
   case Type::FP128TyID:
   case Type::PPC_FP128TyID:
   case Type::LabelTyID:
     assert(0 && "These types are not supported by this backend");
   default:
   case Type::FloatTyID:
   case Type::DoubleTyID:
     size = T->getPrimitiveSizeInBits() >> 3;
     break;
   case Type::PointerTyID:
     size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
     break;
   case Type::IntegerTyID:
     size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
     break;
   case Type::StructTyID:
     size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
     break;
   case Type::ArrayTyID:
     size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
     break;
   case Type::FunctionTyID:
     size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
     break;
   case Type::VectorTyID:
     size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
     break;
   };
   return size;
 }

 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
     bool dereferencePtr) {
   size_t size = 0;
   if (!ST) {
     return size;
   }
   Type *curType;
   StructType::element_iterator eib;
   StructType::element_iterator eie;
   for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
     curType = *eib;
     size += getTypeSize(curType, dereferencePtr);
   }
   return size;
 }

 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
     bool dereferencePtr) {
   return IT ? (IT->getBitWidth() >> 3) : 0;
 }

 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
     bool dereferencePtr) {
     assert(0 && "Should not be able to calculate the size of an function type");
     return 0;
 }

 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
     bool dereferencePtr) {
   return (size_t)(AT ? (getTypeSize(AT->getElementType(),
                                     dereferencePtr) * AT->getNumElements())
                      : 0);
 }

 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
     bool dereferencePtr) {
   return VT ? (VT->getBitWidth() >> 3) : 0;
 }

 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
     bool dereferencePtr) {
   if (!PT) {
     return 0;
   }
   Type *CT = PT->getElementType();
   if (CT->getTypeID() == Type::StructTyID &&
       PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
     return getTypeSize(dyn_cast<StructType>(CT));
   } else if (dereferencePtr) {
     size_t size = 0;
     for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
       size += getTypeSize(PT->getContainedType(x), dereferencePtr);
     }
     return size;
   } else {
     return 4;
   }
 }

 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
     bool dereferencePtr) {
   //assert(0 && "Should not be able to calculate the size of an opaque type");
   return 4;
 }